Skip to content

[SYCL][Fusion] Enable fusion of rounded-range kernels #12492

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions sycl/doc/design/KernelFusionJIT.md
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,18 @@ During the fusion process at runtime, the JIT will load the LLVM IR and
finalize the fused kernel to the final target. More information is available
[here](./CompilerAndRuntimeDesign.md#kernel-fusion-support).

### Interaction with `parallel_for` range rounding

DPCPP's [range rounding](./ParallelForRangeRounding.md) transformation is
transparent for fusion, meaning the generated wrapper kernel with the rounded up
range will be used.

[Private internalization](#internalization-behavior) is supported when fusing
such kernels. We use the original, unrounded global size in dimension 0 when
computing the private memory size. As range rounding only applies to basic
kernels (parametrized by a `sycl::range`), local internalization is not affected
by the range rounding transformation.

### Unsupported SYCL constructs

The following SYCL API constructs are currently not officially supported for
Expand Down
56 changes: 44 additions & 12 deletions sycl/source/detail/jit_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,17 @@ static Promotion getInternalizationInfo(Requirement *Req) {
return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
}

static std::optional<size_t> getLocalSize(NDRDescT NDRange, Requirement *Req,
Promotion Target) {
static std::optional<size_t> getLocalSize(NDRDescT NDRange,
std::optional<size_t> UserGlobalSize,
Requirement *Req, Promotion Target) {
assert((!UserGlobalSize.has_value() || Target != Promotion::Local) &&
"Unexpected range rounding");
auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
if (Target == Promotion::Private) {
if (UserGlobalSize.has_value()) {
// Only the first dimension is affected by range rounding.
NDRange.GlobalSize[0] = *UserGlobalSize;
}
auto NumWorkItems = NDRange.GlobalSize.size();
// For private internalization, the local size is
// (Number of elements in buffer)/(number of work-items)
Expand Down Expand Up @@ -237,13 +244,15 @@ static bool accessorEquals(Requirement *Req, Requirement *Other) {

static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
unsigned ArgFunctionIndex, NDRDescT NDRange,
std::optional<size_t> UserGlobalSize,
PromotionMap &Promotions) {
assert(Arg.MType == kernel_param_kind_t::kind_accessor);

Requirement *Req = static_cast<Requirement *>(Arg.MPtr);

auto ThisPromotionTarget = getInternalizationInfo(Req);
auto ThisLocalSize = getLocalSize(NDRange, Req, ThisPromotionTarget);
auto ThisLocalSize =
getLocalSize(NDRange, UserGlobalSize, Req, ThisPromotionTarget);

if (Promotions.count(Req->MSYCLMemObj)) {
// We previously encountered an accessor for the same buffer.
Expand Down Expand Up @@ -278,7 +287,7 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
// Recompute the local size for the previous definition with adapted
// promotion target.
auto NewPrevLocalSize =
getLocalSize(PreviousDefinition.NDRange,
getLocalSize(PreviousDefinition.NDRange, std::nullopt,
PreviousDefinition.Definition, Promotion::Local);

if (!NewPrevLocalSize.has_value()) {
Expand Down Expand Up @@ -316,7 +325,8 @@ static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,

if (PreviousDefinition.PromotionTarget == Promotion::Local) {
// Recompute the local size with adapted promotion target.
auto ThisLocalSize = getLocalSize(NDRange, Req, Promotion::Local);
auto ThisLocalSize =
getLocalSize(NDRange, std::nullopt, Req, Promotion::Local);
if (!ThisLocalSize.has_value()) {
printPerformanceWarning("Work-group size for local promotion not "
"specified, not performing internalization");
Expand Down Expand Up @@ -591,11 +601,12 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
// argument is later on passed to the kernel.
const size_t SizeAccField =
sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
// Compute the local size and use it for the range parameters.
auto LocalSize = getLocalSize(NDRange, Req,
(PromotedToPrivate) ? Promotion::Private
: Promotion::Local);
range<3> AccessRange{1, 1, LocalSize.value()};
// Compute the local size and use it for the range parameters (only
// relevant for local promotion).
size_t LocalSize = PromotedToLocal ? *getLocalSize(NDRange, std::nullopt,
Req, Promotion::Local)
: 0;
range<3> AccessRange{1, 1, LocalSize};
auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
// Use all-zero as the offset
id<3> AcessOffset{0, 0, 0};
Expand All @@ -604,7 +615,7 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
// Override the arguments.
// 1. Override the pointer with a std-layout argument with 'nullptr' as
// value. handler.cpp does the same for local accessors.
int SizeInBytes = Req->MElemSize * LocalSize.value();
int SizeInBytes = Req->MElemSize * LocalSize;
FusedArgs[ArgIndex] =
ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes,
static_cast<int>(ArgIndex)};
Expand Down Expand Up @@ -694,6 +705,26 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
return A.MIndex < B.MIndex;
});

// Determine whether the kernel has been subject to DPCPP's range rounding.
// If so, the first argument will be the original ("user") range.
std::optional<size_t> UserGlobalSize;
if ((KernelName.find("_ZTSN4sycl3_V16detail18RoundedRangeKernel") == 0 ||
KernelName.find("_ZTSN4sycl3_V16detail19__pf_kernel_wrapper") == 0) &&
!Args.empty()) {
auto &A0 = Args[0];
auto Dims = KernelCG->MNDRDesc.Dims;
assert(A0.MPtr && A0.MSize == static_cast<int>(Dims * sizeof(size_t)) &&
A0.MType == kernel_param_kind_t::kind_std_layout &&
"Unexpected signature for rounded range kernel");

size_t *UGS = reinterpret_cast<size_t *>(A0.MPtr);
// Range-rounding only applies to the first dimension.
assert(UGS[0] > KernelCG->MNDRDesc.GlobalSize[1]);
assert(Dims < 2 || UGS[1] == KernelCG->MNDRDesc.GlobalSize[1]);
assert(Dims < 3 || UGS[2] == KernelCG->MNDRDesc.GlobalSize[2]);
UserGlobalSize = UGS[0];
}

::jit_compiler::SYCLArgumentDescriptor ArgDescriptor{Args.size()};
size_t ArgIndex = 0;
// The kernel function in SPIR-V will only have the non-eliminated
Expand All @@ -719,7 +750,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
if (!Eliminated) {
if (Arg.MType == kernel_param_kind_t::kind_accessor) {
resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
KernelCG->MNDRDesc, PromotedAccs);
KernelCG->MNDRDesc, UserGlobalSize,
PromotedAccs);
}
FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true);
++ArgFunctionIndex;
Expand Down
11 changes: 10 additions & 1 deletion sycl/test-e2e/KernelFusion/different_nd_ranges.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// RUN: %{build} -o %t.out
// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 \
// RUN: %{run} %t.out 2>&1 | FileCheck %s

// Test complete fusion of kernels with different ND-ranges.

Expand Down Expand Up @@ -262,4 +263,12 @@ int main() {
// 1-D, 2-D and 3-D kernels with different global sizes.
test({RangeDesc{{10}, R5}, RangeDesc{{10, 1}, {5, 1}},
RangeDesc{{10, 1, 1}, {5, 1, 1}}});

// Test global sizes that trigger the rounded range kernel insertion.
// Note that we lower the RR threshold when running this test.
test({RangeDesc{67}, RangeDesc{87}, RangeDesc{64}});

// Test multi-dimensional range-rounded kernels. Only the first dimension will
// be rounded up.
test({RangeDesc{30, 67}, RangeDesc{76, 55}, RangeDesc{64, 64}});
}
20 changes: 15 additions & 5 deletions sycl/test-e2e/KernelFusion/private_internalization.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
// RUN: %{run} %t.out
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:512 %{run} %t.out

// Test complete fusion with private internalization specified on the
// accessors.
Expand All @@ -8,8 +8,9 @@

using namespace sycl;

int main() {
constexpr size_t dataSize = 512;
template <typename BaseName, size_t dataSize> class KernelName;

template <size_t dataSize> static void test() {
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];

for (size_t i = 0; i < dataSize; ++i) {
Expand Down Expand Up @@ -39,7 +40,7 @@ int main() {
auto accIn2 = bIn2.get_access(cgh);
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
cgh.parallel_for<class KernelOne>(
cgh.parallel_for<KernelName<class KernelOne, dataSize>>(
dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
});

Expand All @@ -48,7 +49,7 @@ int main() {
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
cgh.parallel_for<KernelName<class KernelTwo, dataSize>>(
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
});

Expand All @@ -63,6 +64,15 @@ int main() {
assert(out[i] == (20 * i * i) && "Computation error");
assert(tmp[i] == -1 && "Not internalized");
}
}

int main() {
// Test power-of-two size.
test<512>();

// Test prime size large enough to trigger rounded-range kernel insertion.
// Note that we lower the RR threshold when running this test.
test<523>();

return 0;
}
21 changes: 15 additions & 6 deletions sycl/test-e2e/KernelFusion/two_dimensional.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
// RUN: %{run} %t.out
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 %{run} %t.out

// Test complete fusion with private internalization specified on the
// accessors for two-dimensional range.
Expand All @@ -8,9 +8,9 @@

using namespace sycl;

int main() {
constexpr size_t sizeX = 16;
constexpr size_t sizeY = 32;
template <typename BaseName, size_t sizeX, size_t sizeY> class KernelName;

template <size_t sizeX, size_t sizeY> static void test() {
constexpr size_t dataSize = sizeX * sizeY;
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];

Expand Down Expand Up @@ -42,7 +42,7 @@ int main() {
auto accIn2 = bIn2.get_access(cgh);
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
cgh.parallel_for<class KernelOne>(
cgh.parallel_for<KernelName<class KernelOne, sizeX, sizeY>>(
xyRange, [=](id<2> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
});

Expand All @@ -51,7 +51,7 @@ int main() {
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
cgh.parallel_for<KernelName<class KernelTwo, sizeX, sizeY>>(
xyRange, [=](id<2> i) { accOut[i] = accTmp[i] * accIn3[i]; });
});

Expand All @@ -66,6 +66,15 @@ int main() {
assert(out[i] == (20 * i * i) && "Computation error");
assert(tmp[i] == -1 && "Not internalized");
}
}

int main() {
// Test power-of-two size.
test<16, 32>();

// Test prime sizes large enough to trigger rounded-range kernel insertion.
// Note that we lower the RR threshold when running this test.
test<67, 79>();

return 0;
}