Skip to content

Commit

Permalink
[SYCL][Fusion] Test caching of fused kernels (intel/llvm-test-suite#1551
Browse files Browse the repository at this point in the history
)

Check that JIT compilation for kernel fusion is or is not repeated, depending on whether a newly submitted sequence of kernels is equivalent to a previous sequence.

Next to the sequence of kernels, other invocation information, e.g., the user-specified internalization properties play a role to assess equivalence with a previous compilation. Different scenarios are tested by the test added in this PR. 

Implementation: intel#8051

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
  • Loading branch information
sommerlukas authored Feb 7, 2023
1 parent b10f20f commit 0fccf29
Showing 1 changed file with 142 additions and 0 deletions.
142 changes: 142 additions & 0 deletions SYCL/KernelFusion/jit_caching.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
// UNSUPPORTED: cuda || hip
// REQUIRES: fusion

// Test caching for JIT fused kernels. Also test for debug messages being
// printed when SYCL_RT_WARNING_LEVEL=1.

#include <iostream>
#include <sycl/sycl.hpp>

using namespace sycl;

constexpr size_t dataSize = 512;

enum class Internalization { None, Local, Private };

void performFusion(queue &q, Internalization internalize, range<1> globalSize,
int beta, int gamma, bool insertBarriers = false) {
int alpha = 1;
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];

for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp[i] = -1;
out[i] = -1;
}
{
buffer<int> bIn1{in1, globalSize};
buffer<int> bIn2{in2, globalSize};
buffer<int> bIn3{in3, globalSize};
buffer<int> bTmp{tmp, globalSize};
buffer<int> bOut{out, globalSize};

ext::codeplay::experimental::fusion_wrapper fw{q};
fw.start_fusion();

assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");

q.submit([&](handler &cgh) {
auto accIn1 = bIn1.get_access(cgh);
auto accIn2 = bIn2.get_access(cgh);
property_list properties{};
if (internalize == Internalization::Private) {
properties = {
sycl::ext::codeplay::experimental::property::promote_private{}};
} else if (internalize == Internalization::Local) {
properties = {
sycl::ext::codeplay::experimental::property::promote_local{}};
}
accessor<int> accTmp = bTmp.get_access(cgh, properties);
cgh.parallel_for<class KernelOne>(globalSize, [=](id<1> i) {
accTmp[i] = accIn1[i] + accIn2[i] * alpha;
});
});

q.submit([&](handler &cgh) {
property_list properties{};
if (internalize == Internalization::Private) {
properties = {
sycl::ext::codeplay::experimental::property::promote_private{}};
} else if (internalize == Internalization::Local) {
properties = {
sycl::ext::codeplay::experimental::property::promote_local{}};
}
accessor<int> accTmp = bTmp.get_access(cgh, properties);
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(globalSize, [=](id<1> i) {
accOut[i] = accTmp[i] * accIn3[i] * beta * gamma;
});
});

if (insertBarriers) {
fw.complete_fusion();
} else {
fw.complete_fusion(
{ext::codeplay::experimental::property::no_barriers{}});
}

assert(!fw.is_in_fusion_mode() &&
"Queue should not be in fusion mode anymore");
}

// Check the results
size_t numErrors = 0;
size_t numInternalized = 0;
for (size_t i = 0; i < dataSize; ++i) {
if (i < globalSize.size() && out[i] != (20 * i * i * beta * gamma)) {
++numErrors;
}
if (tmp[i] == -1) {
++numInternalized;
}
}
if (numErrors) {
std::cout << "COMPUTATION ERROR\n";
}
if ((internalize == Internalization::None) && numInternalized) {
std::cout << "WRONG INTERNALIZATION\n";
}
}

int main() {
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};

// Initial invocation
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1);
// CHECK: JIT DEBUG: Compiling new kernel, no suitable cached kernel found

// Identical invocation, should lead to JIT cache hit.
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1);
// CHECK-NEXT: JIT DEBUG: Re-using cached JIT kernel
// CHECK-NEXT: INFO: Re-using existing device binary for fused kernel

// Invocation with a different beta. Because beta was identical to alpha so
// far, this should lead to a cache miss.
performFusion(q, Internalization::Private, range<1>{dataSize}, 2, 1);
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found

// Invocation with barrier insertion should lead to a cache miss.
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1,
/* insertBarriers */ true);
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found

// Invocation with different internalization target should lead to a cache
// miss.
performFusion(q, Internalization::None, range<1>{dataSize}, 1, 1);
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found

// Invocation with a different gamma should lead to a cache miss because gamma
// participates in constant propagation.
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 2);
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found

return 0;
}

0 comments on commit 0fccf29

Please sign in to comment.