diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
index 2c9f59cdde9b0..45e9133b7ed13 100644
--- a/clang/include/clang/Driver/Action.h
+++ b/clang/include/clang/Driver/Action.h
@@ -660,9 +660,14 @@ class OffloadUnbundlingJobAction final : public JobAction {
 class OffloadWrapperJobAction : public JobAction {
   void anchor() override;
 
+  bool EmbedIR;
+
 public:
   OffloadWrapperJobAction(ActionList &Inputs, types::ID Type);
-  OffloadWrapperJobAction(Action *Input, types::ID OutputType);
+  OffloadWrapperJobAction(Action *Input, types::ID OutputType,
+                          bool EmbedIR = false);
+
+  bool isEmbeddedIR() const { return EmbedIR; }
 
   static bool classof(const Action *A) {
     return A->getKind() == OffloadWrapperJobClass;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4ef2167f9debf..9fb719ef46469 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2973,6 +2973,8 @@ def fintelfpga : Flag<["-"], "fintelfpga">, Group<f_Group>,
   HelpText<"Perform ahead-of-time compilation for FPGA">;
 def fsycl_device_only : Flag<["-"], "fsycl-device-only">, Flags<[CoreOption]>,
   HelpText<"Compile SYCL kernels for device">;
+def fsycl_embed_ir : Flag<["-"], "fsycl-embed-ir">, Flags<[CoreOption]>,
+  HelpText<"Embed LLVM IR for runtime kernel fusion">;
 defm sycl_esimd_force_stateless_mem : BoolFOption<"sycl-esimd-force-stateless-mem",
     LangOpts<"SYCLESIMDForceStatelessMem">, DefaultFalse,
     PosFlag<SetTrue, [], "Enforce using stateless memory accesses. "
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index e8c29e7a2f147..4cb0225cad293 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -478,11 +478,11 @@ void OffloadWrapperJobAction::anchor() {}
 
 OffloadWrapperJobAction::OffloadWrapperJobAction(ActionList &Inputs,
                                                  types::ID Type)
-  : JobAction(OffloadWrapperJobClass, Inputs, Type) {}
+    : JobAction(OffloadWrapperJobClass, Inputs, Type), EmbedIR(false) {}
 
-OffloadWrapperJobAction::OffloadWrapperJobAction(Action *Input,
-                                                 types::ID Type)
-    : JobAction(OffloadWrapperJobClass, Input, Type) {}
+OffloadWrapperJobAction::OffloadWrapperJobAction(Action *Input, types::ID Type,
+                                                 bool IsEmbeddedIR)
+    : JobAction(OffloadWrapperJobClass, Input, Type), EmbedIR(IsEmbeddedIR) {}
 
 void OffloadPackagerJobAction::anchor() {}
 
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 811fb82240ff6..235fa30d8251e 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5516,6 +5516,8 @@ class OffloadingActionBuilder final {
         //   s - device code split requested
         //   r - relocatable device code is requested
         //   f - link object output type is TY_Tempfilelist (fat archive)
+        //   e - Embedded IR for fusion (-fsycl-embed-ir) was requested
+        //       and target is NVPTX.
         //   * - "all other cases"
         //     - no condition means output/input is "always" present
         // First symbol indicates output/input type
@@ -5535,58 +5537,58 @@ class OffloadingActionBuilder final {
         //                |             |
         //                |             |
         //         .---------------------------------------.
-        //         |               PostLink                |
-        //         .---------------------------------------.
-        //                           [+*]                [+]
-        //                             |                  |
-        //                             |                  |
-        //                             |---------         |
-        //                             |        |         |
-        //                             |        |         |
-        //                             |      [+!rf]      |
-        //                             |  .-------------. |
-        //                             |  | llvm-foreach| |
-        //                             |  .-------------. |
-        //                             |        |         |
-        //                            [+*]    [+!rf]      |
-        //                      .-----------------.       |
-        //                      | FileTableTform  |       |
-        //                      | (extract "Code")|       |
-        //                      .-----------------.       |
-        //                              [-]               |-----------
-        //           --------------------|                           |
-        //           |                   |                           |
-        //           |                   |-----------------          |
-        //           |                   |                |          |
-        //           |                   |               [-!rf]      |
-        //           |                   |         .--------------.  |
-        //           |                   |         |FileTableTform|  |
-        //           |                   |         |   (merge)    |  |
-        //           |                   |         .--------------.  |
-        //           |                   |               [-]         |-------
-        //           |                   |                |          |      |
-        //           |                   |                |    ------|      |
-        //           |                   |        --------|    |            |
-        //          [.]                 [-*]   [-!rf]        [+!rf]         |
-        //   .---------------.  .-------------------. .--------------.      |
-        //   | finalizeNVPTX  | |  SPIRVTranslator  | |FileTableTform|      |
-        //   | finalizeAMDGCN | |                   | |   (merge)    |      |
-        //   .---------------.  .-------------------. . -------------.      |
-        //          [.]             [-as]      [-!a]         |              |
-        //           |                |          |           |              |
-        //           |              [-s]         |           |              |
-        //           |       .----------------.  |           |              |
-        //           |       | BackendCompile |  |           |              |
-        //           |       .----------------.  |     ------|              |
-        //           |              [-s]         |     |                    |
-        //           |                |          |     |                    |
-        //           |              [-a]      [-!a]  [-!rf]                 |
-        //           |              .--------------------.                  |
-        //           -----------[-n]|   FileTableTform   |[+*]--------------|
-        //                          |  (replace "Code")  |
-        //                          .--------------------.
-        //                                      |
-        //                                    [+*]
+        //         |               PostLink                |[+e]----------------
+        //         .---------------------------------------.                   |
+        //                           [+*]                [+]                   |
+        //                             |                  |                    |
+        //                             |                  |                    |
+        //                             |---------         |                    |
+        //                             |        |         |                    |
+        //                             |        |         |                    |
+        //                             |      [+!rf]      |                    |
+        //                             |  .-------------. |                    |
+        //                             |  | llvm-foreach| |                    |
+        //                             |  .-------------. |                    |
+        //                             |        |         |                    |
+        //                            [+*]    [+!rf]      |                    |
+        //                      .-----------------.       |                    |
+        //                      | FileTableTform  |       |                    |
+        //                      | (extract "Code")|       |                    |
+        //                      .-----------------.       |                    |
+        //                              [-]               |-----------         |
+        //           --------------------|                           |         |
+        //           |                   |                           |         |
+        //           |                   |-----------------          |         |
+        //           |                   |                |          |         |
+        //           |                   |               [-!rf]      |         |
+        //           |                   |         .--------------.  |         |
+        //           |                   |         |FileTableTform|  |         |
+        //           |                   |         |   (merge)    |  |         |
+        //           |                   |         .--------------.  |         |
+        //           |                   |               [-]         |-------  |
+        //           |                   |                |          |      |  |
+        //           |                   |                |    ------|      |  |
+        //           |                   |        --------|    |            |  |
+        //          [.]                 [-*]   [-!rf]        [+!rf]         |  |
+        //   .---------------.  .-------------------. .--------------.      |  |
+        //   | finalizeNVPTX  | |  SPIRVTranslator  | |FileTableTform|      |  |
+        //   | finalizeAMDGCN | |                   | |   (merge)    |      |  |
+        //   .---------------.  .-------------------. . -------------.      |  |
+        //          [.]             [-as]      [-!a]         |              |  |
+        //           |                |          |           |              |  |
+        //           |              [-s]         |           |              |  |
+        //           |       .----------------.  |           |              |  |
+        //           |       | BackendCompile |  |           |              |  |
+        //           |       .----------------.  |     ------|              |  |
+        //           |              [-s]         |     |                    |  |
+        //           |                |          |     |                    |  |
+        //           |              [-a]      [-!a]  [-!rf]                 |  |
+        //           |              .--------------------.                  |  |
+        //           -----------[-n]|   FileTableTform   |[+*]--------------|  |
+        //                          |  (replace "Code")  |                     |
+        //                          .--------------------.                     |
+        //                                      |      -------------------------
+        //                                    [+*]     | [+e]
         //         .--------------------------------------.
         //         |            OffloadWrapper            |
         //         .--------------------------------------.
@@ -5693,6 +5695,16 @@ class OffloadingActionBuilder final {
             return TypedPostLinkAction;
           };
           Action *PostLinkAction = createPostLinkAction();
+          if (isNVPTX && Args.hasArg(options::OPT_fsycl_embed_ir)) {
+            // When compiling for Nvidia/CUDA devices and the user requested the
+            // IR to be embedded in the application (via option), run the output
+            // of sycl-post-link (filetable referencing LLVM Bitcode + symbols)
+            // through the offload wrapper and link the resulting object to the
+            // application.
+            auto *WrapBitcodeAction = C.MakeAction<OffloadWrapperJobAction>(
+                PostLinkAction, types::TY_Object, true);
+            DA.add(*WrapBitcodeAction, *TC, BoundArch, Action::OFK_SYCL);
+          }
           bool NoRDCFatStaticArchive =
               !IsRDC &&
               FullDeviceLinkAction->getType() == types::TY_Tempfilelist;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 750bbf9197ea0..38c5d832c1639 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9272,6 +9272,14 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
       createArgString("-link-opts=");
     }
 
+    bool IsEmbeddedIR = cast<OffloadWrapperJobAction>(JA).isEmbeddedIR();
+    if (IsEmbeddedIR) {
+      // When the offload-wrapper is called to embed LLVM IR, add a prefix to
+      // the target triple to distinguish the LLVM IR from the actual device
+      // binary for that target.
+      TargetTripleOpt = ("llvm_" + TargetTripleOpt).str();
+    }
+
     WrapperArgs.push_back(
         C.getArgs().MakeArgString(Twine("-target=") + TargetTripleOpt));
 
@@ -9293,7 +9301,7 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
     assert(I.isFilename() && "Invalid input.");
 
     if (I.getType() == types::TY_Tempfiletable ||
-        I.getType() == types::TY_Tempfilelist)
+        I.getType() == types::TY_Tempfilelist || IsEmbeddedIR)
       // wrapper actual input files are passed via the batch job file table:
       WrapperArgs.push_back(C.getArgs().MakeArgString("-batch"));
     WrapperArgs.push_back(C.getArgs().MakeArgString(I.getFilename()));
diff --git a/sycl-fusion/common/include/Kernel.h b/sycl-fusion/common/include/Kernel.h
index 1962dd042ffbe..87726b3368d63 100644
--- a/sycl-fusion/common/include/Kernel.h
+++ b/sycl-fusion/common/include/Kernel.h
@@ -34,7 +34,7 @@ enum class ParameterKind : uint32_t {
 };
 
 /// Different binary formats supported as input to the JIT compiler.
-enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV };
+enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX };
 
 /// Information about a device intermediate representation module (e.g., SPIR-V,
 /// LLVM IR) from DPC++.
diff --git a/sycl-fusion/common/lib/KernelIO.h b/sycl-fusion/common/lib/KernelIO.h
index 12c194f8b4dd4..09058d61e9981 100644
--- a/sycl-fusion/common/lib/KernelIO.h
+++ b/sycl-fusion/common/lib/KernelIO.h
@@ -47,6 +47,7 @@ template <> struct ScalarEnumerationTraits<jit_compiler::BinaryFormat> {
   static void enumeration(IO &IO, jit_compiler::BinaryFormat &BF) {
     IO.enumCase(BF, "LLVM", jit_compiler::BinaryFormat::LLVM);
     IO.enumCase(BF, "SPIRV", jit_compiler::BinaryFormat::SPIRV);
+    IO.enumCase(BF, "PTX", jit_compiler::BinaryFormat::PTX);
     IO.enumCase(BF, "INVALID", jit_compiler::BinaryFormat::INVALID);
   }
 };
diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt
index de6a73b1eab3d..bf323239679b4 100644
--- a/sycl-fusion/jit-compiler/CMakeLists.txt
+++ b/sycl-fusion/jit-compiler/CMakeLists.txt
@@ -2,13 +2,15 @@
 add_llvm_library(sycl-fusion
    lib/KernelFusion.cpp
    lib/JITContext.cpp
+   lib/translation/KernelTranslation.cpp
    lib/translation/SPIRVLLVMTranslation.cpp
    lib/fusion/FusionPipeline.cpp
    lib/fusion/FusionHelper.cpp
    lib/fusion/ModuleHelper.cpp
    lib/helper/ConfigHelper.cpp
 
-  LINK_COMPONENTS
+   LINK_COMPONENTS
+   BitReader
    Core
    Support
    Analysis
@@ -18,6 +20,10 @@ add_llvm_library(sycl-fusion
    Linker
    ScalarOpts
    InstCombine
+   Target
+   TargetParser
+   MC
+   ${LLVM_TARGETS_TO_BUILD}
 )
 
 target_include_directories(sycl-fusion
@@ -40,6 +46,10 @@ target_link_libraries(sycl-fusion
   ${CMAKE_THREAD_LIBS_INIT}
 )
 
+if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_PTX)
+endif()
+
 if (BUILD_SHARED_LIBS)
   if(NOT MSVC AND NOT APPLE)
     # Manage symbol visibility through the linker to make sure no LLVM symbols
diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h
index 4c0616e267941..69465a74d8371 100644
--- a/sycl-fusion/jit-compiler/include/JITContext.h
+++ b/sycl-fusion/jit-compiler/include/JITContext.h
@@ -36,17 +36,21 @@ using CacheKeyT =
                std::optional<std::vector<NDRange>>>;
 
 ///
-/// Wrapper around a SPIR-V binary.
-class SPIRVBinary {
+/// Wrapper around a kernel binary.
+class KernelBinary {
 public:
-  explicit SPIRVBinary(std::string Binary);
+  explicit KernelBinary(std::string &&Binary, BinaryFormat Format);
 
   jit_compiler::BinaryAddress address() const;
 
   size_t size() const;
 
+  BinaryFormat format() const;
+
 private:
   std::string Blob;
+
+  BinaryFormat Format;
 };
 
 ///
@@ -61,7 +65,10 @@ class JITContext {
 
   llvm::LLVMContext *getLLVMContext();
 
-  SPIRVBinary &emplaceSPIRVBinary(std::string Binary);
+  template <typename... Ts> KernelBinary &emplaceKernelBinary(Ts &&...Args) {
+    WriteLockT WriteLock{BinariesMutex};
+    return Binaries.emplace_back(std::forward<Ts>(Args)...);
+  }
 
   std::optional<SYCLKernelInfo> getCacheEntry(CacheKeyT &Identifier) const;
 
@@ -79,7 +86,7 @@ class JITContext {
 
   MutexT BinariesMutex;
 
-  std::vector<SPIRVBinary> Binaries;
+  std::vector<KernelBinary> Binaries;
 
   mutable MutexT CacheMutex;
 
diff --git a/sycl-fusion/jit-compiler/include/Options.h b/sycl-fusion/jit-compiler/include/Options.h
index 335f58fb64cf7..841a229adb7a3 100644
--- a/sycl-fusion/jit-compiler/include/Options.h
+++ b/sycl-fusion/jit-compiler/include/Options.h
@@ -9,12 +9,14 @@
 #ifndef SYCL_FUSION_JIT_COMPILER_OPTIONS_H
 #define SYCL_FUSION_JIT_COMPILER_OPTIONS_H
 
+#include "Kernel.h"
+
 #include <memory>
 #include <unordered_map>
 
 namespace jit_compiler {
 
-enum OptionID { VerboseOutput, EnableCaching };
+enum OptionID { VerboseOutput, EnableCaching, TargetFormat };
 
 class OptionPtrBase {};
 
@@ -78,6 +80,9 @@ struct JITEnableVerbose : public OptionBase<OptionID::VerboseOutput, bool> {};
 
 struct JITEnableCaching : public OptionBase<OptionID::EnableCaching, bool> {};
 
+struct JITTargetFormat
+    : public OptionBase<OptionID::TargetFormat, BinaryFormat> {};
+
 } // namespace option
 } // namespace jit_compiler
 
diff --git a/sycl-fusion/jit-compiler/lib/JITContext.cpp b/sycl-fusion/jit-compiler/lib/JITContext.cpp
index 68c7031b9d8a9..4499dd93f4d76 100644
--- a/sycl-fusion/jit-compiler/lib/JITContext.cpp
+++ b/sycl-fusion/jit-compiler/lib/JITContext.cpp
@@ -11,14 +11,17 @@
 
 using namespace jit_compiler;
 
-SPIRVBinary::SPIRVBinary(std::string Binary) : Blob{std::move(Binary)} {}
+KernelBinary::KernelBinary(std::string &&Binary, BinaryFormat Fmt)
+    : Blob{std::move(Binary)}, Format{Fmt} {}
 
-jit_compiler::BinaryAddress SPIRVBinary::address() const {
+jit_compiler::BinaryAddress KernelBinary::address() const {
   // FIXME: Verify it's a good idea to perform this reinterpret_cast here.
   return reinterpret_cast<jit_compiler::BinaryAddress>(Blob.c_str());
 }
 
-size_t SPIRVBinary::size() const { return Blob.size(); }
+size_t KernelBinary::size() const { return Blob.size(); }
+
+BinaryFormat KernelBinary::format() const { return Format; }
 
 JITContext::JITContext() : LLVMCtx{new llvm::LLVMContext}, Binaries{} {}
 
@@ -26,14 +29,6 @@ JITContext::~JITContext() = default;
 
 llvm::LLVMContext *JITContext::getLLVMContext() { return LLVMCtx.get(); }
 
-SPIRVBinary &JITContext::emplaceSPIRVBinary(std::string Binary) {
-  WriteLockT WriteLock{BinariesMutex};
-  // NOTE: With C++17, which returns a reference from emplace_back, the
-  // following code would be even simpler.
-  Binaries.emplace_back(std::move(Binary));
-  return Binaries.back();
-}
-
 std::optional<SYCLKernelInfo>
 JITContext::getCacheEntry(CacheKeyT &Identifier) const {
   ReadLockT ReadLock{CacheMutex};
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 56ea9401c465b..be7515d935247 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -15,6 +15,7 @@
 #include "fusion/FusionPipeline.h"
 #include "helper/ConfigHelper.h"
 #include "helper/ErrorHandling.h"
+#include "translation/KernelTranslation.h"
 #include "translation/SPIRVLLVMTranslation.h"
 #include <llvm/Support/Error.h>
 #include <sstream>
@@ -47,6 +48,22 @@ gatherNDRanges(llvm::ArrayRef<SYCLKernelInfo> KernelInformation) {
   return NDRanges;
 }
 
+static bool isTargetFormatSupported(BinaryFormat TargetFormat) {
+  switch (TargetFormat) {
+  case BinaryFormat::SPIRV:
+    return true;
+  case BinaryFormat::PTX: {
+#ifdef FUSION_JIT_SUPPORT_PTX
+    return true;
+#else  // FUSION_JIT_SUPPORT_PTX
+    return false;
+#endif // FUSION_JIT_SUPPORT_PTX
+  }
+  default:
+    return false;
+  }
+}
+
 FusionResult KernelFusion::fuseKernels(
     JITContext &JITCtx, Config &&JITConfig,
     const std::vector<SYCLKernelInfo> &KernelInformation,
@@ -55,6 +72,10 @@ FusionResult KernelFusion::fuseKernels(
     int BarriersFlags,
     const std::vector<jit_compiler::ParameterInternalization> &Internalization,
     const std::vector<jit_compiler::JITConstant> &Constants) {
+  // Initialize the configuration helper to make the options for this invocation
+  // available (on a per-thread basis).
+  ConfigHelper::setConfig(std::move(JITConfig));
+
   const auto NDRanges = gatherNDRanges(KernelInformation);
 
   if (!isValidCombination(NDRanges)) {
@@ -63,9 +84,18 @@ FusionResult KernelFusion::fuseKernels(
         "different global sizes in dimensions [2, N) and non-zero offsets"};
   }
 
-  // Initialize the configuration helper to make the options for this invocation
-  // available (on a per-thread basis).
-  ConfigHelper::setConfig(std::move(JITConfig));
+  bool IsHeterogeneousList = jit_compiler::isHeterogeneousList(NDRanges);
+
+  BinaryFormat TargetFormat = ConfigHelper::get<option::JITTargetFormat>();
+
+  if (!isTargetFormatSupported(TargetFormat)) {
+    return FusionResult(
+        "Fusion output target format not supported by this build");
+  }
+
+  if (TargetFormat == BinaryFormat::PTX && IsHeterogeneousList) {
+    return FusionResult{"Heterogeneous ND ranges not supported for CUDA"};
+  }
 
   bool CachingEnabled = ConfigHelper::get<option::JITEnableCaching>();
   CacheKeyT CacheKey{KernelsToFuse,
@@ -97,8 +127,8 @@ FusionResult KernelFusion::fuseKernels(
   // Load all input kernels from their respective SPIR-V modules into a single
   // LLVM IR module.
   llvm::Expected<std::unique_ptr<llvm::Module>> ModOrError =
-      translation::SPIRVLLVMTranslator::loadSPIRVKernels(
-          *JITCtx.getLLVMContext(), ModuleInfo.kernels());
+      translation::KernelTranslator::loadKernels(*JITCtx.getLLVMContext(),
+                                                 ModuleInfo.kernels());
   if (auto Error = ModOrError.takeError()) {
     return errorToFusionResult(std::move(Error), "SPIR-V translation failed");
   }
@@ -136,29 +166,14 @@ FusionResult KernelFusion::fuseKernels(
 
   SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName);
 
-  // Translate the LLVM IR module resulting from the fusion pass into SPIR-V.
-  llvm::Expected<jit_compiler::SPIRVBinary *> BinaryOrError =
-      translation::SPIRVLLVMTranslator::translateLLVMtoSPIRV(*NewMod, JITCtx);
-  if (auto Error = BinaryOrError.takeError()) {
+  if (auto Error = translation::KernelTranslator::translateKernel(
+          FusedKernelInfo, *NewMod, JITCtx, TargetFormat)) {
     return errorToFusionResult(std::move(Error),
-                               "Translation to SPIR-V failed");
+                               "Translation to output format failed");
   }
-  jit_compiler::SPIRVBinary *SPIRVBin = *BinaryOrError;
 
   FusedKernelInfo.NDR = FusedKernel.FusedNDRange;
 
-  // Update the KernelInfo for the fused kernel with the address and size of the
-  // SPIR-V binary resulting from translation.
-  SYCLKernelBinaryInfo &FusedBinaryInfo = FusedKernelInfo.BinaryInfo;
-  FusedBinaryInfo.Format = BinaryFormat::SPIRV;
-  // Output SPIR-V should use the same number of address bits as the input
-  // SPIR-V. SPIR-V translation requires all modules to use the same number of
-  // address bits, so it's safe to take the value from the first one.
-  FusedBinaryInfo.AddressBits =
-      ModuleInfo.kernels().front().BinaryInfo.AddressBits;
-  FusedBinaryInfo.BinaryStart = SPIRVBin->address();
-  FusedBinaryInfo.BinarySize = SPIRVBin->size();
-
   if (CachingEnabled) {
     JITCtx.addCacheEntry(CacheKey, FusedKernelInfo);
   }
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index 3ede007aa69a4..965202fe5bbfa 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/Verifier.h"
 #endif // NDEBUG
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
@@ -35,6 +36,21 @@ using namespace llvm;
 using namespace jit_compiler;
 using namespace jit_compiler::fusion;
 
+static unsigned getFlatAddressSpace(Module &Mod) {
+  // Ideally, we could get this information from the TargetTransformInfo, but
+  // the SPIR-V backend does not yet seem to have an implementation for that.
+  llvm::Triple Tri(Mod.getTargetTriple());
+  if (Tri.isNVPTX()) {
+    return 0;
+  }
+  if (Tri.isSPIRV() || Tri.isSPIR()) {
+    return 4;
+  }
+  // Identical to the definition of "UninitializedAddressSpace" in
+  // "InferAddressSpaces.cpp".
+  return std::numeric_limits<unsigned>::max();
+}
+
 std::unique_ptr<SYCLModuleInfo>
 FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
                                 int BarriersFlags) {
@@ -86,9 +102,8 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
     // Run the InferAddressSpace pass to remove as many address-space casts
     // to/from generic address-space as possible, because these hinder
     // internalization.
-    // FIXME: TTI should tell the pass which address space to use.
     // Ideally, the static compiler should have performed that job.
-    constexpr unsigned FlatAddressSpace = 4;
+    const unsigned FlatAddressSpace = getFlatAddressSpace(Mod);
     FPM.addPass(InferAddressSpacesPass(FlatAddressSpace));
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   }
diff --git a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
index c197fedf47e69..0d6ac7f48fbbe 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
@@ -8,6 +8,7 @@
 
 #include "ModuleHelper.h"
 
+#include "target/TargetFusionInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -22,6 +23,13 @@ helper::ModuleHelper::cloneAndPruneModule(Module *Mod,
   SmallPtrSet<Function *, 16> UnusedFunctions;
   identifyUnusedFunctions(Mod, CGRoots, UnusedFunctions);
 
+  {
+    TargetFusionInfo TFI{Mod};
+    SmallVector<Function *> Unused{UnusedFunctions.begin(),
+                                   UnusedFunctions.end()};
+    TFI.notifyFunctionsDelete(Unused);
+  }
+
   // Clone the module, but use an external reference in place of the global
   // definition for unused functions.
   auto FunctionCloneMask = [&](const GlobalValue *GV) -> bool {
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
new file mode 100644
index 0000000000000..e1435870eb3b2
--- /dev/null
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -0,0 +1,289 @@
+//==----------------------- KernelTranslation.cpp  -------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "KernelTranslation.h"
+
+#include "SPIRVLLVMTranslation.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace jit_compiler;
+using namespace jit_compiler::translation;
+using namespace llvm;
+
+///
+/// Get an attribute value consisting of NumValues scalar constant integers
+/// from the MDNode.
+static void getAttributeValues(std::vector<std::string> &Values, MDNode *MD) {
+  for (const auto &MDOp : MD->operands()) {
+    auto *ConstantMD = cast<ConstantAsMetadata>(MDOp);
+    auto *ConstInt = cast<ConstantInt>(ConstantMD->getValue());
+    Values.push_back(std::to_string(ConstInt->getZExtValue()));
+  }
+}
+
+// NOLINTNEXTLINE(readability-identifier-naming)
+static const char *REQD_WORK_GROUP_SIZE_ATTR = "reqd_work_group_size";
+// NOLINTNEXTLINE(readability-identifier-naming)
+static const char *WORK_GROUP_SIZE_HINT_ATTR = "work_group_size_hint";
+
+///
+/// Restore kernel attributes for the kernel in Info from the metadata
+/// attached to its kernel function in the LLVM module Mod.
+/// Currently supported attributes:
+///   - reqd_work_group_size
+///   - work_group_size_hint
+static void restoreKernelAttributes(Module *Mod, SYCLKernelInfo &Info) {
+  auto *KernelFunction = Mod->getFunction(Info.Name);
+  assert(KernelFunction && "Kernel function not present in module");
+  if (auto *MD = KernelFunction->getMetadata(REQD_WORK_GROUP_SIZE_ATTR)) {
+    SYCLKernelAttribute ReqdAttr{REQD_WORK_GROUP_SIZE_ATTR};
+    getAttributeValues(ReqdAttr.Values, MD);
+    Info.Attributes.push_back(ReqdAttr);
+  }
+  if (auto *MD = KernelFunction->getMetadata(WORK_GROUP_SIZE_HINT_ATTR)) {
+    SYCLKernelAttribute HintAttr{WORK_GROUP_SIZE_HINT_ATTR};
+    getAttributeValues(HintAttr.Values, MD);
+    Info.Attributes.push_back(HintAttr);
+  }
+}
+
+llvm::Expected<std::unique_ptr<llvm::Module>>
+KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx,
+                              std::vector<SYCLKernelInfo> &Kernels) {
+  std::unique_ptr<Module> Result{nullptr};
+  bool First = true;
+  DenseSet<BinaryBlob> ParsedBinaries;
+  size_t AddressBits = 0;
+  for (auto &Kernel : Kernels) {
+    // FIXME: Currently, we use the front of the list.
+    // Do we need to iterate to find the most suitable
+    // SPIR-V module?
+    SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo;
+
+    const unsigned char *ModulePtr = BinInfo.BinaryStart;
+    size_t ModuleSize = BinInfo.BinarySize;
+    BinaryBlob BinBlob{ModulePtr, ModuleSize};
+    if (!ParsedBinaries.contains(BinBlob)) {
+      // Multiple kernels can be stored in the same SPIR-V or LLVM IR module.
+      // We only load if we did not encounter the same binary module before.
+      // NOTE: We compare the pointer as well as the size, in case
+      // a previous kernel only referenced part of the SPIR-V/LLVM IR module.
+      // Not sure this can actually happen, but better safe than sorry.
+      // Simply load and translate the SPIR-V into the currently still empty
+      // module.
+      std::unique_ptr<llvm::Module> NewMod;
+
+      switch (BinInfo.Format) {
+      case BinaryFormat::LLVM: {
+        auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel);
+        if (auto Err = ModOrError.takeError()) {
+          return std::move(Err);
+        }
+        NewMod = std::move(*ModOrError);
+        break;
+      }
+      case BinaryFormat::SPIRV: {
+        auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel);
+        if (auto Err = ModOrError.takeError()) {
+          return std::move(Err);
+        }
+        NewMod = std::move(*ModOrError);
+        break;
+      }
+      default: {
+        return createStringError(
+            inconvertibleErrorCode(),
+            "Failed to load kernel from unsupported input format");
+      }
+      }
+
+      // We do not assume that the input binary information has the address bits
+      // set, but rather retrieve this information from the SPIR-V/LLVM module's
+      // data-layout.
+      BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits();
+
+      if (First) {
+        // We can simply assign the module we just loaded from SPIR-V to the
+        // empty pointer on the first iteration.
+        Result = std::move(NewMod);
+        // The first module will dictate the address bits for the remaining.
+        AddressBits = BinInfo.AddressBits;
+        First = false;
+      } else {
+        // We have already loaded some module, so now we need to
+        // link the module we just loaded with the result so far.
+        // FIXME: We allow duplicates to be overridden by the module
+        // read last. This could cause problems if different modules contain
+        // definitions with the same name, but different body/content.
+        // Check that this is not problematic.
+        Linker::linkModules(*Result, std::move(NewMod),
+                            Linker::Flags::OverrideFromSrc);
+        if (AddressBits != BinInfo.AddressBits) {
+          return createStringError(
+              inconvertibleErrorCode(),
+              "Number of address bits between SPIR-V modules does not match");
+        }
+      }
+      ParsedBinaries.insert(BinBlob);
+    }
+    // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
+    // 'work_group_size_hint' from metadata attached to the kernel function and
+    // store it in the SYCLKernelInfo.
+    restoreKernelAttributes(Result.get(), Kernel);
+  }
+  return std::move(Result);
+}
+
+llvm::Expected<std::unique_ptr<llvm::Module>>
+KernelTranslator::loadLLVMKernel(llvm::LLVMContext &LLVMCtx,
+                                 SYCLKernelInfo &Kernel) {
+  auto &BinInfo = Kernel.BinaryInfo;
+  llvm::StringRef RawData(reinterpret_cast<const char *>(BinInfo.BinaryStart),
+                          BinInfo.BinarySize);
+  return llvm::parseBitcodeFile(
+      MemoryBuffer::getMemBuffer(RawData, Kernel.Name,
+                                 /* RequiresNullTermnator*/ false)
+          ->getMemBufferRef(),
+      LLVMCtx);
+}
+
+llvm::Expected<std::unique_ptr<llvm::Module>>
+KernelTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
+                                  SYCLKernelInfo &Kernel) {
+  return SPIRVLLVMTranslator::loadSPIRVKernel(LLVMCtx, Kernel);
+}
+
+llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
+                                              llvm::Module &Mod,
+                                              JITContext &JITCtx,
+                                              BinaryFormat Format) {
+
+  KernelBinary *KernelBin = nullptr;
+  switch (Format) {
+  case BinaryFormat::SPIRV: {
+    llvm::Expected<KernelBinary *> BinaryOrError =
+        translateToSPIRV(Mod, JITCtx);
+    if (auto Error = BinaryOrError.takeError()) {
+      return Error;
+    }
+    KernelBin = *BinaryOrError;
+    break;
+  }
+  case BinaryFormat::PTX: {
+    llvm::Expected<KernelBinary *> BinaryOrError =
+        translateToPTX(Kernel, Mod, JITCtx);
+    if (auto Error = BinaryOrError.takeError()) {
+      return Error;
+    }
+    KernelBin = *BinaryOrError;
+    break;
+  }
+  default: {
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Failed to translate kernel to unsupported output format");
+  }
+  }
+
+  // Update the KernelInfo for the fused kernel with the address and size of the
+  // SPIR-V binary resulting from translation.
+  SYCLKernelBinaryInfo &FusedBinaryInfo = Kernel.BinaryInfo;
+  FusedBinaryInfo.Format = Format;
+  // Output SPIR-V should use the same number of address bits as the input
+  // SPIR-V. SPIR-V translation requires all modules to use the same number of
+  // address bits, so it's safe to take the value from the first one.
+  FusedBinaryInfo.AddressBits = Mod.getDataLayout().getPointerSizeInBits();
+  FusedBinaryInfo.BinaryStart = KernelBin->address();
+  FusedBinaryInfo.BinarySize = KernelBin->size();
+  return Error::success();
+}
+
+llvm::Expected<KernelBinary *>
+KernelTranslator::translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx) {
+  return SPIRVLLVMTranslator::translateLLVMtoSPIRV(Mod, JITCtx);
+}
+
+llvm::Expected<KernelBinary *>
+KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
+                                 JITContext &JITCtx) {
+#ifndef FUSION_JIT_SUPPORT_PTX
+  (void)KernelInfo;
+  (void)Mod;
+  (void)JITCtx;
+  return createStringError(inconvertibleErrorCode(),
+                           "PTX translation not supported in this build");
+#else  // FUSION_JIT_SUPPORT_PTX
+  LLVMInitializeNVPTXTargetInfo();
+  LLVMInitializeNVPTXTarget();
+  LLVMInitializeNVPTXAsmPrinter();
+  LLVMInitializeNVPTXTargetMC();
+
+  static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
+  static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";
+
+  std::string TargetTriple{"nvptx64-nvidia-cuda"};
+
+  std::string ErrorMessage;
+  const auto *Target =
+      llvm::TargetRegistry::lookupTarget(TargetTriple, ErrorMessage);
+
+  if (!Target) {
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Failed to load and translate PTX LLVM IR module with error %s",
+        ErrorMessage.c_str());
+  }
+
+  llvm::StringRef TargetCPU{"sm_50"};
+  llvm::StringRef TargetFeatures{"+sm_50,+ptx76"};
+  if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name)) {
+    if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
+      TargetCPU =
+          KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
+    }
+    if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
+      TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
+                           .getValueAsString();
+    }
+  }
+
+  // FIXME: Check whether we can provide more accurate target information here
+  auto *TargetMachine = Target->createTargetMachine(
+      TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_,
+      std::nullopt, llvm::CodeGenOpt::Default);
+
+  llvm::legacy::PassManager PM;
+
+  std::string PTXASM;
+
+  {
+    llvm::raw_string_ostream ASMStream{PTXASM};
+    llvm::buffer_ostream BufferedASM{ASMStream};
+
+    if (TargetMachine->addPassesToEmitFile(PM, BufferedASM, nullptr,
+                                           llvm::CGFT_AssemblyFile)) {
+      return createStringError(
+          inconvertibleErrorCode(),
+          "Failed to construct pass pipeline to emit output");
+    }
+
+    PM.run(Mod);
+    ASMStream.flush();
+  }
+
+  return &JITCtx.emplaceKernelBinary(std::move(PTXASM), BinaryFormat::PTX);
+#endif // FUSION_JIT_SUPPORT_PTX
+}
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
new file mode 100644
index 0000000000000..7e4816df9bf94
--- /dev/null
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
@@ -0,0 +1,50 @@
+//==- KernelTranslation - Translate SYCL kernels between different formats -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H
+#define SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H
+
+#include "JITContext.h"
+#include "Kernel.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Error.h"
+#include <vector>
+
+namespace jit_compiler {
+namespace translation {
+
+class KernelTranslator {
+
+public:
+  static llvm::Expected<std::unique_ptr<llvm::Module>>
+  loadKernels(llvm::LLVMContext &LLVMCtx, std::vector<SYCLKernelInfo> &Kernels);
+
+  static llvm::Error translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod,
+                                     JITContext &JITCtx, BinaryFormat Format);
+
+private:
+  ///
+  /// Pair of address and size to represent a binary blob.
+  using BinaryBlob = std::pair<BinaryAddress, size_t>;
+
+  static llvm::Expected<std::unique_ptr<llvm::Module>>
+  loadLLVMKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel);
+
+  static llvm::Expected<std::unique_ptr<llvm::Module>>
+  loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel);
+
+  static llvm::Expected<KernelBinary *> translateToSPIRV(llvm::Module &Mod,
+                                                         JITContext &JITCtx);
+
+  static llvm::Expected<KernelBinary *>
+  translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx);
+};
+} // namespace translation
+} // namespace jit_compiler
+
+#endif // SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
index 77217419e98bd..4092f9dd96fc8 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
@@ -11,12 +11,9 @@
 #include "Kernel.h"
 #include "LLVMSPIRVLib.h"
 #include "helper/ErrorHandling.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Linker/Linker.h"
 #include "llvm/Support/raw_ostream.h"
 #include <iostream>
 #include <sstream>
@@ -25,37 +22,6 @@ using namespace jit_compiler;
 using namespace jit_compiler::translation;
 using namespace llvm;
 
-void SPIRVLLVMTranslator::getAttributeValues(std::vector<std::string> &Values,
-                                             MDNode *MD, size_t NumValues) {
-  assert(MD->getNumOperands() == NumValues && "Incorrect number of values");
-  for (const auto &MDOp : MD->operands()) {
-    auto *ConstantMD = cast<ConstantAsMetadata>(MDOp);
-    auto *ConstInt = cast<ConstantInt>(ConstantMD->getValue());
-    Values.push_back(std::to_string(ConstInt->getZExtValue()));
-  }
-}
-
-// NOLINTNEXTLINE(readability-identifier-naming)
-static const char *REQD_WORK_GROUP_SIZE_ATTR = "reqd_work_group_size";
-// NOLINTNEXTLINE(readability-identifier-naming)
-static const char *WORK_GROUP_SIZE_HINT_ATTR = "work_group_size_hint";
-
-void SPIRVLLVMTranslator::restoreKernelAttributes(Module *Mod,
-                                                  SYCLKernelInfo &Info) {
-  auto *KernelFunction = Mod->getFunction(Info.Name);
-  assert(KernelFunction && "Kernel function not present in module");
-  if (auto *MD = KernelFunction->getMetadata(REQD_WORK_GROUP_SIZE_ATTR)) {
-    SYCLKernelAttribute ReqdAttr{REQD_WORK_GROUP_SIZE_ATTR};
-    getAttributeValues(ReqdAttr.Values, MD, 3);
-    Info.Attributes.push_back(ReqdAttr);
-  }
-  if (auto *MD = KernelFunction->getMetadata(WORK_GROUP_SIZE_HINT_ATTR)) {
-    SYCLKernelAttribute HintAttr{WORK_GROUP_SIZE_HINT_ATTR};
-    getAttributeValues(HintAttr.Values, MD, 3);
-    Info.Attributes.push_back(HintAttr);
-  }
-}
-
 SPIRV::TranslatorOpts &SPIRVLLVMTranslator::translatorOpts() {
   static auto Opts = []() -> SPIRV::TranslatorOpts {
     // Options for translation between SPIR-V and LLVM IR.
@@ -86,12 +52,19 @@ SPIRV::TranslatorOpts &SPIRVLLVMTranslator::translatorOpts() {
   return Opts;
 }
 
-Expected<std::unique_ptr<Module>>
-SPIRVLLVMTranslator::readAndTranslateSPIRV(LLVMContext &LLVMCtx,
-                                           BinaryBlob Input) {
-  // Create an input stream for the binary blob.
+Expected<std::unique_ptr<llvm::Module>>
+SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
+                                     SYCLKernelInfo &Kernel) {
+  std::unique_ptr<Module> Result{nullptr};
+
+  SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo;
+  assert(BinInfo.Format == BinaryFormat::SPIRV &&
+         "Only SPIR-V supported as input");
+
+  // Create an input stream for the SPIR-V binary.
   std::stringstream SPIRStream(
-      std::string(reinterpret_cast<const char *>(Input.first), Input.second),
+      std::string(reinterpret_cast<const char *>(BinInfo.BinaryStart),
+                  BinInfo.BinarySize),
       std::ios_base::in | std::ios_base::binary);
   std::string ErrMsg;
   // Create a raw pointer. readSpirv accepts a reference to a pointer,
@@ -105,80 +78,12 @@ SPIRVLLVMTranslator::readAndTranslateSPIRV(LLVMContext &LLVMCtx,
         "Failed to load and translate SPIR-V module with error %s",
         ErrMsg.c_str());
   }
-  return std::unique_ptr<Module>(LLVMMod);
-}
+  std::unique_ptr<Module> NewMod{LLVMMod};
 
-Expected<std::unique_ptr<llvm::Module>>
-SPIRVLLVMTranslator::loadSPIRVKernels(llvm::LLVMContext &LLVMCtx,
-                                      std::vector<SYCLKernelInfo> &Kernels) {
-  std::unique_ptr<Module> Result{nullptr};
-  bool First = true;
-  DenseSet<BinaryBlob> ParsedSPIRVModules;
-  size_t AddressBits = 0;
-  for (auto &Kernel : Kernels) {
-    // FIXME: Currently, we use the front of the list.
-    // Do we need to iterate to find the most suitable
-    // SPIR-V module?
-    SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo;
-    // TODO(Lukas, ONNX-399): Also support LLVM IR as input but simply skipping
-    // the translation from SPIR-V to LLVM.
-    assert(BinInfo.Format == BinaryFormat::SPIRV &&
-           "Only SPIR-V supported as input");
-    const unsigned char *SPRModulePtr = BinInfo.BinaryStart;
-    size_t SPRModuleSize = BinInfo.BinarySize;
-    BinaryBlob BinBlob{SPRModulePtr, SPRModuleSize};
-    if (ParsedSPIRVModules.contains(BinBlob)) {
-      // Multiple kernels can be stored in the same SPIR-V module.
-      // If we encountered the same SPIR-V module before, skip.
-      // NOTE: We compare the pointer as well as the size, in case
-      // a previous kernel only referenced part of the SPIR-V module.
-      // Not sure this can actually happen, but better safe than sorry.
-      continue;
-    }
-    // Simply load and translate the SPIR-V into the currently still empty
-    // module.
-    PROPAGATE_ERROR(NewMod, readAndTranslateSPIRV(LLVMCtx, BinBlob));
-
-    // We do not assume that the input binary information has the address bits
-    // set, but rather retrieve this information from the SPIR-V/LLVM module's
-    // data-layout.
-    BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits();
-    assert((First || BinInfo.AddressBits == AddressBits) &&
-           "Address bits do not match");
-    // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
-    // 'work_group_size_hint' from metadata attached to the kernel function and
-    // store it in the SYCLKernelInfo.
-    // TODO(Lukas, ONNX-399): Validate that DPC++ used metadata to represent
-    // that information.
-    restoreKernelAttributes(NewMod.get(), Kernel);
-
-    if (First) {
-      // We can simply assign the module we just loaded from SPIR-V to the
-      // empty pointer on the first iteration.
-      Result = std::move(NewMod);
-      // The first module will dictate the address bits for the remaining.
-      AddressBits = BinInfo.AddressBits;
-      First = false;
-    } else {
-      // We have already loaded some module, so now we need to
-      // link the module we just loaded with the result so far.
-      // FIXME: We allow duplicates to be overridden by the module
-      // read last. This could cause problems if different modules contain
-      // definitions with the same name, but different body/content.
-      // Check that this is not problematic.
-      Linker::linkModules(*Result, std::move(NewMod),
-                          Linker::Flags::OverrideFromSrc);
-      if (AddressBits != BinInfo.AddressBits) {
-        return createStringError(
-            inconvertibleErrorCode(),
-            "Number of address bits between SPIR-V modules does not match");
-      }
-    }
-  }
-  return std::move(Result);
+  return std::move(NewMod);
 }
 
-Expected<jit_compiler::SPIRVBinary *>
+Expected<jit_compiler::KernelBinary *>
 SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) {
   std::ostringstream BinaryStream;
   std::string ErrMsg;
@@ -189,5 +94,5 @@ SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) {
         "Translation of LLVM IR to SPIR-V failed with error %s",
         ErrMsg.c_str());
   }
-  return &JITCtx.emplaceSPIRVBinary(BinaryStream.str());
+  return &JITCtx.emplaceKernelBinary(BinaryStream.str(), BinaryFormat::SPIRV);
 }
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
index 5f1d416e45150..c8cdf2bf90ca0 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
@@ -25,40 +25,15 @@ class SPIRVLLVMTranslator {
   ///
   /// Load a list of SPIR-V kernels into a single LLVM module.
   static llvm::Expected<std::unique_ptr<llvm::Module>>
-  loadSPIRVKernels(llvm::LLVMContext &LLVMCtx,
-                   std::vector<SYCLKernelInfo> &Kernels);
+  loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel);
 
   ///
   /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and
   /// return a pointer to its container.
-  static llvm::Expected<SPIRVBinary *> translateLLVMtoSPIRV(llvm::Module &Mod,
-                                                            JITContext &JITCtx);
+  static llvm::Expected<KernelBinary *>
+  translateLLVMtoSPIRV(llvm::Module &Mod, JITContext &JITCtx);
 
 private:
-  ///
-  /// Pair of address and size to represent a binary blob.
-  using BinaryBlob = std::pair<BinaryAddress, size_t>;
-
-  ///
-  /// Get an attribute value consisting of NumValues scalar constant integers
-  /// from the MDNode.
-  static void getAttributeValues(std::vector<std::string> &Values,
-                                 llvm::MDNode *MD, size_t NumValues);
-
-  ///
-  /// Restore kernel attributes for the kernel in Info from the metadata
-  /// attached to its kernel function in the LLVM module Mod.
-  /// Currently supported attributes:
-  ///   - reqd_work_group_size
-  ///   - work_group_size_hint
-  static void restoreKernelAttributes(llvm::Module *Mod, SYCLKernelInfo &Info);
-
-  ///
-  /// Read the given SPIR-V binary and translate it to a new LLVM module
-  /// associated with the given context.
-  static llvm::Expected<std::unique_ptr<llvm::Module>>
-  readAndTranslateSPIRV(llvm::LLVMContext &LLVMCtx, BinaryBlob Input);
-
   ///
   /// Default settings for the SPIRV translation options.
   static SPIRV::TranslatorOpts &translatorOpts();
diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt
index fe81b76b5bbcb..4693083be4faa 100644
--- a/sycl-fusion/passes/CMakeLists.txt
+++ b/sycl-fusion/passes/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(SYCLKernelFusion MODULE
   syclcp/SYCLCP.cpp
   cleanup/Cleanup.cpp
   debug/PassDebug.cpp
+  target/TargetFusionInfo.cpp
 
   DEPENDS
   intrinsics_gen
@@ -25,6 +26,10 @@ target_link_libraries(SYCLKernelFusion
   sycl-fusion-common
 )
 
+if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  target_compile_definitions(SYCLKernelFusion PRIVATE FUSION_JIT_SUPPORT_PTX)
+endif()
+
 # Static library for linking with the jit_compiler
 add_llvm_library(SYCLKernelFusionPasses
   SYCLFusionPasses.cpp
@@ -35,6 +40,7 @@ add_llvm_library(SYCLKernelFusionPasses
   syclcp/SYCLCP.cpp
   cleanup/Cleanup.cpp
   debug/PassDebug.cpp
+  target/TargetFusionInfo.cpp
 
   DEPENDS
   intrinsics_gen
@@ -44,6 +50,7 @@ add_llvm_library(SYCLKernelFusionPasses
   Support
   TransformUtils
   Passes
+  TargetParser
 )
 
 target_include_directories(SYCLKernelFusionPasses
@@ -57,3 +64,7 @@ target_link_libraries(SYCLKernelFusionPasses
   PRIVATE
   sycl-fusion-common
 )
+
+if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  target_compile_definitions(SYCLKernelFusionPasses PRIVATE FUSION_JIT_SUPPORT_PTX)
+endif()
diff --git a/sycl-fusion/passes/cleanup/Cleanup.cpp b/sycl-fusion/passes/cleanup/Cleanup.cpp
index 07fc86d9d0dd3..a5bc3e634c527 100644
--- a/sycl-fusion/passes/cleanup/Cleanup.cpp
+++ b/sycl-fusion/passes/cleanup/Cleanup.cpp
@@ -45,7 +45,8 @@ static void copyAttributesFrom(const BitVector &Mask, Function *NF,
                                        PAL.getRetAttrs(), Attributes));
 }
 
-static Function *createMaskedFunction(const BitVector &Mask, Function *F) {
+static Function *createMaskedFunction(const BitVector &Mask, Function *F,
+                                      TargetFusionInfo &TFI) {
   // Declare
   FunctionType *NFTy = createMaskedFunctionType(Mask, F->getFunctionType());
   Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(),
@@ -78,7 +79,9 @@ static Function *createMaskedFunction(const BitVector &Mask, Function *F) {
   }
 
   // Erase old function
+  TFI.notifyFunctionsDelete(F);
   F->eraseFromParent();
+  TFI.addKernelFunction(NF);
   return NF;
 }
 
@@ -104,9 +107,9 @@ static void updateArgUsageMask(jit_compiler::SYCLKernelInfo *Info,
 
 static void applyArgMask(const jit_compiler::ArgUsageMask &NewArgInfo,
                          const BitVector &Mask, Function *F,
-                         ModuleAnalysisManager &AM) {
+                         ModuleAnalysisManager &AM, TargetFusionInfo &TFI) {
   // Create the function without the masked-out args.
-  Function *NF = createMaskedFunction(Mask, F);
+  Function *NF = createMaskedFunction(Mask, F, TFI);
   // Update the unused args mask.
   jit_compiler::SYCLModuleInfo *ModuleInfo =
       AM.getResult<SYCLModuleInfoAnalysis>(*NF->getParent()).ModuleInfo;
@@ -125,9 +128,7 @@ static void maskMD(const BitVector &Mask, Function *F) {
   SmallVector<std::pair<unsigned, MDNode *>> MD;
   F->getAllMetadata(MD);
   for (const auto &Entry : MD) {
-    auto MDKind = Entry.first;
-    if (MDKind == F->getContext().getMDKindID("reqd_work_group_size") ||
-        MDKind == F->getContext().getMDKindID("work_group_size_hint")) {
+    if (Entry.second->getNumOperands() != Mask.size()) {
       // Some metadata, e.g., the metadata for reqd_work_group_size and
       // work_group_size_hint is independent from the number of arguments
       // and must not be filtered by the argument usage mask.
@@ -144,7 +145,7 @@ static void maskMD(const BitVector &Mask, Function *F) {
 
 void llvm::fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo,
                        Function *F, ModuleAnalysisManager &AM,
-                       ArrayRef<StringRef> MDToErase) {
+                       TargetFusionInfo &TFI, ArrayRef<StringRef> MDToErase) {
   // Erase metadata.
   for (auto Key : MDToErase) {
     F->setMetadata(Key, nullptr);
@@ -158,5 +159,5 @@ void llvm::fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo,
   // Update metadata.
   maskMD(CleanupMask, F);
   // Remove arguments.
-  applyArgMask(ArgUsageInfo, CleanupMask, F, AM);
+  applyArgMask(ArgUsageInfo, CleanupMask, F, AM, TFI);
 }
diff --git a/sycl-fusion/passes/cleanup/Cleanup.h b/sycl-fusion/passes/cleanup/Cleanup.h
index 49619e4b9af07..491d96f46a886 100644
--- a/sycl-fusion/passes/cleanup/Cleanup.h
+++ b/sycl-fusion/passes/cleanup/Cleanup.h
@@ -10,6 +10,7 @@
 #define SYCL_FUSION_PASSES_CLEANUP_H
 
 #include "Kernel.h"
+#include "target/TargetFusionInfo.h"
 #include <llvm/ADT/ArrayRef.h>
 #include <llvm/ADT/BitVector.h>
 #include <llvm/ADT/StringRef.h>
@@ -25,7 +26,8 @@ namespace llvm {
 /// @param[in] AM Module analysis manager.
 /// @param[in] EraseMD Keys of metadata to remove.
 void fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo, Function *F,
-                 ModuleAnalysisManager &AM, ArrayRef<StringRef> EraseMD);
+                 ModuleAnalysisManager &AM, TargetFusionInfo &TFI,
+                 ArrayRef<StringRef> EraseMD);
 } // namespace llvm
 
 #endif // SYCL_FUSION_PASSES_CLEANUP_H
diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp
index 61f3a0738921b..3d5c38c799e8f 100644
--- a/sycl-fusion/passes/internalization/Internalization.cpp
+++ b/sycl-fusion/passes/internalization/Internalization.cpp
@@ -19,16 +19,12 @@
 #include "cleanup/Cleanup.h"
 #include "debug/PassDebug.h"
 #include "metadata/MDParsing.h"
+#include "target/TargetFusionInfo.h"
 
 #define DEBUG_TYPE "sycl-fusion"
 
 using namespace llvm;
 
-// Corresponds to definition of spir_private and spir_local in
-// "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap".
-constexpr static unsigned PrivateAS{0};
-constexpr static unsigned LocalAS{3};
-
 constexpr static StringLiteral PrivatePromotion{"private"};
 constexpr static StringLiteral LocalPromotion{"local"};
 constexpr static StringLiteral NoPromotion{"none"};
@@ -44,6 +40,8 @@ struct SYCLInternalizerImpl {
   StringRef Kind;
   /// Whether or not to create allocas.
   bool CreateAllocas;
+  /// Interface to target-specific information.
+  TargetFusionInfo TargetInfo;
 
   /// Implements internalization the pass run.
   PreservedAnalyses operator()(Module &M, ModuleAnalysisManager &AM) const;
@@ -338,11 +336,14 @@ Error SYCLInternalizerImpl::checkArgsPromotable(
 
 ///
 /// Function to perform the required cleaning actions.
-static void cleanup(Function *OldF, Function *NewF, bool KeepOriginal) {
+static void cleanup(Function *OldF, Function *NewF, bool KeepOriginal,
+                    const TargetFusionInfo &TFI) {
   if (!KeepOriginal) {
     NewF->takeName(OldF);
+    TFI.notifyFunctionsDelete(OldF);
     OldF->eraseFromParent();
   }
+  TFI.addKernelFunction(NewF);
 }
 
 void SYCLInternalizerImpl::promoteCall(CallBase *C, const Value *Val,
@@ -499,11 +500,6 @@ Value *replaceByNewAlloca(Argument *Arg, unsigned AS, std::size_t LocalSize) {
 Function *SYCLInternalizerImpl::promoteFunctionArgs(
     Function *OldF, ArrayRef<std::size_t> PromoteToLocal, bool CreateAllocas,
     bool KeepOriginal) const {
-  constexpr unsigned AddressSpaceBitWidth{32};
-
-  auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get(
-      IntegerType::get(OldF->getContext(), AddressSpaceBitWidth), AS));
-
   // We first declare the promoted function with the new signature.
   Function *NewF =
       getPromotedFunctionDeclaration(OldF, PromoteToLocal, AS,
@@ -542,32 +538,9 @@ Function *SYCLInternalizerImpl::promoteFunctionArgs(
     promoteValue(Arg, LocalSize);
   }
 
-  {
-    constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"};
-    if (auto *AddrspaceMD =
-            dyn_cast_or_null<MDNode>(NewF->getMetadata(KernelArgAddrSpaceMD))) {
-      // If we have kernel_arg_addr_space metadata in the original function,
-      // we should update it in the new one.
-      SmallVector<Metadata *> NewInfo{AddrspaceMD->op_begin(),
-                                      AddrspaceMD->op_end()};
-      for (auto I : enumerate(PromoteToLocal)) {
-        if (I.value() == 0) {
-          continue;
-        }
-        const auto Index = I.index();
-        if (const auto *PtrTy =
-                dyn_cast<PointerType>(NewF->getArg(Index)->getType())) {
-          if (PtrTy->getAddressSpace() == LocalAS) {
-            NewInfo[Index] = NewAddrspace;
-          }
-        }
-      }
-      NewF->setMetadata(KernelArgAddrSpaceMD,
-                        MDNode::get(NewF->getContext(), NewInfo));
-    }
-  }
+  TargetInfo.updateAddressSpaceMetadata(NewF, PromoteToLocal, AS);
 
-  cleanup(OldF, NewF, KeepOriginal);
+  cleanup(OldF, NewF, KeepOriginal, TargetInfo);
 
   return NewF;
 }
@@ -625,7 +598,8 @@ SYCLInternalizerImpl::operator()(Module &M, ModuleAnalysisManager &AM) const {
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
-static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
+static void moduleCleanup(Module &M, ModuleAnalysisManager &AM,
+                          TargetFusionInfo &TFI) {
   SmallVector<Function *> ToProcess;
   for (auto &F : M) {
     if (F.hasMetadata(SYCLInternalizer::Key)) {
@@ -650,24 +624,25 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
         NewArgInfo.push_back(jit_compiler::ArgUsage::Used);
       }
     }
-    fullCleanup(NewArgInfo, F, AM,
+    fullCleanup(NewArgInfo, F, AM, TFI,
                 {SYCLInternalizer::Key, SYCLInternalizer::LocalSizeKey});
   }
 }
 
 PreservedAnalyses llvm::SYCLInternalizer::run(Module &M,
                                               ModuleAnalysisManager &AM) {
+  TargetFusionInfo TFI{&M};
   // Private promotion
-  const PreservedAnalyses Tmp =
-      SYCLInternalizerImpl{PrivateAS, PrivatePromotion, true}(M, AM);
+  const PreservedAnalyses Tmp = SYCLInternalizerImpl{
+      TFI.getPrivateAddressSpace(), PrivatePromotion, true, TFI}(M, AM);
   // Local promotion
-  PreservedAnalyses Res =
-      SYCLInternalizerImpl{LocalAS, LocalPromotion, false}(M, AM);
+  PreservedAnalyses Res = SYCLInternalizerImpl{
+      TFI.getLocalAddressSpace(), LocalPromotion, false, TFI}(M, AM);
 
   Res.intersect(Tmp);
 
   if (!Res.areAllPreserved()) {
-    moduleCleanup(M, AM);
+    moduleCleanup(M, AM, TFI);
   }
   return Res;
 }
diff --git a/sycl-fusion/passes/kernel-fusion/Builtins.cpp b/sycl-fusion/passes/kernel-fusion/Builtins.cpp
index 6de1c40e8f4cd..9cff120cd5888 100644
--- a/sycl-fusion/passes/kernel-fusion/Builtins.cpp
+++ b/sycl-fusion/passes/kernel-fusion/Builtins.cpp
@@ -595,18 +595,6 @@ jit_compiler::Remapper::remapBuiltins(Function *F, const NDRange &SrcNDRange,
   return Clone;
 }
 
-void jit_compiler::barrierCall(IRBuilderBase &Builder, int Flags) {
-  assert((Flags == 1 || Flags == 2 || Flags == 3) && "Invalid barrier flags");
-
-  // See
-  // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id-
-  createSPIRVCall(Builder, BarrierName,
-                  {Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
-                   Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
-                   Builder.getInt32(0x10 | (Flags % 2 == 1 ? 0x100 : 0x0) |
-                                    ((Flags >> 1 == 1 ? 0x200 : 0x0)))});
-}
-
 Value *jit_compiler::createSPIRVCall(IRBuilderBase &Builder,
                                      StringRef FunctionName,
                                      ArrayRef<Value *> Args) {
diff --git a/sycl-fusion/passes/kernel-fusion/Builtins.h b/sycl-fusion/passes/kernel-fusion/Builtins.h
index bc8e186f14188..a55b6efab39f9 100644
--- a/sycl-fusion/passes/kernel-fusion/Builtins.h
+++ b/sycl-fusion/passes/kernel-fusion/Builtins.h
@@ -60,10 +60,6 @@ constexpr llvm::StringLiteral OffloadStartWrapperName{
 llvm::Value *getGlobalLinearID(llvm::IRBuilderBase &Builder,
                                const NDRange &FusedNDRange);
 
-///
-/// Creates a call to a barrier function.
-void barrierCall(llvm::IRBuilderBase &Builder, int Flags);
-
 ///
 /// @return A call to a SPIRV function, which will be declared if not already in
 /// the module.
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
index 241496ac9044f..b61a0936eb32e 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
@@ -31,12 +31,6 @@
 
 using namespace llvm;
 
-constexpr static StringLiteral KernelArgAddrSpace{"kernel_arg_addr_space"};
-constexpr static StringLiteral KernelArgAccessQual{"kernel_arg_access_qual"};
-constexpr static StringLiteral KernelArgType{"kernel_arg_type"};
-constexpr static StringLiteral KernelArgBaseType{"kernel_arg_base_type"};
-constexpr static StringLiteral KernelArgTypeQual{"kernel_arg_type_qual"};
-
 constexpr StringLiteral SYCLKernelFusion::NDRangeMDKey;
 constexpr StringLiteral SYCLKernelFusion::NDRangesMDKey;
 
@@ -144,6 +138,8 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) {
       AM.getResult<SYCLModuleInfoAnalysis>(M).ModuleInfo;
   assert(ModuleInfo && "No module information available");
 
+  TargetFusionInfo TFI{&M};
+
   // Iterate over the functions in the module and locate all
   // stub functions identified by metadata.
   SmallPtrSet<Function *, 8> ToCleanUp;
@@ -156,7 +152,7 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) {
       // attached to this stub function.
       // The newly created function will carry the name also specified
       // in the metadata.
-      if (auto Err = fuseKernel(M, F, ModuleInfo, ToCleanUp)) {
+      if (auto Err = fuseKernel(M, F, ModuleInfo, TFI, ToCleanUp)) {
         DeferredErrs = joinErrors(std::move(DeferredErrs), std::move(Err));
       }
       // Rembember the stub for deletion, as it is not required anymore after
@@ -164,6 +160,10 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) {
       ToCleanUp.insert(&F);
     }
   }
+  // Notify the target-specific logic that some functions will be erased
+  // shortly.
+  SmallVector<Function *> NotifyDelete{ToCleanUp.begin(), ToCleanUp.end()};
+  TFI.notifyFunctionsDelete(NotifyDelete);
   // Delete all the stub functions
   for (Function *SF : ToCleanUp) {
     SF->eraseFromParent();
@@ -230,11 +230,13 @@ static FusionInsertPoints addGuard(IRBuilderBase &Builder,
   return {Entry, CallInsertion, Exit};
 }
 
-static Expected<CallInst *> createFusionCall(
-    IRBuilderBase &Builder, Function *F, ArrayRef<Value *> CallArgs,
-    const jit_compiler::NDRange &SrcNDRange,
-    const jit_compiler::NDRange &FusedNDRange, bool IsLast, int BarriersFlags,
-    jit_compiler::Remapper &Remapper, bool ShouldRemap) {
+static Expected<CallInst *>
+createFusionCall(IRBuilderBase &Builder, Function *F,
+                 ArrayRef<Value *> CallArgs,
+                 const jit_compiler::NDRange &SrcNDRange,
+                 const jit_compiler::NDRange &FusedNDRange, bool IsLast,
+                 int BarriersFlags, jit_compiler::Remapper &Remapper,
+                 bool ShouldRemap, TargetFusionInfo &TargetInfo) {
   const auto IPs = addGuard(Builder, SrcNDRange, FusedNDRange, IsLast);
 
   if (ShouldRemap) {
@@ -260,7 +262,7 @@ static Expected<CallInst *> createFusionCall(
 
   // Insert barrier if needed
   if (!IsLast && BarriersFlags > 0) {
-    jit_compiler::barrierCall(Builder, BarriersFlags);
+    TargetInfo.createBarrierCall(Builder, BarriersFlags);
   }
 
   // Set insert point for future insertions
@@ -271,6 +273,7 @@ static Expected<CallInst *> createFusionCall(
 
 Error SYCLKernelFusion::fuseKernel(
     Module &M, Function &StubFunction, jit_compiler::SYCLModuleInfo *ModInfo,
+    TargetFusionInfo &TargetInfo,
     SmallPtrSetImpl<Function *> &ToCleanUp) const {
   // Retrieve the metadata from the stub function.
   // The first operand of the tuple is the name that the newly created,
@@ -343,12 +346,9 @@ Error SYCLKernelFusion::fuseKernel(
   SmallVector<std::string> FusedArgNames;
   SmallVector<AttributeSet> FusedParamAttributes;
   // We must keep track of some metadata attached to each parameter.
-  // Collect it in lists, so it can be attached to the fused function later on.
-  MDList KernelArgAddressSpaces;
-  MDList KernelArgAccessQualifiers;
-  MDList KernelArgTypes;
-  MDList KernelArgBaseTypes;
-  MDList KernelArgTypeQualifiers;
+  // Collect it, so it can be attached to the fused function later on.
+  MetadataCollection MDCollection{TargetInfo.getKernelMetadataKeys()};
+
   // Add the information about the new kernel to the SYCLModuleInfo.
   // Initialize the jit_compiler::SYCLKernelInfo with the name. The remaining
   // information for functor & argument layout and attributes will be filled in
@@ -423,16 +423,8 @@ Error SYCLKernelFusion::fuseKernel(
       ++ParamIndex;
     }
     // Add the metadata corresponding to the used arguments to the different
-    // lists. NOTE: We do not collect the "kernel_arg_name" metadata, because
-    // the kernel arguments receive new names in the fused kernel.
-    addToFusedMetadata(FF, KernelArgAddrSpace, UsedArgsMask,
-                       KernelArgAddressSpaces);
-    addToFusedMetadata(FF, KernelArgAccessQual, UsedArgsMask,
-                       KernelArgAccessQualifiers);
-    addToFusedMetadata(FF, KernelArgType, UsedArgsMask, KernelArgTypes);
-    addToFusedMetadata(FF, KernelArgBaseType, UsedArgsMask, KernelArgBaseTypes);
-    addToFusedMetadata(FF, KernelArgTypeQual, UsedArgsMask,
-                       KernelArgTypeQualifiers);
+    // lists.
+    MDCollection.collectFromFunction(FF, UsedArgsMask);
 
     // Update the fused kernel's KernelInfo with information from this input
     // kernel.
@@ -464,11 +456,20 @@ Error SYCLKernelFusion::fuseKernel(
       FT, GlobalValue::LinkageTypes::ExternalLinkage,
       M.getDataLayout().getProgramAddressSpace(), KernelName->getString(), &M);
   {
+    auto DefaultAttr = FusedFunction->getAttributes();
+    // Add uniform function attributes, i.e., attributes with identical value on
+    // each input function, to the fused function.
+    auto *FirstFunction = InputFunctions.front().F;
+    for (const auto &UniformKey : TargetInfo.getUniformKernelAttributes()) {
+      if (FirstFunction->hasFnAttribute(UniformKey)) {
+        DefaultAttr = DefaultAttr.addFnAttribute(
+            LLVMCtx, FirstFunction->getFnAttribute(UniformKey));
+      }
+    }
     // Add the collected parameter attributes to the fused function.
     // Copying the parameter attributes from their original definition in the
     // input kernels should be safe and they most likely can't be deducted later
     // on, as no caller is present in the module.
-    auto DefaultAttr = FusedFunction->getAttributes();
     auto FusedFnAttrs =
         AttributeList::get(LLVMCtx, DefaultAttr.getFnAttrs(),
                            DefaultAttr.getRetAttrs(), FusedParamAttributes);
@@ -491,37 +492,22 @@ Error SYCLKernelFusion::fuseKernel(
   }
 
   // Attach names to the arguments. The name includes a prefix for the kernel
-  // from which this argument came. The names are also attached as metadata
-  // with kind "kernel_arg_name".
-  // NOTE: While the kernel_arg_name metadata is required, naming the
-  // parameters themselves is not necessary for functionality, it just improves
-  // readibility for debugging purposes.
-  SmallVector<Metadata *, 16> KernelArgNames;
+  // from which this argument came. Naming the parameters themselves is not
+  // necessary for functionality, it just improves readibility for debugging
+  // purposes.
   for (const auto &AI : llvm::enumerate(FusedFunction->args())) {
     auto &ArgName = FusedArgNames[AI.index()];
     AI.value().setName(ArgName);
-    KernelArgNames.push_back(MDString::get(LLVMCtx, ArgName));
   }
-  // Attach the fused kernel_arg_* metadata collected from the different input
+  // Attach the fused metadata collected from the different input
   // kernels to the fused function.
-  attachFusedMetadata(FusedFunction, "kernel_arg_addr_space",
-                      KernelArgAddressSpaces);
-  attachFusedMetadata(FusedFunction, "kernel_arg_access_qual",
-                      KernelArgAccessQualifiers);
-  attachFusedMetadata(FusedFunction, "kernel_arg_type", KernelArgTypes);
-  attachFusedMetadata(FusedFunction, "kernel_arg_base_type",
-                      KernelArgBaseTypes);
-  attachFusedMetadata(FusedFunction, "kernel_arg_type_qual",
-                      KernelArgTypeQualifiers);
-  attachFusedMetadata(FusedFunction, "kernel_arg_name", KernelArgNames);
+  MDCollection.attachToFunction(FusedFunction);
   // Add metadata for reqd_work_group_size and work_group_size_hint
   attachKernelAttributeMD(LLVMCtx, FusedFunction, FusedKernelInfo);
 
-  // The fused kernel should be a SPIR-V kernel again.
-  // NOTE: If this pass is used in a scenario where input and output
-  // of the compilation are not SPIR-V, care must be taken of other
-  // potential calling conventions here (e.g., nvptx).
-  FusedFunction->setCallingConv(CallingConv::SPIR_KERNEL);
+  // Mark the fused function as a kernel by calling TargetFusionInfo, because
+  // this is target-specific.
+  TargetInfo.addKernelFunction(FusedFunction);
 
   // Fusion is implemented as a two step process: In the first step, we
   // simply create calls to the functions that should be fused into this
@@ -557,9 +543,9 @@ Error SYCLKernelFusion::fuseKernel(
         unsigned ParamIdx = ParamMapping[{FuncIndex, I}];
         CallArgs.push_back(FusedFunction->getArg(ParamIdx));
       }
-      auto CallOrErr = createFusionCall(Builder, IF, CallArgs, KF.ND, NDRange,
-                                        FuncIndex == BarriersEnd, BarriersFlags,
-                                        Remapper, IsHeterogeneousNDRangesList);
+      auto CallOrErr = createFusionCall(
+          Builder, IF, CallArgs, KF.ND, NDRange, FuncIndex == BarriersEnd,
+          BarriersFlags, Remapper, IsHeterogeneousNDRangesList, TargetInfo);
       // Add to the set of original kernel functions that can be deleted after
       // fusion is complete.
       ToCleanUp.insert(IF);
@@ -602,46 +588,8 @@ Error SYCLKernelFusion::fuseKernel(
     }
   }
 
-  // Remove all existing calls of the ITT instrumentation functions. Insert new
-  // ones in the entry block of the fused kernel and every exit block if the
-  // functions are present in the module.
-  // We cannot use the existing SPIRITTAnnotations pass, because that pass might
-  // insert calls to functions not present in the module (e.g., ITT
-  // instrumentations for barriers). As the JITed module is not linked with
-  // libdevice anymore, the functions would remain unresolved and cause the
-  // driver to fail.
-  Function *StartWrapperFunc = M.getFunction(ITTStartWrapper);
-  Function *FinishWrapperFunc = M.getFunction(ITTFinishWrapper);
-  bool InsertWrappers =
-      ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) &&
-       (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration()));
-  auto *WrapperFuncTy =
-      FunctionType::get(Type::getVoidTy(M.getContext()), /*isVarArg*/ false);
-  for (auto &BB : *FusedFunction) {
-    for (auto Inst = BB.begin(); Inst != BB.end();) {
-      if (auto *CB = dyn_cast<CallBase>(Inst)) {
-        if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) {
-          Inst = Inst->eraseFromParent();
-          continue;
-        }
-      }
-      ++Inst;
-    }
-    if (InsertWrappers) {
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
-        auto *WrapperCall =
-            CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI);
-        WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
-      }
-    }
-  }
-  if (InsertWrappers) {
-    FusedFunction->getEntryBlock().getFirstInsertionPt();
-    auto *WrapperCall = CallInst::Create(
-        WrapperFuncTy, StartWrapperFunc, "",
-        &*FusedFunction->getEntryBlock().getFirstInsertionPt());
-    WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
-  }
+  // Perform target-specific post-processing of the new fused kernel.
+  TargetInfo.postProcessKernel(FusedFunction);
   return Error::success();
 }
 
@@ -717,23 +665,6 @@ static unsigned getUnsignedFromMD(Metadata *MD) {
   return ConstInt->getZExtValue();
 }
 
-void SYCLKernelFusion::addToFusedMetadata(
-    Function *InputFunction, const StringRef &Kind,
-    const ArrayRef<bool> IsArgPresentMask,
-    SmallVectorImpl<Metadata *> &FusedMDList) const {
-  // Retrieve metadata from one of the input kernels and add it to the list
-  // of fused metadata.
-  assert(InputFunction->hasMetadata(Kind) &&
-         "Required Metadata not present on input kernel");
-  if (auto *MD = InputFunction->getMetadata(Kind)) {
-    for (auto MaskedOps : llvm::zip(IsArgPresentMask, MD->operands())) {
-      if (std::get<0>(MaskedOps)) {
-        FusedMDList.emplace_back(std::get<1>(MaskedOps).get());
-      }
-    }
-  }
-}
-
 void SYCLKernelFusion::attachFusedMetadata(
     Function *FusedFunction, const StringRef &Kind,
     const ArrayRef<Metadata *> FusedMetadata) const {
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
index 5e26595d2d343..5f52f0a317d14 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
@@ -10,6 +10,7 @@
 #define SYCL_FUSION_PASSES_SYCLKERNELFUSION_H
 
 #include "Kernel.h"
+#include "target/TargetFusionInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -52,8 +53,6 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
   // locate our own metadata again.
   static constexpr auto MetadataKind = "sycl.kernel.fused";
   static constexpr auto ParameterMDKind = "sycl.kernel.param";
-  static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
-  static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
 
   using MDList = llvm::SmallVector<llvm::Metadata *, 16>;
 
@@ -112,6 +111,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
   llvm::Error
   fuseKernel(llvm::Module &M, llvm::Function &StubFunction,
              jit_compiler::SYCLModuleInfo *ModInfo,
+             llvm::TargetFusionInfo &TargetInfo,
              llvm::SmallPtrSetImpl<llvm::Function *> &ToCleanUp) const;
 
   void canonicalizeParameters(
@@ -119,11 +119,6 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
 
   Parameter getParamFromMD(llvm::Metadata *MD) const;
 
-  void addToFusedMetadata(
-      llvm::Function *InputFunction, const llvm::StringRef &Kind,
-      const llvm::ArrayRef<bool> IsArgPresentMask,
-      llvm::SmallVectorImpl<llvm::Metadata *> &FusedMDList) const;
-
   void attachFusedMetadata(
       llvm::Function *FusedFunction, const llvm::StringRef &Kind,
       const llvm::ArrayRef<llvm::Metadata *> FusedMetadata) const;
diff --git a/sycl-fusion/passes/syclcp/SYCLCP.cpp b/sycl-fusion/passes/syclcp/SYCLCP.cpp
index b520620c232d1..b928a33760da0 100644
--- a/sycl-fusion/passes/syclcp/SYCLCP.cpp
+++ b/sycl-fusion/passes/syclcp/SYCLCP.cpp
@@ -41,7 +41,7 @@ static Expected<SmallVector<ConstantInfo>> getCPFromMD(Function *F) {
   MDNode *MD = F->getMetadata(SYCLCP::Key);
   if (!MD) {
     return createStringError(inconvertibleErrorCode(),
-                             "Private promotion metadata not available");
+                             "Constant progagation metadata not available");
   }
   for (auto I : enumerate(MD->operands())) {
     Expected<SmallVector<unsigned char>> Val =
@@ -205,7 +205,8 @@ static bool propagateConstants(Function *F, ArrayRef<ConstantInfo> Constants) {
   return Changed;
 }
 
-static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
+static void moduleCleanup(Module &M, ModuleAnalysisManager &AM,
+                          TargetFusionInfo &TFI) {
   SmallVector<Function *> ToProcess;
   for (auto &F : M) {
     if (F.hasMetadata(SYCLCP::Key)) {
@@ -219,14 +220,13 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
       if (const auto *MDS = dyn_cast<MDString>(I.value().get())) {
         // A value is masked-out if it has a non-empty MDString
         if (MDS->getLength() > 0) {
-          // And is either an integer or a FP number.
           NewArgInfo.push_back(jit_compiler::ArgUsage::Unused);
           continue;
         }
       }
       NewArgInfo.push_back(jit_compiler::ArgUsage::Used);
     }
-    fullCleanup(NewArgInfo, F, AM, {SYCLCP::Key});
+    fullCleanup(NewArgInfo, F, AM, TFI, {SYCLCP::Key});
   }
 }
 
@@ -249,8 +249,10 @@ PreservedAnalyses SYCLCP::run(Module &M, ModuleAnalysisManager &AM) {
     Changed = propagateConstants(F, *ConstantsOrErr) || Changed;
   }
 
+  TargetFusionInfo TFI{&M};
+
   if (Changed) {
-    moduleCleanup(M, AM);
+    moduleCleanup(M, AM, TFI);
   }
 
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
new file mode 100644
index 0000000000000..e6e15a07e28e8
--- /dev/null
+++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -0,0 +1,379 @@
+//==---------------------- TargetFusionInfo.cpp ----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TargetFusionInfo.h"
+
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/TargetParser/Triple.h"
+
+namespace llvm {
+class TargetFusionInfoImpl {
+
+public:
+  explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {};
+
+  virtual ~TargetFusionInfoImpl() = default;
+
+  virtual void notifyFunctionsDelete(
+      [[maybe_unused]] llvm::ArrayRef<Function *> Funcs) const {}
+
+  virtual void addKernelFunction([[maybe_unused]] Function *KernelFunc) const {}
+
+  virtual void postProcessKernel([[maybe_unused]] Function *KernelFunc) const {}
+
+  virtual ArrayRef<StringRef> getKernelMetadataKeys() const { return {}; }
+
+  virtual ArrayRef<StringRef> getUniformKernelAttributes() const { return {}; }
+
+  virtual void createBarrierCall(IRBuilderBase &Builder,
+                                 int BarrierFlags) const = 0;
+
+  virtual unsigned getPrivateAddressSpace() const = 0;
+
+  virtual unsigned getLocalAddressSpace() const = 0;
+
+  virtual void
+  updateAddressSpaceMetadata([[maybe_unused]] Function *KernelFunc,
+                             [[maybe_unused]] ArrayRef<size_t> LocalSize,
+                             [[maybe_unused]] unsigned AddressSpace) const {}
+
+protected:
+  llvm::Module *LLVMMod;
+};
+
+namespace {
+
+//
+// SPIRVTargetFusionInfo
+//
+class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
+public:
+  using TargetFusionInfoImpl::TargetFusionInfoImpl;
+
+  void addKernelFunction(Function *KernelFunc) const override {
+    KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL);
+  }
+
+  ArrayRef<StringRef> getKernelMetadataKeys() const override {
+    // NOTE: We do not collect the "kernel_arg_name" metadata, because
+    // the kernel arguments receive new names in the fused kernel.
+    static SmallVector<StringRef> Keys{
+        {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type",
+         "kernel_arg_base_type", "kernel_arg_type_qual"}};
+    return Keys;
+  }
+
+  void postProcessKernel(Function *KernelFunc) const override {
+    // Attach the kernel_arg_name metadata.
+    SmallVector<Metadata *> KernelArgNames;
+    for (auto &P : KernelFunc->args()) {
+      KernelArgNames.push_back(
+          MDString::get(LLVMMod->getContext(), P.getName()));
+    }
+    auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames);
+    KernelFunc->setMetadata("kernel_arg_name", ArgNameMD);
+
+    static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
+    static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
+    // Remove all existing calls of the ITT instrumentation functions. Insert
+    // new ones in the entry block of the fused kernel and every exit block if
+    // the functions are present in the module. We cannot use the existing
+    // SPIRITTAnnotations pass, because that pass might insert calls to
+    // functions not present in the module (e.g., ITT instrumentations for
+    // barriers). As the JITed module is not linked with libdevice anymore, the
+    // functions would remain unresolved and cause the driver to fail.
+    Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper);
+    Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper);
+    bool InsertWrappers =
+        ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) &&
+         (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration()));
+    auto *WrapperFuncTy = FunctionType::get(
+        Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false);
+    for (auto &BB : *KernelFunc) {
+      for (auto Inst = BB.begin(); Inst != BB.end();) {
+        if (auto *CB = dyn_cast<CallBase>(Inst)) {
+          if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) {
+            Inst = Inst->eraseFromParent();
+            continue;
+          }
+        }
+        ++Inst;
+      }
+      if (InsertWrappers) {
+        if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+          auto *WrapperCall =
+              CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI);
+          WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
+        }
+      }
+    }
+    if (InsertWrappers) {
+      KernelFunc->getEntryBlock().getFirstInsertionPt();
+      auto *WrapperCall =
+          CallInst::Create(WrapperFuncTy, StartWrapperFunc, "",
+                           &*KernelFunc->getEntryBlock().getFirstInsertionPt());
+      WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
+    }
+  }
+
+  void createBarrierCall(IRBuilderBase &Builder,
+                         int BarrierFlags) const override {
+    if (BarrierFlags == -1) {
+      return;
+    }
+    assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) &&
+           "Invalid barrier flags");
+
+    static const auto FnAttrs = AttributeSet::get(
+        LLVMMod->getContext(),
+        {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent),
+         Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)});
+
+    static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"};
+
+    Function *F = LLVMMod->getFunction(N);
+    if (!F) {
+      constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage;
+
+      auto *Ty = FunctionType::get(
+          Builder.getVoidTy(),
+          {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()},
+          false /* isVarArg*/);
+
+      F = Function::Create(Ty, Linkage, N, *LLVMMod);
+
+      F->setAttributes(
+          AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
+      F->setCallingConv(CallingConv::SPIR_FUNC);
+    }
+
+    // See
+    // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id-
+    SmallVector<Value *> Args{
+        Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
+        Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
+        Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) |
+                         ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))};
+
+    auto *BarrierCallInst = Builder.CreateCall(F, Args);
+    BarrierCallInst->setAttributes(
+        AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
+    BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC);
+  }
+
+  // Corresponds to definition of spir_private and spir_local in
+  // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap".
+  unsigned getPrivateAddressSpace() const override { return 0; }
+  unsigned getLocalAddressSpace() const override { return 3; }
+
+  void updateAddressSpaceMetadata(Function *KernelFunc,
+                                  ArrayRef<size_t> LocalSize,
+                                  unsigned AddressSpace) const override {
+    static constexpr unsigned AddressSpaceBitWidth{32};
+    static constexpr StringLiteral KernelArgAddrSpaceMD{
+        "kernel_arg_addr_space"};
+
+    auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get(
+        IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth),
+        AddressSpace));
+    if (auto *AddrspaceMD = dyn_cast_or_null<MDNode>(
+            KernelFunc->getMetadata(KernelArgAddrSpaceMD))) {
+      // If we have kernel_arg_addr_space metadata in the original function,
+      // we should update it in the new one.
+      SmallVector<Metadata *> NewInfo{AddrspaceMD->op_begin(),
+                                      AddrspaceMD->op_end()};
+      for (auto I : enumerate(LocalSize)) {
+        if (I.value() == 0) {
+          continue;
+        }
+        const auto Index = I.index();
+        if (const auto *PtrTy =
+                dyn_cast<PointerType>(KernelFunc->getArg(Index)->getType())) {
+          if (PtrTy->getAddressSpace() == getLocalAddressSpace()) {
+            NewInfo[Index] = NewAddrspace;
+          }
+        }
+      }
+      KernelFunc->setMetadata(KernelArgAddrSpaceMD,
+                              MDNode::get(KernelFunc->getContext(), NewInfo));
+    }
+  }
+};
+
+//
+// NVPTXTargetFusionInfo
+//
+#ifdef FUSION_JIT_SUPPORT_PTX
+class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
+public:
+  using TargetFusionInfoImpl::TargetFusionInfoImpl;
+
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override {
+    SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
+    SmallVector<MDNode *> ValidKernels;
+    auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations");
+    for (auto *Op : OldAnnotations->operands()) {
+      if (auto *TOp = dyn_cast<MDTuple>(Op)) {
+        if (auto *COp = dyn_cast_if_present<ConstantAsMetadata>(
+                TOp->getOperand(0).get())) {
+          if (!DeletedFuncs.contains(COp->getValue())) {
+            ValidKernels.push_back(Op);
+            // Add to the set to also remove duplicate entries.
+            DeletedFuncs.insert(COp->getValue());
+          }
+        }
+      }
+    }
+    LLVMMod->eraseNamedMetadata(OldAnnotations);
+    auto *NewAnnotations =
+        LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+    for (auto *Kernel : ValidKernels) {
+      NewAnnotations->addOperand(Kernel);
+    }
+  }
+
+  void addKernelFunction(Function *KernelFunc) const override {
+    auto *NVVMAnnotations =
+        LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+    auto *MDOne = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
+    auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel");
+    auto *MDFunc = ConstantAsMetadata::get(KernelFunc);
+    SmallVector<Metadata *, 3> KernelMD({MDFunc, MDKernelString, MDOne});
+    auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD);
+    NVVMAnnotations->addOperand(Tuple);
+  }
+
+  ArrayRef<StringRef> getKernelMetadataKeys() const override {
+    // FIXME: Check whether we need to take care of sycl_fixed_targets.
+    static SmallVector<StringRef> Keys{{"kernel_arg_buffer_location",
+                                        "kernel_arg_runtime_aligned",
+                                        "kernel_arg_exclusive_ptr"}};
+    return Keys;
+  }
+
+  ArrayRef<StringRef> getUniformKernelAttributes() const override {
+    static SmallVector<StringRef> Keys{
+        {"target-cpu", "target-features", "uniform-work-group-size"}};
+    return Keys;
+  }
+
+  void createBarrierCall(IRBuilderBase &Builder,
+                         int BarrierFlags) const override {
+    if (BarrierFlags == -1) {
+      return;
+    }
+    // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX
+    // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0
+    // instruction, equivalent to the __syncthreads() call in CUDA."
+    Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {});
+  }
+
+  // Corresponds to the definitions in the LLVM NVPTX backend user guide:
+  // https://llvm.org/docs/NVPTXUsage.html#address-spaces
+  unsigned getPrivateAddressSpace() const override { return 0; }
+  unsigned getLocalAddressSpace() const override { return 3; }
+};
+#endif // FUSION_JIT_SUPPORT_PTX
+
+} // anonymous namespace
+
+//
+// TargetFusionInfo
+//
+
+TargetFusionInfo::TargetFusionInfo(llvm::Module *Mod) {
+  llvm::Triple Tri(Mod->getTargetTriple());
+#ifdef FUSION_JIT_SUPPORT_PTX
+  if (Tri.isNVPTX()) {
+    Impl = std::make_shared<NVPTXTargetFusionInfo>(Mod);
+    return;
+  }
+#endif // FUSION_JIT_SUPPORT_PTX
+  if (Tri.isSPIRV() || Tri.isSPIR()) {
+    Impl = std::make_shared<SPIRVTargetFusionInfo>(Mod);
+    return;
+  }
+  llvm_unreachable("Unsupported target for fusion");
+}
+
+void TargetFusionInfo::notifyFunctionsDelete(
+    llvm::ArrayRef<Function *> Funcs) const {
+  Impl->notifyFunctionsDelete(Funcs);
+}
+
+void TargetFusionInfo::addKernelFunction(llvm::Function *KernelFunc) const {
+  Impl->addKernelFunction(KernelFunc);
+}
+
+void TargetFusionInfo::postProcessKernel(Function *KernelFunc) const {
+  Impl->postProcessKernel(KernelFunc);
+}
+
+llvm::ArrayRef<llvm::StringRef>
+TargetFusionInfo::getKernelMetadataKeys() const {
+  return Impl->getKernelMetadataKeys();
+}
+
+void TargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
+                                         int BarrierFlags) const {
+  Impl->createBarrierCall(Builder, BarrierFlags);
+}
+
+unsigned TargetFusionInfo::getPrivateAddressSpace() const {
+  return Impl->getPrivateAddressSpace();
+}
+
+unsigned TargetFusionInfo::getLocalAddressSpace() const {
+  return Impl->getLocalAddressSpace();
+}
+
+void TargetFusionInfo::updateAddressSpaceMetadata(Function *KernelFunc,
+                                                  ArrayRef<size_t> LocalSize,
+                                                  unsigned AddressSpace) const {
+  Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace);
+}
+
+llvm::ArrayRef<llvm::StringRef>
+TargetFusionInfo::getUniformKernelAttributes() const {
+  return Impl->getUniformKernelAttributes();
+}
+
+//
+// MetadataCollection
+//
+
+MetadataCollection::MetadataCollection(ArrayRef<StringRef> MDKeys)
+    : Keys{MDKeys}, Collection(MDKeys.size()) {}
+
+void MetadataCollection::collectFromFunction(
+    llvm::Function *Func, const ArrayRef<bool> IsArgPresentMask) {
+  for (auto &Key : Keys) {
+    // TODO: Do we want to assert for the presence of the metadata here?
+    if (auto *MD = Func->getMetadata(Key)) {
+      for (auto MaskedOps : llvm::zip(IsArgPresentMask, MD->operands())) {
+        if (std::get<0>(MaskedOps)) {
+          Collection[Key].emplace_back(std::get<1>(MaskedOps).get());
+        }
+      }
+    }
+  }
+}
+
+void MetadataCollection::attachToFunction(llvm::Function *Func) {
+  for (auto &Key : Keys) {
+    // Attach a list of fused metadata for a kind to the fused function.
+    auto *MDEntries = MDNode::get(Func->getContext(), Collection[Key]);
+    Func->setMetadata(Key, MDEntries);
+  }
+}
+
+} // namespace llvm
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h
new file mode 100644
index 0000000000000..f88476c01ebc3
--- /dev/null
+++ b/sycl-fusion/passes/target/TargetFusionInfo.h
@@ -0,0 +1,93 @@
+//==-- TargetFusionInfo.h - Encapsule target-specific fusion functionality -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H
+#define SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+
+namespace llvm {
+
+class TargetFusionInfoImpl;
+
+///
+/// Common interface to target-specific logic around handling of kernel
+/// functions.
+class TargetFusionInfo {
+public:
+  ///
+  /// Create the correct target-specific implementation based on the target
+  /// triple of \p Module.
+  explicit TargetFusionInfo(llvm::Module *Module);
+
+  ///
+  /// Notify the target-specific implementation that set of functions \p Funcs
+  /// is about to be erased from the module. This should be called BEFORE
+  /// erasing the functions.
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const;
+
+  ///
+  /// Notify the target-specific implementation that the function \p KernelFunc
+  /// was added as a new kernel. This should be called AFTER the function has
+  /// been added.
+  void addKernelFunction(llvm::Function *KernelFunc) const;
+
+  ///
+  /// Target-specific post-processing of the new kernel function \p KernelFunc.
+  /// This should be called AFTER the function has been added and defined.
+  void postProcessKernel(Function *KernelFunc) const;
+
+  ///
+  /// Get the target-specific list of argument metadata attached to each
+  /// function that should be collected and attached to the fused kernel.
+  llvm::ArrayRef<llvm::StringRef> getKernelMetadataKeys() const;
+
+  ///
+  /// Get the target-specific list of kernel function attributes that are
+  /// uniform across all input kernels and should be attached to the fused
+  /// kernel.
+  llvm::ArrayRef<llvm::StringRef> getUniformKernelAttributes() const;
+
+  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const;
+
+  unsigned getPrivateAddressSpace() const;
+
+  unsigned getLocalAddressSpace() const;
+
+  void updateAddressSpaceMetadata(Function *KernelFunc,
+                                  ArrayRef<size_t> LocalSize,
+                                  unsigned AddressSpace) const;
+
+private:
+  using ImplPtr = std::shared_ptr<TargetFusionInfoImpl>;
+
+  ImplPtr Impl;
+};
+
+///
+/// Simple helper to collect a target-specific set of kernel argument metadata
+/// from input functions and attach it to a fused kernel.
+class MetadataCollection {
+public:
+  explicit MetadataCollection(llvm::ArrayRef<llvm::StringRef> MDKeys);
+
+  void collectFromFunction(llvm::Function *Func,
+                           const ArrayRef<bool> IsArgPresentMask);
+
+  void attachToFunction(llvm::Function *Func);
+
+private:
+  llvm::SmallVector<llvm::StringRef> Keys;
+
+  llvm::StringMap<llvm::SmallVector<llvm::Metadata *>> Collection;
+};
+} // namespace llvm
+
+#endif // SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H
diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md
index 3db49acbd36b1..f06b0d52c257d 100644
--- a/sycl/doc/design/CompilerAndRuntimeDesign.md
+++ b/sycl/doc/design/CompilerAndRuntimeDesign.md
@@ -758,6 +758,29 @@ entry:
 
 Note: Kernel naming is not fully stable for now.
 
+##### Kernel Fusion Support
+
+The [experimental kernel fusion
+extension](../extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc)
+also supports the CUDA backend. However, as neither CUBIN nor PTX are a suitable
+input format for the [kernel fusion JIT compiler](KernelFusionJIT.md), a
+suitable IR has to be added as an additional device binary.
+
+Therefore, in case kernel fusion should be performed for the CUDA backend, the
+user needs to specify the additional flag `-fsycl-embed-ir` during compilation,
+to add LLVM IR as an additional device binary. When the flag `-fsycl-embed-ir`
+is specified, the LLVM IR produced by Clang for the CUDA backend device
+compilation is added to the fat binary file. To this end, the resulting
+file-table from `sycl-post-link` is additionally passed to the
+`clang-offload-wrapper`, creating a wrapper object with target `llvm_nvptx64`.
+
+This device binary in LLVM IR format can be retrieved by the SYCL runtime and
+used by the kernel fusion JIT compiler. The resulting fused kernel is compiled
+to PTX assembly by the kernel fusion JIT compiler at runtime.
+
+Note that the device binary in LLVM IR does not replace the device binary in
+CUBIN/PTX format, but is embed in addition to it.
+
 ### Integration with SPIR-V format
 
 This section explains how to generate SPIR-V specific types and operations from
diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md
index 6678b9359f832..53367b5a930b3 100644
--- a/sycl/doc/design/KernelFusionJIT.md
+++ b/sycl/doc/design/KernelFusionJIT.md
@@ -162,11 +162,20 @@ The metadata is attached to a function that will become the fused kernel:
 
 ### Support for non SPIR-V targets
 
-Non SPIR-V targets (NVPTX / AMDGCN) are not supported at the moment as they cannot ingest a SPIR-V module. However, we are looking into adding support for these targets once the initial SPIR-V based path is operational.
+Fusion is currently supported for the NVPTX/CUDA backend. 
 
-In this scenario, two options are possible to add JIT support:
+As this backend cannot ingest a SPIR-V module, additional changes to the
+compilation flow are necessary. During static compilation the LLVM module for
+this backend is stored in addition to the finalized binary. 
 
- - During static compilation we store the LLVM module on top of the finalized binary. This behavior could be controlled by a flag to avoid a too important binary inflation. Then, during the fusion process, the JIT will load that LLVM IR and finalize the fused kernel to the final target as driven by the PI plugin.
- - SPIR-V ingestion support is added for these targets. The module to be loaded could then be the generic SPIR-V module. This path would however exclude target specific optimizations written in user's code. The current state of the SPIR-V translator does not allow this at the moment and significant work is needed to add this support.
+This behavior is controlled by the `-fsycl-embed-ir` flag to avoid binary
+inflation in case kernel fusion is not used. If users want to use kernel fusion
+at runtime on the NVPTX/CUDA backend, they need to pass the `-fsycl-embed-ir`
+flag during static compilation. 
 
-In these cases, PI will need to be extended to allow to somehow drive the JIT process, so it is tailored to the plugin target needs.
+During the fusion process at runtime, the JIT will load the LLVM IR and
+finalize the fused kernel to the final target. More information is available
+[here](./CompilerAndRuntimeDesign.md#kernel-fusion-support). 
+
+Support for the AMD GPU/HIP/AMDGCN backend is not yet implemented, but could
+follow an approach similar to the NVPTX/CUDA backend.
diff --git a/sycl/doc/design/images/DevicePTXProcessing.svg b/sycl/doc/design/images/DevicePTXProcessing.svg
index df690ec5fb08b..79a9c5e5c4fc9 100644
--- a/sycl/doc/design/images/DevicePTXProcessing.svg
+++ b/sycl/doc/design/images/DevicePTXProcessing.svg
@@ -1,20 +1,20 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:xlink="http://www.w3.org/1999/xlink"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
    sodipodi:docname="DevicePTXProcessing.svg"
-   inkscape:version="0.92.3 (2405546, 2018-03-11)"
+   inkscape:version="1.2.2 (732a01da63, 2022-12-09)"
    id="svg8"
    version="1.1"
    viewBox="0 0 205.79753 221.03191"
    height="221.03191mm"
-   width="205.79753mm">
+   width="205.79753mm"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:xlink="http://www.w3.org/1999/xlink"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:dc="http://purl.org/dc/elements/1.1/">
   <defs
      id="defs2">
     <marker
@@ -2197,6 +2197,20 @@
          d="M 5.77,0 -2.88,5 V -5 Z"
          id="path4942-3" />
     </marker>
+    <marker
+       inkscape:isstock="true"
+       style="overflow:visible"
+       id="marker4944-3-5"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutM">
+      <path
+         transform="scale(0.4)"
+         style="fill:#4472c4;fill-opacity:1;fill-rule:evenodd;stroke:#4472c4;stroke-width:1pt;stroke-opacity:1"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path4942-3-2" />
+    </marker>
   </defs>
   <sodipodi:namedview
      fit-margin-bottom="0"
@@ -2204,16 +2218,16 @@
      fit-margin-left="0"
      fit-margin-top="0"
      inkscape:window-maximized="1"
-     inkscape:window-y="0"
-     inkscape:window-x="1920"
-     inkscape:window-height="1163"
-     inkscape:window-width="1920"
+     inkscape:window-y="-8"
+     inkscape:window-x="-8"
+     inkscape:window-height="1377"
+     inkscape:window-width="2560"
      showgrid="false"
      inkscape:document-rotation="0"
      inkscape:current-layer="layer1"
      inkscape:document-units="mm"
-     inkscape:cy="330.42371"
-     inkscape:cx="228.34098"
+     inkscape:cy="330.71429"
+     inkscape:cx="228.57143"
      inkscape:zoom="1.4"
      inkscape:pageshadow="2"
      inkscape:pageopacity="0.0"
@@ -2222,7 +2236,10 @@
      pagecolor="#ffffff"
      id="base"
      inkscape:snap-bbox="false"
-     inkscape:snap-global="false" />
+     inkscape:snap-global="false"
+     inkscape:showpageshadow="2"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1" />
   <metadata
      id="metadata5">
     <rdf:RDF>
@@ -2318,16 +2335,14 @@
        d="m 125.31238,20.80355 0.32455,-1.291166 1.29117,-0.324556 z" />
     <text
        id="text899"
-       style="font-variant:normal;font-weight:normal;font-size:4.95300007px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.35277799"
+       style="font-variant:normal;font-weight:normal;font-size:4.953px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="99.965302"
-       y="16.988504">
-      <tspan
+       y="16.988504"><tspan
          id="tspan897"
          sodipodi:role="line"
-         style="stroke-width:0.35277799"
+         style="stroke-width:0.352778"
          x="99.965302"
-         y="16.988504">File table</tspan>
-    </text>
+         y="16.988504">File table</tspan></text>
     <path
        inkscape:connector-curvature="0"
        id="path901"
@@ -2428,13 +2443,11 @@
        id="text1309"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="101.31163"
-       y="84.249756">
-      <tspan
+       y="84.249756"><tspan
          sodipodi:role="line"
          id="tspan1686"
          x="101.31163"
-         y="84.249756">Clang</tspan>
-    </text>
+         y="84.249756">Clang</tspan></text>
     <g
        id="g1323"
        transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)">
@@ -2495,13 +2508,12 @@
         <text
            id="text1385"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Consolas;-inkscape-font-specification:Consolas;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,360.07,89.904)">
-          <tspan
+           transform="matrix(1,0,0,-1,360.07,89.904)"
+           y="0"><tspan
              id="tspan1383"
              sodipodi:role="line"
              y="0"
-             x="0 7.7220001 15.444 23.166 30.94416">clang</tspan>
-        </text>
+             x="0 7.7220001 15.444 23.166 30.94416">clang</tspan></text>
       </g>
     </g>
     <g
@@ -2513,12 +2525,10 @@
         <text
            id="text1397"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Consolas;-inkscape-font-specification:Consolas;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,398.71,89.904)">
-          <tspan
+           transform="matrix(1,0,0,-1,398.71,89.904)"><tspan
              id="tspan1395"
              y="0"
-             x="0">-</tspan>
-        </text>
+             x="0">-</tspan></text>
       </g>
     </g>
     <g
@@ -2530,13 +2540,12 @@
         <text
            id="text1409"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Consolas;-inkscape-font-specification:Consolas;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,406.51,89.904)">
-          <tspan
+           transform="matrix(1,0,0,-1,406.51,89.904)"
+           y="0"><tspan
              id="tspan1407"
              sodipodi:role="line"
              y="0"
-             x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.528561">offload</tspan>
-        </text>
+             x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.528561">offload</tspan></text>
       </g>
     </g>
     <g
@@ -2548,12 +2557,10 @@
         <text
            id="text1421"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Consolas;-inkscape-font-specification:Consolas;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,460.78,89.904)">
-          <tspan
+           transform="matrix(1,0,0,-1,460.78,89.904)"><tspan
              id="tspan1419"
              y="0"
-             x="0">-</tspan>
-        </text>
+             x="0">-</tspan></text>
       </g>
     </g>
     <g
@@ -2565,13 +2572,12 @@
         <text
            id="text1433"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Consolas;-inkscape-font-specification:Consolas;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,468.58,89.904)">
-          <tspan
+           transform="matrix(1,0,0,-1,468.58,89.904)"
+           y="0"><tspan
              id="tspan1431"
              sodipodi:role="line"
              y="0"
-             x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.458359">wrapper</tspan>
-        </text>
+             x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.458359">wrapper</tspan></text>
       </g>
     </g>
     <g
@@ -2629,13 +2635,11 @@
        id="text1509"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="81.597427"
-       y="60.97049">
-      <tspan
+       y="60.97049"><tspan
          sodipodi:role="line"
          id="tspan2072"
          x="81.597427"
-         y="60.97049">PTX target processing</tspan>
-    </text>
+         y="60.97049">PTX target processing</tspan></text>
     <g
        id="g2083"
        transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)">
@@ -2697,13 +2701,12 @@
         <text
            id="text2129"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,397.06,43.272)">
-          <tspan
+           transform="matrix(1,0,0,-1,397.06,43.272)"
+           y="0"><tspan
              id="tspan2127"
              sodipodi:role="line"
              y="0"
-             x="0 12.11652 16.79184 23.517 30.831841 38.146679 45.138599 50.038559 53.141399 60.540482 67.911484 71.267036 78.258957 84.099602">Wrapper object</tspan>
-        </text>
+             x="0 12.11652 16.79184 23.517 30.831841 38.146679 45.138599 50.038559 53.141399 60.540482 67.911484 71.267036 78.258957 84.099602">Wrapper object</tspan></text>
       </g>
     </g>
     <g
@@ -2767,13 +2770,12 @@
         <text
            id="text2177"
            style="font-variant:normal;font-weight:normal;font-size:14.064px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,407.18,26.328)">
-          <tspan
+           transform="matrix(1,0,0,-1,407.18,26.328)"
+           y="0"><tspan
              id="tspan2175"
              sodipodi:role="line"
              y="0"
-             x="0 8.6493597 15.484464 21.841393 25.076113 30.968927 37.930607 41.151264 46.917503 54.329231 61.712833">Device code</tspan>
-        </text>
+             x="0 8.6493597 15.484464 21.841393 25.076113 30.968927 37.930607 41.151264 46.917503 54.329231 61.712833">Device code</tspan></text>
       </g>
     </g>
     <path
@@ -2790,13 +2792,11 @@
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193"
        x="28.692085"
-       y="17.476978">
-      <tspan
+       y="17.476978"><tspan
          sodipodi:role="line"
          id="tspan1595"
          x="28.692085"
-         y="17.476978">(from sycl-post-link)</tspan>
-    </text>
+         y="17.476978">(from sycl-post-link)</tspan></text>
     <g
        id="g1668"
        transform="translate(-29.00824,13.042864)">
@@ -2814,13 +2814,11 @@
          style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
          id="text899-9"
          x="73.031509"
-         y="68.195061">
-        <tspan
+         y="68.195061"><tspan
            y="68.195061"
            x="73.031509"
            id="tspan1662"
-           sodipodi:role="line">libspirv.bc</tspan>
-      </text>
+           sodipodi:role="line">libspirv.bc</tspan></text>
     </g>
     <g
        id="g2345"
@@ -2839,13 +2837,11 @@
          y="81.860855"
          x="71.144508"
          style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
-         id="text899-0">
-        <tspan
+         id="text899-0"><tspan
            y="81.860855"
            x="71.144508"
            id="tspan2332"
-           sodipodi:role="line">libdevice.bc</tspan>
-      </text>
+           sodipodi:role="line">libdevice.bc</tspan></text>
     </g>
     <g
        id="g1263-0"
@@ -2902,13 +2898,11 @@
        id="text1309-1"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="102.06758"
-       y="113.47669">
-      <tspan
+       y="113.47669"><tspan
          sodipodi:role="line"
          id="tspan2597"
          x="102.06758"
-         y="113.47669">ptxas</tspan>
-    </text>
+         y="113.47669">ptxas</tspan></text>
     <g
        id="g1263-02"
        transform="matrix(0.35277777,0,0,-0.35277777,-7.4064096,246.33646)">
@@ -2964,13 +2958,11 @@
        id="text1309-7"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="102.06758"
-       y="137.37424">
-      <tspan
+       y="137.37424"><tspan
          sodipodi:role="line"
          id="tspan2611"
          x="102.06758"
-         y="137.37424">fatbin</tspan>
-    </text>
+         y="137.37424">fatbin</tspan></text>
     <path
        style="fill:#538f7b;fill-opacity:1;fill-rule:evenodd;stroke:#538f7b;stroke-width:0.60211;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker3606)"
        d="m 109.38615,118.87647 v 9.24756"
@@ -2980,24 +2972,20 @@
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1"
        x="110.58295"
-       y="97.450661">
-      <tspan
+       y="97.450661"><tspan
          y="97.450661"
          x="110.58295"
          sodipodi:role="line"
-         id="tspan3794">ptx</tspan>
-    </text>
+         id="tspan3794">ptx</tspan></text>
     <text
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1-3"
        x="111.25836"
-       y="125.55264">
-      <tspan
+       y="125.55264"><tspan
          y="125.55264"
          x="111.25836"
          sodipodi:role="line"
-         id="tspan3916">cubin</tspan>
-    </text>
+         id="tspan3916">cubin</tspan></text>
     <path
        style="fill:none;fill-rule:evenodd;stroke:#538f7b;stroke-width:0.616;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker4036)"
        d="M 108.63556,91.358218 V 104.16258"
@@ -3014,13 +3002,11 @@
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1-4"
        x="110.90725"
-       y="73.327454">
-      <tspan
+       y="73.327454"><tspan
          sodipodi:role="line"
          id="tspan4924"
          x="110.90725"
-         y="73.327454">LLVM IR</tspan>
-    </text>
+         y="73.327454">LLVM IR</tspan></text>
     <path
        style="fill:none;fill-rule:evenodd;stroke:#4472c4;stroke-width:0.616;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker4944)"
        d="m 72.28691,79.325713 h 9.08716 v 5.34539 h 6.94901"
@@ -3036,35 +3022,29 @@
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1-3-9"
        x="111.2326"
-       y="150.29218">
-      <tspan
+       y="150.29218"><tspan
          y="150.29218"
          x="111.2326"
          sodipodi:role="line"
-         id="tspan2303">CUDA fatbin</tspan>
-    </text>
+         id="tspan2303">CUDA fatbin</tspan></text>
     <text
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-5"
        x="31.587437"
-       y="216.68318">
-      <tspan
+       y="216.68318"><tspan
          sodipodi:role="line"
          id="tspan1653"
          x="31.587437"
-         y="216.68318">(to host linker)</tspan>
-    </text>
+         y="216.68318">(to host linker)</tspan></text>
     <text
        id="text1309-0"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.05694px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="91.899139"
-       y="88.973877">
-      <tspan
+       y="88.973877"><tspan
          sodipodi:role="line"
          id="tspan1706"
          x="91.899139"
-         y="88.973877">(nvptx backend)</tspan>
-    </text>
+         y="88.973877">(nvptx backend)</tspan></text>
     <path
        inkscape:connector-curvature="0"
        id="path995-7"
@@ -3140,35 +3120,29 @@
        id="text1509-0"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="86.807594"
-       y="33.349266">
-      <tspan
+       y="33.349266"><tspan
          sodipodi:role="line"
          id="tspan2072-3"
          x="86.807594"
-         y="33.349266">file-table-tform</tspan>
-    </text>
+         y="33.349266">file-table-tform</tspan></text>
     <text
        id="text1509-0-9"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="90.741898"
-       y="37.806171">
-      <tspan
+       y="37.806171"><tspan
          sodipodi:role="line"
          id="tspan2072-3-5"
          x="90.741898"
-         y="37.806171">(Copy &quot;Code&quot;)</tspan>
-    </text>
+         y="37.806171">(Copy &quot;Code&quot;)</tspan></text>
     <text
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1-4-1"
        x="110.37068"
-       y="47.484673">
-      <tspan
+       y="47.484673"><tspan
          sodipodi:role="line"
          id="tspan4924-3"
          x="110.37068"
-         y="47.484673">LLVM IR</tspan>
-    </text>
+         y="47.484673">LLVM IR</tspan></text>
     <path
        inkscape:connector-curvature="0"
        id="path995-7-4"
@@ -3244,40 +3218,40 @@
        id="text1509-0-5"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="87.676941"
-       y="161.4454">
-      <tspan
+       y="161.4454"><tspan
          sodipodi:role="line"
          id="tspan2072-3-0"
          x="87.676941"
-         y="161.4454">file-table-tform</tspan>
-    </text>
+         y="161.4454">file-table-tform</tspan></text>
     <text
        id="text1509-0-9-5"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="87.907066"
-       y="165.9023">
-      <tspan
+       y="165.9023"><tspan
          sodipodi:role="line"
          id="tspan2072-3-5-4"
          x="87.907066"
-         y="165.9023">(Replace &quot;Code&quot;)</tspan>
-    </text>
+         y="165.9023">(Replace &quot;Code&quot;)</tspan></text>
     <text
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1-3-9-2"
        x="110.86103"
-       y="175.67789">
-      <tspan
+       y="175.67789"><tspan
          y="175.67789"
          x="110.86103"
          sodipodi:role="line"
-         id="tspan2303-1">File table</tspan>
-    </text>
+         id="tspan2303-1">File table</tspan></text>
     <path
        style="fill:none;fill-rule:evenodd;stroke:#4472c4;stroke-width:0.616;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker4944-3)"
        d="m 108.65966,22.915376 h 69.56335 l 0,138.959974 h -9.30396"
        id="path4926-6"
        inkscape:connector-curvature="0"
        sodipodi:nodetypes="cccc" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#4472c4;stroke-width:0.616;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker4944-3-5)"
+       d="m 177.4031,22.943668 h 14.37883 L 191.97091,188.17299 169.2488,187.984"
+       id="path4926-6-8"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cccc" />
   </g>
 </svg>
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index efb105d1ba090..85cedb47e6e92 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -771,7 +771,8 @@ struct get_device_info_impl<
     // Currently fusion is only supported for SPIR-V based backends, i.e. OpenCL
     // and LevelZero.
     return (Dev->getBackend() == backend::ext_oneapi_level_zero) ||
-           (Dev->getBackend() == backend::opencl);
+           (Dev->getBackend() == backend::opencl) ||
+           (Dev->getBackend() == backend::ext_oneapi_cuda);
 #else  // SYCL_EXT_CODEPLAY_KERNEL_FUSION
     (void)Dev;
     return false;
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 4e092ef12c4e3..ed64e35c06509 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -39,6 +39,80 @@ translateBinaryImageFormat(pi::PiDeviceBinaryType Type) {
   }
 }
 
+::jit_compiler::BinaryFormat getTargetFormat(QueueImplPtr &Queue) {
+  auto Backend = Queue->getDeviceImplPtr()->getBackend();
+  switch (Backend) {
+  case backend::ext_oneapi_level_zero:
+  case backend::opencl:
+    return ::jit_compiler::BinaryFormat::SPIRV;
+  case backend::ext_oneapi_cuda:
+    return ::jit_compiler::BinaryFormat::PTX;
+  default:
+    throw sycl::exception(
+        sycl::make_error_code(sycl::errc::feature_not_supported),
+        "Backend unsupported by kernel fusion");
+  }
+}
+
+std::pair<const RTDeviceBinaryImage *, RT::PiProgram>
+retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
+  auto KernelName = KernelCG->getKernelName();
+
+  bool isNvidia =
+      Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_cuda;
+  if (isNvidia) {
+    auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
+    std::vector<kernel_id> KernelIds{KernelID};
+    auto DeviceImages =
+        ProgramManager::getInstance().getRawDeviceImages(KernelIds);
+    auto DeviceImage = std::find_if(
+        DeviceImages.begin(), DeviceImages.end(), [](RTDeviceBinaryImage *DI) {
+          return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
+                 DI->getRawData().DeviceTargetSpec ==
+                     std::string("llvm_nvptx64");
+        });
+    if (DeviceImage == DeviceImages.end()) {
+      return {nullptr, nullptr};
+    }
+    auto ContextImpl = Queue->getContextImplPtr();
+    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+    auto DeviceImpl = Queue->getDeviceImplPtr();
+    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+    RT::PiProgram Program =
+        detail::ProgramManager::getInstance().createPIProgram(**DeviceImage,
+                                                              Context, Device);
+    return {*DeviceImage, Program};
+  }
+
+  const RTDeviceBinaryImage *DeviceImage = nullptr;
+  RT::PiProgram Program = nullptr;
+  if (KernelCG->getKernelBundle() != nullptr) {
+    // Retrieve the device image from the kernel bundle.
+    auto KernelBundle = KernelCG->getKernelBundle();
+    kernel_id KernelID =
+        detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
+
+    auto SyclKernel = detail::getSyclObjImpl(
+        KernelBundle->get_kernel(KernelID, KernelBundle));
+
+    DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
+    Program = SyclKernel->getDeviceImage()->get_program_ref();
+  } else if (KernelCG->MSyclKernel != nullptr) {
+    DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
+    Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
+  } else {
+    auto ContextImpl = Queue->getContextImplPtr();
+    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+    auto DeviceImpl = Queue->getDeviceImplPtr();
+    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+    DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
+        KernelCG->MOSModuleHandle, KernelName, Context, Device);
+    Program = detail::ProgramManager::getInstance().createPIProgram(
+        *DeviceImage, Context, Device);
+  }
+  return {DeviceImage, Program};
+}
+
 static ::jit_compiler::ParameterKind
 translateArgType(kernel_param_kind_t Kind) {
   using PK = ::jit_compiler::ParameterKind;
@@ -378,7 +452,15 @@ static ParamIterator preProcessArguments(
     // which will go out-of-scope before we execute the fused kernel. Therefore,
     // we need to copy the argument to a permant location and update the
     // argument.
-    Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
+    if (Arg->Arg.MPtr) {
+      Arg->Arg.MPtr =
+          storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
+      // Propagate values of scalar parameters as constants to the JIT
+      // compiler.
+      JITConstants.emplace_back(
+          ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
+          Arg->Arg.MPtr, Arg->Arg.MSize);
+    }
     // Standard layout arguments do not participate in identical argument
     // detection, but we still add it to the list here. As the SYCL runtime can
     // only check the raw bytes for identical content, but is unaware of the
@@ -386,14 +468,9 @@ static ParamIterator preProcessArguments(
     // not be materialized by the JIT compiler. Instead of removing some
     // standard layout arguments due to identity and missing some in case the
     // materialization is not possible, we rely on constant propagation to
-    // replace standard layout arguments by constants (see below).
+    // replace standard layout arguments by constants.
     NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
                                     true);
-    // Propagate values of scalar parameters as constants to the JIT
-    // compiler.
-    JITConstants.emplace_back(
-        ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
-        Arg->Arg.MPtr, Arg->Arg.MSize);
     return ++Arg;
   }
   // First check if there's already another parameter with identical
@@ -576,43 +653,20 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
           "Cannot fuse kernel with invalid kernel function name");
       return nullptr;
     }
-    const RTDeviceBinaryImage *DeviceImage = nullptr;
-    RT::PiProgram Program = nullptr;
+
+    auto [DeviceImage, Program] = retrieveKernelBinary(Queue, KernelCG);
+
+    if (!DeviceImage || !Program) {
+      printPerformanceWarning("No suitable IR available for fusion");
+      return nullptr;
+    }
     const KernelArgMask *EliminatedArgs = nullptr;
-    if (KernelCG->getKernelBundle() != nullptr) {
-      // Retrieve the device image from the kernel bundle.
-      auto KernelBundle = KernelCG->getKernelBundle();
-      kernel_id KernelID =
-          detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
-
-      auto SyclKernel = detail::getSyclObjImpl(
-          KernelBundle->get_kernel(KernelID, KernelBundle));
-
-      DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
-      Program = SyclKernel->getDeviceImage()->get_program_ref();
-      EliminatedArgs = SyclKernel->getKernelArgMask();
-    } else if (KernelCG->MSyclKernel != nullptr) {
-      DeviceImage =
-          KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
-      Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
-      EliminatedArgs = KernelCG->MSyclKernel->getKernelArgMask();
-    } else {
-      auto ContextImpl = Queue->getContextImplPtr();
-      auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
-      auto DeviceImpl = Queue->getDeviceImplPtr();
-      auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
-      DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
-          KernelCG->MOSModuleHandle, KernelName, Context, Device);
-      Program = detail::ProgramManager::getInstance().createPIProgram(
-          *DeviceImage, Context, Device);
+    if (Program && (KernelCG->MSyclKernel == nullptr ||
+                    !KernelCG->MSyclKernel->isCreatedFromSource())) {
       EliminatedArgs =
           detail::ProgramManager::getInstance().getEliminatedKernelArgMask(
               KernelCG->MOSModuleHandle, Program, KernelName);
     }
-    if (!DeviceImage || !Program) {
-      printPerformanceWarning("No suitable IR available for fusion");
-      return nullptr;
-    }
 
     // Collect information about the arguments of this kernel.
 
@@ -666,8 +720,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       return nullptr;
     }
     ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
-        translateBinaryImageFormat(DeviceImage->getFormat()), 0,
-        RawDeviceImage.BinaryStart, DeviceImageSize};
+        BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
 
     constexpr auto SYCLTypeToIndices = [](auto Val) -> ::jit_compiler::Indices {
       return {Val.get(0), Val.get(1), Val.get(2)};
@@ -757,6 +810,9 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   JITConfig.set<::jit_compiler::option::JITEnableCaching>(
       detail::SYCLConfig<detail::SYCL_ENABLE_FUSION_CACHING>::get());
 
+  ::jit_compiler::BinaryFormat TargetFormat = getTargetFormat(Queue);
+  JITConfig.set<::jit_compiler::option::JITTargetFormat>(TargetFormat);
+
   auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels(
       *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames,
       FusedKernelName.str(), ParamIdentities, BarrierFlags, InternalizeParams,
@@ -796,11 +852,19 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   }(FusedKernelInfo.NDR);
   updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
 
+  OSModuleHandle Handle = OSUtil::DummyModuleHandle;
   if (!FusionResult.cached()) {
-    auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo);
+    auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat);
     detail::ProgramManager::getInstance().addImages(PIDeviceBinaries);
-  } else if (DebugEnabled) {
-    std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
+    Handle = OSUtil::getOSModuleHandle(PIDeviceBinaries->DeviceBinaries);
+    CachedModules.emplace(FusedKernelInfo.Name, Handle);
+  } else {
+    if (DebugEnabled) {
+      std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
+    }
+    // Retrieve an OSModuleHandle for the cached binary.
+    assert(CachedModules.count(FusedKernelInfo.Name) && "No cached binary");
+    Handle = CachedModules.at(FusedKernelInfo.Name);
   }
 
   // Create a kernel bundle for the fused kernel.
@@ -809,22 +873,45 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       FusedKernelInfo.Name);
   std::vector<std::shared_ptr<const void>> RawExtendedMembers;
 
-  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
-      detail::getSyclObjImpl(get_kernel_bundle<bundle_state::executable>(
-          Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
+  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr;
+  if (TargetFormat == ::jit_compiler::BinaryFormat::SPIRV) {
+    detail::getSyclObjImpl(get_kernel_bundle<bundle_state::executable>(
+        Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
+  }
 
   std::unique_ptr<detail::CG> FusedCG;
   FusedCG.reset(new detail::CGExecKernel(
       NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
       std::move(ArgsStorage), std::move(AccStorage),
       std::move(RawExtendedMembers), std::move(Requirements), std::move(Events),
-      std::move(FusedArgs), FusedKernelInfo.Name, OSUtil::DummyModuleHandle, {},
-      {}, CG::CGTYPE::Kernel, KernelCacheConfig));
+      std::move(FusedArgs), FusedKernelInfo.Name, Handle, {}, {},
+      CG::CGTYPE::Kernel, KernelCacheConfig));
   return FusedCG;
 }
 
 pi_device_binaries jit_compiler::createPIDeviceBinary(
-    const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo) {
+    const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
+    ::jit_compiler::BinaryFormat Format) {
+
+  const char *TargetSpec = nullptr;
+  pi_device_binary_type BinFormat = PI_DEVICE_BINARY_TYPE_NATIVE;
+  switch (Format) {
+  case ::jit_compiler::BinaryFormat::PTX: {
+    TargetSpec = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64;
+    BinFormat = PI_DEVICE_BINARY_TYPE_NONE;
+    break;
+  }
+  case ::jit_compiler::BinaryFormat::SPIRV: {
+    TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64)
+                     ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64
+                     : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32;
+    BinFormat = PI_DEVICE_BINARY_TYPE_SPIRV;
+    break;
+  }
+  default:
+    sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                    "Invalid output format");
+  }
 
   DeviceBinaryContainer Binary;
 
@@ -849,11 +936,33 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
 
   Binary.addProperty(std::move(ArgMaskPropSet));
 
+  if (Format == ::jit_compiler::BinaryFormat::PTX) {
+    // Add a program metadata property with the reqd_work_group_size attribute.
+    // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference.
+    auto ReqdWGS = std::find_if(
+        FusedKernelInfo.Attributes.begin(), FusedKernelInfo.Attributes.end(),
+        [](const ::jit_compiler::SYCLKernelAttribute &Attr) {
+          return Attr.AttributeName == "reqd_work_group_size";
+        });
+    if (ReqdWGS != FusedKernelInfo.Attributes.end()) {
+      auto Encoded = encodeReqdWorkGroupSize(*ReqdWGS);
+      std::stringstream PropName;
+      PropName << FusedKernelInfo.Name;
+      PropName << __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE;
+      PropertyContainer ReqdWorkGroupSizeProp{
+          PropName.str(), Encoded.data(), Encoded.size(),
+          pi_property_type::PI_PROPERTY_TYPE_BYTE_ARRAY};
+      PropertySetContainer ProgramMetadata{
+          __SYCL_PI_PROPERTY_SET_PROGRAM_METADATA};
+      ProgramMetadata.addProperty(std::move(ReqdWorkGroupSizeProp));
+      Binary.addProperty(std::move(ProgramMetadata));
+    }
+  }
+
   DeviceBinariesCollection Collection;
-  Collection.addDeviceBinary(std::move(Binary),
-                             FusedKernelInfo.BinaryInfo.BinaryStart,
-                             FusedKernelInfo.BinaryInfo.BinarySize,
-                             FusedKernelInfo.BinaryInfo.AddressBits);
+  Collection.addDeviceBinary(
+      std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart,
+      FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat);
 
   JITDeviceBinaries.push_back(std::move(Collection));
   return JITDeviceBinaries.back().getPIDeviceStruct();
@@ -889,6 +998,23 @@ std::vector<uint8_t> jit_compiler::encodeArgUsageMask(
   return Encoded;
 }
 
+std::vector<uint8_t> jit_compiler::encodeReqdWorkGroupSize(
+    const ::jit_compiler::SYCLKernelAttribute &Attr) const {
+  assert(Attr.AttributeName == "reqd_work_group_size");
+  size_t NumBytes = sizeof(uint64_t) + (Attr.Values.size() * sizeof(uint32_t));
+  std::vector<uint8_t> Encoded(NumBytes, 0u);
+  uint8_t *Ptr = Encoded.data();
+  // Skip 64-bit wide size argument with value 0 at the start of the data.
+  // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference.
+  Ptr += sizeof(uint64_t);
+  for (const auto &Val : Attr.Values) {
+    uint32_t UVal = std::stoul(Val);
+    std::memcpy(Ptr, &UVal, sizeof(uint32_t));
+    Ptr += sizeof(uint32_t);
+  }
+  return Encoded;
+}
+
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index 522c0749ef75b..fae774cadd09a 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -12,9 +12,13 @@
 #include <detail/scheduler/commands.hpp>
 #include <detail/scheduler/scheduler.hpp>
 
+#include <unordered_map>
+
 namespace jit_compiler {
+enum class BinaryFormat : uint32_t;
 class JITContext;
 struct SYCLKernelInfo;
+struct SYCLKernelAttribute;
 using ArgUsageMask = std::vector<unsigned char>;
 } // namespace jit_compiler
 
@@ -46,14 +50,20 @@ class jit_compiler {
   jit_compiler &operator=(const jit_compiler &&) = delete;
 
   pi_device_binaries
-  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo);
+  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
+                       ::jit_compiler::BinaryFormat Format);
 
   std::vector<uint8_t>
   encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const;
 
+  std::vector<uint8_t> encodeReqdWorkGroupSize(
+      const ::jit_compiler::SYCLKernelAttribute &Attr) const;
+
   // Manages the lifetime of the PI structs for device binaries.
   std::vector<DeviceBinariesCollection> JITDeviceBinaries;
 
+  std::unordered_map<std::string, OSModuleHandle> CachedModules;
+
   std::unique_ptr<::jit_compiler::JITContext> MJITContext;
 };
 
diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp
index 0aa778da14240..d0cc9e824bc07 100644
--- a/sycl/source/detail/jit_device_binaries.cpp
+++ b/sycl/source/detail/jit_device_binaries.cpp
@@ -81,10 +81,12 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) {
 }
 
 pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
-    const unsigned char *BinaryStart, size_t BinarySize, size_t AddressBits) {
+    const unsigned char *BinaryStart, size_t BinarySize, const char *TargetSpec,
+    pi_device_binary_type Format) {
   pi_device_binary_struct DeviceBinary;
   DeviceBinary.Version = PI_DEVICE_BINARY_VERSION;
   DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL;
+  DeviceBinary.Format = Format;
   DeviceBinary.CompileOptions = "";
   DeviceBinary.LinkOptions = "";
   DeviceBinary.ManifestStart = nullptr;
@@ -93,10 +95,7 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
   // the JITContext.
   DeviceBinary.BinaryStart = BinaryStart;
   DeviceBinary.BinaryEnd = BinaryStart + BinarySize;
-  DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV;
-  DeviceBinary.DeviceTargetSpec = (AddressBits == 32)
-                                      ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32
-                                      : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64;
+  DeviceBinary.DeviceTargetSpec = TargetSpec;
   DeviceBinary.EntriesBegin = PIOffloadEntries.data();
   DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size();
   DeviceBinary.PropertySetsBegin = PIPropertySets.data();
@@ -108,14 +107,15 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
 void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont,
                                                const unsigned char *BinaryStart,
                                                size_t BinarySize,
-                                               size_t AddressBits) {
+                                               const char *TargetSpec,
+                                               pi_device_binary_type Format) {
   // Adding to the vectors might trigger reallocation, which would invalidate
   // the pointers used for PI structs if a PI struct has already been created
   // via getPIDeviceStruct(). Forbid calls to this method after the first PI
   // struct has been created.
   assert(Fused && "Adding to container would invalidate existing PI structs");
   PIBinaries.push_back(
-      Cont.getPIDeviceBinary(BinaryStart, BinarySize, AddressBits));
+      Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec, Format));
   Binaries.push_back(std::move(Cont));
 }
 
diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp
index 7bf2c7d9fe07b..cecab17870650 100644
--- a/sycl/source/detail/jit_device_binaries.hpp
+++ b/sycl/source/detail/jit_device_binaries.hpp
@@ -111,7 +111,8 @@ class DeviceBinaryContainer {
 
   pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart,
                                             size_t BinarySize,
-                                            size_t AddressBits);
+                                            const char *TargetSpec,
+                                            pi_device_binary_type Format);
 
 private:
   bool Fused = true;
@@ -138,7 +139,7 @@ class DeviceBinariesCollection {
 
   void addDeviceBinary(DeviceBinaryContainer &&Cont,
                        const unsigned char *BinaryStart, size_t BinarySize,
-                       size_t AddressBits);
+                       const char *TargetSpec, pi_device_binary_type Format);
   pi_device_binaries getPIDeviceStruct();
 
 private:
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 35df6ef614cc7..5f5f68ed1bf56 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -290,6 +290,17 @@ class Command {
   // Memory is allocated in this method and released in destructor.
   void copySubmissionCodeLocation();
 
+  /// Clear all dependency events This should only be used if a command is about
+  /// to be deleted without being executed before that. As of now, the only
+  /// valid use case for this function is in kernel fusion, where the fused
+  /// kernel commands are replaced by the fused command without ever being
+  /// executed.
+  void clearAllDependencies() {
+    MPreparedDepsEvents.clear();
+    MPreparedHostDepsEvents.clear();
+    MDeps.clear();
+  }
+
   /// Contains list of dependencies(edges)
   std::vector<DepDesc> MDeps;
   /// Contains list of commands that depend on the command.
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 5dee68c6e69d2..c40b5c0f69fdc 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1395,7 +1395,10 @@ void Scheduler::GraphBuilder::removeNodeFromGraph(
     Dep.MDepCommand->MUsers.erase(Node);
   }
 
-  Node->MDeps.clear();
+  // Clear all the dependencies to avoid cleanDepEventsThroughOneLevel, called
+  // from the destructor of the command to delete the dependencies of the
+  // command this command depends on.
+  Node->clearAllDependencies();
 }
 
 void Scheduler::GraphBuilder::cancelFusion(QueueImplPtr Queue,
diff --git a/sycl/test-e2e/KernelFusion/abort_fusion.cpp b/sycl/test-e2e/KernelFusion/abort_fusion.cpp
index 28f5e0d72b5b5..e0a5858b12250 100644
--- a/sycl/test-e2e/KernelFusion/abort_fusion.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_fusion.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test fusion being aborted: Different scenarios causing the JIT compiler
diff --git a/sycl/test-e2e/KernelFusion/abort_internalization.cpp b/sycl/test-e2e/KernelFusion/abort_internalization.cpp
index b314bf3a0d64d..fdb7b3fa6b193 100644
--- a/sycl/test-e2e/KernelFusion/abort_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_internalization.cpp
@@ -1,9 +1,11 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
-// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -O2 -fsycl-embed-ir %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_ENABLE_FUSION_CACHING=0\
+// RUN: %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
-// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_ENABLE_FUSION_CACHING=0\
+// RUN: %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test incomplete internalization: Different scenarios causing the JIT compiler
diff --git a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
index c3d49cea3c1ab..d05d53e19c68c 100644
--- a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "Computation error" --implicit-check-not "Internalized"
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "Computation error" --implicit-check-not "Internalized"
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test pointers being stored are not internalized.
diff --git a/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp b/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp
index 8ff486812b921..3dee82d36006d 100644
--- a/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with local internalization and a combination of kernels
diff --git a/sycl/test-e2e/KernelFusion/buffer_internalization.cpp b/sycl/test-e2e/KernelFusion/buffer_internalization.cpp
index 102441ed1b8c8..4be40a9389a2f 100644
--- a/sycl/test-e2e/KernelFusion/buffer_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/buffer_internalization.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/cancel_fusion.cpp b/sycl/test-e2e/KernelFusion/cancel_fusion.cpp
index 6c94f9902579a..9dc5ebe2c007c 100644
--- a/sycl/test-e2e/KernelFusion/cancel_fusion.cpp
+++ b/sycl/test-e2e/KernelFusion/cancel_fusion.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test cancel fusion
diff --git a/sycl/test-e2e/KernelFusion/complete_fusion.cpp b/sycl/test-e2e/KernelFusion/complete_fusion.cpp
index 0ffeca17a5e78..67c2fb3d05ec9 100644
--- a/sycl/test-e2e/KernelFusion/complete_fusion.cpp
+++ b/sycl/test-e2e/KernelFusion/complete_fusion.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion without any internalization
diff --git a/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp b/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp
index 91bd1622a5a5f..c7dc498ce2dd7 100644
--- a/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp
+++ b/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// XFAIL: cuda || hip
+// XFAIL: hip
 // REQUIRES: fusion
 
 // Test correct return from device information descriptor.
diff --git a/sycl/test-e2e/KernelFusion/diamond_shape.cpp b/sycl/test-e2e/KernelFusion/diamond_shape.cpp
index 54a513fbfc71a..73ce385e1c848 100644
--- a/sycl/test-e2e/KernelFusion/diamond_shape.cpp
+++ b/sycl/test-e2e/KernelFusion/diamond_shape.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp b/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp
new file mode 100644
index 0000000000000..359c86086e8d7
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp
@@ -0,0 +1,111 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: hip
+// REQUIRES: fusion
+
+// Test complete fusion with local internalization specified on the
+// accessors for a combination of four kernels, forming a diamond-like shape and
+// repeating one of the kernels.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+struct AddKernel {
+  accessor<int, 1> accIn1;
+  accessor<int, 1> accIn2;
+  accessor<int, 1> accOut;
+
+  void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
+};
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp1[dataSize],
+      tmp2[dataSize], tmp3[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp1[i] = -1;
+    tmp2[i] = -1;
+    tmp3[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp1{
+        tmp1,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bTmp2{
+        tmp2,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bTmp3{
+        tmp3,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp1 = bTmp1.get_access(cgh);
+      cgh.parallel_for<AddKernel>(nd_range<1>{{dataSize}, {16}},
+                                  AddKernel{accIn1, accIn2, accTmp1});
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp1 = bTmp1.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accTmp2 = bTmp2.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {16}},
+          [=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp1 = bTmp1.get_access(cgh);
+      auto accTmp3 = bTmp3.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{dataSize}, {16}},
+          [=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp2 = bTmp2.get_access(cgh);
+      auto accTmp3 = bTmp3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<AddKernel>(nd_range<1>{{dataSize}, {16}},
+                                  AddKernel{accTmp2, accTmp3, accOut});
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i + i * 25) && "Computation error");
+    assert(tmp1[i] == -1 && "tmp1 not internalized");
+    assert(tmp2[i] == -1 && "tmp2 not internalized");
+    assert(tmp3[i] == -1 && "tmp3 not internalized");
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp b/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp
index 1f4771d7d5b0d..aff44212531d8 100644
--- a/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp
+++ b/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion, aspect-usm_shared_allocations
 
 // Test validity of events after cancel_fusion.
diff --git a/sycl/test-e2e/KernelFusion/event_wait_complete.cpp b/sycl/test-e2e/KernelFusion/event_wait_complete.cpp
index cf48a9c1ced43..20e547a09f64e 100644
--- a/sycl/test-e2e/KernelFusion/event_wait_complete.cpp
+++ b/sycl/test-e2e/KernelFusion/event_wait_complete.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion, aspect-usm_shared_allocations
 
 // Test validity of events after complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp b/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp
new file mode 100644
index 0000000000000..a35473cc1fd51
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp
@@ -0,0 +1,78 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: hip
+// REQUIRES: fusion
+
+// Test complete fusion with local internalization and an local accessor that
+// already exists in one of the input kernels.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_local{});
+      local_accessor<int> accLocal{16, cgh};
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {16}}, [=](nd_item<1> i) {
+            size_t globalIdx = i.get_global_linear_id();
+            size_t localIdx = i.get_local_linear_id();
+            accLocal[localIdx] = accIn2[globalIdx];
+            accTmp[globalIdx] = accIn1[globalIdx] + accLocal[localIdx];
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_local{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{dataSize}, {16}},
+          [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp b/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp
index 22e3f5ba2b34d..3277ba5ee6f52 100644
--- a/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp
+++ b/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion, aspect-usm_shared_allocations
 
 // Test complete fusion where one kernel in the fusion list specifies an
diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
index ba155e0e5ffac..2ebccb626ef23 100644
--- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test internalization of a nested array type.
diff --git a/sycl/test-e2e/KernelFusion/internalize_deep.cpp b/sycl/test-e2e/KernelFusion/internalize_deep.cpp
index 4a0c32e2683b6..2585728259499 100644
--- a/sycl/test-e2e/KernelFusion/internalize_deep.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_deep.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with internalization of a deep struct type.
diff --git a/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp b/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp
index 9455cad86900a..bd521155be6ed 100644
--- a/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/internalize_vec.cpp b/sycl/test-e2e/KernelFusion/internalize_vec.cpp
index 765036ec98d9c..0536a8f15216a 100644
--- a/sycl/test-e2e/KernelFusion/internalize_vec.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_vec.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with internalization of a struct type.
diff --git a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
index bf8f177a24904..1a130a404ad71 100644
--- a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/jit_caching.cpp b/sycl/test-e2e/KernelFusion/jit_caching.cpp
index 617b81bb4b48c..7c00a7cc61eee 100644
--- a/sycl/test-e2e/KernelFusion/jit_caching.cpp
+++ b/sycl/test-e2e/KernelFusion/jit_caching.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test caching for JIT fused kernels. Also test for debug messages being
diff --git a/sycl/test-e2e/KernelFusion/local_internalization.cpp b/sycl/test-e2e/KernelFusion/local_internalization.cpp
index b60bd29df394c..508ad08584f4a 100644
--- a/sycl/test-e2e/KernelFusion/local_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/local_internalization.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with local internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/math_function.cpp b/sycl/test-e2e/KernelFusion/math_function.cpp
new file mode 100644
index 0000000000000..ef9357672977e
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/math_function.cpp
@@ -0,0 +1,64 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: hip
+// REQUIRES: fusion
+
+// Test fusion of a kernel using a math function.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  float in1[dataSize], in2[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = 1;
+    in2[i] = i * 3;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<float> bIn1{in1, range{dataSize}};
+    buffer<float> bIn2{in2, range{dataSize}};
+    buffer<float> bTmp{tmp, range{dataSize}};
+    buffer<float> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = sycl::cospi(accIn1[i]); });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn2[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (-1.0 * static_cast<float>(i * 3)) && "Computation error");
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp b/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp
index 917eda6e090c7..ffd08f918d414 100644
--- a/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp
+++ b/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with local internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp b/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp
index ffe5178cdaa9c..7c16d212d8485 100644
--- a/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp
+++ b/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 // This test currently fails because InferAddressSpace is not able to remove all
 // address-space casts, causing internalization to fail.
diff --git a/sycl/test-e2e/KernelFusion/private_internalization.cpp b/sycl/test-e2e/KernelFusion/private_internalization.cpp
index bf7490e5cabab..ca0e8fdeb2f97 100644
--- a/sycl/test-e2e/KernelFusion/private_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/private_internalization.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp b/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp
index 7325b26925349..bd2d2d0ea2a40 100644
--- a/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp
+++ b/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization on accessors with different
diff --git a/sycl/test-e2e/KernelFusion/struct_with_array.cpp b/sycl/test-e2e/KernelFusion/struct_with_array.cpp
index 20d25a0d38f50..f79b51908bb49 100644
--- a/sycl/test-e2e/KernelFusion/struct_with_array.cpp
+++ b/sycl/test-e2e/KernelFusion/struct_with_array.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization on a kernel functor with an
diff --git a/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp
index 14643a3d81179..440de656d043b 100644
--- a/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on an explicit memory operation on an accessor
 // happening before complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp b/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp
index 627a8cdbfe43d..96375f18c41d2 100644
--- a/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on buffer destruction happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_event_wait.cpp b/sycl/test-e2e/KernelFusion/sync_event_wait.cpp
index d34393638e9b4..d077116412683 100644
--- a/sycl/test-e2e/KernelFusion/sync_event_wait.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_event_wait.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on event::wait() happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp b/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp
index 854803f34716d..d6f4cdc77456f 100644
--- a/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on host accessor creation happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_host_task.cpp b/sycl/test-e2e/KernelFusion/sync_host_task.cpp
index fc94fa9b3d9dc..4c2bc870e2919 100644
--- a/sycl/test-e2e/KernelFusion/sync_host_task.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_host_task.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on host task submission happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp b/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp
index 145fde97b5012..936b486c7741b 100644
--- a/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on queue destruction happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp b/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp
index 5fe768d60c551..71996ffed8cf8 100644
--- a/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on queue::wait() happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_second_queue.cpp b/sycl/test-e2e/KernelFusion/sync_second_queue.cpp
index 057c96935361b..5147a639196c0 100644
--- a/sycl/test-e2e/KernelFusion/sync_second_queue.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_second_queue.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on submission of kernel with requirements to a
 // different queue happening before complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
index bb33fcdcb8337..a2c9caa88cabc 100644
--- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
@@ -3,7 +3,7 @@
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // For this test, complete_fusion must be supported.
 // REQUIRES: fusion
 
diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp
index 3ca9015c7ee22..d3526f2aba029 100644
--- a/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // For this test, complete_fusion must be supported.
 // REQUIRES: fusion
 
diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
index 67af367316fa7..270645af15b40 100644
--- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on an explicit memory operation on an USM pointer
 // happening before complete_fusion.
@@ -61,6 +61,10 @@ int main() {
 
   fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
 
+  for (size_t i = 0; i < dataSize; ++i) {
+    std::cout << out[i] << ", ";
+  }
+  std::cout << "\n";
   // Check the results
   for (size_t i = 0; i < dataSize; ++i) {
     assert(out[i] == (20 * i * i) && "Computation error");
diff --git a/sycl/test-e2e/KernelFusion/three_dimensional.cpp b/sycl/test-e2e/KernelFusion/three_dimensional.cpp
index db0ea25ca3077..805f7f88d782d 100644
--- a/sycl/test-e2e/KernelFusion/three_dimensional.cpp
+++ b/sycl/test-e2e/KernelFusion/three_dimensional.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/two_dimensional.cpp b/sycl/test-e2e/KernelFusion/two_dimensional.cpp
index 2eafb1c1ccc0a..30359b4713dab 100644
--- a/sycl/test-e2e/KernelFusion/two_dimensional.cpp
+++ b/sycl/test-e2e/KernelFusion/two_dimensional.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp b/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp
index 13290d06e25f1..85a19c4122202 100644
--- a/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp
+++ b/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion, aspect-usm_shared_allocations
 
 // Test complete fusion using USM pointers.
diff --git a/sycl/test-e2e/KernelFusion/work_group_barrier.cpp b/sycl/test-e2e/KernelFusion/work_group_barrier.cpp
index 7141c37be8987..5d7e1a6f16806 100644
--- a/sycl/test-e2e/KernelFusion/work_group_barrier.cpp
+++ b/sycl/test-e2e/KernelFusion/work_group_barrier.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with a combination of kernels that require a work-group
diff --git a/sycl/test-e2e/KernelFusion/wrapped_usm.cpp b/sycl/test-e2e/KernelFusion/wrapped_usm.cpp
index aa112ca064ec0..a46c9199b31de 100644
--- a/sycl/test-e2e/KernelFusion/wrapped_usm.cpp
+++ b/sycl/test-e2e/KernelFusion/wrapped_usm.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion, aspect-usm_shared_allocations
 
 // Test complete fusion using an wrapped USM pointer as kernel functor argument.
diff --git a/sycl/test-e2e/README.md b/sycl/test-e2e/README.md
index b8dfb34406788..1a751973eaea3 100644
--- a/sycl/test-e2e/README.md
+++ b/sycl/test-e2e/README.md
@@ -183,6 +183,7 @@ unavailable.
  * **dump_ir**: - compiler can / cannot dump IR;
  * **llvm-spirv** - llvm-spirv tool availability;
  * **llvm-link** - llvm-link tool availability;
+ * **fusion**: - Runtime supports kernel fusion;
 
 ## llvm-lit parameters