intel · steffenlarsen · Dec 12, 2022 · Aug 5, 2022 · Aug 5, 2022 · Aug 29, 2022
@@ -1,4 +1,4 @@
-//===---- matrix-tensorcore.hpp - SYCL tensor cores matrix ----*- C++ -*---===//
+//===-------------- matrix-tensorcores-legacy.hpp - -----------*- C++ -*---===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

@@ -0,0 +1,221 @@
+//===------- matrix-unified.hpp - SYCL matrix extension ----*- C++ -*------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+
+#pragma once
+#include <sycl/ext/oneapi/matrix/matrix-tensorcores.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+namespace matrix {
+
+template <typename Group, typename T, use Use, size_t Rows, size_t Cols,
+          layout Layout>
+struct joint_matrix {
+
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
+  // TODO: Intel case here: we use the ext_oneapi_cuda case also for the host,
+  // because the Intel SPIRV functions will not be host compilable.
+#else
+  sycl::ext::oneapi::detail::joint_matrix_cuda<T, Use, Rows, Cols, Layout>
+      cuda_impl;
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
+
+  joint_matrix() {
+#ifndef __SYCL_DEVICE_ONLY__
+    throw runtime_error("joint matrix is not supported on host device.",
+                        PI_ERROR_INVALID_DEVICE);
+#endif
+  }
+};
+
+template <typename Group, typename T, use Use, size_t Rows, size_t Cols,
+          layout Layout>
+inline __SYCL_ALWAYS_INLINE wi_data<Group, T, Use, Rows, Cols, Layout>
+get_wi_data(Group sg, joint_matrix<Group, T, Use, Rows, Cols, Layout> &jm) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  std::ignore = sg;
+  return wi_data(jm);
+#else
+  // TODO add Intel impl.
+#endif // defined(__NVPTX__)
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+
+template <typename Group, typename T, size_t NumRows, size_t NumCols, use Use,
+          layout Layout, typename T2>
+inline __SYCL_ALWAYS_INLINE void
+joint_matrix_fill(Group sg,
+                  joint_matrix<Group, T, Use, NumRows, NumCols, Layout> &res,
+                  const T2 &v) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  std::ignore = sg;
+  res.cuda_impl.wi_marray = v;
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = sg;
+  std::ignore = res;
+  std::ignore = v;
+  throw runtime_error(
+      "This version of the matrix extension is only currently supported on "
+      "Nvidia devices",
+      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+
+template <
+    typename Group, typename S, typename T, size_t NumRows, size_t NumCols,
+    access::address_space Space, access::decorated IsDecorated,
+    std::enable_if_t<std::is_same<S, std::remove_const_t<T>>::value, bool> =
+        true>
+inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
+    Group sg,
+    joint_matrix<Group, S, use::accumulator, NumRows, NumCols,
+                 sycl::ext::oneapi::experimental::matrix::layout::dynamic> &res,
+    multi_ptr<T, Space, IsDecorated> src, size_t stride,
+    sycl::ext::oneapi::experimental::matrix::layout Layout) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  std::ignore = sg;
+  sycl::ext::oneapi::detail::load_accumulator_cuda(res.cuda_impl, src, stride,
+                                                   Layout);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = sg;
+  std::ignore = res;
+  std::ignore = src;
+  std::ignore = stride;
+  throw runtime_error(
+      "This version of the matrix extension is only currently supported on "
+      "Nvidia devices",
+      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+
+template <
+    typename Group, typename S, typename T, use Use, size_t NumRows,
+    size_t NumCols, matrix::layout Layout, access::address_space Space,
+    access::decorated IsDecorated,
+    std::enable_if_t<std::is_same<S, std::remove_const_t<T>>::value ||
+                         (std::is_same<S, precision::tf32>::value &&
+                          std::is_same<std::remove_const_t<T>, float>::value),
+                     bool> = true>
+inline __SYCL_ALWAYS_INLINE void
+joint_matrix_load(Group sg,
+                  joint_matrix<Group, S, Use, NumRows, NumCols, Layout> &res,
+                  multi_ptr<T, Space, IsDecorated> src, size_t stride) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  std::ignore = sg;
+  sycl::ext::oneapi::detail::load_multiplicand_cuda<S, T, NumRows, NumCols, Use,
+                                                    Layout, Space>(
+      res.cuda_impl, src, stride);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = sg;
+  std::ignore = res;
+  std::ignore = src;
+  std::ignore = stride;
+  throw runtime_error(
+      "This version of the matrix extension is only currently supported on "
+      "Nvidia devices",
+      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+
+template <typename Group, typename T, size_t NumRows, size_t NumCols,
+          access::address_space Space, access::decorated IsDecorated>
+inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
+    Group sg,
+    joint_matrix<Group, T, use::accumulator, NumRows, NumCols,
+                 sycl::ext::oneapi::experimental::matrix::layout::dynamic> &src,
+    multi_ptr<T, Space, IsDecorated> dst, size_t stride,
+    sycl::ext::oneapi::experimental::matrix::layout Layout) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  std::ignore = sg;
+  sycl::ext::oneapi::detail::joint_matrix_store_cuda<T, NumRows, NumCols,
+                                                     Space>(src.cuda_impl, dst,
+                                                            stride, Layout);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = sg;
+  std::ignore = src;
+  std::ignore = dst;
+  std::ignore = stride;
+  throw runtime_error(
+      "This version of the matrix extension is only currently supported on "
+      "Nvidia devices",
+      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+
+template <typename Group, typename Ta, typename Tb, typename Tc, std::size_t M,
+          std::size_t K, std::size_t N, layout LayoutA, layout LayoutB>
+inline __SYCL_ALWAYS_INLINE
+    joint_matrix<Group, Tc, use::accumulator, M, N,
+                 sycl::ext::oneapi::experimental::matrix::layout::dynamic>
+    joint_matrix_mad(
+        Group sg, joint_matrix<Group, Ta, use::a, M, K, LayoutA> &A,
+        joint_matrix<Group, Tb, use::b, K, N, LayoutB> &B,
+        joint_matrix<Group, Tc, use::accumulator, M, N,
+                     sycl::ext::oneapi::experimental::matrix::layout::dynamic>
+            &C) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  std::ignore = sg;
+  if constexpr (std::is_same<Ta, Tb>::value) {
+    joint_matrix<Group, Tc, use::accumulator, M, N,
+                 sycl::ext::oneapi::experimental::matrix::layout::dynamic>
+        D;
+    sycl::ext::oneapi::detail::joint_matrix_mad_cuda<Ta, Tc, M, K, N, LayoutA,
+                                                     LayoutB>(
+        D.cuda_impl, A.cuda_impl, B.cuda_impl, C.cuda_impl);
+    return D;
+  } else {
+    assert(false && "Ta != Tb : In the CUDA backend joint_matrix_mad "
+                    "requires that joint_matrix data types Ta and Tb match");
+  }
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = sg;
+  std::ignore = A;
+  std::ignore = B;
+  std::ignore = C;
+  throw runtime_error(
+      "This version of the matrix extension is only currently supported on "
+      "Nvidia devices",
+      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+
+// This function rounds the bottom 13 bits up or down, and then zeros out the
+// bottom bits
+inline __SYCL_ALWAYS_INLINE float round_to_tf32(float &a) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  int32_t tmp_int = __nvvm_f2tf32_rna(a);
+  return __nvvm_bitcast_i2f(tmp_int);
+#else
+  uint32_t tmp_uint = reinterpret_cast<uint32_t &>(a);
+  tmp_uint += 0x1000u;
+  tmp_uint &= 0xFFFFE000u;
+  float ret = reinterpret_cast<float &>(tmp_uint);
+  return ret;
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+} // namespace matrix
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
@@ -27,5 +27,8 @@
 #include <sycl/ext/oneapi/matrix/static-query-use.hpp>
 #endif // SYCL_EXT_ONEAPI_MATRIX_VERSION
 #if (SYCL_EXT_ONEAPI_MATRIX_VERSION == 3)
-#include <sycl/ext/oneapi/matrix/matrix-tensorcore.hpp>
+#include <sycl/ext/oneapi/matrix/matrix-tensorcores-legacy.hpp>
+#endif // SYCL_EXT_ONEAPI_MATRIX_VERSION
+#if (SYCL_EXT_ONEAPI_MATRIX_VERSION == 4)
+#include <sycl/ext/oneapi/matrix/matrix-unified.hpp>
 #endif // SYCL_EXT_ONEAPI_MATRIX_VERSION