intel · AlexeySachkov · Nov 19, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
@@ -46,6 +46,15 @@ class __SYCL_SPECIAL_CLASS __SYCL_TYPE(work_group_memory) work_group_memory
   using value_type = std::remove_all_extents_t<DataT>;
 
 private:
+  // At the moment we do not have a way to set properties nor property values to
+  // set for work group memory. So, we check here for diagnostic purposes that
+  // the property list is empty.
+  // TODO: Remove this function and its occurrences in this file once properties
+  // have been created for work group memory.
+  void check_props_empty() const {
+    static_assert(std::is_same_v<PropertyListT, empty_properties_t> &&
+                  "Work group memory class does not support properties yet!");
+  }
   using decoratedPtr = typename sycl::detail::DecoratedType<
       value_type, access::address_space::local_space>::type *;
 
@@ -62,18 +71,22 @@ class __SYCL_SPECIAL_CLASS __SYCL_TYPE(work_group_memory) work_group_memory
 #endif
 
 public:
-  work_group_memory(const indeterminate_t &) {};
+  work_group_memory(const indeterminate_t &) { check_props_empty(); };
   work_group_memory(const work_group_memory &rhs) = default;
   work_group_memory &operator=(const work_group_memory &rhs) = default;
   template <typename T = DataT,
             typename = std::enable_if_t<!sycl::detail::is_unbounded_array_v<T>>>
   work_group_memory(handler &)
-      : sycl::detail::work_group_memory_impl(sizeof(DataT)) {}
+      : sycl::detail::work_group_memory_impl(sizeof(DataT)) {
+    check_props_empty();
+  }
   template <typename T = DataT,
             typename = std::enable_if_t<sycl::detail::is_unbounded_array_v<T>>>
   work_group_memory(size_t num, handler &)
       : sycl::detail::work_group_memory_impl(
-            num * sizeof(std::remove_extent_t<DataT>)) {}
+            num * sizeof(std::remove_extent_t<DataT>)) {
+    check_props_empty();
+  }
   template <access::decorated IsDecorated = access::decorated::no>
   multi_ptr<value_type, access::address_space::local_space, IsDecorated>
   get_multi_ptr() const {

@@ -5,6 +5,8 @@
 #include <sycl/detail/core.hpp>
 #include <sycl/ext/oneapi/experimental/work_group_memory.hpp>
 #include <sycl/group_barrier.hpp>
+#include <sycl/half_type.hpp>
+
 namespace syclexp = sycl::ext::oneapi::experimental;
 
 sycl::queue q;
@@ -50,7 +52,9 @@ template <typename T> void swap_scalar(T &a, T &b) {
       sycl::nd_range<1> ndr{size, wgsize};
       cgh.parallel_for(ndr, [=](sycl::nd_item<1> it) {
         syclexp::work_group_memory<T> temp2{syclexp::indeterminate};
-        temp2 = temp; // temp and temp2 have the same underlying data
+        temp2 = temp;            // temp and temp2 have the same underlying data
+        assert(&temp2 == &temp); // check that both objects return same
+                                 // underlying address after assignment
         temp = acc_a[0];
         acc_a[0] = acc_b[0];
         acc_b[0] = temp2; // safe to use temp2
@@ -86,6 +90,8 @@ template <typename T> void swap_scalar(T &a, T &b) {
   assert(a == old_b && b == old_a && "Incorrect swap!");
 
   // Same as above but instead of using multi_ptr, use address-of operator.
+  // Also verify that get_multi_ptr() returns the same address as address-of
+  // operator.
   {
     sycl::buffer<T, 1> buf_a{&a, 1};
     sycl::buffer<T, 1> buf_b{&b, 1};
@@ -96,6 +102,7 @@ template <typename T> void swap_scalar(T &a, T &b) {
       syclexp::work_group_memory<T> temp2{cgh};
       sycl::nd_range<1> ndr{size, wgsize};
       cgh.parallel_for(ndr, [=](sycl::nd_item<> it) {
+        assert(&temp == temp.get_multi_ptr().get());
         temp = acc_a[0];
         acc_a[0] = acc_b[0];
         temp2 = *(&temp);
@@ -294,6 +301,8 @@ void swap_array_2d(T (&a)[N][N], T (&b)[N][N], size_t batch_size) {
         temp[i][j] = acc_a[i][j];
         acc_a[i][j] = acc_b[i][j];
         syclexp::work_group_memory<T[N][N]> temp2{temp};
+        assert(&temp2 == &temp); // check both objects return same underlying
+                                 // address after copy construction.
         acc_b[i][j] = temp2[i][j];
       });
     });
@@ -342,28 +351,28 @@ void swap_array_2d(T (&a)[N][N], T (&b)[N][N], size_t batch_size) {
 // so we can verify that each work-item sees the value written by its leader.
 // The test also is a sanity check that different work groups get different
 // work group memory locations as otherwise we'd have data races.
-void coherency(size_t size, size_t wgsize) {
+template <typename T> void coherency(size_t size, size_t wgsize) {
   q.submit([&](sycl::handler &cgh) {
-    syclexp::work_group_memory<int> data{cgh};
+    syclexp::work_group_memory<T> data{cgh};
     sycl::nd_range<1> ndr{size, wgsize};
     cgh.parallel_for(ndr, [=](sycl::nd_item<1> it) {
       if (it.get_group().leader()) {
-        data = it.get_global_id() / wgsize;
+        data = T(it.get_global_id() / wgsize);
       }
       sycl::group_barrier(it.get_group());
-      assert(data == it.get_global_id() / wgsize);
+      assert(data == T(it.get_global_id() / wgsize));
     });
   });
 }
 
 constexpr size_t N = 32;
-int main() {
-  int intarr1[N][N];
-  int intarr2[N][N];
+template <typename T> void test() {
+  T intarr1[N][N];
+  T intarr2[N][N];
   for (int i = 0; i < N; ++i) {
     for (int j = 0; j < N; ++j) {
-      intarr1[i][j] = i + j;
-      intarr2[i][j] = i * j;
+      intarr1[i][j] = T(i) + T(j);
+      intarr2[i][j] = T(i) * T(j);
     }
   }
   for (int i = 0; i < N; ++i) {
@@ -373,10 +382,37 @@ int main() {
     swap_array_1d(intarr1[i], intarr2[i], 8);
   }
   swap_array_2d(intarr1, intarr2, 8);
-  coherency(N, N / 2);
-  coherency(N, N / 4);
-  coherency(N, N / 8);
-  coherency(N, N / 16);
-  coherency(N, N / 32);
+  coherency<T>(N, N / 2);
+  coherency<T>(N, N / 4);
+  coherency<T>(N, N / 8);
+  coherency<T>(N, N / 16);
+  coherency<T>(N, N / 32);
+}
+
+template <typename T> void test_ptr() {
+  T arr1[N][N];
+  T arr2[N][N];
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < N; ++j) {
+      swap_scalar(arr1[i][j], arr2[i][j]);
+    }
+    swap_array_1d(arr1[i], arr2[i], 8);
+  }
+  swap_array_2d(arr1, arr2, 8);
+}
+
+int main() {
+  test<int>();
+  test<char>();
+  test<uint16_t>();
+  if (q.get_device().has(sycl::aspect::fp16))
+    test<sycl::half>();
+  test_ptr<float *>();
+  test_ptr<int *>();
+  test_ptr<char *>();
+  test_ptr<uint16_t *>();
+  if (q.get_device().has(sycl::aspect::fp16))
+    test_ptr<sycl::half *>();
+  test_ptr<float *>();
   return 0;
 }
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <cassert>
+#include <iostream>
+#include <sycl/atomic_ref.hpp>
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/experimental/work_group_memory.hpp>
+#include <sycl/ext/oneapi/free_function_queries.hpp>
+#include <sycl/group_barrier.hpp>
+#include <sycl/marray.hpp>
+#include <sycl/usm.hpp>
+#include <sycl/vector.hpp>
+
+using namespace sycl;
+
+template <typename T> bool check_half_aspect(queue &q) {
+  if (std::is_same_v<sycl::half, T> &&
+      !q.get_device().has(sycl::aspect::fp16)) {
+    std::cout << "Device does not support fp16 aspect. Skipping all tests with "
+                 "sycl::half type!"
+              << std::endl;
+    return false;
+  }
+  return true;
+}
+
+template <typename T> bool check_double_aspect(queue &q) {
+  if (std::is_same_v<T, double> && !q.get_device().has(aspect::fp64)) {
+    std::cout << "Device does not support fp64 aspect. Skipping all tests with "
+                 "double type!"
+              << std::endl;
+    return false;
+  }
+  return true;
+}
+
+template <typename T> struct S {
+  T val;
+};
+
+template <typename T> struct M {
+  T val;
+};
+
+union U {
+  S<int> s;
+  M<int> m;
+};
+
+template <typename T>
+void sum_helper(sycl::ext::oneapi::experimental::work_group_memory<T[]> mem,
+                sycl::ext::oneapi::experimental::work_group_memory<T> ret,
+                size_t WGSIZE) {
+  for (int i = 0; i < WGSIZE; ++i) {
+    ret = ret + mem[i];
+  }
+}
@@ -0,0 +1,121 @@
+#pragma once
+
+#include "common.hpp"
+#include "common_lambda.hpp"
+#include <cassert>
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/experimental/work_group_memory.hpp>
+#include <sycl/ext/oneapi/free_function_queries.hpp>
+#include <sycl/group_barrier.hpp>
+#include <sycl/usm.hpp>
+
+using namespace sycl;
+
+template <typename T>
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
+    (ext::oneapi::experimental::nd_range_kernel<1>))
+void sum(sycl::ext::oneapi::experimental::work_group_memory<T[]> mem, T *buf,
+         sycl::ext::oneapi::experimental::work_group_memory<T> result,
+         T expected, size_t WGSIZE, bool UseHelper) {
+  const auto it = sycl::ext::oneapi::this_work_item::get_nd_item<1>();
+  size_t local_id = it.get_local_id();
+  mem[local_id] = buf[local_id];
+  group_barrier(it.get_group());
+  if (it.get_group().leader()) {
+    result = 0;
+    if (!UseHelper) {
+      for (int i = 0; i < WGSIZE; ++i) {
+        result = result + mem[i];
+      }
+    } else {
+      sum_helper(mem, result, WGSIZE);
+    }
+    assert(result == expected);
+  }
+}
+
+// Explicit instantiations for the relevant data types.
+// These are needed because free function kernel support is not fully
+// implemented yet.
+// TODO: Remove these once free function kernel support is fully there.
+#define SUM(T)                                                                 \
+  template void sum<T>(                                                        \
+      sycl::ext::oneapi::experimental::work_group_memory<T[]> mem, T * buf,    \
+      sycl::ext::oneapi::experimental::work_group_memory<T> result,            \
+      T expected, size_t WGSIZE, bool UseHelper);
+
+SUM(int)
+SUM(uint16_t)
+SUM(half)
+SUM(double)
+SUM(float)
+SUM(char)
+SUM(bool)
+
+template <typename T>
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
+    (ext::oneapi::experimental::nd_range_kernel<1>))
+void sum_marray(
+    sycl::ext::oneapi::experimental::work_group_memory<sycl::marray<T, 16>> mem,
+    T *buf, sycl::ext::oneapi::experimental::work_group_memory<T> result,
+    T expected) {
+  const auto it = sycl::ext::oneapi::this_work_item::get_nd_item<1>();
+  size_t local_id = it.get_local_id();
+  constexpr T tolerance = 0.0001;
+  sycl::marray<T, 16> &data = mem;
+  data[local_id] = buf[local_id];
+  group_barrier(it.get_group());
+  if (it.get_group().leader()) {
+    result = 0;
+    for (int i = 0; i < 16; ++i) {
+      result = result + data[i];
+    }
+    assert((result - expected) * (result - expected) <= tolerance);
+  }
+}
+
+// Explicit instantiations for the relevant data types.
+#define SUM_MARRAY(T)                                                          \
+  template void sum_marray<T>(                                                 \
+      sycl::ext::oneapi::experimental::work_group_memory<sycl::marray<T, 16>>  \
+          mem,                                                                 \
+      T * buf, sycl::ext::oneapi::experimental::work_group_memory<T> result,   \
+      T expected);
+
+SUM_MARRAY(float);
+SUM_MARRAY(double);
+SUM_MARRAY(half);
+
+template <typename T>
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
+    (ext::oneapi::experimental::nd_range_kernel<1>))
+void sum_vec(
+    sycl::ext::oneapi::experimental::work_group_memory<sycl::vec<T, 16>> mem,
+    T *buf, sycl::ext::oneapi::experimental::work_group_memory<T> result,
+    T expected) {
+  const auto it = sycl::ext::oneapi::this_work_item::get_nd_item<1>();
+  size_t local_id = it.get_local_id();
+  constexpr T tolerance = 0.0001;
+  sycl::vec<T, 16> &data = mem;
+  data[local_id] = buf[local_id];
+  group_barrier(it.get_group());
+  if (it.get_group().leader()) {
+    result = 0;
+    for (int i = 0; i < 16; ++i) {
+      result = result + data[i];
+    }
+    assert((result - expected) * (result - expected) <= tolerance);
+  }
+}
+
+// Explicit instantiations for the relevant data types.
+#define SUM_VEC(T)                                                             \
+  template void sum_vec<T>(                                                    \
+      sycl::ext::oneapi::experimental::work_group_memory<sycl::vec<T, 16>>     \
+          mem,                                                                 \
+      T * buf, sycl::ext::oneapi::experimental::work_group_memory<T> result,   \
+      T expected);
+
+SUM_VEC(float);
+SUM_VEC(double);
+SUM_VEC(half);
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <cassert>
+#include <sycl/atomic_ref.hpp>
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/experimental/work_group_memory.hpp>
+#include <sycl/ext/oneapi/free_function_queries.hpp>
+#include <sycl/group_barrier.hpp>
+#include <sycl/marray.hpp>
+#include <sycl/usm.hpp>
+#include <sycl/vector.hpp>
+
+using namespace sycl;
+
+template <typename T>
+void sum_helper(sycl::ext::oneapi::experimental::work_group_memory<T[]> mem,
+                sycl::ext::oneapi::experimental::work_group_memory<T> ret,
+                size_t WGSIZE) {
+  for (int i = 0; i < WGSIZE; ++i) {
+    ret = ret + mem[i];
+  }
+}