intel · abuyukku · Sep 17, 2021 · romanovvlad · Sep 19, 2021 · vladimirlaz
@@ -0,0 +1,97 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
+
+#include <sys/resource.h>
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/numeric>
+#include <iostream>
+#include <vector>
+#include "CL/sycl.hpp"
+
+template<typename RealType>
+void cross_product (sycl::queue& q, 
+                    int n, int p,
+                    sycl::buffer<RealType, 1> data_buf,
+                    sycl::buffer<RealType, 1> w_buf,
+                    sycl::buffer<RealType, 1> mean_buf, 
+                    sycl::buffer<RealType, 1> cross_product_buf)
+{
+    std::cout << '.';
+    sycl::buffer<RealType, 1> helper_buf(n * p, sycl::no_init); // props
+
+    q.submit([&] (auto &h) {
+        sycl::accessor data_acc(data_buf, h, sycl::read_only);
+        sycl::accessor helper_acc(helper_buf, h, sycl::write_only);
+        sycl::accessor mean_acc(mean_buf, h, sycl::read_only);
+        h.parallel_for(sycl::range<1>(n * p), [=] (sycl::item<1> item) {
+            int ij = item.get_id();
+            helper_acc[ij] = data_acc[ij] - mean_acc[ij / n];
+        });
+    });
+    q.wait();
+
+    std::vector<sycl::buffer<RealType, 1>> sub_helper; // props
+    for (int i = 0; i < p; i++) {
+        sub_helper.push_back(sycl::buffer<RealType, 1>(helper_buf, i*n, n)); 
+    }
+
+    auto policy = oneapi::dpl::execution::make_device_policy(q);
+    auto w_begin = oneapi::dpl::begin(w_buf);
+    for (int i = 0; i < p; i++) {
+        auto helper_i = oneapi::dpl::begin(sub_helper[i]);
+
+        for (int j = i + 1; j < p; j++) {
+            auto helper_j = oneapi::dpl::begin(sub_helper[j]);
+            auto zip_begin = oneapi::dpl::make_zip_iterator(w_begin, helper_i, helper_j);
+            RealType sum = std::transform_reduce(policy, zip_begin, zip_begin + n, RealType{0}, std::plus<>(),
+                           [](auto x) { return std::get<0>(x) * std::get<1>(x) * std::get<2>(x); });
+            {
+                sycl::host_accessor h_cross_product(cross_product_buf);
+                h_cross_product[i*p+j] = sum;
+                h_cross_product[i+j*p] = sum;
+            }
+        }
+        auto zip_begin = oneapi::dpl::make_zip_iterator(w_begin, helper_i);
+        RealType sum = std::transform_reduce(policy, zip_begin, zip_begin + n, RealType{0}, std::plus<>(),
+                       [](auto x) { return std::get<0>(x) * std::get<1>(x) * std::get<1>(x); });
+        {
+            sycl::host_accessor h_cross_product(cross_product_buf);
+            h_cross_product[i*p+i] = sum;
+        }
+    }
+    q.wait();
+}
+
+int main() {
+    const int p = 30; // dimensions
+    const int n = 100; // observations
+    const int n_runs = 20;
+    std::vector<float> data(n * p), w(n, 1), not_mean(p, 1);
+    // data[i, j] = data[i * n + j]
+    for (auto& x : data) x = float(std::rand()) / float(RAND_MAX);
+
+    sycl::buffer<float, 1> data_buf(data.data(), data.size());
+    sycl::buffer<float, 1> w_buf(w.data(), w.size());
+    sycl::buffer<float, 1> cross_product_buf(p * p);
+    sycl::buffer<float, 1> not_mean_buf(not_mean.data(), not_mean.size());
+
+    sycl::queue q(sycl::gpu_selector{});
+
+    std::cout << "Running on: " << q.get_device().get_info<sycl::info::device::name>() << std::endl;
+
+    std::cout << "Start of parallel computing ";
+
+    for (int i = 0; i < n_runs; i++) {
+        cross_product(q, n, p, data_buf, w_buf, not_mean_buf, cross_product_buf);
+        q.wait();
+    }
+
+    std::cout << "SUCCESS" << std::endl;
+
+    return 0;
+}
+
+// CHECK:---> piQueueRelease
+// CHECK:---> piContextRelease
+// CHECK:---> piKernelRelease
+// CHECK:---> piProgramRelease