intel · bader · Mar 9, 2023 · Nov 15, 2022 · Nov 15, 2022 · Nov 15, 2022
@@ -0,0 +1,118 @@
+= sycl_ext_oneapi_cuda_tex_cache_read
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2022-2023 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 6 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+
+== Status
+
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback.  Interfaces defined in this
+specification are implemented in {dpcpp}, but they are not finalized and may
+change incompatibly in future versions of {dpcpp} without prior notice.
+*Shipping software products should not rely on APIs defined in this
+specification.*
+
+== Backend support status
+
+This extension is supported by all backends. However, as described in the following sections, this extension is currently only functionally useful in the `ext_oneapi_cuda` backend.
+
+
+== Overview
+
+When used with the `ext_oneapi_cuda` backend only, this extension exposes the https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc[ld-global-nc] ptx instruction so that users can load a register variable to the non-coherent read-only texture cache. The texture cache is designed for random access reads and is most performant when work-items within a sub-group read a set of addresses of texture memory that are close to one another in the cache. This extension introduces a free function, `ldg`, that is available for all backends and devices; however this function will only make use of a special hardware feature (the texture cache) when called from the `ext_oneapi_cuda` backend. The only reason that `ldg` is supported on backends other than `ext_oneapi_cuda` is to allow for code portability.
+
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_CUDA_TEX_CACHE_READ` to one of the values defined in the table
+below.  Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's features the implementation
+supports.
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
+=== `ldg` free function
+
+This extension adds a single templated free function which may be called from device
+code. This function is not available in host code.
+
+```
+namespace sycl::ext::oneapi::experimental::cuda {
+
+template<typename T>
+T ldg(const T* ptr);
+
+} // namespace sycl::ext::oneapi::experimental::cuda
+```
+
+`ldg` returns the data of type `T` located at address `ptr`. When called from the `ext_oneapi_cuda` backend the data is cached in the read-only texture cache.
+When called from any other backend a copy of the data stored at address `ptr` is returned without using any special cache.
+
+The template parameter `T` can be one of `char`, `signed char`, `short`, `int`, `long`, `long long`, `unsigned char`, `unsigned short`, `unsigned int`, `unsigned long`, `unsigned long long`, `vec<char, 2>`, `vec<char, 4>`, `vec<short, 2>`, `vec<short, 4>`, `vec<int, 2>`, `vec<int, 4>`, `vec<long long, 2>`, `vec<uchar, 2>`, `vec<uchar, 4>`, `vec<ushort, 2>`, `vec<ushort, 4>`, `vec<uint, 2>`, `vec<uint, 4>`, `vec<unsigned long long, 2>`, `float`, `vec<float, 2>`, `vec<float, 4>`, `double`, or `vec<double, 2>`.
+
+=== Example of usage
+
+```
+    h.parallel_for<class kernel_name>(range, [=](sycl::nd_item<1> item) {
+      const int idx = item.get_global_id(0);
+      auto cachedA = sycl::ext::oneapi::experimental::cuda::ldg(&A[idx]);
+      auto cachedB = sycl::ext::oneapi::experimental::cuda::ldg(&B[idx]);
+      C[idx] = cachedA + cachedB;
+    });
+```
+
+IMPORTANT: Sometimes the compiler may not be sure that the read-only condition is satisfied. In such cases users can mark the declaration of the pointer used as the argument to `ldg` with both the `const` and `$$__$$restrict$$__$$` qualifiers, which may aid the compiler in detecting the read-only condition. Additionally, data returned from `ldg`, e.g. `cacheA` and `cacheB` in the above example, should not be written to at any point within the kernel. If such data is written to at any point in the kernel, the code will compile and execute correctly, however the texture cache will not be used.
+
+=== Issues
+
+- Investigate exposing this functionality through annotated_ptr once the sycl_ext_oneapi_annotated_ptr becomes finalized.
@@ -0,0 +1,225 @@
+//==--- builtins.hpp - SYCL_ONEAPI_CUDA experimental builtins  -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#define SYCL_EXT_ONEAPI_CUDA_TEX_CACHE_READ 1
+
+#include <sycl/types.hpp>
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ATTRIBUTE_EXT_VEC_TYPE(N) __declspec(ext_vector_type(N))
+#else
+#define ATTRIBUTE_EXT_VEC_TYPE(N) __attribute__((ext_vector_type(N)))
+#endif
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+namespace cuda {
+
+namespace detail {
+using ldg_vector_types = sycl::detail::type_list<
+    sycl::char2, sycl::char4, sycl::short2, sycl::short4, sycl::int2,
+    sycl::int4, sycl::longlong2, sycl::uchar2, sycl::uchar4, sycl::ushort2,
+    sycl::ushort4, sycl::uint2, sycl::uint4, sycl::ulonglong2, sycl::float2,
+    sycl::float4, sycl::double2>;
+
+using ldg_types =
+    sycl::detail::type_list<ldg_vector_types,
+                            sycl::detail::gtl::scalar_signed_basic_list,
+                            sycl::detail::gtl::scalar_unsigned_basic_list>;
+} // namespace detail
+
+template <typename T>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    sycl::detail::is_contained<
+        T, sycl::ext::oneapi::experimental::cuda::detail::ldg_types>::value,
+    T>
+ldg(const T *ptr) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  if constexpr (std::is_same_v<T, char>) {
+    return __nvvm_ldg_c(ptr);
+  } else if constexpr (std::is_same_v<T, short>) {
+    return __nvvm_ldg_s(ptr);
+  } else if constexpr (std::is_same_v<T, int>) {
+    return __nvvm_ldg_i(ptr);
+  } else if constexpr (std::is_same_v<T, long>) {
+    return __nvvm_ldg_l(ptr);
+  } else if constexpr (std::is_same_v<T, long long>) {
+    return __nvvm_ldg_ll(ptr);
+  } else if constexpr (std::is_same_v<T, unsigned char>) {
+    return __nvvm_ldg_uc(ptr);
+  } else if constexpr (std::is_same_v<T, unsigned short>) {
+    return __nvvm_ldg_us(ptr);
+  } else if constexpr (std::is_same_v<T, unsigned int>) {
+    return __nvvm_ldg_ui(ptr);
+  } else if constexpr (std::is_same_v<T, unsigned long>) {
+    return __nvvm_ldg_ul(ptr);
+  } else if constexpr (std::is_same_v<T, unsigned long long>) {
+    return __nvvm_ldg_ull(ptr);
+  } else if constexpr (std::is_same_v<T, float>) {
+    return __nvvm_ldg_f(ptr);
+  } else if constexpr (std::is_same_v<T, double>) {
+    return __nvvm_ldg_d(ptr);
+  } else if constexpr (std::is_same_v<T, sycl::char2>) {
+    // We can assume that ptr is aligned at least to char2's alignment, but the
+    // load will assume that ptr is aligned to char2's alignment.  This is only
+    // safe if alignof(f2) <= alignof(char2).
+    typedef char c2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
+    sycl::char2 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::char4>) {
+    typedef char c4 ATTRIBUTE_EXT_VEC_TYPE(4);
+    c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
+    sycl::char4 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    ret.z() = rv[2];
+    ret.w() = rv[3];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::short2>) {
+    typedef short s2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
+    sycl::short2 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::short4>) {
+    typedef short s4 ATTRIBUTE_EXT_VEC_TYPE(4);
+    s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
+    sycl::short4 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    ret.z() = rv[2];
+    ret.w() = rv[3];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::int2>) {
+    typedef int i2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
+    sycl::int2 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::int4>) {
+    typedef int i4 ATTRIBUTE_EXT_VEC_TYPE(4);
+    i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
+    sycl::int4 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    ret.z() = rv[2];
+    ret.w() = rv[3];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::longlong2>) {
+    typedef long long ll2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
+    sycl::longlong2 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::uchar2>) {
+    typedef unsigned char uc2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
+    sycl::uchar2 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::uchar4>) {
+    typedef unsigned char uc4 ATTRIBUTE_EXT_VEC_TYPE(4);
+    uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
+    sycl::uchar4 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    ret.z() = rv[2];
+    ret.w() = rv[3];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::ushort2>) {
+    typedef unsigned short us2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
+    sycl::ushort2 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::ushort4>) {
+    typedef unsigned short us4 ATTRIBUTE_EXT_VEC_TYPE(4);
+    us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
+    sycl::ushort4 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    ret.z() = rv[2];
+    ret.w() = rv[3];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::uint2>) {
+    typedef unsigned int ui2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
+    sycl::uint2 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::uint4>) {
+    typedef unsigned int ui4 ATTRIBUTE_EXT_VEC_TYPE(4);
+    ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
+    sycl::uint4 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    ret.z() = rv[2];
+    ret.w() = rv[3];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::ulonglong2>) {
+    typedef unsigned long long ull2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
+    sycl::ulonglong2 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::float2>) {
+    typedef float f2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
+    sycl::float2 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::float4>) {
+    typedef float f4 ATTRIBUTE_EXT_VEC_TYPE(4);
+    f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
+    sycl::float4 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    ret.z() = rv[2];
+    ret.w() = rv[3];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::double2>) {
+    typedef double d2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
+    sycl::double2 ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  }
+#else
+  return *ptr;
+#endif
+#else
+  throw runtime_error("ldg is not supported on host.", PI_ERROR_INVALID_DEVICE);
+#endif
+}
+
+#undef ATTRIBUTE_EXT_VEC_TYPE
+
+} // namespace cuda
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl