intel · kbobrovs · Feb 2, 2022 · Jan 28, 2022
@@ -10,13 +10,36 @@
 
 #pragma once
 
+// clang-format off
+///
 /// @defgroup sycl_esimd DPC++ Explicit SIMD API
 /// This is a low-level API providing direct access to Intel GPU hardware
 /// features. ESIMD overview can be found
 /// [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/SYCL_EXT_INTEL_ESIMD.md).
+/// Some terminology used in the API documentation:
+/// - *lane* -
+///       (or "vector lane") Individual "lane" of input and output elements
+///       in a ESIMD vector operation, such that all lanes combined for the
+///       input and output vectors of the operation. Lane is indentified by
+///       an ordinal in the [0, N-1) range, where N is the size of the
+///       input/output vectors.
+/// - *mask* -
+///       a vector of predicates which can be used to enable/disable
+///       execution of a vector operation over the correspondin lane.
+///       \c 0 predicate value disables execution, non-zero - enables.
+/// - *word* - 2 bytes.
+/// - *dword* ("double word") - 4 bytes.
+/// - *qword* ("quad word") - 8 bytes.
+/// - *oword* ("octal word") - 16 bytes.
+/// - *pixel* A 4 byte-aligned contiguous 128-bit chunk of memory logically
+///    divided into 4 32-bit channels - \c R,\c G, \c B, \c A. Multiple pixels
+///    can be accessed by ESIMD APIs, with ability to enable/disable access
+///    to each channel for all pixels.
+///
+// clang-format on
 
-///@{
-/// @ingroup sycl_esimd
+/// @addtogroup sycl_esimd
+/// @{
 
 /// @defgroup sycl_esimd_core ESIMD core.
 /// Core APIs defining main vector data types and their interfaces.
@@ -31,22 +54,17 @@
 /// @defgroup sycl_esimd_bitmanip Bit and mask manipulation APIs.
 
 /// @defgroup sycl_esimd_conv Explicit conversions.
-/// @ingroup sycl_esimd
 /// Defines explicit conversions (with and without saturation), truncation etc.
 /// between ESIMD vector types.
 
+/// @defgroup sycl_esimd_raw_send Raw send APIs.
+/// Implements the \c send instruction to send messages to variaous components
+/// of the Intel(R) processor graphics, as defined in the documentation at
+/// https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-icllp-vol02a-commandreference-instructions_2.pdf
+
 /// @defgroup sycl_esimd_misc Miscellaneous ESIMD convenience functions.
 
-/// The main components of the API are:
-///   - @ref sycl_esimd_core - core API defining main vector data types and
-///   their
-///     interfaces.
-///   - @ref sycl_esimd_memory
-///   - @ref sycl_esimd_math
-///   - @ref sycl_esimd_bitmanip
-///   - @ref sycl_esimd_conv
-///   - @ref sycl_esimd_misc
-///@}
+/// @} sycl_esimd
 
 #include <sycl/ext/intel/experimental/esimd/alt_ui.hpp>
 #include <sycl/ext/intel/experimental/esimd/common.hpp>

@@ -132,24 +132,48 @@ constexpr int get_num_channels_enabled(rgba_channel_mask M) {
          is_channel_enabled(M, rgba_channel::A);
 }
 
-/// Represents an atomic operation.
+/// Represents an atomic operation. Operations always return the old value(s) of
+/// the target memory location(s) as it was before the operation was applied.
+/// Each operation is annotated with a pseudocode illustrating its semantics,
+/// \c addr is a memory address (one of the many, as the atomic operation is
+/// vector) the operation is applied at, \c src0 is its first argumnet,
+/// \c src1 - second.
 enum class atomic_op : uint8_t {
+  /// Addition: <code>*addr = *addr + src0</code>.
   add = 0x0,
+  /// Subtraction: <code>*addr = *addr - src0</code>.
   sub = 0x1,
+  /// Increment: <code>*addr = *addr + 1</code>.
   inc = 0x2,
+  /// Decrement: <code>*addr = *addr - 1</code>.
   dec = 0x3,
+  /// Minimum: <code>*addr = min(*addr, src0)</code>.
   min = 0x4,
+  /// Maximum: <code>*addr = max(*addr, src0)</code>.
   max = 0x5,
+  /// Exchange. <code>*addr == src0;</code>
   xchg = 0x6,
+  /// Compare and exchange. <code>if (*addr == src0) *sddr = src1;</code>
   cmpxchg = 0x7,
+  /// Bit \c and: <code>*addr = *addr & src0</code>.
   bit_and = 0x8,
+  /// Bit \c or: <code>*addr = *addr | src0</code>.
   bit_or = 0x9,
+  /// Bit \c xor: <code>*addr = *addr | src0</code>.
   bit_xor = 0xa,
+  /// Minimum (signed integer): <code>*addr = min(*addr, src0)</code>.
   minsint = 0xb,
+  /// Maximum (signed integer): <code>*addr = max(*addr, src0)</code>.
   maxsint = 0xc,
+  /// Minimum (floating point): <code>*addr = min(*addr, src0)</code>.
   fmax = 0x10,
+  /// Maximum (floating point): <code>*addr = max(*addr, src0)</code>.
   fmin = 0x11,
+  /// Compare and exchange (floating point).
+  /// <code>if (*addr == src0) *addr = src1;</code>
   fcmpwr = 0x12,
+  /// Decrement: <code>*addr = *addr - 1</code>. The only operation which
+  /// returns new value of the destination rather than old.
   predec = 0xff,
 };
 

@@ -31,10 +31,10 @@ namespace experimental {
 namespace esimd {
 namespace detail {
 // clang-format off
-/// @ingroup sycl_esimd_core
+/// @addtogroup sycl_esimd_core
 /// @{
+
 /// @defgroup sycl_esimd_core_binops C++ binary operators overloads for ESIMD.
-///
 /// Standard C++ binary operators overloads applicable to \c simd_obj_impl
 /// derivatives - \c simd , \c simd_mask , \c simd_view and their combinations.
 /// The following overloads are defined:

@@ -25,10 +25,14 @@ namespace intel {
 namespace experimental {
 namespace esimd {
 
+/// @addtogroup sycl_esimd_core
 /// @{
-/// @ingroup sycl_esimd_core
 
-/// @name Alignment type tags for use with simd load/store operations.
+/// @defgroup sycl_esimd_core_align Alignment control
+/// Alignment type tags and related APIs for use with ESIMD memory access
+/// operations.
+
+/// @addtogroup sycl_esimd_core_align
 /// @{
 /// element_aligned_tag type. Flag of this type should be used in load and store
 /// operations when memory address is aligned by simd object's element type.
@@ -60,7 +64,6 @@ inline constexpr element_aligned_tag element_aligned = {};
 inline constexpr vector_aligned_tag vector_aligned = {};
 
 template <unsigned N> inline constexpr overaligned_tag<N> overaligned = {};
-/// @}
 
 /// Checks if type is a simd load/store flag.
 template <typename T> struct is_simd_flag_type : std::false_type {};
@@ -77,6 +80,8 @@ struct is_simd_flag_type<overaligned_tag<N>> : std::true_type {};
 template <typename T>
 static inline constexpr bool is_simd_flag_type_v = is_simd_flag_type<T>::value;
 
+/// @} alignment tags
+
 /// @cond ESIMD_DETAIL
 
 namespace detail {
@@ -165,9 +170,9 @@ class simd_obj_impl {
   }
 
 public:
-  /// @{
-  /// Constructors.
   simd_obj_impl() = default;
+
+  /// Copy constructor.
   simd_obj_impl(const simd_obj_impl &other) {
     __esimd_dbg_print(simd_obj_impl(const simd_obj_impl &other));
     set(other.data());
@@ -234,8 +239,6 @@ class simd_obj_impl {
     copy_from(acc, offset, Flags{});
   }
 
-  /// @}
-
   // Load the object's value from array.
   template <int N1>
   std::enable_if_t<N1 == N> copy_from(const RawTy (&&Arr)[N1]) {

@@ -28,8 +28,8 @@ namespace intel {
 namespace experimental {
 namespace esimd {
 
+/// @addtogroup sycl_esimd_math
 /// @{
-/// @ingroup sycl_esimd_math
 
 /// Conversion of input vector elements of type \p T1 into vector of elements of
 /// type \p T0 with saturation.
@@ -157,8 +157,8 @@ abs(T1 src0, int flag = saturation_off) {
 
 /// @} sycl_esimd_math
 
+/// @addtogroup sycl_esimd_bitmanip
 /// @{
-/// @ingroup sycl_esimd_bitmanip
 
 /// Shift left operation (vector version)
 /// \tparam T0 element type of the returned vector. Must be any integer type.
@@ -498,8 +498,8 @@ asr(T1 src0, T2 src1, int flag = saturation_off) {
 }
 /// @} sycl_esimd_bitmanip
 
+/// @addtogroup sycl_esimd_math
 /// @{
-/// @ingroup sycl_esimd_math
 
 // imul
 #ifndef ESIMD_HAS_LONG_LONG
@@ -1323,8 +1323,8 @@ __ESIMD_API simd<float, SZ> pln(simd<float, 4> src0, simd<float, SZ> src1,
 }
 /// @} sycl_esimd_math
 
+/// @addtogroup sycl_esimd_bitmanip
 /// @{
-/// @ingroup sycl_esimd_bitmanip
 
 /// bf_reverse
 template <typename T0, typename T1, int SZ>
@@ -1402,8 +1402,8 @@ ESIMD_NODEBUG
 
 /// @} sycl_esimd_bitmanip
 
+/// @addtogroup sycl_esimd_math
 /// @{
-/// @ingroup sycl_esimd_math
 
 ////////////////////////////////////////////////////////////////////////////////
 // ESIMD arithmetic intrinsics:
@@ -1712,8 +1712,8 @@ ESIMD_NODEBUG ESIMD_INLINE T exp(T src0) {
 }
 /// @} sycl_esimd_math
 
+/// @addtogroup sycl_esimd_conv
 /// @{
-/// @ingroup sycl_esimd_conv
 
 ////////////////////////////////////////////////////////////////////////////////
 // Rounding intrinsics.
@@ -1748,8 +1748,8 @@ __ESIMD_INTRINSIC_DEF(rndz)
 #undef __ESIMD_INTRINSIC_DEF
 /// @} sycl_esimd_conv
 
+/// @addtogroup sycl_esimd_bitmanip
 /// @{
-/// @ingroup sycl_esimd_bitmanip
 
 template <int N>
 ESIMD_NODEBUG
@@ -1938,8 +1938,8 @@ fbh(simd_view<BaseTy, RegionTy> src) {
 
 /// @} sycl_esimd_bitmanip
 
+/// @addtogroup sycl_esimd_math
 /// @{
-/// @ingroup sycl_esimd_math
 
 /// \brief DP4A.
 ///