update

PaddlePaddle · TCChenlong · Oct 13, 2021 · Sep 17, 2021 · Sep 17, 2021 · Sep 18, 2021
commit 66621b92335c4d17e53f154ff1934ed0191c1cc9
diff --git a/docs/guides/11_kernel_primitives_api/case_elementwise_add.md b/docs/guides/11_kernel_primitives_api/case_elementwise_add.md
@@ -0,0 +1,52 @@
+## ElementwiseAdd
++ 案例功能说明：完成相同shape的两数相加，输入为InT类型，输出为OutT类型，根据Functor完成对应的计算.
+
+### Functor定义
+
+```
+AddFunctor:
+
+template <typename InT, typename OutT>
+struct AddFunctor {
+  HOSTDEVICE OutT operator()(const InT &a, const InT &b) const { return statice<OutT>(a + b); }
+};
+
+```
+### kernel 实现说明
+
+VecSize 表示每个线程连续读取VecSize个元素，根据剩余元素num与每个线程最大处理的元素个数VecSize x blockDim.x的关系，将数据处理分为2部分，第一部分，当VecSize * blockDim.x > num 表示当前数据处理需要进行边界处理，因此将IsBoundary设置为 true，避免访存越界，注意此处使用Init函数对寄存器arg0，arg1进行初始化，避免当arg0或者arg1作为分母时出现为0的情况。
+
+### kernel 代码
+
+```
+
+#include "kernel_primitives/kernel_primitives.h"
+template<int VecSize, typename InT, typename OutT, typename Functor, bool IsBoundary>
+__device__ void elementwiseImpl(InT _global_ptr_ *in0, InT _global_ptr_* in1, OutT _global_ptr_ * out, Functor func, int num) {
+  __local__ InT arg0[VecSize];
+  __local__ InT arg1[VecSize];
+  __local__ OutT result[VecSize];
+  Init<InT, VecSize>(arg0, static_cast<OutT>(1.0f));
+  Init<InT, VecSize>(arg1, static_cast<OutT>(1.0f));
+  ReadData<InT, VecSize, 1, 1, IsBoundary>(arg0, in0, num);
+  ReadData<InT, VecSize, 1, 1, IsBoundary>(arg1, in1, num);
+  ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(result, arg0, arg1, func);
+  WriteData<OutT, VecSize, 1, 1, IsBoundary>(out, result, num);
+}
+
+template<int VecSize, typename InT, typename OutT, typename Functor >
+__global__ void elementwise(InT *in0,
+                            InT *in1, OutT *out,
+                            int size, Functor func) {
+  int data_offset = VecSize * blockIdx.x * blockDim.x; // data offset of this block
+  int stride = gridDim.x * blockDim.x * VecSize;
+  for (int offset = data_offset; offset < size; offset += stride) {
+    if (offset + blockDim.x * VecSize < size) {
+      elementwiseImpl<VecSize, InT, OutT, Functor, false>(in0 + offset, in1 + offset, out + offset, func, size - offset);
+    } else {
+      elementwiseImpl<VecSize, InT, OutT, Functor, true>(in0 + offset, in1 + offset, out + offset, func, size - offset);
+    }
+  }
+}
+
+```
diff --git a/docs/guides/11_kernel_primitives_api/case_reduce.md b/docs/guides/11_kernel_primitives_api/case_reduce.md
@@ -0,0 +1,87 @@
+
+## Redcuce
++ 案例功能说明：完成单维Reduce操作，例如输入为x [N, H, W, C], axis 可以取值为，0 / 1/ 2。实现对非最低维的reduce操作。reduce类型可以为reduce sum / mean / any /prod 等，通过ReduceOp进行定义。
+
+### ReduceOp定义
+```
+template <typename T>
+struct DivideFunctor {
+  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
+
+  HOSTDEVICE inline T operator()(const T* x) const { return x[0] * n_inv; }
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
+
+  private:
+    T n_inv;
+};
+
+template <typename Tx, typename Ty = Tx>
+struct CustomMean {
+  using Transformer = kpds::DivideFunctor<Tx>;
+
+  inline Ty initial() { return static_cast<Ty>(0.0f); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
+     return b + a;
+  }
+};
+```
+### kernel 实现说明
+
+完成最高维度的reduce，或者完成中间维度当然reduce操作，操作分为2部分，根据剩余数据的size与blockDim.x 间的关系，当size < blockDim.x时需要将IsBounary设置为true，表明需要进行访存边界判断，避免访问存储越界。
+
+### kernel 代码
+
+```
+template <typename Tx, typename Ty, typename MPType, typename ReduceOp, typename TransformOp, bool IsBoundary = false>
+__device__ void HigherDimImp(const Tx* x, Ty* y, ReduceOp reducer,
+                                     TransformOp transformer, MPType init,
+                                     int reduce_num, int left_num,
+                                     int block_size) {
+  const int NY = 1;
+  int idx = blockIdx.x * blockDim.x;
+  int idy = blockIdx.y * block_size; // block_offset of rows
+  Tx reduce_input[NY];
+  MPType reduce_compute[NY];
+  MPType result = init;
+  int block_offset = idy * left_num + idx + blockIdx.z * reduce_num * left_num; // the offset of this block
+  const Tx* input = x + block_offset;
+  int store_offset = blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num + idx;
+  // how many columns left
+  int size = left_num - idx;
+  // how many rows have to be reduced
+  int loop = reduce_num - idy;
+  loop = loop > block_size ? block_size : loop;
+
+  for (int loop_index = 0; loop_index < loop; loop_index += NY) {
+    kps::ReadData<Tx, Tx, 1, NY, 1, IsBoundary>(&reduce_input[0], input + loop_index * left_num, size, NY, 1, left_num);
+    kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(&reduce_compute[0], &reduce_input[0], transformer);
+    kps::Reduce<MPType, NY, 1, 1, ReduceOp, kps::details::ReduceMode::kLocalMode>( &result, &reduce_compute[0], reducer, false);
+  }
+
+  Ty temp_data = static_cast<Ty>(result);
+  kps::WriteData<Ty, 1, 1, 1, IsBoundary>(y + store_offset, &temp_data, size);
+}
+
+template <typename Tx, typename Ty, typename MPType, typename ReduceOp, typename TransformOp>
+__global__ void ReduceHigherDimKernel(const Tx* x, Ty* y, ReduceOp reducer,
+                                      TransformOp transformer, MPType init,
+                                      int reduce_num, int left_num,
+                                      int blocking_size) {
+  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this function will be used
+  // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
+  //     if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / 32
+  //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
+
+  int size = left_num - blockIdx.x * blockDim.x;
+  if (size >= blockDim.x) {  // complete segment
+    HigherDimImp<Tx, Ty, MPType, ReduceOp, TransformOp>(
+        x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
+  } else {
+    HigherDimImp<Tx, Ty, MPType, ReduceOp, TransformOp, true>(
+        x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
+  }
+}
+
+```
diff --git a/docs/guides/11_kernel_primitives_api/compute_api_cn.md b/docs/guides/11_kernel_primitives_api/compute_api_cn.md
@@ -45,9 +45,11 @@ __device__ void ElementwiseBinary(OutT* out, const InT* in1, const InT* in2, OpF
 ```
 
 ###函数说明
+
 按照 compute 中的计算规则对i n1、in2 进行计算，将计算结果按照 OutT 类型存储到寄存器 out 中。
 
 ###模板参数
+
 ```
 InT : 输入数据的类型。
 OutT : 存储到out寄存器中的类型。
@@ -84,9 +86,11 @@ __device__ void CycleBinary(OutT* out, const InT* in1, const InT* in2, OpFunc co
 ```
 
 ###函数说明
+
 按照 compute 中的计算规则对 in1、in2 进行计算，将计算结果按照 OutT 类型存储到寄存器 out 中. in1 的 shape 为[1, NX], in2 的 shape 为 [NY, NX]，实现in1， in2的循环计算，out的shape是[NY, NX]。
 
 ###模板参数
+
 ```
 InT : 输入数据的类型。
 OutT : 存储到out寄存器中的类型。
@@ -125,9 +129,11 @@ template <typename InT, typename OutT, int NX, int NY, int BlockSize, class OpFu
 ```
 
 ###函数说明
+
 按照 compute 中的计算规则对 in1、in2、in3 进行计算，将计算结果按照OutT类型存储到寄存器out中。
 
 ###模板参数
+
 ```
 InT : 输入数据的类型。
 OutT : 存储到out寄存器中的类型。
@@ -164,9 +170,11 @@ __device__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
 ```
 
 ###函数说明
+
 按照 compute 中的计算规则对 ins 中的输入进行计算，将计算结果按照OutT类型存储到寄存器 out 中，所有输入输出的维度相同。
 
 ###模板参数
+
 ```
 InT : 输入数据的类型。
 OutT : 存储到out寄存器中的类型。
@@ -201,9 +209,11 @@ __device__ void Reduce(T* out, const T* in, ReduceFunctor reducer, bool reduce_l
 ```
 
 ###函数说明
+
 根据 reducer 对 in 中的数据进行数据规约，in数据size为[NY， NX]，当 ReduceMode = kLocalMode 时，对 in 沿着 NX 方向进行规约，完成线程内规约，out为[NY, 1]；当 ReduceMode = kGlobalMode 时，使用共享内存完成 block 内线程间的规约操作，in 和 out 的size相同，均为[NY, NX]。
 
 ###模板参数
+
 ```
 T : 输入数据的类型。
 NX : 每个线程需要计算 NX 列数据。
@@ -222,6 +232,7 @@ Mode: 规约模式，可以取值为 kGlobalMode、kLocalMode，当 ReduceMode =
 ```
 
 ###函数参数
+
 ```
 out : 输出寄存器指针，大小为 NX x NY。
 in : 输入寄存器指针，大小为 NX x NY。

diff --git a/docs/guides/11_kernel_primitives_api/index_cn.rst b/docs/guides/11_kernel_primitives_api/index_cn.rst
@@ -44,17 +44,11 @@ API 列表
 +--------------------------------------+-------------------------------------------------------+
 
 API 详细介绍
-############
+##############
 
 - `IO API <./io_api_cn.html>`_ : 介绍封装的IO类Primitive API。
 - `COMPUTE API <./compute_api_cn.html>`_ : 介绍封装的COMPUTE类Primitive API。
 
-..  toctree::
-    :hidden:
-
-    io_api_cn.md
-    compute_api_cn.md
-
 API 应用实例
 ############
 
@@ -64,7 +58,7 @@ API 应用实例
 ..  toctree::
     :hidden:
 
+    io_api_cn.md
+    compute_api_cn.md
     elementwise.md
     softmax.md
-
-
diff --git a/docs/guides/11_kernel_primitives_api/io_api_cn.md b/docs/guides/11_kernel_primitives_api/io_api_cn.md
@@ -1,4 +1,5 @@
 ##ReadData
+
 ###函数定义
 
 ```
@@ -10,9 +11,11 @@ __device__ void ReadData(Ty* dst, const Tx* __restrict__ src,
 ```
 
 ###函数说明
+
 将 Tx 类型的 2D 数据从全局内存中读取到寄存器，并按照 Ty 类型存储到寄存器 dst 中。每读取1列数据需要偏移 stride_nx 列数据，每读取 NX 列数据需要偏移 stride_ny 行数据，直到加载 NX * NY 个数据到寄存器 dst 中。当 IsBoundary == true 需要保证数据读取行数不超过 size_ny, 数据读取列数不超过 size_nx 列。
 
 ###模板参数
+
 ```
 Tx : 数据存储在全局内存中的数据类型。
 Ty : 数据加载到寄存器上存储的类型。
@@ -22,7 +25,9 @@ BlockSize : 设备属性，标识当前设备线程索引方式。对于GPU，th
 IsBoundary : 标识是否进行访存边界判断。当block处理的数据总数小于 NX x NY x blockDim 时，需要进行边界判断以避免访存越界。
 
 ```
+
 ###函数参数
+
 ```
 dst : 输出寄存器指针，大小为 NX x NY。
 src : 当前 block 的输入数据指针，通常为i nput + blockIdx.x x blockDim.x x NX。
@@ -35,17 +40,21 @@ stride_ny : 每读取 NX 列需要偏移 stride_nx 行。
 ------------------
 
 ##ReadData
+
 ###函数定义
 
+
 ```
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ void ReadData(T* dst, const T* __restrict__ src, int num);
 ```
 
 ###函数说明
+
 将 T 类型的1D数据从全局内存 src 中读取到寄存器 dst 中。每次连续读取 NX 个数据，当前仅支持 NY = 1，直到加载 NX 个数据到寄存器 dst 中。当 IsBoundary = true 需要保证数据读取个数不超过 num ,以避免访存越界。当 (NX % 4 = 0 或 NX % 2 = 0) 且 IsBoundary = false 时，会有更高的访存效率。
 
 ###模板参数
+
 ```
 T : 元素类型
 NX : 每个线程读取 NX 列数据。
@@ -54,7 +63,9 @@ BlockSize : 设备属性，标识当前设备线程索引方式。对于 GPU，t
 IsBoundary : 标识是否进行访存边界判断。当block处理的数据总数小于 NX x NY x blockDim 时，需要进行边界判断以避免访存越界。
 
 ```
+
 ###函数参数
+
 ```
 dst : 输出寄存器指针，大小为 NX x NY。
 src : 当前 block 的输入数据指针，通常为 input + blockIdx.x x blockDim.x x NX。
@@ -64,6 +75,7 @@ num : 当前 block 对多读取 num 个元素，参数仅在 IsBoundary = true
 ------------------
 
 ##ReadDataBc
+
 ###函数定义
 
 ```
@@ -77,9 +89,11 @@ __device__ void ReadDataBc(T* dst, const T* __restrict__ src,
 ```
 
 ###函数说明
+
 将需要进行 brodcast 的 2D 数据按照T类型从全局内存 src 中读取到寄存器 dst 中，其中 src 为原始输入数据指针，根据 config 计算当前输出数据对应的输入数据坐标，将坐标对应的数据读取到寄存器中。
 
 ###模板参数
+
 ```
 T : 待读取的元素类型
 NX : 每个线程读取 NX 列数据。
@@ -88,7 +102,9 @@ BlockSize : 设备属性，标识当前设备线程索引方式。对于 GPU，t
 Rank : 原始输出数据的维度。
 IsBoundary : 标识是否进行访存边界判断。当block处理的数据总数小于 NX x NY x blockDim 时，需要进行边界判断以避免访存越界。
 ```
+
 ###函数参数
+
 ```
 dst : 输出寄存器指针，大小为 NX x NY。
 src : 原始输入数据指针。
@@ -103,6 +119,7 @@ stride_ny : 每读取 NX 列需要偏移 stride_nx 行。
 ------------------
 
 ##ReadDataReduce
+
 ###函数定义
 
 ```
@@ -118,9 +135,11 @@ __device__ void ReadDataReduce(T* dst, const T* __restrict__ src,
 ```
 
 ###函数说明
+
 将需要进行reduce操作的2D数据以T类型从全局内存src中读取到寄存器dst中，其中src为原始输入数据指针，根据index_cal计算当前输出数据对应的输入数据坐标，将坐标对应的数据读取到寄存器中。
 
 ###模板参数
+
 ```
 T : 待读取的元素类型
 NX : 每个线程读取 NX 列数据。
@@ -136,7 +155,9 @@ IndexCal : 输入输出坐标映射规则。定义方式如下：
 IsBoundary : 标识是否进行访存边界判断。当block处理的数据总数小于 NX x NY x blockDim 时，需要进行边界判断以避免访存越界。
 
 ```
+
 ###函数参数
+
 ```
 dst : 输出寄存器指针，大小为 NX x NY。
 src : 原始输入数据指针。
@@ -152,25 +173,31 @@ reduce_last_dim：原始输入数据的最低维是否进行reduce，当reduce_l
 ------------------
 
 ##WriteData
+
 ###函数定义
 
+
 ```
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ void WriteData(T* dst, T* __restrict__ src, int num);
 ```
 
 ###函数说明
+
 将 T 类型的 1D 数据从寄存器 src 写到全局内存 dst 中。每次连续读取 NX 个数据，当前仅支持NY = 1，直到写 NX 个数据到全局内存 dst 中。当 IsBoundary = true 需要保证数据读取个数不超过 num，以避免访存越界。当 (NX % 4 = 0 或 NX % 2 = 0) 且 IsBoundary = false 时，会有更高的访存效率。
 
 ###模板参数
+
 ```
 T : 元素类型
 NX : 每个线程读取 NX 列数据。
 NY : 每个线程读取 NY 行数据， 当前仅支持为NY = 1。
 BlockSize : 设备属性，标识当前设备线程索引方式。对于 GPU，threadIdx.x 用作线程索引，而对于XPU，core_id() 用作线程索引。
 IsBoundary : 标识是否进行访存边界判断。当block处理的数据总数小于 NX x NY x blockDim 时，需要进行边界判断以避免访存越界。
 ```
+
 ###函数参数
+
 ```
 dst : 当前 block 的输出数据指针，通常为 input + blockIdx.x x blockDim.x x NX。
 src : 寄存器指针，大小为 NX x NY。，通常为 input + blockIdx.x * blockDim.x * NX。