merge CMakeList.txt manual (PaddlePaddle#35378)

feng_shuai · AnnaTrainingG · commit e370b16358a1 · 2021-09-29T03:08:46.000Z
* merge CMakeList.txt manual

* add platform for changethreadnum

* repair some bugs according to make error

* do nothing just flush CI

* forget change thread num

* add inplace_atol param for check_output_with_place

* Windows

* std:min and std::max should be change because of windows
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -104,10 +105,14 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int col_width = col->dims()[4];
 
     int num_outputs = im_channels * col_height * col_width;
-    int blocks = (num_outputs + 1024 - 1) / 1024;
+    int num_thread = 1024;
+#ifdef WITH_NV_JETSON
+    platform::ChangeThreadNum(context, &num_thread);
+#endif
+    int blocks = (num_outputs + num_thread - 1) / num_thread;
     int block_x = 512;
     int block_y = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
+    dim3 threads(num_thread, 1);
     dim3 grid(block_x, block_y);
     im2col<T><<<grid, threads, 0, context.stream()>>>(
         im.data<T>(), num_outputs, im_height, im_width, dilation[0],
@@ -228,10 +233,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
 
     size_t num_kernels = im_channels * im_height * im_width;
 
-    size_t blocks = (num_kernels + 1024 - 1) / 1024;
+    int num_thread = 1024;
+#ifdef WITH_NV_JETSON
+    platform::ChangeThreadNum(context, &num_thread);
+#endif
+    size_t blocks = (num_kernels + num_thread - 1) / num_thread;
     size_t block_x = 512;
     size_t block_y = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
+    dim3 threads(num_thread, 1);
     dim3 grid(block_x, block_y);
 
     // To avoid involving atomic operations, we will launch one kernel per
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -254,8 +255,13 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
   const int padding_width = paddings[1];
 
   int nthreads = batch_size * output_channels * output_height * output_width;
-  int blocks = (nthreads + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
+  int thread_num = 1024;
+#ifdef WITH_NV_JETSON
+  // platform::ChangeThreadNum(context, &thread_num);
+  thread_num = 512;
+#endif
+  int blocks = (nthreads + thread_num - 1) / thread_num;
+  dim3 threads(thread_num, 1);
   dim3 grid(blocks, 1);
 
   KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
@@ -298,10 +304,13 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_height * output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
+    int thread_num = 1024;
+#ifdef WITH_NV_JETSON
+    platform::ChangeThreadNum(context, &thread_num);
+#endif
+    int blocks = (nthreads + thread_num - 1) / thread_num;
+    dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
-
     KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
@@ -341,10 +350,13 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_height * output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
+    int thread_num = 1024;
+#ifdef WITH_NV_JETSON
+    platform::ChangeThreadNum(context, &thread_num);
+#endif
+    int blocks = (nthreads + thread_num - 1) / thread_num;
+    dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
-
     KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
@@ -911,8 +923,12 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
+    int thread_num = 1024;
+#ifdef WITH_NV_JETSON
+    platform::ChangeThreadNum(context, &thread_num);
+#endif
+    int blocks = (nthreads + thread_num - 1) / thread_num;
+    dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
     KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
@@ -962,8 +978,12 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
+    int thread_num = 1024;
+#ifdef WITH_NV_JETSON
+    platform::ChangeThreadNum(context, &thread_num);
+#endif
+    int blocks = (nthreads + thread_num - 1) / thread_num;
+    dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
     KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
@@ -1377,10 +1397,14 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_height * output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
+    int thread_num = 1024;
+#ifdef WITH_NV_JETSON
+    platform::ChangeThreadNum(context, &thread_num);
+#endif
 
+    int blocks = (nthreads + thread_num - 1) / thread_num;
+    dim3 threads(thread_num, 1);
+    dim3 grid(blocks, 1);
     KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
@@ -1613,8 +1637,13 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
+    int thread_num = 1024;
+#ifdef WITH_NV_JETSON
+    platform::ChangeThreadNum(context, &thread_num);
+#endif
+
+    int blocks = (nthreads + thread_num - 1) / thread_num;
+    dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -152,8 +153,14 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
     int num_outputs =
         input_channels * output_depth * output_height * output_width;
 
-    const int threads = 1024;
-    const int blocks = (num_outputs + 1024 - 1) / 1024;
+    int max_threads = 1024;
+#ifdef WITH_NV_JETSON
+    platform::ChangeThreadNum(context, &max_threads);
+#endif
+
+    const int threads = max_threads;
+    const int blocks = (num_outputs + max_threads - 1) / max_threads;
+
     vol2col<T><<<blocks, threads, 0, context.stream()>>>(
         num_outputs, vol.data<T>(), input_depth, input_height, input_width,
         dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
@@ -313,8 +320,13 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
 
     int num_kernels = input_channels * input_depth * input_height * input_width;
 
-    const int threads = 1024;
-    const int blocks = (num_kernels + 1024 - 1) / 1024;
+    int max_threads = 1024;
+#ifdef WITH_NV_JETSON
+    platform::ChangeThreadNum(context, &max_threads);
+#endif
+
+    const int threads = max_threads;
+    const int blocks = (num_kernels + max_threads - 1) / max_threads;
 
     col2vol<T><<<blocks, threads, 0, context.stream()>>>(
         num_kernels, col.data<T>(), input_depth, input_height, input_width,
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/roi_align_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -261,7 +262,9 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
     int output_size = out->numel();
     int blocks = NumBlocks(output_size);
     int threads = kNumCUDAThreads;
-
+#ifdef WITH_NV_JETSON
+    platform::ChangeThreadNum(ctx.cuda_device_context(), &threads, 256);
+#endif
     Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
     auto cplace = platform::CPUPlace();
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace platform {
@@ -65,6 +66,11 @@ struct ForRange<CUDADeviceContext> {
 #ifdef __HIPCC__
     // HIP will throw core dump when threads > 256
     constexpr int num_threads = 256;
+#elif WITH_NV_JETSON
+    // JETSON_NANO will throw core dump when threads > 128
+    int num_thread = 256;
+    platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
+    const int num_threads = num_thread;
 #else
     constexpr int num_threads = 1024;
 #endif
diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h
@@ -23,6 +23,7 @@
 #else
 #include <hip/hip_runtime.h>
 #endif
+
 #include <stddef.h>
 #include <algorithm>
 #include <string>
@@ -33,6 +34,18 @@ namespace platform {
 
 inline int DivUp(int a, int b) { return (a + b - 1) / b; }
 
+#ifdef WITH_NV_JETSON
+// The number of threads cannot be assigned 1024 in some cases when the device
+// is nano or tx2 .
+inline void ChangeThreadNum(const platform::CUDADeviceContext& context,
+                            int* num_thread, int alternative_num_thread = 512) {
+  if (context.GetComputeCapability() == 53 ||
+      context.GetComputeCapability() == 62) {
+    *num_thread = alternative_num_thread;
+  }
+}
+#endif
+
 struct GpuLaunchConfig {
   dim3 theory_thread_count = dim3(1, 1, 1);
   dim3 thread_per_block = dim3(1, 1, 1);
@@ -61,15 +74,22 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
 
   // Compute physical threads we need, should small than max sm threads
   const int physical_thread_count =
-      std::min(max_physical_threads, theory_thread_count);
+      (std::min)(max_physical_threads, theory_thread_count);
+
+  // Get compute_capability
+  const int capability = context.GetComputeCapability();
+
+#ifdef WITH_NV_JETSON
+  if (capability == 53 || capability == 62) {
+    max_threads = 512;
+  }
+#endif
 
   // Need get from device
   const int thread_per_block =
-      std::min(max_threads, context.GetMaxThreadsPerBlock());
+      (std::min)(max_threads, context.GetMaxThreadsPerBlock());
   const int block_count =
-      std::min(DivUp(physical_thread_count, thread_per_block), sm);
-  // Get compute_capability
-  const int capability = context.GetComputeCapability();
+      (std::min)(DivUp(physical_thread_count, thread_per_block), sm);
 
   GpuLaunchConfig config;
   config.theory_thread_count.x = theory_thread_count;
@@ -91,19 +111,20 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(
                                   y_dim));
 
   const int kThreadsPerBlock = 256;
-  int block_cols = std::min(x_dim, kThreadsPerBlock);
-  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
+  int block_cols = (std::min)(x_dim, kThreadsPerBlock);
+  int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
 
   int max_physical_threads = context.GetMaxPhysicalThreadCount();
-  const int max_blocks = std::max(max_physical_threads / kThreadsPerBlock, 1);
+  const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
 
   GpuLaunchConfig config;
   // Noticed, block size is not align to 32, if needed do it yourself.
   config.theory_thread_count = dim3(x_dim, y_dim, 1);
   config.thread_per_block = dim3(block_cols, block_rows, 1);
 
-  int grid_x = std::min(DivUp(x_dim, block_cols), max_blocks);
-  int grid_y = std::min(max_blocks / grid_x, std::max(y_dim / block_rows, 1));
+  int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
+  int grid_y =
+      (std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
 
   config.block_per_grid = dim3(grid_x, grid_y, 1);
   return config;
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py