fix conv v7 workspace size limit error, test=develop (#17902)

sneaxiy · web-flow · commit 0a96ec699cbe · 2019-06-10T10:02:16.000+08:00
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -165,6 +165,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 
     // TODO(dangqingqing) simplify the following code by SearchAlgorithm in
     // conv_cudnn_helper.h
+    bool has_got_workspace_size = false;
     if ((!exhaustive_search) && (!half_float)) {
 #if CUDNN_VERSION >= 7001
       using perf_t = cudnnConvolutionFwdAlgoPerf_t;
@@ -176,11 +177,29 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
           cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
           perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
-#else
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+
+      // get workspace size able to allocate
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
           handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &algo));
+          cudnn_output_desc, algo, &workspace_size_in_bytes));
+
+      // NOTE(zjl): cudnnGetConvolutionForwardAlgorithm_v7 cannot limit
+      // workspace size. If the workspace size found by v7 exceeds the limit,
+      // we should fallback to non-v7 method to find another algorithm.
+      if (workspace_size_in_bytes > workspace_size_limit) {
+        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                   "the workspace size request("
+                << workspace_size_in_bytes << ") exceeds the limit("
+                << workspace_size_limit << ")";
+#endif
+        CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+            handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+            cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+            workspace_size_limit, &algo));
+#if CUDNN_VERSION >= 7001
+      } else {
+        has_got_workspace_size = true;
+      }
 #endif
 
       VLOG(3) << "cuDNN forward algo " << algo;
@@ -219,10 +238,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
                      "cuDNN exhaustive search doesn't support half float.");
     }
 
-    // get workspace size able to allocate
-    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-        cudnn_output_desc, algo, &workspace_size_in_bytes));
+    if (!has_got_workspace_size) {
+      // get workspace size able to allocate
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+          cudnn_output_desc, algo, &workspace_size_in_bytes));
+    }
+
     // It is possible for float16 on Volta GPU to allocate more memory than
     // the limit because the algo is overrided to use tensor core.
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
@@ -366,6 +388,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto x_dims = framework::vectorize(input->dims());
     auto f_dims = framework::vectorize(filter->dims());
     auto handle = dev_ctx.cudnn_handle();
+
+    bool has_got_bwd_data_ws_size = false;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       if (exhaustive_search) {
@@ -431,28 +455,49 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                               CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
           data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
         }
-#else
+
         CUDNN_ENFORCE(
-            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-                handle, cudnn_filter_desc,
-                // dyDesc: Handle to the previously initialized input
-                // differential
-                // tensor descriptor.
-                cudnn_output_grad_desc, cudnn_conv_desc,
-                // dxDesc: Handle to the previously initialized output tensor
-                // descriptor.
-                cudnn_input_desc,
-                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &data_algo));
+            platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+                handle, cudnn_filter_desc, cudnn_output_grad_desc,
+                cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
+        auto new_workspace_size = std::max(workspace_size_in_bytes, tmp_size);
+
+        if (new_workspace_size > workspace_size_limit) {
+          VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                     "the workspace size request("
+                  << new_workspace_size << ") exceeds the limit("
+                  << workspace_size_limit << ")";
+#endif
+          CUDNN_ENFORCE(
+              platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+                  handle, cudnn_filter_desc,
+                  // dyDesc: Handle to the previously initialized input
+                  // differential
+                  // tensor descriptor.
+                  cudnn_output_grad_desc, cudnn_conv_desc,
+                  // dxDesc: Handle to the previously initialized output tensor
+                  // descriptor.
+                  cudnn_input_desc,
+                  CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                  workspace_size_limit, &data_algo));
+#if CUDNN_VERSION >= 7001
+        } else {
+          workspace_size_in_bytes = new_workspace_size;
+          has_got_bwd_data_ws_size = true;
+        }
 #endif
       }
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-              handle, cudnn_filter_desc, cudnn_output_grad_desc,
-              cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
-      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+
+      if (!has_got_bwd_data_ws_size) {
+        CUDNN_ENFORCE(
+            platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+                handle, cudnn_filter_desc, cudnn_output_grad_desc,
+                cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
+        workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+      }
     }
 
+    bool has_got_bwd_filter_ws_size = false;
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       if (exhaustive_search) {
@@ -495,22 +540,45 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                 cudnn_conv_desc, cudnn_filter_desc, kNUM_CUDNN_BWD_FILTER_ALGS,
                 &perf_count, perf_results.get()));
         filter_algo = (perf_results.get())[best_algo_idx].algo;
-#else
+
         CUDNN_ENFORCE(
-            platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+            platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
-                cudnn_conv_desc, cudnn_filter_desc,
-                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &filter_algo));
+                cudnn_conv_desc, cudnn_filter_desc, filter_algo, &tmp_size));
+        auto new_workspace_size = std::max(workspace_size_in_bytes, tmp_size);
+
+        if (new_workspace_size > workspace_size_limit) {
+          VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                     "the workspace size request("
+                  << new_workspace_size << ") exceeds the limit("
+                  << workspace_size_limit << ")";
 #endif
+          CUDNN_ENFORCE(
+              platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+                  handle, cudnn_input_desc, cudnn_output_grad_desc,
+                  cudnn_conv_desc, cudnn_filter_desc,
+                  CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+                  workspace_size_limit, &filter_algo));
+#if CUDNN_VERSION >= 7001
+        } else {
+          workspace_size_in_bytes = new_workspace_size;
+          has_got_bwd_filter_ws_size = true;
+        }
+#endif
+      }
+
+      if (!has_got_bwd_filter_ws_size) {
+        CUDNN_ENFORCE(
+            platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+                handle, cudnn_input_desc, cudnn_output_grad_desc,
+                cudnn_conv_desc, cudnn_filter_desc, filter_algo, &tmp_size));
+        workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
       }
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
-              cudnn_filter_desc, filter_algo, &tmp_size));
-      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
     }
 
+    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
+                      "workspace_size to be allocated exceeds the limit");
+
     // ------------------- cudnn conv workspace ---------------------
     if (!cudnn_workspace_ptr) {
       cudnn_workspace =