Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 103 additions & 35 deletions paddle/fluid/operators/conv_cudnn_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {

// TODO(dangqingqing) simplify the following code by SearchAlgorithm in
// conv_cudnn_helper.h
bool has_got_workspace_size = false;
if ((!exhaustive_search) && (!half_float)) {
#if CUDNN_VERSION >= 7001
using perf_t = cudnnConvolutionFwdAlgoPerf_t;
Expand All @@ -176,11 +177,29 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
perf_results.get()));
algo = (perf_results.get())[best_algo_idx].algo;
#else
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(

// get workspace size able to allocate
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
cudnn_output_desc, algo, &workspace_size_in_bytes));

// NOTE(zjl): cudnnGetConvolutionForwardAlgorithm_v7 cannot limit
// workspace size. If the workspace size found by v7 exceeds the limit,
// we should fallback to non-v7 method to find another algorithm.
if (workspace_size_in_bytes > workspace_size_limit) {
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size_in_bytes << ") exceeds the limit("
<< workspace_size_limit << ")";
#endif
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#if CUDNN_VERSION >= 7001
} else {
has_got_workspace_size = true;
}
#endif

VLOG(3) << "cuDNN forward algo " << algo;
Expand Down Expand Up @@ -219,10 +238,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
"cuDNN exhaustive search doesn't support half float.");
}

// get workspace size able to allocate
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes));
if (!has_got_workspace_size) {
// get workspace size able to allocate
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes));
}

// It is possible for float16 on Volta GPU to allocate more memory than
// the limit because the algo is overrided to use tensor core.
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
Expand Down Expand Up @@ -366,6 +388,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims());
auto handle = dev_ctx.cudnn_handle();

bool has_got_bwd_data_ws_size = false;
if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
if (exhaustive_search) {
Expand Down Expand Up @@ -431,28 +455,49 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
#else

CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc,
// dyDesc: Handle to the previously initialized input
// differential
// tensor descriptor.
cudnn_output_grad_desc, cudnn_conv_desc,
// dxDesc: Handle to the previously initialized output tensor
// descriptor.
cudnn_input_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo));
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_output_grad_desc,
cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
auto new_workspace_size = std::max(workspace_size_in_bytes, tmp_size);

if (new_workspace_size > workspace_size_limit) {
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< new_workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
#endif
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc,
// dyDesc: Handle to the previously initialized input
// differential
// tensor descriptor.
cudnn_output_grad_desc, cudnn_conv_desc,
// dxDesc: Handle to the previously initialized output tensor
// descriptor.
cudnn_input_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo));
#if CUDNN_VERSION >= 7001
} else {
workspace_size_in_bytes = new_workspace_size;
has_got_bwd_data_ws_size = true;
}
#endif
}
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_output_grad_desc,
cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);

if (!has_got_bwd_data_ws_size) {
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_output_grad_desc,
cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
}
}

bool has_got_bwd_filter_ws_size = false;
if (filter_grad) {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
if (exhaustive_search) {
Expand Down Expand Up @@ -495,22 +540,45 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, cudnn_filter_desc, kNUM_CUDNN_BWD_FILTER_ALGS,
&perf_count, perf_results.get()));
filter_algo = (perf_results.get())[best_algo_idx].algo;
#else

CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle, cudnn_input_desc, cudnn_output_grad_desc,
cudnn_conv_desc, cudnn_filter_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &filter_algo));
cudnn_conv_desc, cudnn_filter_desc, filter_algo, &tmp_size));
auto new_workspace_size = std::max(workspace_size_in_bytes, tmp_size);

if (new_workspace_size > workspace_size_limit) {
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< new_workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
#endif
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
handle, cudnn_input_desc, cudnn_output_grad_desc,
cudnn_conv_desc, cudnn_filter_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &filter_algo));
#if CUDNN_VERSION >= 7001
} else {
workspace_size_in_bytes = new_workspace_size;
has_got_bwd_filter_ws_size = true;
}
#endif
}

if (!has_got_bwd_filter_ws_size) {
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle, cudnn_input_desc, cudnn_output_grad_desc,
cudnn_conv_desc, cudnn_filter_desc, filter_algo, &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
}
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
cudnn_filter_desc, filter_algo, &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
}

PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit");

// ------------------- cudnn conv workspace ---------------------
if (!cudnn_workspace_ptr) {
cudnn_workspace =
Expand Down