Skip to content

Commit

Permalink
[cuDNN V8 API] (reopen 2) Allow the number of kernels profiled under …
Browse files Browse the repository at this point in the history
…torch.backends.cudnn.benchmark = True to be limitedCudnnv8 benchmark limit (pytorch#78299)

Reopen of pytorch#77002 to address comments by @malfet

CC @ngimel @ptrblck
Pull Request resolved: pytorch#78299
Approved by: https://github.com/ngimel
  • Loading branch information
eqy authored and pytorchmergebot committed Jul 7, 2022
1 parent 7fd0cf5 commit ae6dd20
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 6 deletions.
8 changes: 8 additions & 0 deletions aten/src/ATen/Context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,14 @@ void Context::setBenchmarkCuDNN(bool b) {
benchmark_cudnn = b;
}

int Context::benchmarkLimitCuDNN() const {
return benchmark_limit_cudnn;
}

void Context::setBenchmarkLimitCuDNN(int b) {
benchmark_limit_cudnn = b;
}

bool Context::allowTF32CuBLAS() const {
static bool allow_tf32_cublas_override = c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true;
return allow_tf32_cublas_override || float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
Expand Down
3 changes: 3 additions & 0 deletions aten/src/ATen/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ class TORCH_API Context {
void setUserEnabledMkldnn(bool e);
bool benchmarkCuDNN() const;
void setBenchmarkCuDNN(bool);
int benchmarkLimitCuDNN() const;
void setBenchmarkLimitCuDNN(int);
bool deterministicCuDNN() const;
void setDeterministicCuDNN(bool);

Expand Down Expand Up @@ -254,6 +256,7 @@ class TORCH_API Context {
bool benchmark_cudnn = false;
Float32MatmulPrecision float32_matmul_precision =
at::Float32MatmulPrecision::HIGHEST;
int benchmark_limit_cudnn = 10;
bool allow_tf32_cudnn = true;
bool allow_fp16_reduction_cublas = true;
bool enabled_mkldnn = true;
Expand Down
8 changes: 5 additions & 3 deletions aten/src/ATen/native/cudnn/Conv_v8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ void generate_and_filter_plans(const cudnnHandle_t handle, cudnn_frontend::Opera
remove_invalid = true;
}
}
if (remove_invalid) {
if (remove_invalid || max_plans) {
cudnn_frontend::executionPlans_t new_valid_plans;
unsigned int plan_count = 0;
for (auto &plan : valid_plans) {
Expand All @@ -370,7 +370,8 @@ auto get_plans_from_find(const cudnnHandle_t handle, const cudnnBackendDescripto
cudnn_frontend::executionPlans_t valid_plans;
c10::DeviceGuard g(x.options().device());
at::DataPtr workspace_ptr;
generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr);
auto benchmark_limit = at::globalContext().benchmarkLimitCuDNN();
generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr, benchmark_limit);
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setDataPointers(3, data_ptrs)
.setUids(3, uids)
Expand Down Expand Up @@ -400,7 +401,8 @@ auto get_plans_from_find_fused(const cudnnHandle_t handle,
cudnn_frontend::executionPlans_t valid_plans;
c10::DeviceGuard g(x.options().device());
at::DataPtr workspace_ptr;
generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr);
auto benchmark_limit = at::globalContext().benchmarkLimitCuDNN();
generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr, benchmark_limit);
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setDataPointers(5, data_ptrs)
.setUids(5, uids)
Expand Down
8 changes: 8 additions & 0 deletions docs/source/backends.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ torch.backends.cudnn
A :class:`bool` that, if True, causes cuDNN to benchmark multiple convolution algorithms
and select the fastest.

.. attribute:: torch.backends.cudnn.benchmark_limit

A :class:`int` that specifies the maximum number of cuDNN convolution algorithms to try when
`torch.backends.cudnn.benchmark` is True. Set `benchmark_limit` to zero to try every
available algorithm. Note that this setting only affects convolutions dispatched via the
cuDNN v8 API.


torch.backends.mps
^^^^^^^^^^^^^^^^^^
.. automodule:: torch.backends.mps
Expand Down
4 changes: 4 additions & 0 deletions torch/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,10 @@ if(USE_ROCM)
list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${roctracer_INCLUDE_DIRS})
endif()

if(USE_EXPERIMENTAL_CUDNN_V8_API)
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_EXPERIMENTAL_CUDNN_V8_API)
endif()

if(USE_CUDNN OR USE_ROCM)
list(APPEND TORCH_PYTHON_SRCS
${TORCH_SRC_DIR}/csrc/cuda/shared/cudnn.cpp
Expand Down
2 changes: 2 additions & 0 deletions torch/_C/__init__.pyi.in
Original file line number Diff line number Diff line change
Expand Up @@ -1053,6 +1053,8 @@ def _cuda_jiterator_compile_and_launch_kernel(code_string: str,
num_outputs: _int,
tensors: Tuple,
kwargs: Dict[str, Union[_int, _float, _bool]]) -> Tensor: ...
def _cuda_get_cudnn_benchmark_limit() -> _int: ...
def _cuda_set_cudnn_benchmark_limit(arg: _int) -> None: ...
def _nccl_version() -> _int: ...
def _nccl_unique_id() -> bytes: ...
def _nccl_init_rank(nranks: _int, comm_id: bytes, rank: _int) -> object: ...
Expand Down
13 changes: 10 additions & 3 deletions torch/backends/cudnn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,18 @@ def is_acceptable(tensor):
return True


def set_flags(_enabled=None, _benchmark=None, _deterministic=None, _allow_tf32=None):
def set_flags(_enabled=None, _benchmark=None, _benchmark_limit=None, _deterministic=None, _allow_tf32=None):
orig_flags = (torch._C._get_cudnn_enabled(),
torch._C._get_cudnn_benchmark(),
None if not is_available() else torch._C._cuda_get_cudnn_benchmark_limit(),
torch._C._get_cudnn_deterministic(),
torch._C._get_cudnn_allow_tf32())
if _enabled is not None:
torch._C._set_cudnn_enabled(_enabled)
if _benchmark is not None:
torch._C._set_cudnn_benchmark(_benchmark)
if _benchmark_limit is not None and is_available():
torch._C._cuda_set_cudnn_benchmark_limit(_benchmark_limit)
if _deterministic is not None:
torch._C._set_cudnn_deterministic(_deterministic)
if _allow_tf32 is not None:
Expand All @@ -119,9 +122,9 @@ def set_flags(_enabled=None, _benchmark=None, _deterministic=None, _allow_tf32=N


@contextmanager
def flags(enabled=False, benchmark=False, deterministic=False, allow_tf32=True):
def flags(enabled=False, benchmark=False, benchmark_limit=10, deterministic=False, allow_tf32=True):
with __allow_nonbracketed_mutation():
orig_flags = set_flags(enabled, benchmark, deterministic, allow_tf32)
orig_flags = set_flags(enabled, benchmark, benchmark_limit, deterministic, allow_tf32)
try:
yield
finally:
Expand All @@ -141,6 +144,9 @@ def __init__(self, m, name):
enabled = ContextProp(torch._C._get_cudnn_enabled, torch._C._set_cudnn_enabled)
deterministic = ContextProp(torch._C._get_cudnn_deterministic, torch._C._set_cudnn_deterministic)
benchmark = ContextProp(torch._C._get_cudnn_benchmark, torch._C._set_cudnn_benchmark)
benchmark_limit = None
if is_available():
benchmark_limit = ContextProp(torch._C._cuda_get_cudnn_benchmark_limit, torch._C._cuda_set_cudnn_benchmark_limit)
allow_tf32 = ContextProp(torch._C._get_cudnn_allow_tf32, torch._C._set_cudnn_allow_tf32)

# This is the sys.modules replacement trick, see
Expand All @@ -152,3 +158,4 @@ def __init__(self, m, name):
deterministic: bool
benchmark: bool
allow_tf32: bool
benchmark_limit: int
40 changes: 40 additions & 0 deletions torch/csrc/cuda/Module.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAConfig.h>
#if AT_CUDNN_ENABLED()

#include <ATen/native/cudnn/Macros.h>

#endif
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAGeneratorImpl.h>
#include <ATen/cuda/CachingHostAllocator.h>
Expand Down Expand Up @@ -727,6 +733,32 @@ static PyObject* THCPModule_isCurrentStreamCapturing_wrap(
END_HANDLE_TH_ERRORS
}

PyObject* THCPModule_setBenchmarkLimitCuDNN(PyObject* _unused, PyObject* arg) {
THPUtils_assert(
THPUtils_checkLong(arg),
"set_benchmark_limit_cudnn expects an int, "
"but got %s",
THPUtils_typename(arg));
auto benchmark_limit = static_cast<int>(THPUtils_unpackLong(arg));
#if defined(USE_ROCM)
TORCH_WARN_ONCE(
"cuDNN Benchmark limit is not supported in MIOpen and will have no effect.");
#endif
#if AT_CUDNN_ENABLED()
#if HAS_CUDNN_V8()
at::globalContext().setBenchmarkLimitCuDNN(benchmark_limit);
#else
TORCH_WARN_ONCE(
"cuDNN Benchmark limit is not supported with cuDNN v7 API and will have no effect.");
#endif
#endif
Py_RETURN_NONE;
}

PyObject* THCPModule_benchmarkLimitCuDNN(PyObject* _unused, PyObject* noargs) {
return THPUtils_packInt32(at::globalContext().benchmarkLimitCuDNN());
}

// NOLINTNEXTLINE(modernize-avoid-c-arrays,
// cppcoreguidelines-avoid-non-const-global-variables,
// cppcoreguidelines-avoid-c-arrays)
Expand Down Expand Up @@ -814,6 +846,14 @@ static struct PyMethodDef _THCPModule_methods[] = {
THCPModule_cudaJiteratorCompileAndLaunchKernel,
METH_VARARGS,
nullptr},
{"_cuda_get_cudnn_benchmark_limit",
THCPModule_benchmarkLimitCuDNN,
METH_NOARGS,
nullptr},
{"_cuda_set_cudnn_benchmark_limit",
THCPModule_setBenchmarkLimitCuDNN,
METH_O,
nullptr},
#ifdef USE_NCCL
{"_nccl_version", THCPModule_nccl_version, METH_NOARGS, nullptr},
{"_nccl_unique_id", THCPModule_nccl_unique_id, METH_NOARGS, nullptr},
Expand Down

0 comments on commit ae6dd20

Please sign in to comment.