Merge pull request #705 from JuliaGPU/tb/compat_cudnn

Backports for Julia 1.5
JuliaGPU · Feb 10, 2021 · dad01a5 · dad01a5
2 parents 02ac36a + b52b41d
commit dad01a5
Show file tree

Hide file tree

Showing 39 changed files with 3,426 additions and 1,209 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -3,11 +3,11 @@ steps:
 
   - label: "Julia 1.5"
     plugins:
-      - JuliaCI/julia#v0.5:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.3:
+      - JuliaCI/julia-test#v1:
           test_args: "--quickfail"
-      - JuliaCI/julia-coverage#v0.3:
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -23,11 +23,11 @@ steps:
 
   - label: "Julia 1.5 (debug)"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2:
+      - JuliaCI/julia-test#v1:
           julia_args: "-g2"
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -50,10 +50,10 @@ steps:
 
   - label: "CUDA 11.2"
     plugins:
-      - JuliaCI/julia#v0.6:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.3: ~
-      - JuliaCI/julia-coverage#v0.3:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -70,10 +70,10 @@ steps:
 
   - label: "CUDA 11.1"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2: ~
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -92,10 +92,10 @@ steps:
 
   - label: "CUDA 11.0"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2: ~
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -114,10 +114,10 @@ steps:
 
   - label: "CUDA 10.2"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2: ~
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -136,10 +136,10 @@ steps:
 
   - label: "CUDA 10.1"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2: ~
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -199,10 +199,10 @@ steps:
 
   - label: "Split memory pool"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2: ~
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -225,7 +225,7 @@ steps:
   # so they can run on any system in the juliagpu queue.
   - label: "Benchmarks (dry run)"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
     command: |
       julia --project -e '
@@ -249,7 +249,7 @@ steps:
   # be running on the same system each time
   - label: "Benchmarks"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
     env:
       CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME"
@@ -278,7 +278,7 @@ steps:
 
   - label: "Documentation"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
     command: |
       julia --project -e '

diff --git a/deps/discovery.jl b/deps/discovery.jl
@@ -158,6 +158,21 @@ const cuda_releases = [v"1.0", v"1.1",
                        v"11.0", v"11.1"]
 
 const cuda_library_versions = Dict(
+    v"11.0.1" => Dict(
+        # NOTE: encountered this version in a Docker container; not sure where it came from.
+        "cudart"    => v"11.0.171",
+        "cupti"     => "2020.1.0", # wtf
+        "nvrtc"     => v"11.0.167",
+        "nvtx"      => v"11.0.167",
+        "nvvp"      => v"11.0.167",
+        "cublas"    => v"11.0.0", #.191
+        "cufft"     => v"10.1.3", #.191
+        "curand"    => v"10.2.0", #.191
+        "cusolver"  => v"10.4.0", #.191
+        "cusparse"  => v"11.0.0", #.191
+        "npp"       => v"11.0.0", #.191
+        "nvjpeg"    => v"11.0.0", #.191
+    ),
     v"11.0.2" => Dict(
         "cudart"    => v"11.0.171",
         "cupti"     => "2020.1.0", # wtf
@@ -250,6 +265,10 @@ const cuda_library_names = Dict(
 
 # only for nvdisasm, to discover the CUDA toolkit version
 const cuda_binary_versions = Dict(
+    v"11.0.1" => Dict(
+        # NOTE: encountered this version in a Docker container; not sure where it came from.
+        "nvdisasm"  => v"11.0.167"
+    ),
     v"11.0.2" => Dict(
         "nvdisasm"  => v"11.0.194"
     ),

diff --git a/lib/cudnn/CUDNN.jl b/lib/cudnn/CUDNN.jl
@@ -19,20 +19,24 @@ include("libcudnn_deprecated.jl")
 # low-level wrappers
 include("util.jl")
 include("base.jl")
+include("descriptors.jl")
 include("tensor.jl")
-include("conv.jl")
+include("inplace.jl")
+include("optensor.jl")
+include("reduce.jl")
+include("convolution.jl")
 include("pooling.jl")
 include("activation.jl")
-include("filter.jl")
 include("softmax.jl")
-include("batchnorm.jl")
 include("dropout.jl")
 include("rnn.jl")
+include("multiheadattn.jl")
+include("normalization.jl")
 
 # high-level integrations
 include("nnlib.jl")
+include("batchnorm.jl")
 
-include("compat.jl")
 
 function math_mode(mode=CUDA.math_mode())
     if mode == CUDA.PEDANTIC_MATH

diff --git a/lib/cudnn/README.md b/lib/cudnn/README.md
@@ -0,0 +1,91 @@
+## High level interface to cuDNN functions
+Deniz Yuret, Nov 6, 2020
+
+The goal of the high-level interface is to map the low level cuDNN calls to more natural
+Julia functions. Here are some design choices I followed:
+
+**Naming:** We try to keep the same function, argument, and type names from the cuDNN
+library in the high level interface. The wrappers for descriptors drop the `_t` suffix,
+e.g. `cudnnPoolingDescriptor_t => cudnnPoolingDescriptor`.
+
+**Descriptors:** The cuDNN functions take data and operator descriptors. Most of these
+descriptors are relatively fast to create (~500 ns for a cudnnTensorDescriptor) so they may
+not be worth preallocating for the user but we provide keyword options anyway. We cache
+descriptors (~100 ns) so we can use them as hash keys for memoization, which also saves a
+bit of memory and speed.  All descriptor fields are `isbits` types with the exception of the
+`cudnnDropoutDescriptor` which points to a random number generator state and is used as a
+field of some other descriptors.
+
+**Operator descriptors:** Descriptors such as `cudnnPoolingDescriptor` specify the options
+for an operator such as stride and padding. For operators with descriptors we have one
+method that takes keyword arguments with reasonable defaults to construct the descriptor and
+another method that takes a pre-initialized descriptor as its last argument.  This way a
+casual user can call the first method without worrying about the descriptor format, only
+specifying non-default options, whereas a layer architect can keep a preset descriptor in
+the layer that gets passed to the function using the second method. We try to use generic
+Julia types for keyword arguments that specify default descriptor fields and convert these
+to the appropriate cudnn types during descriptor construction.
+
+**Output arrays:** The low level cuDNN functions take pre-allocated output arrays. The high
+level interface has one Julia function that allocates its own output array
+(e.g. `cudnnPoolingForward`) and another with an exclamation mark that takes a pre-allocated
+output array as its first argument (e.g. `cudnnPoolingForward!`).
+
+**Methods:** Each cuDNN forward function may have up to four methods depending on whether
+the descriptor and the output array are specified:
+
+    cudnnPoolingForward(x; kwargs...)
+    cudnnPoolingForward(x, d::cudnnPoolingDescriptor; kwargs...)
+    cudnnPoolingForward!(y, x; kwargs...)
+    cudnnPoolingForward!(y, x, d::cudnnPoolingDescriptor; kwargs...)
+
+The conventional order of arguments for these public methods is:
+
+    ([output], weights, inputs, [descriptor]; kwargs...)
+
+**AD method:** Neither the high level nor the low level interface is sometimes
+appropriate for gradient definitions, e.g. the low level API may not return a value, the
+high level API may have some gradient target parameters as keyword arguments. To solve this
+issue the API exposes an intermediate function with an AD suffix,
+e.g. `cudnnPoolingForwardAD`, that is called by the high level method and that makes
+the low level library call. These methods may not seem like they are doing anything useful,
+but they should not be removed so automatic gradient packages may make use of them.
+
+**Backward functions:** The point of a high level interface is to give the user appropriate
+defaults for the many options of typical cudnn functions. Backward functions do not have
+meaningful defaults because they need to copy their options from the corresponding forward
+function. Therefore we do not need high level APIs for backward functions unless they are
+useful in some other way. See Knet/src/cudnn for example uses.
+
+**Types:** Do not specify types for array arguments. Leave the high level functions generic
+so they can be called with CuArray, KnetArray, AutoGrad.Param etc. Types can and should be
+specified for non-array arguments. In the API we use `nothing` to indicate unspecified array
+argument values, convert these to `C_NULL` or `CU_NULL` as appropriate only at the low-level
+call. Similarly for numbers the API should accept generic types like `Integer` or `Real` and
+convert these to the appropriate specific type, e.g. `Cint` or `Cdouble` only at the
+low-level call.
+
+**Workspace:** Some functions need a temporary allocated workspace whose required size is
+determined by another cudnn call. Unfortunately, the required size may depend on factors
+other than the current inputs (see [this
+issue](https://github.com/FluxML/Flux.jl/issues/923#issuecomment-558671966)), so the usage
+of the `@workspace` macro is used at a point as close to the library call as possible. One
+exception to this is cases where the same workspace will be passed to the backward call, in
+which case we allocate a regular CuArray.
+
+**Training vs Inference:** There is no consistent way cuDNN distinguishes training vs inference calls:
+* BatchNormalization and Normalization have two separate functions: `cudnnNormalizationForwardTraining / Inference`
+* RNN has an indicator argument: `fwdMode` in `cudnnRNNForward`
+* MultiHeadAttn looks at the `reserveSpace` argument to decide: if `NULL` inference mode, otherwise training mode
+* Dropout always runs in training mode with a non-NULL `reserveSpace` (it doesn't make sense in inference mode)
+* Activation, convolution, pooling, softmax, optensor, addtensor, reducetensor do not make a distinction between the two modes
+
+In the high level API we assume inference by default and let the gradient packages override when necessary.
+See the gradient implementations in Knet/src/cudnn for examples.
+
+**TODO:** 
+* Keyword arg descriptor constructors.
+* Test forw fns with descriptors: check for desc vs kwarg incompatibility.
+* Find out about cudnnRNNSetClip_v8.
+* Test with Knet.Ops20.
+* Command used to test: julia17 --project -e 'using Pkg; Pkg.API.test(; test_args=`--memcheck --jobs=1 cudnn`)'
diff --git a/lib/cudnn/activation.jl b/lib/cudnn/activation.jl
@@ -1,44 +1,57 @@
-# descriptor
-
-mutable struct ActivationDesc
-    ptr::cudnnActivationDescriptor_t
+"""
+    cudnnActivationForward(x; mode, nanOpt, coef, alpha)
+    cudnnActivationForward(x, d::cudnnActivationDescriptor; alpha)
+    cudnnActivationForward!(y, x; mode, nanOpt, coef, alpha, beta)
+    cudnnActivationForward!(y, x, d::cudnnActivationDescriptor; alpha, beta)
+
+Return the result of the specified elementwise activation operation applied to `x`.
+Optionally `y` holds the result and `d` specifies the operation. `y` should be similar to
+`x` if specified. Keyword arguments `alpha=1, beta=0` can be used for scaling, i.e. `y .=
+alpha*op.(x1) .+ beta*y`.  The following keyword arguments specify the operation if `d` is
+not given:
+
+* `mode = CUDNN_ACTIVATION_RELU`: Options are SIGMOID, RELU, TANH, CLIPPED_RELU, ELU, IDENTITY
+* `nanOpt = CUDNN_NOT_PROPAGATE_NAN`: NAN propagation policy, the other option is `CUDNN_PROPAGATE_NAN`
+* `coef=1`: When the activation mode is set to CUDNN_ACTIVATION_CLIPPED_RELU, this input specifies the clipping threshold; and when the activation mode is set to CUDNN_ACTIVATION_ELU, this input specifies the α parameter.
+"""
+cudnnActivationForward, cudnnActivationForward!
+
+
+# Public methods
+cudnnActivationForward(x; o...)     = cudnnActivationForwardWithDefaults(x; o...)
+cudnnActivationForward!(y, x; o...) = cudnnActivationForwardWithDefaults(x; y, o...)
+cudnnActivationForward(x, d::cudnnActivationDescriptor; o...)     = cudnnActivationForwardWithDefaults(x; activationDesc=d, o...)
+cudnnActivationForward!(y, x, d::cudnnActivationDescriptor; o...) = cudnnActivationForwardWithDefaults(x; y, activationDesc=d, o...)
+
+
+# Private method
+function cudnnActivationForwardWithDefaults(
+    x;
+    y = similar(x),
+    mode::cudnnActivationMode_t = CUDNN_ACTIVATION_RELU,
+    nanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN,
+    coef::Real=1,
+    activationDesc::cudnnActivationDescriptor = cudnnActivationDescriptor(mode, nanOpt, Cdouble(coef)),
+    alpha::Real=1,
+    beta::Real=0,
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x),
+    yDesc::cudnnTensorDescriptor = xDesc,
+)
+    T = eltype(x)
+    alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta)
+    cudnnActivationForwardAD(x; activationDesc, alpha, xDesc, beta, yDesc, y)
 end
 
-unsafe_free!(ad::ActivationDesc)=cudnnDestroyActivationDescriptor(ad.ptr)
-
-Base.unsafe_convert(::Type{cudnnActivationDescriptor_t}, ad::ActivationDesc)=ad.ptr
 
-function ActivationDesc(mode, coeff, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN)
-    ad = Ref{cudnnActivationDescriptor_t}()
-    cudnnCreateActivationDescriptor(ad)
-    cudnnSetActivationDescriptor(ad[],mode,reluNanOpt,coeff)
-    this = ActivationDesc(ad[])
-    finalizer(unsafe_free!, this)
-    return this
+# AD method:
+function cudnnActivationForwardAD(x; activationDesc, alpha, xDesc, beta, yDesc, y)
+    cudnnActivationForward(handle(), activationDesc, alpha, xDesc, x, beta, yDesc, y)
+    return y
 end
 
 
-# wrappers
-
-function cudnnActivationForward(x::DenseCuArray{T,N}, y::DenseCuArray{T,N}=x;
-                                mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
-                                coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=true,
-                                beta=false) where {T,N}
-    cudnnActivationForward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
-                           scalingParameter(T, alpha), TensorDesc(x), x,
-                           scalingParameter(T, beta ), TensorDesc(y), y)
-    return  y
-end
-
-function cudnnActivationBackward(x::DenseCuArray{T,N}, dx::DenseCuArray{T,N},
-                                 y::DenseCuArray{T,N}, dy::DenseCuArray{T,N}=dx;
-                                 mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
-                                 coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1,
-                                 beta=false) where {T,N}
-    cudnnActivationBackward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
-                            scalingParameter(T, alpha), TensorDesc( y),  y,
-                            TensorDesc(dy), dy,
-                            TensorDesc( x),  x,
-                            scalingParameter(T, beta ), TensorDesc(dx), dx)
-    return dx
+# Deprecated:
+function cudnnActivationForward(x::DenseCuArray{T,N}, y::DenseCuArray{T,N}; o...) where {T,N}
+    @warn "`cudnnActivationForward(x,y)` is deprecated, please use one of the methods in `@doc cudnnActivationForward`." maxlog=1
+    cudnnActivationForward!(y, x; o...)
 end