JuliaGPU
diff --git a/‎lib/cudnn/CUDNN.jl
Lines changed: 8 additions & 4 deletions b/‎lib/cudnn/CUDNN.jl
Lines changed: 8 additions & 4 deletions
diff --git a/‎lib/cudnn/README.md
Lines changed: 91 additions & 0 deletions b/‎lib/cudnn/README.md
Lines changed: 91 additions & 0 deletions
diff --git a/‎lib/cudnn/activation.jl
Lines changed: 50 additions & 37 deletions b/‎lib/cudnn/activation.jl
Lines changed: 50 additions & 37 deletions
diff --git a/‎lib/cudnn/batchnorm.jl
Lines changed: 7 additions & 7 deletions b/‎lib/cudnn/batchnorm.jl
Lines changed: 7 additions & 7 deletions
diff --git a/‎lib/cudnn/compat.jl
Lines changed: 0 additions & 21 deletions b/‎lib/cudnn/compat.jl
Lines changed: 0 additions & 21 deletions
@@ -19,20 +19,24 @@ include("libcudnn_deprecated.jl")
 # low-level wrappers
 include("util.jl")
 include("base.jl")
+include("descriptors.jl")
 include("tensor.jl")
-include("conv.jl")
+include("inplace.jl")
+include("optensor.jl")
+include("reduce.jl")
+include("convolution.jl")
 include("pooling.jl")
 include("activation.jl")
-include("filter.jl")
 include("softmax.jl")
-include("batchnorm.jl")
 include("dropout.jl")
 include("rnn.jl")
+include("multiheadattn.jl")
+include("normalization.jl")
 
 # high-level integrations
 include("nnlib.jl")
+include("batchnorm.jl")
 
-include("compat.jl")
 
 function math_mode(mode=CUDA.math_mode())
     if mode == CUDA.PEDANTIC_MATH
 
@@ -0,0 +1,91 @@
+## High level interface to cuDNN functions
+Deniz Yuret, Nov 6, 2020
+
+The goal of the high-level interface is to map the low level cuDNN calls to more natural
+Julia functions. Here are some design choices I followed:
+
+**Naming:** We try to keep the same function, argument, and type names from the cuDNN
+library in the high level interface. The wrappers for descriptors drop the `_t` suffix,
+e.g. `cudnnPoolingDescriptor_t => cudnnPoolingDescriptor`.
+
+**Descriptors:** The cuDNN functions take data and operator descriptors. Most of these
+descriptors are relatively fast to create (~500 ns for a cudnnTensorDescriptor) so they may
+not be worth preallocating for the user but we provide keyword options anyway. We cache
+descriptors (~100 ns) so we can use them as hash keys for memoization, which also saves a
+bit of memory and speed.  All descriptor fields are `isbits` types with the exception of the
+`cudnnDropoutDescriptor` which points to a random number generator state and is used as a
+field of some other descriptors.
+
+**Operator descriptors:** Descriptors such as `cudnnPoolingDescriptor` specify the options
+for an operator such as stride and padding. For operators with descriptors we have one
+method that takes keyword arguments with reasonable defaults to construct the descriptor and
+another method that takes a pre-initialized descriptor as its last argument.  This way a
+casual user can call the first method without worrying about the descriptor format, only
+specifying non-default options, whereas a layer architect can keep a preset descriptor in
+the layer that gets passed to the function using the second method. We try to use generic
+Julia types for keyword arguments that specify default descriptor fields and convert these
+to the appropriate cudnn types during descriptor construction.
+
+**Output arrays:** The low level cuDNN functions take pre-allocated output arrays. The high
+level interface has one Julia function that allocates its own output array
+(e.g. `cudnnPoolingForward`) and another with an exclamation mark that takes a pre-allocated
+output array as its first argument (e.g. `cudnnPoolingForward!`).
+
+**Methods:** Each cuDNN forward function may have up to four methods depending on whether
+the descriptor and the output array are specified:
+
+    cudnnPoolingForward(x; kwargs...)
+    cudnnPoolingForward(x, d::cudnnPoolingDescriptor; kwargs...)
+    cudnnPoolingForward!(y, x; kwargs...)
+    cudnnPoolingForward!(y, x, d::cudnnPoolingDescriptor; kwargs...)
+
+The conventional order of arguments for these public methods is:
+
+    ([output], weights, inputs, [descriptor]; kwargs...)
+
+**AD method:** Neither the high level nor the low level interface is sometimes
+appropriate for gradient definitions, e.g. the low level API may not return a value, the
+high level API may have some gradient target parameters as keyword arguments. To solve this
+issue the API exposes an intermediate function with an AD suffix,
+e.g. `cudnnPoolingForwardAD`, that is called by the high level method and that makes
+the low level library call. These methods may not seem like they are doing anything useful,
+but they should not be removed so automatic gradient packages may make use of them.
+
+**Backward functions:** The point of a high level interface is to give the user appropriate
+defaults for the many options of typical cudnn functions. Backward functions do not have
+meaningful defaults because they need to copy their options from the corresponding forward
+function. Therefore we do not need high level APIs for backward functions unless they are
+useful in some other way. See Knet/src/cudnn for example uses.
+
+**Types:** Do not specify types for array arguments. Leave the high level functions generic
+so they can be called with CuArray, KnetArray, AutoGrad.Param etc. Types can and should be
+specified for non-array arguments. In the API we use `nothing` to indicate unspecified array
+argument values, convert these to `C_NULL` or `CU_NULL` as appropriate only at the low-level
+call. Similarly for numbers the API should accept generic types like `Integer` or `Real` and
+convert these to the appropriate specific type, e.g. `Cint` or `Cdouble` only at the
+low-level call.
+
+**Workspace:** Some functions need a temporary allocated workspace whose required size is
+determined by another cudnn call. Unfortunately, the required size may depend on factors
+other than the current inputs (see [this
+issue](https://github.com/FluxML/Flux.jl/issues/923#issuecomment-558671966)), so the usage
+of the `@workspace` macro is used at a point as close to the library call as possible. One
+exception to this is cases where the same workspace will be passed to the backward call, in
+which case we allocate a regular CuArray.
+
+**Training vs Inference:** There is no consistent way cuDNN distinguishes training vs inference calls:
+* BatchNormalization and Normalization have two separate functions: `cudnnNormalizationForwardTraining / Inference`
+* RNN has an indicator argument: `fwdMode` in `cudnnRNNForward`
+* MultiHeadAttn looks at the `reserveSpace` argument to decide: if `NULL` inference mode, otherwise training mode
+* Dropout always runs in training mode with a non-NULL `reserveSpace` (it doesn't make sense in inference mode)
+* Activation, convolution, pooling, softmax, optensor, addtensor, reducetensor do not make a distinction between the two modes
+
+In the high level API we assume inference by default and let the gradient packages override when necessary.
+See the gradient implementations in Knet/src/cudnn for examples.
+
+**TODO:** 
+* Keyword arg descriptor constructors.
+* Test forw fns with descriptors: check for desc vs kwarg incompatibility.
+* Find out about cudnnRNNSetClip_v8.
+* Test with Knet.Ops20.
+* Command used to test: julia17 --project -e 'using Pkg; Pkg.API.test(; test_args=`--memcheck --jobs=1 cudnn`)'
@@ -1,44 +1,57 @@
-# descriptor
-
-mutable struct ActivationDesc
-    ptr::cudnnActivationDescriptor_t
+"""
+    cudnnActivationForward(x; mode, nanOpt, coef, alpha)
+    cudnnActivationForward(x, d::cudnnActivationDescriptor; alpha)
+    cudnnActivationForward!(y, x; mode, nanOpt, coef, alpha, beta)
+    cudnnActivationForward!(y, x, d::cudnnActivationDescriptor; alpha, beta)
+
+Return the result of the specified elementwise activation operation applied to `x`.
+Optionally `y` holds the result and `d` specifies the operation. `y` should be similar to
+`x` if specified. Keyword arguments `alpha=1, beta=0` can be used for scaling, i.e. `y .=
+alpha*op.(x1) .+ beta*y`.  The following keyword arguments specify the operation if `d` is
+not given:
+
+* `mode = CUDNN_ACTIVATION_RELU`: Options are SIGMOID, RELU, TANH, CLIPPED_RELU, ELU, IDENTITY
+* `nanOpt = CUDNN_NOT_PROPAGATE_NAN`: NAN propagation policy, the other option is `CUDNN_PROPAGATE_NAN`
+* `coef=1`: When the activation mode is set to CUDNN_ACTIVATION_CLIPPED_RELU, this input specifies the clipping threshold; and when the activation mode is set to CUDNN_ACTIVATION_ELU, this input specifies the α parameter.
+"""
+cudnnActivationForward, cudnnActivationForward!
+
+
+# Public methods
+cudnnActivationForward(x; o...)     = cudnnActivationForwardWithDefaults(x; o...)
+cudnnActivationForward!(y, x; o...) = cudnnActivationForwardWithDefaults(x; y, o...)
+cudnnActivationForward(x, d::cudnnActivationDescriptor; o...)     = cudnnActivationForwardWithDefaults(x; activationDesc=d, o...)
+cudnnActivationForward!(y, x, d::cudnnActivationDescriptor; o...) = cudnnActivationForwardWithDefaults(x; y, activationDesc=d, o...)
+
+
+# Private method
+function cudnnActivationForwardWithDefaults(
+    x;
+    y = similar(x),
+    mode::cudnnActivationMode_t = CUDNN_ACTIVATION_RELU,
+    nanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN,
+    coef::Real=1,
+    activationDesc::cudnnActivationDescriptor = cudnnActivationDescriptor(mode, nanOpt, Cdouble(coef)),
+    alpha::Real=1,
+    beta::Real=0,
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x),
+    yDesc::cudnnTensorDescriptor = xDesc,
+)
+    T = eltype(x)
+    alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta)
+    cudnnActivationForwardAD(x; activationDesc, alpha, xDesc, beta, yDesc, y)
 end
 
-unsafe_free!(ad::ActivationDesc)=cudnnDestroyActivationDescriptor(ad.ptr)
-
-Base.unsafe_convert(::Type{cudnnActivationDescriptor_t}, ad::ActivationDesc)=ad.ptr
 
-function ActivationDesc(mode, coeff, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN)
-    ad = Ref{cudnnActivationDescriptor_t}()
-    cudnnCreateActivationDescriptor(ad)
-    cudnnSetActivationDescriptor(ad[],mode,reluNanOpt,coeff)
-    this = ActivationDesc(ad[])
-    finalizer(unsafe_free!, this)
-    return this
+# AD method:
+function cudnnActivationForwardAD(x; activationDesc, alpha, xDesc, beta, yDesc, y)
+    cudnnActivationForward(handle(), activationDesc, alpha, xDesc, x, beta, yDesc, y)
+    return y
 end
 
 
-# wrappers
-
-function cudnnActivationForward(x::DenseCuArray{T,N}, y::DenseCuArray{T,N}=x;
-                                mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
-                                coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=true,
-                                beta=false) where {T,N}
-    cudnnActivationForward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
-                           scalingParameter(T, alpha), TensorDesc(x), x,
-                           scalingParameter(T, beta ), TensorDesc(y), y)
-    return  y
-end
-
-function cudnnActivationBackward(x::DenseCuArray{T,N}, dx::DenseCuArray{T,N},
-                                 y::DenseCuArray{T,N}, dy::DenseCuArray{T,N}=dx;
-                                 mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
-                                 coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1,
-                                 beta=false) where {T,N}
-    cudnnActivationBackward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
-                            scalingParameter(T, alpha), TensorDesc( y),  y,
-                            TensorDesc(dy), dy,
-                            TensorDesc( x),  x,
-                            scalingParameter(T, beta ), TensorDesc(dx), dx)
-    return dx
+# Deprecated:
+function cudnnActivationForward(x::DenseCuArray{T,N}, y::DenseCuArray{T,N}; o...) where {T,N}
+    @warn "`cudnnActivationForward(x,y)` is deprecated, please use one of the methods in `@doc cudnnActivationForward`." maxlog=1
+    cudnnActivationForward!(y, x; o...)
 end
@@ -36,9 +36,9 @@ function cudnnBNForward!(y::DenseCuArray{T}, g::DenseCuArray{T}, b::DenseCuArray
     # warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", CUDNN_BN_MIN_EPSILON)
     eps = CUDNN_BN_MIN_EPSILON
   end
-  xd = TensorDesc(x)
-  yd = TensorDesc(y)
-  gd = TensorDesc(T, dims)
+  xd = cudnnTensorDescriptor(x)
+  yd = cudnnTensorDescriptor(y)
+  gd = cudnnTensorDescriptor(CUDNN_TENSOR_NCHW, cudnnDataType(T), Cint(length(dims)), dim4(dims,Val(CUDNN_TENSOR_NCHW)))
 
   if training
 
@@ -91,10 +91,10 @@ function cudnnBNBackward!(dg::DenseCuArray{T}, g::DenseCuArray{T}, db::DenseCuAr
                           alpha = T(1), beta = T(0),
                           dalpha = T(1), dbeta = T(0), training = true) where T<:Union{Float32, Float64}
   if training
-    xd = TensorDesc(x)
-    dyd = TensorDesc(dy)
-    dxd = TensorDesc(dx)
-    gd = TensorDesc(T, _wsize(x))
+    xd = cudnnTensorDescriptor(x)
+    dyd = cudnnTensorDescriptor(dy)
+    dxd = cudnnTensorDescriptor(dx)
+    gd = cudnnTensorDescriptor(CUDNN_TENSOR_NCHW, cudnnDataType(T), Cint(length(_wsize(x))), dim4(_wsize(x),Val(CUDNN_TENSOR_NCHW)))
     if cache !== nothing
       mean, ivar = cache.mean, cache.ivar
       info("mean and ivar are fetched from the cache")