diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index d8b8619e80..acad431caa 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -3,11 +3,11 @@ steps:
 
   - label: "Julia 1.5"
     plugins:
-      - JuliaCI/julia#v0.5:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.3:
+      - JuliaCI/julia-test#v1:
           test_args: "--quickfail"
-      - JuliaCI/julia-coverage#v0.3:
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -23,11 +23,11 @@ steps:
 
   - label: "Julia 1.5 (debug)"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2:
+      - JuliaCI/julia-test#v1:
           julia_args: "-g2"
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -50,10 +50,10 @@ steps:
 
   - label: "CUDA 11.2"
     plugins:
-      - JuliaCI/julia#v0.6:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.3: ~
-      - JuliaCI/julia-coverage#v0.3:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -70,10 +70,10 @@ steps:
 
   - label: "CUDA 11.1"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2: ~
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -92,10 +92,10 @@ steps:
 
   - label: "CUDA 11.0"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2: ~
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -114,10 +114,10 @@ steps:
 
   - label: "CUDA 10.2"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2: ~
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -136,10 +136,10 @@ steps:
 
   - label: "CUDA 10.1"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2: ~
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -199,10 +199,10 @@ steps:
 
   - label: "Split memory pool"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
-      - JuliaCI/julia-test#v0.2: ~
-      - JuliaCI/julia-coverage#v0.2:
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
@@ -225,7 +225,7 @@ steps:
   # so they can run on any system in the juliagpu queue.
   - label: "Benchmarks (dry run)"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
     command: |
       julia --project -e '
@@ -249,7 +249,7 @@ steps:
   # be running on the same system each time
   - label: "Benchmarks"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
     env:
       CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME"
@@ -278,7 +278,7 @@ steps:
 
   - label: "Documentation"
     plugins:
-      - JuliaCI/julia#v0.4:
+      - JuliaCI/julia#v1:
           version: 1.5
     command: |
       julia --project -e '
diff --git a/deps/discovery.jl b/deps/discovery.jl
index 99246ad964..77cb22588f 100644
--- a/deps/discovery.jl
+++ b/deps/discovery.jl
@@ -158,6 +158,21 @@ const cuda_releases = [v"1.0", v"1.1",
                        v"11.0", v"11.1"]
 
 const cuda_library_versions = Dict(
+    v"11.0.1" => Dict(
+        # NOTE: encountered this version in a Docker container; not sure where it came from.
+        "cudart"    => v"11.0.171",
+        "cupti"     => "2020.1.0", # wtf
+        "nvrtc"     => v"11.0.167",
+        "nvtx"      => v"11.0.167",
+        "nvvp"      => v"11.0.167",
+        "cublas"    => v"11.0.0", #.191
+        "cufft"     => v"10.1.3", #.191
+        "curand"    => v"10.2.0", #.191
+        "cusolver"  => v"10.4.0", #.191
+        "cusparse"  => v"11.0.0", #.191
+        "npp"       => v"11.0.0", #.191
+        "nvjpeg"    => v"11.0.0", #.191
+    ),
     v"11.0.2" => Dict(
         "cudart"    => v"11.0.171",
         "cupti"     => "2020.1.0", # wtf
@@ -250,6 +265,10 @@ const cuda_library_names = Dict(
 
 # only for nvdisasm, to discover the CUDA toolkit version
 const cuda_binary_versions = Dict(
+    v"11.0.1" => Dict(
+        # NOTE: encountered this version in a Docker container; not sure where it came from.
+        "nvdisasm"  => v"11.0.167"
+    ),
     v"11.0.2" => Dict(
         "nvdisasm"  => v"11.0.194"
     ),
diff --git a/lib/cudnn/CUDNN.jl b/lib/cudnn/CUDNN.jl
index eac548924a..8a04117683 100644
--- a/lib/cudnn/CUDNN.jl
+++ b/lib/cudnn/CUDNN.jl
@@ -19,20 +19,24 @@ include("libcudnn_deprecated.jl")
 # low-level wrappers
 include("util.jl")
 include("base.jl")
+include("descriptors.jl")
 include("tensor.jl")
-include("conv.jl")
+include("inplace.jl")
+include("optensor.jl")
+include("reduce.jl")
+include("convolution.jl")
 include("pooling.jl")
 include("activation.jl")
-include("filter.jl")
 include("softmax.jl")
-include("batchnorm.jl")
 include("dropout.jl")
 include("rnn.jl")
+include("multiheadattn.jl")
+include("normalization.jl")
 
 # high-level integrations
 include("nnlib.jl")
+include("batchnorm.jl")
 
-include("compat.jl")
 
 function math_mode(mode=CUDA.math_mode())
     if mode == CUDA.PEDANTIC_MATH
diff --git a/lib/cudnn/README.md b/lib/cudnn/README.md
new file mode 100644
index 0000000000..ddfccaffdf
--- /dev/null
+++ b/lib/cudnn/README.md
@@ -0,0 +1,91 @@
+## High level interface to cuDNN functions
+Deniz Yuret, Nov 6, 2020
+
+The goal of the high-level interface is to map the low level cuDNN calls to more natural
+Julia functions. Here are some design choices I followed:
+
+**Naming:** We try to keep the same function, argument, and type names from the cuDNN
+library in the high level interface. The wrappers for descriptors drop the `_t` suffix,
+e.g. `cudnnPoolingDescriptor_t => cudnnPoolingDescriptor`.
+
+**Descriptors:** The cuDNN functions take data and operator descriptors. Most of these
+descriptors are relatively fast to create (~500 ns for a cudnnTensorDescriptor) so they may
+not be worth preallocating for the user but we provide keyword options anyway. We cache
+descriptors (~100 ns) so we can use them as hash keys for memoization, which also saves a
+bit of memory and speed.  All descriptor fields are `isbits` types with the exception of the
+`cudnnDropoutDescriptor` which points to a random number generator state and is used as a
+field of some other descriptors.
+
+**Operator descriptors:** Descriptors such as `cudnnPoolingDescriptor` specify the options
+for an operator such as stride and padding. For operators with descriptors we have one
+method that takes keyword arguments with reasonable defaults to construct the descriptor and
+another method that takes a pre-initialized descriptor as its last argument.  This way a
+casual user can call the first method without worrying about the descriptor format, only
+specifying non-default options, whereas a layer architect can keep a preset descriptor in
+the layer that gets passed to the function using the second method. We try to use generic
+Julia types for keyword arguments that specify default descriptor fields and convert these
+to the appropriate cudnn types during descriptor construction.
+
+**Output arrays:** The low level cuDNN functions take pre-allocated output arrays. The high
+level interface has one Julia function that allocates its own output array
+(e.g. `cudnnPoolingForward`) and another with an exclamation mark that takes a pre-allocated
+output array as its first argument (e.g. `cudnnPoolingForward!`).
+
+**Methods:** Each cuDNN forward function may have up to four methods depending on whether
+the descriptor and the output array are specified:
+
+    cudnnPoolingForward(x; kwargs...)
+    cudnnPoolingForward(x, d::cudnnPoolingDescriptor; kwargs...)
+    cudnnPoolingForward!(y, x; kwargs...)
+    cudnnPoolingForward!(y, x, d::cudnnPoolingDescriptor; kwargs...)
+
+The conventional order of arguments for these public methods is:
+
+    ([output], weights, inputs, [descriptor]; kwargs...)
+
+**AD method:** Neither the high level nor the low level interface is sometimes
+appropriate for gradient definitions, e.g. the low level API may not return a value, the
+high level API may have some gradient target parameters as keyword arguments. To solve this
+issue the API exposes an intermediate function with an AD suffix,
+e.g. `cudnnPoolingForwardAD`, that is called by the high level method and that makes
+the low level library call. These methods may not seem like they are doing anything useful,
+but they should not be removed so automatic gradient packages may make use of them.
+
+**Backward functions:** The point of a high level interface is to give the user appropriate
+defaults for the many options of typical cudnn functions. Backward functions do not have
+meaningful defaults because they need to copy their options from the corresponding forward
+function. Therefore we do not need high level APIs for backward functions unless they are
+useful in some other way. See Knet/src/cudnn for example uses.
+
+**Types:** Do not specify types for array arguments. Leave the high level functions generic
+so they can be called with CuArray, KnetArray, AutoGrad.Param etc. Types can and should be
+specified for non-array arguments. In the API we use `nothing` to indicate unspecified array
+argument values, convert these to `C_NULL` or `CU_NULL` as appropriate only at the low-level
+call. Similarly for numbers the API should accept generic types like `Integer` or `Real` and
+convert these to the appropriate specific type, e.g. `Cint` or `Cdouble` only at the
+low-level call.
+
+**Workspace:** Some functions need a temporary allocated workspace whose required size is
+determined by another cudnn call. Unfortunately, the required size may depend on factors
+other than the current inputs (see [this
+issue](https://github.com/FluxML/Flux.jl/issues/923#issuecomment-558671966)), so the usage
+of the `@workspace` macro is used at a point as close to the library call as possible. One
+exception to this is cases where the same workspace will be passed to the backward call, in
+which case we allocate a regular CuArray.
+
+**Training vs Inference:** There is no consistent way cuDNN distinguishes training vs inference calls:
+* BatchNormalization and Normalization have two separate functions: `cudnnNormalizationForwardTraining / Inference`
+* RNN has an indicator argument: `fwdMode` in `cudnnRNNForward`
+* MultiHeadAttn looks at the `reserveSpace` argument to decide: if `NULL` inference mode, otherwise training mode
+* Dropout always runs in training mode with a non-NULL `reserveSpace` (it doesn't make sense in inference mode)
+* Activation, convolution, pooling, softmax, optensor, addtensor, reducetensor do not make a distinction between the two modes
+
+In the high level API we assume inference by default and let the gradient packages override when necessary.
+See the gradient implementations in Knet/src/cudnn for examples.
+
+**TODO:** 
+* Keyword arg descriptor constructors.
+* Test forw fns with descriptors: check for desc vs kwarg incompatibility.
+* Find out about cudnnRNNSetClip_v8.
+* Test with Knet.Ops20.
+* Command used to test: julia17 --project -e 'using Pkg; Pkg.API.test(; test_args=`--memcheck --jobs=1 cudnn`)'
diff --git a/lib/cudnn/activation.jl b/lib/cudnn/activation.jl
index 1f0439bf51..fa9ae0d09d 100644
--- a/lib/cudnn/activation.jl
+++ b/lib/cudnn/activation.jl
@@ -1,44 +1,57 @@
-# descriptor
-
-mutable struct ActivationDesc
-    ptr::cudnnActivationDescriptor_t
+"""
+    cudnnActivationForward(x; mode, nanOpt, coef, alpha)
+    cudnnActivationForward(x, d::cudnnActivationDescriptor; alpha)
+    cudnnActivationForward!(y, x; mode, nanOpt, coef, alpha, beta)
+    cudnnActivationForward!(y, x, d::cudnnActivationDescriptor; alpha, beta)
+
+Return the result of the specified elementwise activation operation applied to `x`.
+Optionally `y` holds the result and `d` specifies the operation. `y` should be similar to
+`x` if specified. Keyword arguments `alpha=1, beta=0` can be used for scaling, i.e. `y .=
+alpha*op.(x1) .+ beta*y`.  The following keyword arguments specify the operation if `d` is
+not given:
+
+* `mode = CUDNN_ACTIVATION_RELU`: Options are SIGMOID, RELU, TANH, CLIPPED_RELU, ELU, IDENTITY
+* `nanOpt = CUDNN_NOT_PROPAGATE_NAN`: NAN propagation policy, the other option is `CUDNN_PROPAGATE_NAN`
+* `coef=1`: When the activation mode is set to CUDNN_ACTIVATION_CLIPPED_RELU, this input specifies the clipping threshold; and when the activation mode is set to CUDNN_ACTIVATION_ELU, this input specifies the α parameter.
+"""
+cudnnActivationForward, cudnnActivationForward!
+
+
+# Public methods
+cudnnActivationForward(x; o...)     = cudnnActivationForwardWithDefaults(x; o...)
+cudnnActivationForward!(y, x; o...) = cudnnActivationForwardWithDefaults(x; y, o...)
+cudnnActivationForward(x, d::cudnnActivationDescriptor; o...)     = cudnnActivationForwardWithDefaults(x; activationDesc=d, o...)
+cudnnActivationForward!(y, x, d::cudnnActivationDescriptor; o...) = cudnnActivationForwardWithDefaults(x; y, activationDesc=d, o...)
+
+
+# Private method
+function cudnnActivationForwardWithDefaults(
+    x;
+    y = similar(x),
+    mode::cudnnActivationMode_t = CUDNN_ACTIVATION_RELU,
+    nanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN,
+    coef::Real=1,
+    activationDesc::cudnnActivationDescriptor = cudnnActivationDescriptor(mode, nanOpt, Cdouble(coef)),
+    alpha::Real=1,
+    beta::Real=0,
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x),
+    yDesc::cudnnTensorDescriptor = xDesc,
+)
+    T = eltype(x)
+    alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta)
+    cudnnActivationForwardAD(x; activationDesc, alpha, xDesc, beta, yDesc, y)
 end
 
-unsafe_free!(ad::ActivationDesc)=cudnnDestroyActivationDescriptor(ad.ptr)
-
-Base.unsafe_convert(::Type{cudnnActivationDescriptor_t}, ad::ActivationDesc)=ad.ptr
 
-function ActivationDesc(mode, coeff, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN)
-    ad = Ref{cudnnActivationDescriptor_t}()
-    cudnnCreateActivationDescriptor(ad)
-    cudnnSetActivationDescriptor(ad[],mode,reluNanOpt,coeff)
-    this = ActivationDesc(ad[])
-    finalizer(unsafe_free!, this)
-    return this
+# AD method:
+function cudnnActivationForwardAD(x; activationDesc, alpha, xDesc, beta, yDesc, y)
+    cudnnActivationForward(handle(), activationDesc, alpha, xDesc, x, beta, yDesc, y)
+    return y
 end
 
 
-# wrappers
-
-function cudnnActivationForward(x::DenseCuArray{T,N}, y::DenseCuArray{T,N}=x;
-                                mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
-                                coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=true,
-                                beta=false) where {T,N}
-    cudnnActivationForward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
-                           scalingParameter(T, alpha), TensorDesc(x), x,
-                           scalingParameter(T, beta ), TensorDesc(y), y)
-    return  y
-end
-
-function cudnnActivationBackward(x::DenseCuArray{T,N}, dx::DenseCuArray{T,N},
-                                 y::DenseCuArray{T,N}, dy::DenseCuArray{T,N}=dx;
-                                 mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
-                                 coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1,
-                                 beta=false) where {T,N}
-    cudnnActivationBackward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
-                            scalingParameter(T, alpha), TensorDesc( y),  y,
-                            TensorDesc(dy), dy,
-                            TensorDesc( x),  x,
-                            scalingParameter(T, beta ), TensorDesc(dx), dx)
-    return dx
+# Deprecated:
+function cudnnActivationForward(x::DenseCuArray{T,N}, y::DenseCuArray{T,N}; o...) where {T,N}
+    @warn "`cudnnActivationForward(x,y)` is deprecated, please use one of the methods in `@doc cudnnActivationForward`." maxlog=1
+    cudnnActivationForward!(y, x; o...)
 end
diff --git a/lib/cudnn/batchnorm.jl b/lib/cudnn/batchnorm.jl
index 53fb4f647f..d18b77d301 100644
--- a/lib/cudnn/batchnorm.jl
+++ b/lib/cudnn/batchnorm.jl
@@ -36,9 +36,9 @@ function cudnnBNForward!(y::DenseCuArray{T}, g::DenseCuArray{T}, b::DenseCuArray
     # warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", CUDNN_BN_MIN_EPSILON)
     eps = CUDNN_BN_MIN_EPSILON
   end
-  xd = TensorDesc(x)
-  yd = TensorDesc(y)
-  gd = TensorDesc(T, dims)
+  xd = cudnnTensorDescriptor(x)
+  yd = cudnnTensorDescriptor(y)
+  gd = cudnnTensorDescriptor(CUDNN_TENSOR_NCHW, cudnnDataType(T), Cint(length(dims)), dim4(dims,Val(CUDNN_TENSOR_NCHW)))
 
   if training
 
@@ -91,10 +91,10 @@ function cudnnBNBackward!(dg::DenseCuArray{T}, g::DenseCuArray{T}, db::DenseCuAr
                           alpha = T(1), beta = T(0),
                           dalpha = T(1), dbeta = T(0), training = true) where T<:Union{Float32, Float64}
   if training
-    xd = TensorDesc(x)
-    dyd = TensorDesc(dy)
-    dxd = TensorDesc(dx)
-    gd = TensorDesc(T, _wsize(x))
+    xd = cudnnTensorDescriptor(x)
+    dyd = cudnnTensorDescriptor(dy)
+    dxd = cudnnTensorDescriptor(dx)
+    gd = cudnnTensorDescriptor(CUDNN_TENSOR_NCHW, cudnnDataType(T), Cint(length(_wsize(x))), dim4(_wsize(x),Val(CUDNN_TENSOR_NCHW)))
     if cache !== nothing
       mean, ivar = cache.mean, cache.ivar
       info("mean and ivar are fetched from the cache")
diff --git a/lib/cudnn/compat.jl b/lib/cudnn/compat.jl
deleted file mode 100644
index 9e7ac781c9..0000000000
--- a/lib/cudnn/compat.jl
+++ /dev/null
@@ -1,21 +0,0 @@
-# Compatibility shims until users upgrade to new NNlib format
-function conv!(y::DenseCuArray{T}, x::DenseCuArray{T}, w::DenseCuArray{T}; pad=0, stride=1, flipkernel=0, dilation=1, kwargs...) where {T<:CUDNNFloat}
-    cdims = DenseConvDims(x, w; padding=pad, stride=stride, flipkernel=flipkernel, dilation=dilation)
-    return conv!(y, x, w, cdims; kwargs...)
-end
-
-function ∇conv_filter!(dw::DenseCuArray{T}, dy::DenseCuArray{T}, x::DenseCuArray{T}; pad=0, stride=1, flipkernel=0, dilation=1, kwargs...) where {T<:CUDNNFloat}
-    cdims = DenseConvDims(x, dw; padding=pad, stride=stride, flipkernel=flipkernel, dilation=dilation)
-    # NOTE!!!  This compat shim re-arranges the argument order!
-    return ∇conv_filter!(dw, x, dy, cdims; kwargs...)
-end
-
-function maxpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, k; pad=map(_->0,k), stride=k) where {T<:CUDNNFloat}
-    pdims = PoolDims(x, k; padding=pad, stride=stride)
-    return maxpool!(y, x, pdims)
-end
-
-function meanpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, k; pad=map(_->0,k), stride=k) where {T<:CUDNNFloat}
-    pdims = PoolDims(x, k; padding=pad, stride=stride)
-    return meanpool!(y, x, pdims)
-end
diff --git a/lib/cudnn/conv.jl b/lib/cudnn/conv.jl
deleted file mode 100644
index 0a36235a68..0000000000
--- a/lib/cudnn/conv.jl
+++ /dev/null
@@ -1,378 +0,0 @@
-using NNlib: DenseConvDims
-
-
-# descriptor
-
-mutable struct ConvDesc
-    ptr::cudnnConvolutionDescriptor_t
-end
-
-unsafe_free!(cd::ConvDesc) = cudnnDestroyConvolutionDescriptor(cd.ptr)
-
-Base.unsafe_convert(::Type{cudnnConvolutionDescriptor_t}, cd::ConvDesc)=cd.ptr
-
-function cdsize(w, nd)
-    isa(w, Integer) && return Cint[fill(w,nd)...]
-    length(w) == nd && return Cint[reverse(w)...]
-    length(w) == 2*nd && return Cint[reverse(w[nd+1:end])...]
-    throw(DimensionMismatch())
-end
-
-pdsize(w, nd)=Cint[reverse(psize(w,nd))...]
-function psize(w, nd)
-    isa(w, Integer) && return Cint[fill(w,nd)...]
-    length(w) == nd && return w
-    length(w) == 2*nd && return w[1:nd]
-    throw(DimensionMismatch())
-end
-
-Base.cconvert(::Type{cudnnConvolutionMode_t}, x::Bool) = x ? CUDNN_CROSS_CORRELATION : CUDNN_CONVOLUTION
-
-function ConvDesc(T, N, padding, stride, dilation, mode)
-    cd = Ref{cudnnConvolutionDescriptor_t}()
-    cudnnCreateConvolutionDescriptor(cd)
-    if version() >= v"4"
-        cudnnSetConvolutionNdDescriptor(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode,cudnnDataType(T))
-    elseif version() >= v"3"
-        cudnnSetConvolutionNdDescriptor_v3(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode,cudnnDataType(T))
-    else
-        cudnnSetConvolutionNdDescriptor(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode)
-    end
-    cudnnSetConvolutionMathType(cd[], math_mode())
-    this = ConvDesc(cd[])
-    finalizer(unsafe_free!, this)
-    return this
-end
-
-function ConvDesc(T, cdims::DenseConvDims)
-    pd = NNlib.padding(cdims)
-    if !all(pd[1:2:end] .== pd[2:2:end])
-        @warn("CuDNN does not support asymmetric padding; defaulting to symmetric choice")
-    end
-    return ConvDesc(T, NNlib.spatial_dims(cdims), pd[1:2:end], NNlib.stride(cdims),
-                       NNlib.dilation(cdims), NNlib.flipkernel(cdims))
-end
-
-
-# wrappers
-
-# Forward
-
-function cudnnGetConvolutionForwardAlgorithmMaxCount()
-    count=@argout(
-        cudnnGetConvolutionForwardAlgorithmMaxCount(
-            handle(),
-            out(Ref{Cint}()))
-        )[]
-    return count
-end
-
-# will be removed in cuDNN 8
-function cudnnGetConvolutionForwardAlgorithm(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, w::DenseCuArray{T,N},
-                                             cdims::DenseConvDims; preference=0, workspacesize=1<<32) where {T,N}
-    algo=@argout(
-        cudnnGetConvolutionForwardAlgorithm(
-            handle(), TensorDesc(x),
-            FilterDesc(w), ConvDesc(T, cdims),
-            TensorDesc(y),
-            cudnnConvolutionFwdPreference_t(preference),
-            Csize_t(workspacesize),
-            out(Ref{cudnnConvolutionFwdAlgo_t}()))
-        )[]
-    return algo
-end
-
-function cudnnGetConvolutionForwardAlgorithm_v7(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, w::DenseCuArray{T,N},
-                                                cdims::DenseConvDims; count=-1) where {T,N}
-    if count < 0
-        count = cudnnGetConvolutionForwardAlgorithmMaxCount()
-    end
-    perfResults = Array{cudnnConvolutionFwdAlgoPerf_t, 1}(undef, count)
-    returnedAlgoCount=@argout(
-        cudnnGetConvolutionForwardAlgorithm_v7(
-            handle(), TensorDesc(x),
-            FilterDesc(w), ConvDesc(T, cdims),
-            TensorDesc(y),
-            Cint(count),
-            out(Ref{Cint}()),
-            perfResults)
-        )[]
-    return returnedAlgoCount, perfResults
-end
-
-function cudnnFindConvolutionForwardAlgorithm(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, w::DenseCuArray{T,N},
-                                              cdims::DenseConvDims; count=-1) where {T,N}
-    if count < 0
-        count = cudnnGetConvolutionForwardAlgorithmMaxCount()
-    end
-    perfResults = Array{cudnnConvolutionFwdAlgoPerf_t, 1}(undef, count)
-    returnedAlgoCount=@argout(
-        cudnnFindConvolutionForwardAlgorithm(
-            handle(), TensorDesc(x),
-            FilterDesc(w), ConvDesc(T, cdims),
-            TensorDesc(y),
-            Cint(count),
-            out(Ref{Cint}()),
-            perfResults)
-        )[]
-    return returnedAlgoCount, perfResults
-end
-
-function cudnnFindConvolutionForwardAlgorithmEx(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, w::DenseCuArray{T,N},
-                                              cdims::DenseConvDims; count=-1, workspacesize=1<<32) where {T,N}
-    if count < 0
-        count = cudnnGetConvolutionForwardAlgorithmMaxCount()
-    end
-    @workspace size=workspacesize workspace->begin
-        perfResults = Array{cudnnConvolutionFwdAlgoPerf_t, 1}(undef, count)
-        returnedAlgoCount=@argout(
-            cudnnFindConvolutionForwardAlgorithmEx(
-                handle(), TensorDesc(x), x,
-                FilterDesc(w), w, ConvDesc(T, cdims),
-                TensorDesc(y), y,
-                Cint(count),
-                out(Ref{Cint}()),
-                perfResults,
-                workspace,
-                workspacesize)
-            )[]
-        return returnedAlgoCount, perfResults
-    end
-end
-
-function cudnnConvolutionForward(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, w::DenseCuArray{T,N},
-                                 cdims::DenseConvDims; algo=0, alpha=1, beta=0) where {T,N}
-    @workspace size=@argout(
-            cudnnGetConvolutionForwardWorkspaceSize(
-                handle(), TensorDesc(x),
-                FilterDesc(w), ConvDesc(T, cdims),
-                TensorDesc(y),
-                cudnnConvolutionFwdAlgo_t(algo),
-                out(Ref{Csize_t}()))
-        )[] workspace->begin
-            cudnnConvolutionForward(
-                handle(), scalingParameter(T, alpha), TensorDesc(x), x, FilterDesc(w), w,
-                ConvDesc(T,cdims), cudnnConvolutionFwdAlgo_t(algo), workspace,
-                sizeof(workspace), scalingParameter(T, beta), TensorDesc(y), y)
-        end
-    return y
-end
-
-function cudnnConvolutionBiasActivationForward(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, w::DenseCuArray{T,N}, z::DenseCuArray{T,N}, bias::DenseCuArray{T,N},
-                                               cdims::DenseConvDims; algo=0, alpha1=1, alpha2=1,
-                                               activationMode=CUDNN_ACTIVATION_RELU, activationCoeff=0.0, activationReluNanOpt=CUDNN_NOT_PROPAGATE_NAN) where {T,N}
-    @workspace size=@argout(
-            cudnnGetConvolutionForwardWorkspaceSize(
-                handle(), TensorDesc(x),
-                FilterDesc(w), ConvDesc(T, cdims),
-                TensorDesc(y),
-                cudnnConvolutionFwdAlgo_t(algo),
-                out(Ref{Csize_t}()))
-        )[] workspace->begin
-            cudnnConvolutionBiasActivationForward(
-                handle(), scalingParameter(T, alpha1), TensorDesc(x), x, FilterDesc(w), w,
-                ConvDesc(T, cdims), cudnnConvolutionFwdAlgo_t(algo), workspace,
-                sizeof(workspace), scalingParameter(T, alpha2), TensorDesc(z), z, TensorDesc(bias), bias, ActivationDesc(activationMode, activationCoeff, activationReluNanOpt), TensorDesc(y),y)
-        end
-    return y
-end
-
-# Backward data
-
-function cudnnGetConvolutionBackwardDataAlgorithmMaxCount()
-    count=@argout(
-        cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
-            handle(),
-            out(Ref{Cint}()))
-        )[]
-    return count
-end
-
-# will be removed in cuDNN 8
-function cudnnGetConvolutionBackwardDataAlgorithm(dx::DenseCuArray{T,N}, w::DenseCuArray{T,N}, dy::DenseCuArray{T,N},
-                                                  cdims::DenseConvDims; preference=0, workspacesize=1<<32) where {T,N}
-    algo=@argout(
-        cudnnGetConvolutionBackwardDataAlgorithm(
-            handle(), FilterDesc(w), TensorDesc(dy), ConvDesc(T, cdims),
-            TensorDesc(dx), cudnnConvolutionBwdDataPreference_t(preference),
-            Csize_t(workspacesize), out(Ref{cudnnConvolutionBwdDataAlgo_t}()))
-        )[]
-    return algo
-end
-
-function cudnnGetConvolutionBackwardDataAlgorithm_v7(dx::DenseCuArray{T,N}, w::DenseCuArray{T,N}, dy::DenseCuArray{T,N},
-                                                     cdims::DenseConvDims; count=-1) where {T,N}
-    if count < 0
-        count = cudnnGetConvolutionBackwardDataAlgorithmMaxCount()
-    end
-    perfResults = Array{cudnnConvolutionBwdDataAlgoPerf_t, 1}(undef, count)
-    returnedAlgoCount=@argout(
-        cudnnGetConvolutionBackwardDataAlgorithm_v7(
-            handle(), FilterDesc(w), TensorDesc(dy),
-            ConvDesc(T, cdims), TensorDesc(dx),
-            Cint(count),
-            out(Ref{Cint}()), perfResults)
-        )[]
-    return returnedAlgoCount, perfResults
-end
-
-function cudnnFindConvolutionBackwardDataAlgorithm(dx::DenseCuArray{T,N}, w::DenseCuArray{T,N}, dy::DenseCuArray{T,N},
-                                                   cdims::DenseConvDims; count=-1) where {T,N}
-    if count < 0
-        count = cudnnGetConvolutionBackwardDataAlgorithmMaxCount()
-    end
-    perfResults = Array{cudnnConvolutionBwdDataAlgoPerf_t, 1}(undef, count)
-    returnedAlgoCount=@argout(
-        cudnnFindConvolutionBackwardDataAlgorithm(
-            handle(), FilterDesc(w), TensorDesc(dy),
-            ConvDesc(T, cdims), TensorDesc(dx),
-            Cint(count),
-            out(Ref{Cint}()), perfResults)
-        )[]
-    return returnedAlgoCount, perfResults
-end
-
-function cudnnFindConvolutionBackwardDataAlgorithmEx(dx::DenseCuArray{T,N}, w::DenseCuArray{T,N}, dy::DenseCuArray{T,N},
-                                                     cdims::DenseConvDims; count=-1, workspacesize=1<<32) where {T,N}
-    if count < 0
-        count = cudnnGetConvolutionBackwardDataAlgorithmMaxCount()
-    end
-    @workspace size=workspacesize workspace->begin
-        perfResults = Array{cudnnConvolutionBwdDataAlgoPerf_t, 1}(undef, count)
-        returnedAlgoCount=@argout(
-            cudnnFindConvolutionBackwardDataAlgorithmEx(
-                handle(), FilterDesc(w), w, TensorDesc(dy), dy,
-                ConvDesc(T, cdims), TensorDesc(dx), dx,
-                Cint(count),
-                out(Ref{Cint}()),
-                perfResults, workspace,
-                workspacesize)
-            )[]
-        return returnedAlgoCount, perfResults
-    end
-end
-
-function cudnnConvolutionBackwardData(dx::DenseCuArray{T,N}, w::DenseCuArray{T,N}, dy::DenseCuArray{T,N},
-                                      cdims::DenseConvDims; algo=0, alpha=1, beta=0) where {T,N}
-    @workspace size=@argout(
-            cudnnGetConvolutionBackwardDataWorkspaceSize(
-                handle(), FilterDesc(w),
-                TensorDesc(dy), ConvDesc(T, cdims), TensorDesc(dx),
-                cudnnConvolutionBwdDataAlgo_t(algo),
-                out(Ref{Csize_t}()))
-        )[] workspace->begin
-            cudnnConvolutionBackwardData(
-                handle(), scalingParameter(T, alpha), FilterDesc(w), w,
-                TensorDesc(dy), dy, ConvDesc(T, cdims),
-                cudnnConvolutionBwdDataAlgo_t(algo),
-                workspace, sizeof(workspace),
-                scalingParameter(T, beta), TensorDesc(dx), dx)
-        end
-    return dx
-end
-
-# Backward filter
-
-function cudnnGetConvolutionBackwardFilterAlgorithmMaxCount()
-    count=@argout(
-        cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
-            handle(),
-            out(Ref{Cint}()))
-        )[]
-    return count
-end
-
-# will be removed in cuDNN 8
-function cudnnGetConvolutionBackwardFilterAlgorithm(dw::DenseCuArray{T,N}, x::DenseCuArray{T,N}, dy::DenseCuArray{T,N},
-                                                    cdims::DenseConvDims; preference=0, workspacesize=1<<32) where {T,N}
-    algo=@argout(
-        cudnnGetConvolutionBackwardFilterAlgorithm(
-            handle(), TensorDesc(x), TensorDesc(dy),
-            ConvDesc(T, cdims), FilterDesc(dw), cudnnConvolutionBwdFilterPreference_t(preference),
-            Csize_t(workspacesize), out(Ref{cudnnConvolutionBwdFilterAlgo_t}()))
-        )[]
-    return algo
-end
-
-function cudnnGetConvolutionBackwardFilterAlgorithm_v7(dw::DenseCuArray{T,N}, x::DenseCuArray{T,N}, dy::DenseCuArray{T,N},
-                                                     cdims::DenseConvDims; count=-1) where {T,N}
-    if count < 0
-        count = cudnnGetConvolutionBackwardFilterAlgorithmMaxCount()
-    end
-    perfResults = Array{cudnnConvolutionBwdFilterAlgoPerf_t, 1}(undef, count)
-    returnedAlgoCount=@argout(
-        cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-            handle(), TensorDesc(x), TensorDesc(dy),
-            ConvDesc(T, cdims), FilterDesc(dw),
-            Cint(count),
-            out(Ref{Cint}()),
-            perfResults)
-        )[]
-    return returnedAlgoCount, perfResults
-end
-
-function cudnnFindConvolutionBackwardFilterAlgorithm(dw::DenseCuArray{T,N}, x::DenseCuArray{T,N}, dy::DenseCuArray{T,N},
-                                                     cdims::DenseConvDims; count=-1) where {T,N}
-    if count < 0
-        count = cudnnGetConvolutionBackwardFilterAlgorithmMaxCount()
-    end
-    perfResults = Array{cudnnConvolutionBwdFilterAlgoPerf_t, 1}(undef, count)
-    returnedAlgoCount=@argout(
-        cudnnFindConvolutionBackwardFilterAlgorithm(
-            handle(), TensorDesc(x), TensorDesc(dy),
-            ConvDesc(T, cdims), FilterDesc(dw),
-            Cint(count),
-            out(Ref{Cint}()),
-            perfResults)
-        )[]
-    return returnedAlgoCount, perfResults
-end
-
-function cudnnFindConvolutionBackwardFilterAlgorithmEx(dw::DenseCuArray{T,N}, x::DenseCuArray{T,N}, dy::DenseCuArray{T,N},
-                                                     cdims::DenseConvDims; count=-1, workspacesize=1<<32) where {T,N}
-    if count < 0
-        count = cudnnGetConvolutionBackwardFilterAlgorithmMaxCount()
-    end
-    @workspace size=workspacesize workspace->begin
-        perfResults = Array{cudnnConvolutionBwdFilterAlgoPerf_t, 1}(undef, count)
-        returnedAlgoCount=@argout(
-            cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                handle(), TensorDesc(x), x, TensorDesc(dy),
-                dy, ConvDesc(T, cdims), FilterDesc(dw), dw,
-                Cint(count),
-                out(Ref{Cint}()),
-                perfResults, workspace,
-                workspacesize)
-            )[]
-        return returnedAlgoCount, perfResults
-    end
-end
-
-function cudnnConvolutionBackwardFilter(dw::DenseCuArray{T,N}, x::DenseCuArray{T,N}, dy::DenseCuArray{T,N},
-                                        cdims::DenseConvDims; algo=0, alpha=1, beta=0) where {T,N}
-    @workspace size=@argout(
-            cudnnGetConvolutionBackwardFilterWorkspaceSize(
-                handle(), TensorDesc(x),
-                TensorDesc(dy),
-                ConvDesc(T, cdims),
-                FilterDesc(dw),
-                cudnnConvolutionBwdFilterAlgo_t(algo),
-                out(Ref{Csize_t}()))
-        )[] workspace->begin
-            cudnnConvolutionBackwardFilter(
-                handle(), scalingParameter(T, alpha), TensorDesc(x), x,
-                TensorDesc(dy), dy, ConvDesc(T, cdims),
-                cudnnConvolutionBwdFilterAlgo_t(algo), workspace,
-                sizeof(workspace), scalingParameter(T, beta), FilterDesc(dw), dw)
-        end
-    return dw
-end
-
-# Backward bias
-
-function cudnnConvolutionBackwardBias(db::DenseCuArray{T,N}, dy::DenseCuArray{T,N}; alpha=1, beta=0) where {T,N}
-    cudnnConvolutionBackwardBias(handle(),
-                                 scalingParameter(T, alpha), TensorDesc(dy), dy,
-                                 scalingParameter(T, beta),  TensorDesc(db), db)
-    return db
-end
diff --git a/lib/cudnn/convolution.jl b/lib/cudnn/convolution.jl
new file mode 100644
index 0000000000..c85e8dcb04
--- /dev/null
+++ b/lib/cudnn/convolution.jl
@@ -0,0 +1,253 @@
+# There is too much common code between cudnnConvolutionForward and cudnnConvolutionBiasActivationForward.
+# We will have a single interface and call one or the other based on keyword arguments.
+
+"""
+    cudnnConvolutionForward(w, x; bias, activation, mode, padding, stride, dilation, group, mathType, reorderType, alpha, beta, z, format)
+    cudnnConvolutionForward(w, x, d::cudnnConvolutionDescriptor; bias, activation, alpha, beta, z, format)
+    cudnnConvolutionForward!(y, w, x; bias, activation, mode, padding, stride, dilation, group, mathType, reorderType, alpha, beta, z, format)
+    cudnnConvolutionForward!(y, w, x, d::cudnnConvolutionDescriptor; bias, activation, alpha, beta, z, format)
+
+Return the convolution of filter `w` with tensor `x`, overwriting `y` if provided, according
+to keyword arguments or the convolution descriptor `d`. Optionally perform bias addition,
+activation and/or scaling:
+
+    y .= activation.(alpha * conv(w,x) + beta * z .+ bias) 
+
+All tensors should have the same number of dimensions. If they are less than 4-D their
+dimensions are assumed to be padded on the left with 1's. `x` has size `(X...,Cx,N)` where
+`(X...)` are the spatial dimensions, `Cx` is the number of input channels, and `N` is the
+number of instances. `y,z` have size `(Y...,Cy,N)` where `(Y...)` are the spatial dimensions
+and `Cy` is the number of output channels (`y` and `z` can be the same array). Both `Cx` and
+`Cy` have to be an exact multiple of `group`.  `w` has size `(W...,Cx÷group,Cy)` where
+`(W...)` are the filter dimensions. `bias` has size `(1...,Cy,1)`.
+
+The arguments `padding`, `stride` and `dilation` can be specified as `n-2` dimensional
+vectors, tuples or a single integer which is assumed to be repeated `n-2` times. If any of
+the entries is larger than the corresponding `x` dimension, the `x` dimension is used
+instead. For a description of different types of convolution see:
+https://towardsdatascience.com/a-comprehensive-introduction-to-different-types-of-convolutions-in-deep-learning-669281e58215
+
+Keyword arguments:
+* `activation = CUDNN_ACTIVATION_IDENTITY`: the only other supported option is `CUDNN_ACTIVATION_RELU`
+* `bias = nothing`: add bias if provided
+* `z = nothing`: add `beta*z`, `z` can be `nothing`, `y` or another array similar to `y`
+* `alpha = 1, beta = 0`: scaling parameters
+* `format = CUDNN_TENSOR_NCHW`: order of tensor dimensions, the other alternative is `CUDNN_TENSOR_NHWC`. Note that Julia dimensions will have the opposite order, i.e. WHCN or CWHN.
+
+Keyword arguments describing the convolution when `d` is not given:
+* `mode = CUDNN_CONVOLUTION`: alternatively `CUDNN_CROSS_CORRELATION`
+* `padding = 0`: padding assumed around `x`
+* `stride = 1`: how far to shift the convolution window at each step
+* `dilation = 1`: dilation factor
+* `group = 1`: number of groups to be used
+* `mathType = CUDNN.math_mode()`: whether or not the use of tensor op is permitted
+* `reorderType = CUDNN_DEFAULT_REORDER`: convolution reorder type
+"""
+cudnnConvolutionForward, cudnnConvolutionForward!
+
+
+# Public methods
+cudnnConvolutionForward(w, x; o...)     = cudnnConvolutionForwardWithDefaults(w, x; o...)
+cudnnConvolutionForward!(y, w, x; o...) = cudnnConvolutionForwardWithDefaults(w, x; y, o...)
+cudnnConvolutionForward(w, x, d::cudnnConvolutionDescriptor; o...)     = cudnnConvolutionForwardWithDefaults(w, x; convDesc=d, o...)
+cudnnConvolutionForward!(y, w, x, d::cudnnConvolutionDescriptor; o...) = cudnnConvolutionForwardWithDefaults(w, x; y, convDesc=d, o...)
+
+
+# Private method
+function cudnnConvolutionForwardWithDefaults(
+    w, x;
+
+    # convDesc arguments
+    padding::Union{Integer,Vector{<:Integer},Tuple{<:Integer,Vararg{Int}}} = 0,  # >= 0
+    stride::Union{Integer,Vector{<:Integer},Tuple{<:Integer,Vararg{Int}}} = 1,   # >= 1
+    dilation::Union{Integer,Vector{<:Integer},Tuple{<:Integer,Vararg{Int}}} = 1, # >= 1
+    mode::cudnnConvolutionMode_t = CUDNN_CONVOLUTION,
+    mathType::cudnnMathType_t = math_mode(),
+    reorderType::cudnnReorderType_t = CUDNN_DEFAULT_REORDER,  # related to cudnnReorderFilterAndBias?
+    group::Integer = 1,
+    format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW,
+    convDesc::cudnnConvolutionDescriptor = cudnnConvolutionDescriptor(convdims(padding,size(x),format), convdims(stride,size(x),format), convdims(dilation,size(x),format), mode, cudnnDataType(eltype(x)), mathType, reorderType, Cint(group)),
+
+    # output array, descriptors, scaling factors
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format),
+    wDesc::cudnnFilterDescriptor = cudnnFilterDescriptor(w; format),
+    y = cudnnConvolutionForwardOutput(x, xDesc, wDesc, convDesc, format),
+    yDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(y; format),
+    alpha::Real = 1,
+    beta::Real = 0,
+
+    # convbiasact arguments
+    bias = nothing,
+    z = nothing,
+    biasDesc::Union{Nothing,cudnnTensorDescriptor} = (bias===nothing ? nothing : cudnnTensorDescriptor(bias; format)),
+    zDesc::Union{Nothing,cudnnTensorDescriptor} = (z === nothing ? nothing : cudnnTensorDescriptor(z; format)),
+    activation::cudnnActivationMode_t = CUDNN_ACTIVATION_IDENTITY, # coef and nanOpt are not useful options for convbiasact which only supports relu
+
+    # gradient buffers
+    dw = Ref{Any}(nothing),
+    dx = Ref{Any}(nothing),
+    dz = Ref{Any}(nothing),
+    dbias = Ref{Any}(nothing),
+)
+    T = eltype(x)
+    alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta)
+    # Backward called separately on each variable. We will calculate all gradients on first call. Use `dready` to avoid subsequent calls.
+    dready = Ref{Bool}(false)   # this will be turned to `true` by the first backward call.
+    cudnnConvolutionForwardAD(w, x, bias, z; y, activation, convDesc, wDesc, xDesc, yDesc, zDesc, biasDesc, alpha, beta, dw, dx, dz, dbias, dready)
+end
+
+
+# AD method
+function cudnnConvolutionForwardAD(w, x, bias, z; y, activation, convDesc, wDesc, xDesc, yDesc, zDesc, biasDesc, alpha, beta, dw, dx, dz, dbias, dready)
+    p = cudnnConvolutionFwdAlgoPerf(xDesc, x, wDesc, w, convDesc, yDesc, y, biasDesc, activation)
+    @workspace size=p.memory workspace->begin
+        if bias === nothing && activation === CUDNN_ACTIVATION_IDENTITY && (z === y || beta[] == 0)
+            cudnnConvolutionForward(handle(), alpha, xDesc, x, wDesc, w, convDesc, p.algo, workspace, sizeof(workspace), beta, yDesc, y)
+        else
+            @assert activation === CUDNN_ACTIVATION_IDENTITY || activation === CUDNN_ACTIVATION_RELU  "Only RELU and IDENTITY supported"
+            activationDesc = cudnnActivationDescriptor(activation, CUDNN_NOT_PROPAGATE_NAN, Cdouble(1.0))
+            # bias and z cannot be null for cudnnConvolutionBiasActivationForward
+            if z === nothing; z, zDesc = y, yDesc; beta[] = 0; end
+            if bias === nothing
+                format = cudnnGetFilterDescriptor(wDesc)[3]
+                bdim = (format === CUDNN_TENSOR_NHWC ? 1 : ndims(y)-1)
+                bias = fill!(similar(w, ntuple(i->(i==bdim ? size(y,i) : 1), ndims(y))), 0)
+                biasDesc = cudnnTensorDescriptor(bias; format)
+            end
+            cudnnConvolutionBiasActivationForward(handle(), alpha, xDesc, x, wDesc, w, convDesc, p.algo, workspace, sizeof(workspace), beta, zDesc, z, biasDesc, bias, activationDesc, yDesc, y)
+        end
+    end
+    return y
+end
+
+
+# Deprecated methods
+using NNlib: DenseConvDims
+
+function cudnnConvolutionForward(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, w::DenseCuArray{T,N},
+                                 cdims::DenseConvDims; algo=0, alpha=1, beta=0) where {T,N}
+    @warn "`cudnnConvolutionForward(y,x,w,c::DenseConvDims)` is deprecated, please use one of the methods in `@doc cudnnConvolutionForward!`." maxlog=1
+    cudnnConvolutionForward!(y, w, x; alpha, beta, padding=nnlibPadding(cdims), stride=NNlib.stride(cdims), dilation=NNlib.dilation(cdims), mode=(NNlib.flipkernel(cdims) ? CUDNN_CROSS_CORRELATION : CUDNN_CONVOLUTION))
+end
+
+function cudnnConvolutionBiasActivationForward(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, w::DenseCuArray{T,N}, z::DenseCuArray{T,N}, bias::DenseCuArray{T,N},
+                                               cdims::DenseConvDims; algo=0, alpha1=1, alpha2=1,
+                                               activationMode=CUDNN_ACTIVATION_RELU, activationCoeff=0.0, activationReluNanOpt=CUDNN_NOT_PROPAGATE_NAN) where {T,N}
+    @warn "`cudnnConvolutionBiasActivationForward` is deprecated, please use one of the methods in `@doc cudnnConvolutionForward!`." maxlog=1
+    cudnnConvolutionForward!(y, w, x; bias, activation=activationMode, z, alpha=alpha1, beta=alpha2, padding=nnlibPadding(cdims), stride=NNlib.stride(cdims), dilation=NNlib.dilation(cdims), mode=(NNlib.flipkernel(cdims) ? CUDNN_CROSS_CORRELATION : CUDNN_CONVOLUTION))
+end
+
+
+# Helper for cudnnConvolutionDescriptor
+function cudnnSetConvolutionDescriptor(
+    ptr::cudnnConvolutionDescriptor_t,
+    padding::Vector{Cint},
+    stride::Vector{Cint},
+    dilation::Vector{Cint},
+    mode::cudnnConvolutionMode_t,
+    dataType::cudnnDataType_t,
+    mathType::cudnnMathType_t,
+    reorderType::cudnnReorderType_t,
+    groupCount::Cint,
+)
+    cudnnSetConvolutionNdDescriptor(ptr, Cint(length(padding)), padding, stride, dilation, mode, dataType)
+    mathType != CUDNN_DEFAULT_MATH       && cudnnSetConvolutionMathType(ptr, mathType)
+    reorderType != CUDNN_DEFAULT_REORDER && cudnnSetConvolutionReorderType(ptr, reorderType)
+    groupCount != 1                      && cudnnSetConvolutionGroupCount(ptr, groupCount)
+end
+
+
+## cudnnConvolutionForward helpers:
+
+function cudnnConvolutionForwardOutput(x, xDesc, wDesc, convDesc, format)
+    d = Array{Cint}(undef, max(4, ndims(x))) # d = [N,C,Yn,...,Y1] no matter what format
+    cudnnGetConvolutionNdForwardOutputDim(convDesc, xDesc, wDesc, length(d), d)
+    if length(d) > ndims(x) # This happens when x is (X,C,N), xDesc is [N,C,X,1]
+        @assert all(d[ndims(x)+1:end] .== 1)
+        d = d[1:ndims(x)]
+    end
+    # ydims(NCHW)=(Y1,...,Yn,C,N) ydims(NHWC)=(C,Y1,...,Yn,N)
+    ydims = (format === CUDNN_TENSOR_NCHW ? reverse(d) : (d[2],d[end:-1:3]...,d[1]))
+    similar(x, ydims...)
+end
+
+
+# Convert the integer, tuple or array to convolution dims compatible with array size
+function convdims(d, s::Dims{N}, format) where N
+    @assert d isa Integer || length(d) == N-2  "Cannot conv $(Base.dims2string(s)) array with $d convdims."
+    xdims = (format === CUDNN_TENSOR_NHWC ? (2:N-1) : (1:N-2))
+    return Cint[reverse(min.(d,s[xdims]))...]
+end
+
+convdims(d, s::Dims) = convdims(d, s, CUDNN_TENSOR_NCHW)
+convdims(d, s::Dims{0}, format::cudnnTensorFormat_t) = convdims(d, (1,1,1,1), format)
+convdims(d, s::Dims{1}, format::cudnnTensorFormat_t) = convdims(d, (1,1,1,s[1]), format)    # (1,1,1,Cy)
+convdims(d, s::Dims{2}, format::cudnnTensorFormat_t) = convdims(d, (1,1,s[1],s[2]), format) # (1,1,Cx,Cy)
+convdims(d, s::Dims{3}, format::cudnnTensorFormat_t) = convdims(d, format === CUDNN_TENSOR_NHWC ? (s[1],1,s[2],s[3]) : (1, s...), format)
+
+
+## Utilities to find a fast algorithm
+
+const cudnnConvolutionFwdAlgoPerfCache = Dict{Tuple,cudnnConvolutionFwdAlgoPerf_t}()
+function cudnnConvolutionFwdAlgoPerf(xDesc, x, wDesc, w, convDesc, yDesc, y, biasDesc, activation)
+    get!(cudnnConvolutionFwdAlgoPerfCache, (xDesc, wDesc, convDesc, biasDesc, activation)) do 
+        requestedAlgoCount = Int(CUDNN_CONVOLUTION_FWD_ALGO_COUNT)
+        returnedAlgoCount = Cint[0]
+        perfResults = Array{cudnnConvolutionFwdAlgoPerf_t}(undef,requestedAlgoCount)
+        @workspace size=cudnnFindConvolutionAlgorithmWorkspaceSize(x) workspace->begin
+            cudnnFindConvolutionForwardAlgorithmEx(handle(),xDesc,x,wDesc,w,convDesc,yDesc,y,requestedAlgoCount,returnedAlgoCount,perfResults,workspace,sizeof(workspace))
+        end
+        cudnnConvolutionAlgoPerfChoose(perfResults, returnedAlgoCount[1])
+    end
+end
+
+const cudnnConvolutionBwdDataAlgoPerfCache = Dict{Tuple,cudnnConvolutionBwdDataAlgoPerf_t}()
+function cudnnConvolutionBwdDataAlgoPerf(wDesc, w, dyDesc, dy, convDesc, dxDesc, dx)
+    get!(cudnnConvolutionBwdDataAlgoPerfCache, (wDesc, dyDesc, convDesc)) do 
+        requestedAlgoCount = Int(CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT)
+        returnedAlgoCount = Cint[0]
+        perfResults = Array{cudnnConvolutionBwdDataAlgoPerf_t}(undef,requestedAlgoCount)
+        @workspace size=cudnnFindConvolutionAlgorithmWorkspaceSize(dx) workspace->begin
+            cudnnFindConvolutionBackwardDataAlgorithmEx(handle(),wDesc,w,dyDesc,dy,convDesc,dxDesc,dx,requestedAlgoCount,returnedAlgoCount,perfResults,workspace,sizeof(workspace))
+        end
+        cudnnConvolutionAlgoPerfChoose(perfResults, returnedAlgoCount[1])
+    end
+end
+
+const cudnnConvolutionBwdFilterAlgoPerfCache = Dict{Tuple,cudnnConvolutionBwdFilterAlgoPerf_t}()
+function cudnnConvolutionBwdFilterAlgoPerf(xDesc, x, dyDesc, dy, convDesc, dwDesc, dw)
+    get!(cudnnConvolutionBwdFilterAlgoPerfCache, (xDesc, dyDesc, convDesc)) do 
+        requestedAlgoCount = Int(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT)
+        returnedAlgoCount = Cint[0]
+        perfResults = Array{cudnnConvolutionBwdFilterAlgoPerf_t}(undef,requestedAlgoCount)
+        @workspace size=cudnnFindConvolutionAlgorithmWorkspaceSize(x) workspace->begin
+            cudnnFindConvolutionBackwardFilterAlgorithmEx(handle(),xDesc,x,dyDesc,dy,convDesc,dwDesc,dw,requestedAlgoCount,returnedAlgoCount,perfResults,workspace,sizeof(workspace))
+        end
+        cudnnConvolutionAlgoPerfChoose(perfResults, returnedAlgoCount[1])
+    end
+end
+
+
+# Return algorithm with best memory that is within 10% of best time
+function cudnnConvolutionAlgoPerfChoose(ps, n)
+    (ibest,mbest,tbest) = (0,Inf,Inf)
+    for i in 1:n
+        # These metrics are written in a sorted fashion where the first element has the lowest compute time.
+        if ps[i].status == CUDNN_STATUS_SUCCESS && ps[i].memory < mbest && ps[i].time < tbest * 1.1
+            (ibest,mbest,tbest) = (i,ps[i].memory,ps[i].time)
+        end
+    end
+    if ibest == 0
+        @warn "No valid algorithm found, probably bad params for convolution." maxlog=1
+        ibest = findfirst(p->p.algo==0, ps)
+        ibest === nothing && error("Cannot find backup algorithm for convolution, giving up.")
+    end
+    return ps[ibest]
+end
+
+
+# Allocate the maximum reasonable amount of memory for algorithm discovery
+function cudnnFindConvolutionAlgorithmWorkspaceSize(x)
+    gpufree = Mem.info()[1] + CUDA.cached_memory()
+    min(gpufree ÷ 10, sizeof(x) * 100)
+end
diff --git a/lib/cudnn/descriptors.jl b/lib/cudnn/descriptors.jl
new file mode 100644
index 0000000000..ed6a9fa6f9
--- /dev/null
+++ b/lib/cudnn/descriptors.jl
@@ -0,0 +1,220 @@
+using Base: @__doc__
+
+
+"""
+    @cudnnDescriptor(XXX, setter=cudnnSetXXXDescriptor)
+
+Defines a new type `cudnnXXXDescriptor` with a single field `ptr::cudnnXXXDescriptor_t` and
+its constructor. The second optional argument is the function that sets the descriptor
+fields and defaults to `cudnnSetXXXDescriptor`. The constructor is memoized, i.e. when
+called with the same arguments it returns the same object rather than creating a new one.
+
+The arguments of the constructor and thus the keys to the memoization cache depend on the
+setter: If the setter has arguments `cudnnSetXXXDescriptor(ptr::cudnnXXXDescriptor_t,
+args...)`, then the constructor has `cudnnXXXDescriptor(args...)`. The user can control
+these arguments by defining a custom setter.
+"""
+macro cudnnDescriptor(x, set = Symbol("cudnnSet$(x)Descriptor"))
+    sname = Symbol("cudnn$(x)Descriptor")
+    tname = Symbol("cudnn$(x)Descriptor_t")
+    cache = Symbol("cudnn$(x)DescriptorCache")
+    create = Symbol("cudnnCreate$(x)Descriptor")
+    destroy = Symbol("cudnnDestroy$(x)Descriptor")
+    return quote
+        @__doc__ mutable struct $sname                      # needs to be mutable for finalizer
+            ptr::$tname
+            $sname(p::$tname) = new(p)                      # prevent $sname(::Any) default constructor
+        end
+        Base.unsafe_convert(::Type{<:Ptr}, d::$sname)=d.ptr # needed for ccalls
+        const $cache = Dict{Tuple,$sname}()                 # Dict is 3x faster than IdDict!
+        function $sname(args...)
+            get!($cache, args) do
+                ptr = $tname[C_NULL]
+                $create(ptr)
+                $set(ptr[1], args...)
+                d = $sname(ptr[1])
+                finalizer(x->$destroy(x.ptr), d)
+                return d
+            end
+        end
+    end |> esc
+end
+
+
+"""
+    cudnnActivationDescriptor(mode::cudnnActivationMode_t, 
+                              reluNanOpt::cudnnNanPropagation_t,
+                              coef::Cfloat)
+"""
+@cudnnDescriptor(Activation)
+
+
+"""
+    cudnnAttnDescriptor(attnMode::Cuint,
+                        nHeads::Cint,
+                        smScaler::Cdouble,
+                        dataType::cudnnDataType_t,
+                        computePrec::cudnnDataType_t,
+                        mathType::cudnnMathType_t,
+                        attnDropoutDesc::cudnnDropoutDescriptor_t,
+                        postDropoutDesc::cudnnDropoutDescriptor_t,
+                        qSize::Cint,
+                        kSize::Cint,
+                        vSize::Cint,
+                        qProjSize::Cint,
+                        kProjSize::Cint,
+                        vProjSize::Cint,
+                        oProjSize::Cint,
+                        qoMaxSeqLength::Cint,
+                        kvMaxSeqLength::Cint,
+                        maxBatchSize::Cint,
+                        maxBeamSize::Cint)
+"""
+@cudnnDescriptor(Attn)
+
+
+"""
+    cudnnCTCLossDescriptor(compType::cudnnDataType_t,
+                           normMode::cudnnLossNormalizationMode_t,
+                           gradMode::cudnnNanPropagation_t,
+                           maxLabelLength::Cint)
+"""
+@cudnnDescriptor(CTCLoss, cudnnSetCTCLossDescriptor_v8)
+
+
+"""
+cudnnConvolutionDescriptor(pad::Vector{Cint},
+                           stride::Vector{Cint},
+                           dilation::Vector{Cint},
+                           mode::cudnnConvolutionMode_t,
+                           dataType::cudnnDataType_t,
+                           groupCount::Cint,
+                           mathType::cudnnMathType_t,
+                           reorderType::cudnnReorderType_t)
+"""
+@cudnnDescriptor(Convolution, cudnnSetConvolutionDescriptor)
+
+
+"""
+    cudnnDropoutDescriptor(dropout::Real)
+"""
+@cudnnDescriptor(Dropout, cudnnSetDropoutDescriptorFromFloat)
+
+
+"""
+    cudnnFilterDescriptor(dataType::cudnnDataType_t,
+                          format::cudnnTensorFormat_t,
+                          nbDims::Cint,
+                          filterDimA::Vector{Cint})
+"""
+@cudnnDescriptor(Filter, cudnnSetFilterNdDescriptor)
+
+
+"""
+    cudnnLRNDescriptor(lrnN::Cuint,
+                       lrnAlpha::Cdouble, 
+                       lrnBeta::Cdouble, 
+                       lrnK::Cdouble)
+"""
+@cudnnDescriptor(LRN)
+
+
+"""
+    cudnnOpTensorDescriptor(opTensorOp::cudnnOpTensorOp_t,
+                            opTensorCompType::cudnnDataType_t,
+                            opTensorNanOpt::cudnnNanPropagation_t)
+"""
+@cudnnDescriptor(OpTensor)
+
+
+"""
+    cudnnPoolingDescriptor(mode::cudnnPoolingMode_t,
+                           maxpoolingNanOpt::cudnnNanPropagation_t,
+                           nbDims::Cint,
+                           windowDimA::Vector{Cint},
+                           paddingA::Vector{Cint},
+                           strideA::Vector{Cint})
+"""
+@cudnnDescriptor(Pooling, cudnnSetPoolingNdDescriptor)
+
+
+"""
+    cudnnRNNDescriptor(algo::cudnnRNNAlgo_t,
+                       cellMode::cudnnRNNMode_t,
+                       biasMode::cudnnRNNBiasMode_t,
+                       dirMode::cudnnDirectionMode_t,
+                       inputMode::cudnnRNNInputMode_t,
+                       dataType::cudnnDataType_t,
+                       mathPrec::cudnnDataType_t,
+                       mathType::cudnnMathType_t,
+                       inputSize::Int32,
+                       hiddenSize::Int32,
+                       projSize::Int32,
+                       numLayers::Int32,
+                       dropoutDesc::cudnnDropoutDescriptor_t,
+                       auxFlags::UInt32)
+"""
+@cudnnDescriptor(RNN, cudnnSetRNNDescriptor_v8)
+
+
+"""
+    cudnnRNNDataDescriptor(dataType::cudnnDataType_t,
+                           layout::cudnnRNNDataLayout_t,
+                           maxSeqLength::Cint,
+                           batchSize::Cint,
+                           vectorSize::Cint,
+                           seqLengthArray::Vector{Cint},
+                           paddingFill::Ptr{Cvoid})
+"""
+@cudnnDescriptor(RNNData)
+
+
+"""
+    cudnnReduceTensorDescriptor(reduceTensorOp::cudnnReduceTensorOp_t,
+                                reduceTensorCompType::cudnnDataType_t,
+                                reduceTensorNanOpt::cudnnNanPropagation_t,
+                                reduceTensorIndices::cudnnReduceTensorIndices_t,
+                                reduceTensorIndicesType::cudnnIndicesType_t)
+"""
+@cudnnDescriptor(ReduceTensor)
+
+
+"""
+    cudnnSeqDataDescriptor(dataType::cudnnDataType_t,
+                           nbDims::Cint,
+                           dimA::Vector{Cint},
+                           axes::Vector{cudnnSeqDataAxis_t},
+                           seqLengthArraySize::Csize_t,
+                           seqLengthArray::Vector{Cint},
+                           paddingFill::Ptr{Cvoid})
+"""
+@cudnnDescriptor(SeqData)
+
+
+"""
+    cudnnSpatialTransformerDescriptor(samplerType::cudnnSamplerType_t,
+                                      dataType::cudnnDataType_t,
+                                      nbDims::Cint,
+                                      dimA::Vector{Cint})
+"""
+@cudnnDescriptor(SpatialTransformer, cudnnSetSpatialTransformerNdDescriptor)
+
+
+"""
+    cudnnTensorDescriptor(format::cudnnTensorFormat_t,
+                          dataType::cudnnDataType_t,
+                          nbDims::Cint,
+                          dimA::Vector{Cint})
+"""
+@cudnnDescriptor(Tensor, cudnnSetTensorNdDescriptorEx)
+
+
+"""
+    cudnnTensorTransformDescriptor(nbDims::UInt32,
+                                   destFormat::cudnnTensorFormat_t,
+                                   padBeforeA::Vector{Int32},
+                                   padAfterA::Vector{Int32},
+                                   foldA::Vector{UInt32},
+                                   direction::cudnnFoldingDirection_t)
+"""
+@cudnnDescriptor(TensorTransform)
diff --git a/lib/cudnn/dropout.jl b/lib/cudnn/dropout.jl
index a683b7a680..fc76c18ae2 100644
--- a/lib/cudnn/dropout.jl
+++ b/lib/cudnn/dropout.jl
@@ -1,22 +1,103 @@
-# descriptor
+"""
+    cudnnDropoutForward(x; dropout=0.5)
+    cudnnDropoutForward(x, d::cudnnDropoutDescriptor)
+    cudnnDropoutForward!(y, x; dropout=0.5)
+    cudnnDropoutForward!(y, x, d::cudnnDropoutDescriptor)
 
-mutable struct DropoutDesc
-  ptr::Ptr{Nothing}
-  states::CuVector{UInt8}
+Return a new array similar to `x` where approximately `dropout` fraction of the values are
+replaced by a 0, and the rest are scaled by `1/(1-dropout)`.  Optionally `y` holds the
+result and `d` specifies the operation. `y` should be similar to `x` if specified.
+
+The user can set the global seed `cudnnDropoutSeed[]` to a positive number to always drop
+the same values deterministically for debugging. Note that this slows down the operation by
+about 40x.
+
+The global constant `cudnnDropoutState::Dict` holds the random number generator state for
+each CUDNN handle.
+"""
+cudnnDropoutForward, cudnnDropoutForward!, cudnnDropoutSeed, cudnnDropoutState
+
+
+# Public methods
+cudnnDropoutForward(x; o...)     = cudnnDropoutForwardWithDefaults(x; o...)
+cudnnDropoutForward!(y, x; o...) = cudnnDropoutForwardWithDefaults(x; y, o...)
+cudnnDropoutForward(x, d::cudnnDropoutDescriptor; o...)     = cudnnDropoutForwardWithDefaults(x; dropoutDesc=d, o...)
+cudnnDropoutForward!(y, x, d::cudnnDropoutDescriptor; o...) = cudnnDropoutForwardWithDefaults(x; y, dropoutDesc=d, o...)
+
+
+# Private method
+function cudnnDropoutForwardWithDefaults(
+    x;
+    y = similar(x),
+    dropout::Real = 0.5,
+    dropoutDesc::cudnnDropoutDescriptor = cudnnDropoutDescriptor(Cfloat(dropout)),
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x),
+    yDesc::cudnnTensorDescriptor = xDesc,
+    reserveSpace::CuArray = cudnnDropoutReserveSpace(xDesc)
+)
+    if cudnnDropoutSeed[] >= 0
+        # This is a very expensive call (40x dropout), so only use for debugging
+        @warn "CUDA.CUDNN.cudnnDropoutSeed[] >= 0: dropout operations will be deterministic but 40x more expensive" maxlog=1
+        dropout, states, seed = cudnnGetDropoutDescriptor(dropoutDesc)
+        hstate = cudnnDropoutState[handle()]
+        @assert states == pointer(hstate)
+        @retry_reclaim(isequal(CUDNN_STATUS_ALLOC_FAILED),
+                       cudnnSetDropoutDescriptor(dropoutDesc, handle(), dropout, hstate, sizeof(hstate), cudnnDropoutSeed[]))
+    end
+    cudnnDropoutForwardAD(x; xDesc, y, yDesc, dropoutDesc, reserveSpace)
+end
+
+function cudnnDropoutReserveSpace(td::cudnnTensorDescriptor)
+    # reserveSpace is ~1/8 of tensor size and passes info between forw and back
+    rss = Csize_t[0]; cudnnDropoutGetReserveSpaceSize(td, rss)
+    return cudnnTempSpace(rss[1])
+end
+
+
+# AD method
+function cudnnDropoutForwardAD(x; xDesc, y, yDesc, dropoutDesc, reserveSpace)
+    cudnnDropoutForward(handle(), dropoutDesc, xDesc, x, yDesc, y, reserveSpace, sizeof(reserveSpace))
+    return y
+end
+
+
+# Global RNG state: This should NOT be reallocated for each descriptor! However
+# cudnnDropoutForward() doc says: "This function should not be running concurrently with
+# another cudnnDropoutForward() function using the same states."  So I am going to assume
+# using a single buffer per handle is ok.
+
+const cudnnDropoutState = Dict{Ptr,CuArray}() # handle -> state
+
+# Global dropout seed: To debug gradients set cudnnDropoutSeed[] >= 0 which makes all
+# dropout operations deterministic but about 40x more expensive.
+
+const cudnnDropoutSeed = Ref{Int}(-1)
+
+
+# Helper for cudnnDropoutDescriptor constructor from float:
+# Calls to cudnnDropoutDescriptor with identical Cfloats will return the same object thanks
+# to caching. If the user wants to set the seed to replicate an experiment, that is taken
+# care of during the forward call.
+
+function cudnnSetDropoutDescriptorFromFloat(ptr::cudnnDropoutDescriptor_t, dropout::Real)
+    hstate = get!(cudnnDropoutState, handle()) do
+        cudnnTempSpace(cudnnDropoutGetStatesSize())
+    end
+    seed = floor(Culonglong,time())
+    @retry_reclaim(isequal(CUDNN_STATUS_ALLOC_FAILED),
+                   cudnnSetDropoutDescriptor(ptr, handle(), Cfloat(dropout), hstate, sizeof(hstate), seed))
+end
+
+
+function cudnnGetDropoutDescriptor(d::cudnnDropoutDescriptor)
+    dropout, states, seed = Ref{Cfloat}(0), Ref{CuPtr{Nothing}}(0), Ref{Culonglong}(0)
+    cudnnGetDropoutDescriptor(d::cudnnDropoutDescriptor, handle(), dropout, states, seed)
+    return (dropout[], states[], seed[])
 end
 
-Base.unsafe_convert(::Type{Ptr{Nothing}}, dd::DropoutDesc) = dd.ptr
-
-function DropoutDesc(ρ::Real; seed::Integer=0)
-  d = [C_NULL]
-  s = Csize_t[0]
-  cudnnCreateDropoutDescriptor(d)
-  cudnnDropoutGetStatesSize(handle(), s)
-  states = CuArray{UInt8}(undef, s[]) # TODO: can we drop this when ρ=0?
-  desc = DropoutDesc(d[], states)
-  cudnnSetDropoutDescriptor(desc, handle(), ρ, states, length(states), seed)
-  finalizer(desc) do x
-    cudnnDestroyDropoutDescriptor(x)
-  end
-  return desc
+function cudnnDropoutGetStatesSize()
+    ssize = Ref{Csize_t}(0)
+    cudnnDropoutGetStatesSize(handle(), ssize)
+    @assert ssize[] > 0
+    return ssize[]
 end
diff --git a/lib/cudnn/filter.jl b/lib/cudnn/filter.jl
deleted file mode 100644
index b2113b992b..0000000000
--- a/lib/cudnn/filter.jl
+++ /dev/null
@@ -1,41 +0,0 @@
-# descriptor
-
-mutable struct FilterDesc
-  ptr::cudnnFilterDescriptor_t
-end
-
-unsafe_free!(fd::FilterDesc) = cudnnDestroyFilterDescriptor(fd.ptr)
-
-Base.unsafe_convert(::Type{cudnnFilterDescriptor_t}, fd::FilterDesc) = fd.ptr
-
-function createFilterDesc()
-  d = Ref{cudnnFilterDescriptor_t}()
-  cudnnCreateFilterDescriptor(d)
-  return d[]
-end
-
-function FilterDesc(T::Type, size::Tuple; format = CUDNN_TENSOR_NCHW)
-    # The only difference of a FilterDescriptor is no strides.
-    sz = Cint.(size) |> reverse |> collect
-    d = createFilterDesc()
-    version() >= v"5" ?
-        cudnnSetFilterNdDescriptor(d, cudnnDataType(T), format, length(sz), sz) :
-    version() >= v"4" ?
-        cudnnSetFilterNdDescriptor_v4(d, cudnnDataType(T), format, length(sz), sz) :
-        cudnnSetFilterNdDescriptor(d, cudnnDataType(T), length(sz), sz)
-    this = FilterDesc(d)
-    finalizer(unsafe_free!, this)
-    return this
-end
-
-FilterDesc(a::DenseCuArray; format = CUDNN_TENSOR_NCHW) = FilterDesc(eltype(a), size(a), format = format)
-
-function Base.size(f::FilterDesc)
-  typ = Ref{Cuint}()
-  format = Ref{Cuint}()
-  ndims = Ref{Cint}()
-  dims = Vector{Cint}(undef, 8)
-  cudnnGetFilterNdDescriptor(f, 8, typ, format, ndims, dims)
-  @assert ndims[] ≤ 8
-  return (dims[1:ndims[]]...,) |> reverse
-end
diff --git a/lib/cudnn/inplace.jl b/lib/cudnn/inplace.jl
new file mode 100644
index 0000000000..322dd0f5b4
--- /dev/null
+++ b/lib/cudnn/inplace.jl
@@ -0,0 +1,68 @@
+"""
+    cudnnSetTensor!(x, s)
+
+Set all elements of tensor `x` to scalar `s` and return `x`.
+"""
+function cudnnSetTensor!(
+    x, s::Real;
+    format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW,
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format)
+)
+    cudnnSetTensor(handle(), xDesc, x, Ref(eltype(x)(s)))
+    return x
+end
+
+
+"""
+    cudnnScaleTensor(x, s)
+    cudnnScaleTensor!(y, x, s)
+
+Scale all elements of tensor `x` with scale `s` and return the result. `cudnnScaleTensor`
+allocates a new array for the answer, `cudnnScaleTensor!` overwrites `y`.
+"""
+cudnnScaleTensor, cudnnScaleTensor!
+
+function cudnnScaleTensor!(
+    y, x, s::Real;
+    format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW,
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format)
+)
+    y === x || copyto!(y, x)
+    cudnnScaleTensor(handle(), xDesc, y, Ref(eltype(y)(s)))
+    return y
+end
+
+cudnnScaleTensor(x, s::Real; o...) = cudnnScaleTensor!(similar(x), x, s; o...)
+
+
+# cudnnAddTensor does not support all broadcasting dimensions, use cudnnOpTensor instead.
+# Compared to libknet8 x .+ b it is ~2x slower for (1,1,100,100), ~30% faster for (14,14,256,32)
+# CUDA.jl x .+ b is 2x slower than both
+
+"""
+    cudnnAddTensor(x, b; alpha)
+    cudnnAddTensor!(y, x, b; alpha, beta)
+
+Broadcast-add tensor `b` to tensor `x`. `alpha=1, beta=1` are used for scaling, i.e. `y .=
+alpha * b .+ beta * x`.  `cudnnAddTensor` allocates a new array for the answer,
+`cudnnAddTensor!` overwrites `y`. Does not support all valid broadcasting dimensions.  For
+more flexible broadcast operations see `cudnnOpTensor`.
+"""
+cudnnAddTensor, cudnnAddTensor!
+
+function cudnnAddTensor!(
+    y, x, b;
+    alpha::Real=1,
+    beta::Real=1,
+    format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW,
+    bDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(b; format),
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format),
+)
+    T = eltype(x)
+    alpha, beta = scalingParameter(T, alpha), scalingParameter(T, beta)
+    y === x || copyto!(y, x)
+    cudnnAddTensor(handle(), alpha, bDesc, b, beta, xDesc, y)
+    return y
+end
+
+cudnnAddTensor(x, b; o...) = cudnnAddTensor!(similar(x), x, b; o...)
diff --git a/lib/cudnn/libcudnn.jl b/lib/cudnn/libcudnn.jl
index da87bbcbaf..14ce1b4f9a 100644
--- a/lib/cudnn/libcudnn.jl
+++ b/lib/cudnn/libcudnn.jl
@@ -2306,8 +2306,7 @@ end
 
 @checked function cudnnGetRNNWeightParams(handle, rnnDesc, pseudoLayer, weightSpaceSize, weightSpace, linLayerID, mDesc, mAddr, bDesc, bAddr)
     initialize_api()
-    @runtime_ccall((:cudnnGetRNNWeightParams, libcudnn()), cudnnStatus_t, (cudnnHandle_t, cudnnRNNDescriptor_t, Int32, Csize_t, Ptr{Cvoid}, Int32, cudnnTensorDescriptor_t, Ptr{Ptr{Cvoid}}, cudnnTensorDescriptor_t, Ptr{Ptr{Cvoid}}), handle, rnnDesc, pseudoLayer, weightSpaceSize, weightSpace, linLayerID, mDesc, mAddr, bDesc, bAddr)
-    # not sure about memory residency here, isn't clearly documented
+    @runtime_ccall((:cudnnGetRNNWeightParams, libcudnn()), cudnnStatus_t, (cudnnHandle_t, cudnnRNNDescriptor_t, Int32, Csize_t, CuPtr{Cvoid}, Int32, cudnnTensorDescriptor_t, Ptr{CuPtr{Cvoid}}, cudnnTensorDescriptor_t, Ptr{CuPtr{Cvoid}}), handle, rnnDesc, pseudoLayer, weightSpaceSize, weightSpace, linLayerID, mDesc, mAddr, bDesc, bAddr)
 end
 
 @checked function cudnnGetNormalizationForwardTrainingWorkspaceSize(handle, mode, normOps, algo, xDesc, zDesc, yDesc, normScaleBiasDesc, activationDesc, normMeanVarDesc, sizeInBytes, groupCnt)
@@ -2357,8 +2356,7 @@ end
 
 @checked function cudnnNormalizationForwardTraining(handle, mode, normOps, algo, alpha, beta, xDesc, xData, normScaleBiasDesc, normScale, normBias, exponentialAverageFactor, normMeanVarDesc, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance, activationDesc, zDesc, zData, yDesc, yData, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes, groupCnt)
     initialize_api()
-    @runtime_ccall((:cudnnNormalizationForwardTraining, libcudnn()), cudnnStatus_t, (cudnnHandle_t, cudnnNormMode_t, cudnnNormOps_t, cudnnNormAlgo_t, Ptr{Cvoid}, Ptr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, CuPtr{Cvoid}, Cdouble, cudnnTensorDescriptor_t, Ptr{Cvoid}, Ptr{Cvoid}, Cdouble, Ptr{Cvoid}, Ptr{Cvoid}, cudnnActivationDescriptor_t, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, CuPtr{Cvoid}, Csize_t, CuPtr{Cvoid}, Csize_t, Cint), handle, mode, normOps, algo, alpha, beta, xDesc, xData, normScaleBiasDesc, normScale, normBias, exponentialAverageFactor, normMeanVarDesc, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance, activationDesc, zDesc, zData, yDesc, yData, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes, groupCnt)
-    # not sure about residency of resultSaveMean and resultSaveInvVariance: host or device?
+    @runtime_ccall((:cudnnNormalizationForwardTraining, libcudnn()), cudnnStatus_t, (cudnnHandle_t, cudnnNormMode_t, cudnnNormOps_t, cudnnNormAlgo_t, Ptr{Cvoid}, Ptr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, CuPtr{Cvoid}, Cdouble, cudnnTensorDescriptor_t, CuPtr{Cvoid}, CuPtr{Cvoid}, Cdouble, CuPtr{Cvoid}, CuPtr{Cvoid}, cudnnActivationDescriptor_t, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, CuPtr{Cvoid}, Csize_t, CuPtr{Cvoid}, Csize_t, Cint), handle, mode, normOps, algo, alpha, beta, xDesc, xData, normScaleBiasDesc, normScale, normBias, exponentialAverageFactor, normMeanVarDesc, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance, activationDesc, zDesc, zData, yDesc, yData, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes, groupCnt)
 end
 
 @checked function cudnnGetCTCLossWorkspaceSize_v8(handle, algo, ctcLossDesc, probsDesc, gradientsDesc, sizeInBytes)
@@ -2388,6 +2386,5 @@ end
 
 @checked function cudnnNormalizationBackward(handle, mode, normOps, algo, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData, dxDesc, dxData, dNormScaleBiasDesc, normScaleData, normBiasData, dNormScaleData, dNormBiasData, epsilon, normMeanVarDesc, savedMean, savedInvVariance, activationDesc, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes, groupCnt)
     initialize_api()
-    @runtime_ccall((:cudnnNormalizationBackward, libcudnn()), cudnnStatus_t, (cudnnHandle_t, cudnnNormMode_t, cudnnNormOps_t, cudnnNormAlgo_t, Ptr{Cvoid}, Ptr{Cvoid}, Ptr{Cvoid}, Ptr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, CuPtr{Cvoid}, CuPtr{Cvoid}, CuPtr{Cvoid}, Cdouble, cudnnTensorDescriptor_t, Ptr{Cvoid}, Ptr{Cvoid}, cudnnActivationDescriptor_t, CuPtr{Cvoid}, Csize_t, CuPtr{Cvoid}, Csize_t, Cint), handle, mode, normOps, algo, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData, dxDesc, dxData, dNormScaleBiasDesc, normScaleData, normBiasData, dNormScaleData, dNormBiasData, epsilon, normMeanVarDesc, savedMean, savedInvVariance, activationDesc, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes, groupCnt)
-    # savedMean and savedInvVariance in host or device memory?
+    @runtime_ccall((:cudnnNormalizationBackward, libcudnn()), cudnnStatus_t, (cudnnHandle_t, cudnnNormMode_t, cudnnNormOps_t, cudnnNormAlgo_t, Ptr{Cvoid}, Ptr{Cvoid}, Ptr{Cvoid}, Ptr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, cudnnTensorDescriptor_t, CuPtr{Cvoid}, CuPtr{Cvoid}, CuPtr{Cvoid}, CuPtr{Cvoid}, Cdouble, cudnnTensorDescriptor_t, CuPtr{Cvoid}, CuPtr{Cvoid}, cudnnActivationDescriptor_t, CuPtr{Cvoid}, Csize_t, CuPtr{Cvoid}, Csize_t, Cint), handle, mode, normOps, algo, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData, dxDesc, dxData, dNormScaleBiasDesc, normScaleData, normBiasData, dNormScaleData, dNormBiasData, epsilon, normMeanVarDesc, savedMean, savedInvVariance, activationDesc, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes, groupCnt)
 end
diff --git a/lib/cudnn/multiheadattn.jl b/lib/cudnn/multiheadattn.jl
new file mode 100644
index 0000000000..559c159fa3
--- /dev/null
+++ b/lib/cudnn/multiheadattn.jl
@@ -0,0 +1,288 @@
+@doc raw"""
+    cudnnMultiHeadAttnForward(weights, queries, keys, values; o...)
+    cudnnMultiHeadAttnForward!(out, weights, queries, keys, values; o...)
+    cudnnMultiHeadAttnForward(weights, queries, keys, values, d::cudnnAttnDescriptor; o...)
+    cudnnMultiHeadAttnForward!(out, weights, queries, keys, values, d::cudnnAttnDescriptor; o...)
+
+Return the multi-head attention result with `weights`, `queries`, `keys`, and `values`,
+overwriting `out` if provided, according to keyword arguments or the attention descriptor
+`d`.  The multi-head attention model can be described by the following equations:
+
+```math
+\begin{aligned}
+&h_i = (W_{V,i} V) \operatorname{softmax}(\operatorname{smScaler}(K^T W^T_{K,i}) (W_{Q,i} q))
+&\operatorname(MultiHeadAttn)(q,K,V,W_Q,W_K,W_V,W_O) = \sum_{i=1}^{\operatorname{nHeads}-1} W_{O,i} h_i
+\end{aligned}
+```
+
+The input arguments are:
+* `out`: Optional output tensor.
+* `weights`: A weight buffer that contains ``W_Q, W_K, W_V, W_O``.
+* `queries`: A query tensor ``Q`` which may contain a batch of queries (the above equations were for a single query vector ``q`` for simplicity).
+* `keys`: The keys tensor ``K``.
+* `values`: The values tensor ``V``.
+
+Keyword arguments describing the tensors:
+* `axes::Vector{cudnnSeqDataAxis_t} = [CUDNN_SEQDATA_VECT_DIM, CUDNN_SEQDATA_BATCH_DIM, CUDNN_SEQDATA_TIME_DIM, CUDNN_SEQDATA_BEAM_DIM]`: an array of length 4 that specifies the role of (Julia) dimensions. VECT has to be the first dimension, all 6 permutations of the remaining three are supported.
+* `seqLengthsQO::Vector{<:Integer}`: sequence lengths in the queries and out containers. By default sequences are assumed to be full length of the TIME dimension.
+* `seqLengthsKV::Vector{<:Integer}`: sequence lengths in the keys and values containers. By default sequences are assumed to be full length of the TIME dimension.
+
+Keyword arguments describing the attention operation when `d` is not given:
+* `attnMode::Unsigned = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE | CUDNN_ATTN_DISABLE_PROJ_BIASES`: bitwise flags indicating various attention options. See cudnn docs for details.
+* `nHeads::Integer = 1`: number of attention heads.
+* `smScaler::Real = 1`: softmax smoothing (1.0 >= smScaler >= 0.0) or sharpening (smScaler > 1.0) coefficient. Negative values are not accepted.
+* `mathType::cudnnMathType_t = math_mode()`: NVIDIA Tensor Core settings.
+* `qProjSize, kProjSize, vProjSize, oProjSize`: vector lengths after projections, set to 0 by default which disables projections.
+* `qoMaxSeqLength::Integer`: largest sequence length expected in queries and out, set to their TIME dim by default.
+* `kvMaxSeqLength::Integer`: largest sequence length expected in keys and values, set to their TIME dim by default.
+* `maxBatchSize::Integer`: largest batch size expected in any container, set to the BATCH dim of queries by default.
+* `maxBeamSize::Integer`: largest beam size expected in any container, set to the BEAM dim of queries by default.
+
+Other keyword arguments:
+* `residuals = nothing`: optional tensor with the same size as queries that can be used to implement residual connections (see figure in cudnn docs). When residual connections are enabled, the vector length in `queries` should match the vector length in `out`, so that a vector addition is feasible. 
+* `currIdx::Integer = -1`: Time-step (0-based) in queries to process. When the currIdx argument is negative, all Q time-steps are processed. When currIdx is zero or positive, the forward response is computed for the selected time-step only. The latter input can be used in inference mode only, to process one time-step while updating the next attention window and Q, R, K, V inputs in-between calls.
+* `loWinIdx, hiWinIdx::Array{Cint}`: Two host integer arrays specifying the start and end (0-based) indices of the attention window for each Q time-step. The start index in K, V sets is inclusive, and the end index is exclusive. By default set at 0 and `kvMaxSeqLength` respectively.
+"""
+cudnnMultiHeadAttnForward, cudnnMultiHeadAttnForward!
+
+
+# The axes argument in the functions below specifies the role of the axes using Julia order: VECT,BATCH,TIME,BEAM by default. Missing trailing dims assumed 1.
+const cudnnSeqDataDefaultAxes = [CUDNN_SEQDATA_VECT_DIM, CUDNN_SEQDATA_BATCH_DIM, CUDNN_SEQDATA_TIME_DIM, CUDNN_SEQDATA_BEAM_DIM]
+
+
+# Public methods
+cudnnMultiHeadAttnForward(w,q,k,v; o...) = cudnnMultiHeadAttnForward(w,q,k,v, cudnnAttnDescriptor(q,k,v;o...); o...)
+cudnnMultiHeadAttnForward(w,q,k,v, d::cudnnAttnDescriptor; o...) = cudnnMultiHeadAttnForward!(cudnnAttnOutput(q,k,v,d), w,q,k,v,d; o...)
+cudnnMultiHeadAttnForward!(out, w,q,k,v; o...) = cudnnMultiHeadAttnForward!(out, w,q,k,v, cudnnAttnDescriptor(q,k,v;o...); o...)
+
+function cudnnMultiHeadAttnForward!(
+    out, weights, queries, keys, values, attnDesc::cudnnAttnDescriptor;
+
+    # Input tensor descriptors
+    axes::Vector{cudnnSeqDataAxis_t} = cudnnSeqDataDefaultAxes,
+    seqLengthsQO::Vector{<:Integer} = fill(Cint(sdim(queries,axes,CUDNN_SEQDATA_TIME_DIM)), sdim(queries,axes,CUDNN_SEQDATA_BATCH_DIM)*sdim(queries,axes,CUDNN_SEQDATA_BEAM_DIM)),
+    seqLengthsKV::Vector{<:Integer} = fill(Cint(sdim(keys,axes,CUDNN_SEQDATA_TIME_DIM)), sdim(keys,axes,CUDNN_SEQDATA_BATCH_DIM)*sdim(keys,axes,CUDNN_SEQDATA_BEAM_DIM)),
+    devSeqLengthsQO::CuVector{Cint} = convert(CuVector{Cint}, seqLengthsQO),
+    devSeqLengthsKV::CuVector{Cint} = convert(CuVector{Cint}, seqLengthsKV),
+    qDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(queries; axes, seqLengthArray=seqLengthsQO),
+    kDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(keys;    axes, seqLengthArray=seqLengthsKV),
+    vDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(values;  axes, seqLengthArray=seqLengthsKV),
+    oDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(out;     axes, seqLengthArray=seqLengthsQO),
+
+    # forw parameters
+    residuals = nothing,
+    currIdx::Integer = -1,
+    loWinIdx::Union{Array{Cint},Nothing} = nothing,
+    hiWinIdx::Union{Array{Cint},Nothing} = nothing,
+    workspace::Union{CuArray,Nothing}    = nothing, 
+    reserveSpace::Union{CuArray,Nothing} = nothing,
+
+    # Buffers for gradients
+    dweights::Ref = Ref{Any}(),
+    dqueries::Ref = Ref{Any}(),
+    dkeys::Ref    = Ref{Any}(),
+    dvalues::Ref  = Ref{Any}(),
+    o...
+)
+    d = cudnnGetAttnDescriptor(attnDesc)
+    dt = juliaDataType(d.dataType)
+    @assert dt == eltype(out) == eltype(queries) == eltype(keys) == eltype(values)
+    qSize = (d.qProjSize > 0 ? d.qProjSize : size(queries,1))
+    kSize = (d.kProjSize > 0 ? d.kProjSize : size(keys,1))
+    @assert kSize == qSize  "key size $kSize does not match query size $qSize."
+    vSize = (d.vProjSize > 0 ? d.vProjSize : size(values,1))
+    @assert size(keys)[2:end] == size(values)[2:end]  "key tensor $(size(keys)) does not match value tensor $(size(values))"
+    oSize = (d.oProjSize > 0 ? d.oProjSize : d.nHeads * vSize)
+    oDims = (oSize, size(queries)[2:end]...)
+    @assert size(out) == oDims  "output size should be $(oDims)"
+    @assert residuals === nothing || size(residuals) == oDims  "residual size should be $(oDims)"
+    loWinIdx === nothing ? loWinIdx = fill(Cint(0), d.qoMaxSeqLength) : @assert length(loWinIdx) == d.qoMaxSeqLength
+    hiWinIdx === nothing ? hiWinIdx = fill(typemax(Cint), d.qoMaxSeqLength) : @assert length(hiWinIdx) == d.qoMaxSeqLength
+
+    @assert axes[1] == CUDNN_SEQDATA_VECT_DIM  "The most inner dimension of the containers should be the vector dimension"
+    @assert d.smScaler >= 0  "smScaler should be non-negative"
+    @assert d.qoMaxSeqLength >= sdim(queries, axes, CUDNN_SEQDATA_TIME_DIM)
+    @assert d.kvMaxSeqLength >= sdim(keys,    axes, CUDNN_SEQDATA_TIME_DIM)
+    @assert d.maxBatchSize   >= sdim(queries, axes, CUDNN_SEQDATA_BATCH_DIM)
+    @assert d.maxBeamSize    >= sdim(queries, axes, CUDNN_SEQDATA_BEAM_DIM)
+    @assert sdim(keys, axes, CUDNN_SEQDATA_BATCH_DIM) == sdim(queries, axes, CUDNN_SEQDATA_BATCH_DIM)  "keys/values and queries have different batch sizes"
+    if d.attnMode & CUDNN_ATTN_QUERYMAP_ONE_TO_ONE > 0
+        @assert sdim(keys, axes, CUDNN_SEQDATA_BEAM_DIM) == sdim(queries, axes, CUDNN_SEQDATA_BEAM_DIM)  "keys/values and queries have different beam sizes when attnMode is CUDNN_ATTN_QUERYMAP_ONE_TO_ONE"
+    else
+        @assert sdim(keys, axes, CUDNN_SEQDATA_BEAM_DIM) == 1  "keys/values should have beam=1 when attnMode is CUDNN_ATTN_QUERYMAP_ALL_TO_ONE"
+    end
+
+    # Backward called separately on each variable. We will calculate all gradients on first call. Use `dready` to avoid subsequent calls.
+    dready = Ref{Bool}(false)   # this will be turned to `true` by the first backward call.
+
+    cudnnMultiHeadAttnForwardAD(
+        weights, queries, keys, values, residuals;
+        dready, dweights, dqueries, dkeys, dvalues, # dresiduals is equal to dout
+        attnDesc, currIdx, loWinIdx, hiWinIdx,
+        devSeqLengthsQO, devSeqLengthsKV,
+        qDesc, kDesc, vDesc, oDesc,
+        out, workspace, reserveSpace)
+end
+
+
+# AD method
+function cudnnMultiHeadAttnForwardAD(
+    weights, queries, keys, values, residuals;
+    dready, dweights, dqueries, dkeys, dvalues,
+    attnDesc, currIdx, loWinIdx, hiWinIdx,
+    devSeqLengthsQO, devSeqLengthsKV,
+    qDesc, kDesc, vDesc, oDesc,
+    out, workspace, reserveSpace
+)
+    # Cannot use @workspace here because it is shared between forw and back calls
+    (weightSize, workspaceSize, reserveSpaceSize) = cudnnMultiHeadAttnBuffers(attnDesc)
+    if workspaceSize > 0 && workspace === nothing; workspace = cudnnTempSpace(workspaceSize); end
+    if reserveSpaceSize > 0 && reserveSpace === nothing; reserveSpace = cudnnTempSpace(reserveSpaceSize); end
+    @assert sizeof(weights) >= weightSize  "weights should be at least $weightSize bytes."
+    @assert sizeof(workspace) >= workspaceSize  "worksSpace should be at least $workspaceSize bytes"
+    @assert sizeof(reserveSpace) >= reserveSpaceSize  "reserveSpace should be at least $reserveSpaceSize bytes"
+
+    cudnnMultiHeadAttnForward(
+        handle(), attnDesc, currIdx,
+        loWinIdx, hiWinIdx,
+        devSeqLengthsQO, devSeqLengthsKV,
+        qDesc, queries, something(residuals, CU_NULL),
+        kDesc, keys,
+        vDesc, values,
+        oDesc, out,
+        sizeof(weights), something(weights, CU_NULL),
+        sizeof(workspace), something(workspace, CU_NULL),
+        sizeof(reserveSpace), something(reserveSpace, CU_NULL)
+    )
+    return out
+end
+
+
+# Helper methods
+
+
+function cudnnAttnDescriptor(
+    queries, keys, values;
+    axes = cudnnSeqDataDefaultAxes,
+    attnMode::Unsigned = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE | CUDNN_ATTN_DISABLE_PROJ_BIASES |> Cuint,
+    nHeads::Integer = Cint(1),
+    smScaler::Real = Cdouble(1),
+    # dataType::DataType = eltype(queries),
+    # computePrec::DataType = eltype(queries),  ## No other option according to 8.0.2
+    mathType::cudnnMathType_t = math_mode(),
+    # attnDropout::Real = 0, ## The dropout option is currently not supported by the multi-head attention API
+    # postDropout::Real = 0, ## The dropout option is currently not supported by the multi-head attention API
+    qProjSize::Integer = 0, # Use zero to disable the corresponding projection
+    kProjSize::Integer = 0,
+    vProjSize::Integer = 0,
+    oProjSize::Integer = 0,
+    qoMaxSeqLength::Integer = sdim(queries,axes,CUDNN_SEQDATA_TIME_DIM),
+    kvMaxSeqLength::Integer = sdim(keys,axes,CUDNN_SEQDATA_TIME_DIM),
+    maxBatchSize::Integer = sdim(queries,axes,CUDNN_SEQDATA_BATCH_DIM),
+    maxBeamSize::Integer = sdim(queries,axes,CUDNN_SEQDATA_BEAM_DIM),
+    o...
+)
+    cudnnAttnDescriptor(
+        Cuint(attnMode),
+        Cint(nHeads),
+        Cdouble(smScaler),
+        cudnnDataType(eltype(queries)),    # dataType
+        cudnnDataType(eltype(queries)),    # computePrec
+        mathType,
+        C_NULL,  # attnDropout
+        C_NULL,  # postDropout
+        Cint(sdim(queries,axes,CUDNN_SEQDATA_VECT_DIM)), # qSize
+        Cint(sdim(keys,   axes,CUDNN_SEQDATA_VECT_DIM)), # kSize
+        Cint(sdim(values, axes,CUDNN_SEQDATA_VECT_DIM)), # vSize
+        Cint(qProjSize),
+        Cint(kProjSize),
+        Cint(vProjSize),
+        Cint(oProjSize),
+        Cint(qoMaxSeqLength),
+        Cint(kvMaxSeqLength),
+        Cint(maxBatchSize),
+        Cint(maxBeamSize)
+    )
+end
+
+function cudnnGetAttnDescriptor(d::cudnnAttnDescriptor)
+    (attnMode, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize) = (Ref{Cuint}(), Ref{Cint}(), Ref{Cdouble}(), Ref{cudnnDataType_t}(), Ref{cudnnDataType_t}(), Ref{cudnnMathType_t}(), Ref{cudnnDropoutDescriptor_t}(), Ref{cudnnDropoutDescriptor_t}(), Ref{Cint}(), Ref{Cint}(), Ref{Cint}(), Ref{Cint}(), Ref{Cint}(), Ref{Cint}(), Ref{Cint}(), Ref{Cint}(), Ref{Cint}(), Ref{Cint}(), Ref{Cint}())
+    cudnnGetAttnDescriptor(d, attnMode, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize)
+    (attnMode, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize) = (x->x[]).((attnMode, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize))
+    return (; attnMode, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize)
+end
+
+
+function cudnnAttnOutput(queries, keys, values, attnDesc::cudnnAttnDescriptor)
+    d = cudnnGetAttnDescriptor(attnDesc)
+    vSize = (d.vProjSize > 0 ? d.vProjSize : size(values,1))
+    oSize = (d.oProjSize > 0 ? d.oProjSize : d.nHeads * vSize)
+    oDims = (oSize, size(queries)[2:end]...)
+    out = similar(values, oDims)
+    out .= 0 # currIdx >= 0 only fills part of this, zero the rest for consistency
+    return out
+end
+
+
+function cudnnMultiHeadAttnBuffers(attnDesc::cudnnAttnDescriptor; training=false)
+    weightSize, workspaceSize = Ref{Csize_t}(0), Ref{Csize_t}(0)
+    # Assigning NULL to the reserveSpaceSizeInBytes argument indicates that the user does not plan to invoke multi-head attention gradient functions
+    reserveSpaceSize = training ? Ref{Csize_t}(0) : C_NULL
+    cudnnGetMultiHeadAttnBuffers(handle(), attnDesc, weightSize, workspaceSize, reserveSpaceSize)
+    return (weightSize[], workspaceSize[], reserveSpaceSize === C_NULL ? 0 : reserveSpaceSize[])
+end
+
+
+# If we have less than four dims, assume they are padded by 1s on the right for Julia, on the left for cudnn.
+# We keep `axes` in Julia order, e.g. axes[1] refers to the function of the first Julia dimension and is always VECT.
+"""
+    sdim(x,axes,dim)
+    sdim(x,axes)
+
+The first form returns the size of `x` in the dimension specified with
+`dim::cudnnSeqDataAxis_t` (e.g. CUDNN_SEQDATA_TIME_DIM), i.e. return `size(x,i)` such that
+`axes[i]==dim`.
+
+The second form returns an array of length 4 `dims::Vector{Cint}` such that `dims[1+dim] ==
+sdim(x,axes,dim)` where `dim::cudnnSeqDataAxis_t` specifies the role of the dimension
+(e.g. dims[CUDNN_SEQDATA_TIME_DIM]==5).
+
+The `axes::Vector{cudnnSeqDataAxis_t}` argument is an array of length 4 that
+specifies the role of Julia dimensions, e.g. `axes[3]=CUDNN_SEQDATA_TIME_DIM`.
+"""
+function sdim(x,axes,dim)
+    for i in 1:length(axes)
+        if axes[i] === dim      # axes[i] = CUDNN_SEQDATA_XXX_DIM
+            return size(x,i)
+        end
+    end
+    error("Cannot find $dim in axes")
+end
+
+function sdim(x,axes)
+    dims = Array{Cint}(undef, 4)
+    for dim in (CUDNN_SEQDATA_VECT_DIM, CUDNN_SEQDATA_BATCH_DIM, CUDNN_SEQDATA_TIME_DIM, CUDNN_SEQDATA_BEAM_DIM)
+        dims[1+dim] = sdim(x,axes,dim)
+    end
+    return dims                 # dims[1+CUDNN_SEQDATA_XXX_DIM] = how many XXX
+end
+
+
+# Alternative cudnnSeqDataDescriptor constructor for array
+function cudnnSeqDataDescriptor(
+    array; 
+    axes::Vector{cudnnSeqDataAxis_t} = cudnnSeqDataDefaultAxes,
+    dimA::Vector{Cint} = sdim(array,axes),
+    seqLengthArray::Vector{<:Integer} = fill(Cint(sdim(array,axes,CUDNN_SEQDATA_TIME_DIM)), sdim(array,axes,CUDNN_SEQDATA_BATCH_DIM)*sdim(array,axes,CUDNN_SEQDATA_BEAM_DIM)), # cudnn-doc: The seqLengthArray[] must specify all sequence lengths in the container so the total size of this array should be dimA[CUDNN_SEQDATA_BATCH_DIM] * dimA[CUDNN_SEQDATA_BEAM_DIM].
+    paddingFill::Ptr{Cvoid} = C_NULL, # cudnn-doc: Currently, the only supported value for paddingFill is NULL which means this option should be ignored.
+)
+    nbDims::Cint = CUDNN_SEQDATA_DIM_COUNT          # Currently, the value of this argument should be four. The actual size of the dimA[] and axes[] arrays should be declared using the CUDNN_SEQDATA_DIM_COUNT macro.
+    @assert length(axes) == length(dimA) == CUDNN_SEQDATA_DIM_COUNT # cudnn-doc: The number of active dimensions in the dimA[] and axes[] arrays is defined by the nbDims argument. 
+    seqLengthArraySize = Csize_t(sdim(array,axes,CUDNN_SEQDATA_BATCH_DIM) * sdim(array,axes,CUDNN_SEQDATA_BEAM_DIM))
+    @assert length(seqLengthArray) == seqLengthArraySize
+    cudnnSeqDataDescriptor(cudnnDataType(eltype(array)), nbDims, dimA, reverse(axes), # cudnn uses reverse order for dims
+                           seqLengthArraySize, convert(Vector{Cint}, seqLengthArray), 
+                           paddingFill)
+end
+
+cudnnSeqDataDescriptor(::Nothing; o...) = nothing
diff --git a/lib/cudnn/nnlib.jl b/lib/cudnn/nnlib.jl
index 8624f33e81..1c24911ce8 100644
--- a/lib/cudnn/nnlib.jl
+++ b/lib/cudnn/nnlib.jl
@@ -1,10 +1,10 @@
 # interfacing with NNlib.jl
 
 import NNlib: stride, padding, dilation, flipkernel, spatial_dims, kernel_size,
-              conv!, ∇conv_filter!, ∇conv_data!,
-              maxpool!, meanpool!, ∇maxpool!, ∇meanpool!,
-              softmax, softmax!, ∇softmax, ∇softmax!,
-              logsoftmax, logsoftmax!, ∇logsoftmax, ∇logsoftmax!
+    conv!, ∇conv_filter!, ∇conv_data!,
+    maxpool!, meanpool!, ∇maxpool!, ∇meanpool!, PoolDims,
+    softmax, softmax!, ∇softmax, ∇softmax!,
+    logsoftmax, logsoftmax!, ∇logsoftmax, ∇logsoftmax!
 
 import DataStructures: DefaultDict
 
@@ -13,261 +13,310 @@ const CUDNNFloat = Union{Float16,Float32,Float64}
 
 # Since CUDNN does not support 1D convolution, Conv in Flux will give a CUDNNError if the size is 1-dimensional.
 fix1d(x) = x
-fix1d(x::DenseCuArray{T, 3}) where T = reshape(x, size(x, 1), 1, size(x, 2), size(x, 3))
+fix1d(x::DenseCuArray{T, 3}) where T = reshape(x, 1, size(x, 1), size(x, 2), size(x, 3))
 fix1d(cdims::DenseConvDims{1,K,C_in,C_out,S,P,D,F}) where {K,C_in,C_out,S,P,D,F} =
-  DenseConvDims{2,(K...,1),C_in,C_out,(S...,1),(P...,0,0),(D...,1),F}((cdims.I...,1))
+    DenseConvDims{2,(1,K...),C_in,C_out,(1,S...),(0,0,P...),(1,D...),F}((1,cdims.I...))
 fix1d(pdims::PoolDims{1,K,S,P,D}) where {K,S,P,D,F} =
-  PoolDims{2,(K...,1),(S...,1),(P...,0,0),(D...,1)}((pdims.I..., 1), pdims.C_in)
-
-# We have to reshape the CuArray/PoolDims/DenseConvDims to 4D before feeding to CUDNN.
-reshape4D(x::AbstractVector) = reshape(x, 1, 1, length(x), 1)
-reshape4D(x::AbstractMatrix) = reshape(x, 1, 1, size(x)...)
-reshape4D(x::AbstractArray{T,3}) where T = reshape(x, size(x, 1), 1, size(x, 2), size(x, 3))
-reshape4D(x::AbstractArray{T}) where T = x
-
-workspacesize(x) = min(Mem.info()[1] ÷ 16, sizeof(x) * 2)
-
-function perfChoose(perfResults, returnedAlgoCount)::UInt32
-  if perfResults[1].status != 0
-    return 0
-  else
-    (best_algo,best_time,best_memory) = (perfResults[1].algo,perfResults[1].time,perfResults[1].memory)
-    for i = 2:returnedAlgoCount
-      if perfResults[i].status == 0 && perfResults[i].memory < best_memory && perfResults[i].time < best_time * 1.1
-        (best_algo,best_memory) = (perfResults[i].algo,perfResults[i].memory)
-      end
-    end
-    return best_algo
-  end
+    PoolDims{2,(1,K...),(1,S...),(0,0,P...),(1,D...)}((1,pdims.I...), pdims.C_in)
+
+# Softmax
+
+# @denizyuret: do not do inplace operations with softmax/logsoftmax when (1) cpu version is not, (2) one can use softmax!
+function softmax(x::T; dims=1) where {T<:DenseCuArray}
+    softmax!(similar(x), x; dims)
 end
 
+function ∇softmax(dy::T, x::T, y::T; dims=1) where {T<:DenseCuArray}
+    ∇softmax!(similar(x), dy, x, y; dims)
+end
 
-# Softmax
+function logsoftmax(x::T; dims=1) where {T<:DenseCuArray}
+    logsoftmax!(similar(x), x; dims)
+end
 
-# in-place for x or dy
-softmax(x::DenseCuArray{T}; dims=1) where T<:CUDNNFloat =
-  softmax!(x, x, dims=dims)
+function ∇logsoftmax(dy::T, x::T, y::T; dims=1) where {T<:DenseCuArray}
+    ∇logsoftmax!(similar(x), dy, x, y; dims)
+end
 
-∇softmax(dy::DenseCuArray{T}, x::DenseCuArray{T}; dims=1) where T<:CUDNNFloat =
-  ∇softmax!(dy, dy, x, dims=dims)
 
-logsoftmax(x::DenseCuArray{T}; dims=1) where T<:CUDNNFloat =
-  logsoftmax!(x, x, dims=dims)
+# @denizyuret: recalculating y in ∇softmax! is a big waste, the nnlib API should be changed:
+function ∇softmax(dy::T, x::T; dims=1) where {T<:DenseCuArray}
+    @warn "∇softmax(dy,x) should be deprecated, please use ∇softmax(dy,x,y)" maxlog=1
+    ∇softmax!(similar(x), dy, x, softmax(x); dims)
+end
+
+function ∇softmax!(dx::T, dy::T, x::T; dims=1) where {T<:DenseCuArray}
+    @warn "∇softmax!(dx,dy,x) should be deprecated, please use ∇softmax!(dx,dy,x,y)" maxlog=1
+    ∇softmax!(dx, dy, x, softmax(x); dims)
+end
+
+function ∇logsoftmax(dy::T, x::T; dims=1) where {T<:DenseCuArray}
+    @warn "∇logsoftmax(dy,x) should be deprecated, please use ∇logsoftmax(dy,x,y)" maxlog=1
+    ∇logsoftmax!(similar(x), dy, x, logsoftmax(x); dims)
+end
+
+function ∇logsoftmax!(dx::T, dy::T, x::T; dims=1) where {T<:DenseCuArray}
+    @warn "∇logsoftmax!(dx,dy,x) should be deprecated, please use ∇logsoftmax!(dx,dy,x,y)" maxlog=1
+    ∇logsoftmax!(dx, dy, x, logsoftmax(x); dims)
+end
 
-∇logsoftmax(dy::DenseCuArray{T}, x::DenseCuArray{T}; dims=1) where T<:CUDNNFloat =
-  ∇logsoftmax!(dy, dy, x, dims=dims)
 
-function softmax!(y::DenseCuArray{T}, x::DenseCuArray{T}; dims=1) where T<:CUDNNFloat
-  cudnnSoftmaxForward(reshape4D(x), reshape4D(y),
-                      algo=CUDNN_SOFTMAX_FAST, mode=cudnnSoftmaxMode_t(dims-1))
-  return y
+# @denizyuret: backup implementations for unsupported/slow size/dims combinations:
+function _softmax!(y::T, x::T; dims) where {T<:DenseCuArray}
+    y .= exp.(x .- maximum(x; dims))
+    y ./= sum(y; dims)
 end
 
-function ∇softmax!(dx::DenseCuArray{T}, dy::DenseCuArray{T}, x::DenseCuArray{T}; dims=1) where T<:CUDNNFloat
-  y = softmax(x, dims=dims)
-  cudnnSoftmaxBackward(reshape4D(y), reshape4D(dy), reshape4D(dx),
-                       algo=CUDNN_SOFTMAX_FAST, mode=cudnnSoftmaxMode_t(dims-1))
-  return dx
+function _∇softmax!(dx::T, dy::T, x::T, y::T; dims) where {T<:DenseCuArray}
+    dx .= y .* (dy .- sum(dy .* y; dims))
 end
 
-function logsoftmax!(y::DenseCuArray{T}, x::DenseCuArray{T}; dims=1) where T<:CUDNNFloat
-  cudnnSoftmaxForward(reshape4D(x), reshape4D(y),
-                      algo=CUDNN_SOFTMAX_LOG, mode=cudnnSoftmaxMode_t(dims-1))
-  return y
+function _logsoftmax!(y::T, x::T; dims) where {T<:DenseCuArray}
+    y .= x .- maximum(x; dims)
+    y .-= log.(sum(exp.(y); dims))
 end
 
-function ∇logsoftmax!(dx::DenseCuArray{T}, dy::DenseCuArray{T}, x::DenseCuArray{T}; dims=1) where T<:CUDNNFloat
-  y = logsoftmax(x, dims=dims)
-  cudnnSoftmaxBackward(reshape4D(y), reshape4D(dy), reshape4D(dx),
-                       algo=CUDNN_SOFTMAX_LOG, mode=cudnnSoftmaxMode_t(dims-1))
-  return dx
+function _∇logsoftmax!(dx::T, dy::T, x::T, y::T; dims) where {T<:DenseCuArray}
+    dx .= dy .- sum(dy; dims) .* exp.(y)
 end
 
+# Trick by @norci to use cudnn for softmax dims args that are contiguous: 
+# If dims=(dmin:dmax) then CUDNN_SOFTMAX_MODE_CHANNEL does the trick with reshape 
+#    (1, prod(size(x)[1:dmin-1]), prod(size(x)[dmin:dmax]), :)
+# softmaxdims returns nothing when the backup implementation should be used.
+
+function softmaxdims(x, dims)
+    dims === Colon() && return (1, 1, length(x), 1)
+    mind,maxd = minimum(dims),maximum(dims)
+    all(i in dims for i in mind:maxd) || return nothing # cannot handle if not contiguous
+    stride = dimsize = 1
+    for i in 1:(mind-1); stride *= size(x,i); end # Using size(x,i) assumes trailing dims = 1, robust to maxd > ndims(x)
+    for i in mind:maxd; dimsize *= size(x,i); end
+    batchsize = length(x)÷(stride*dimsize)
+    # Here is a region where cudnn is slower, so we go with the backup:
+    batchsize == 1 && 64 <= stride <= 4096 && 64 <= dimsize <= 4096 && return nothing
+    return (1, stride, dimsize, batchsize)
+end
+
+# Determine softmax algo based on math_mode
+
+softmaxalgo() = (CUDA.math_mode()===CUDA.FAST_MATH ? CUDNN_SOFTMAX_FAST : CUDNN_SOFTMAX_ACCURATE)
+
+# Main implementations:
+
+function softmax!(y::T, x::T = y; dims=1) where {T<:DenseCuArray}
+    s = softmaxdims(x, dims)
+    s === nothing && return _softmax!(y, x; dims)
+    cudnnSoftmaxForward!(reshape(y,s), reshape(x,s); mode = CUDNN_SOFTMAX_MODE_CHANNEL, algo = softmaxalgo())
+    return y
+end
+
+function ∇softmax!(dx::T, dy::T, x::T, y::T; dims=1) where {R,T<:DenseCuArray{R}}
+    s = softmaxdims(x, dims)
+    s === nothing && return _∇softmax!(dx, dy, x, y; dims)
+    xDesc = cudnnTensorDescriptor(reshape(x,s))
+    alpha, beta = scalingParameter(R,1), scalingParameter(R,0)
+    cudnnSoftmaxBackward(handle(), softmaxalgo(), CUDNN_SOFTMAX_MODE_CHANNEL, 
+                         alpha, xDesc, y, xDesc, dy, beta, xDesc, dx)
+    return dx
+end
+
+function logsoftmax!(y::T, x::T = y; dims=1) where {T<:DenseCuArray}
+    s = softmaxdims(x, dims)
+    s === nothing && return _logsoftmax!(y, x; dims)
+    cudnnSoftmaxForward!(reshape(y,s), reshape(x,s); mode = CUDNN_SOFTMAX_MODE_CHANNEL, algo = CUDNN_SOFTMAX_LOG)
+    return y
+end
+
+function ∇logsoftmax!(dx::T, dy::T, x::T, y::T; dims=1) where {R,T<:DenseCuArray{R}}
+    s = softmaxdims(x, dims)
+    s === nothing && return _∇logsoftmax!(dx, dy, x, y; dims)
+    xDesc = cudnnTensorDescriptor(reshape(x,s))
+    alpha, beta = scalingParameter(R,1), scalingParameter(R,0)
+    cudnnSoftmaxBackward(handle(), CUDNN_SOFTMAX_LOG, CUDNN_SOFTMAX_MODE_CHANNEL, 
+                         alpha, xDesc, y, xDesc, dy, beta, xDesc, dx)
+    return dx
+end
+
+
 # Convolution
 
-const conv_forward_algos = DefaultDict{Tuple, Int32}(Int32(-1))
+function cudnnConvolutionDescriptor(cdims::DenseConvDims, x::DenseCuArray{T}) where T
+    cdims, x = fix1d(cdims), fix1d(x)
+    mode=(NNlib.flipkernel(cdims) ? CUDNN_CROSS_CORRELATION : CUDNN_CONVOLUTION)
+    cudnnConvolutionDescriptor(convdims(nnlibPadding(cdims),size(x)), convdims(NNlib.stride(cdims),size(x)), convdims(NNlib.dilation(cdims),size(x)), mode, cudnnDataType(T), math_mode(), CUDNN_DEFAULT_REORDER, Cint(1))
+end
+
 function conv!(y::DenseCuArray{T}, x::DenseCuArray{T}, w::DenseCuArray{T}, cdims::DenseConvDims;
-               alpha=1, algo=-1) where T<:CUDNNFloat
-  if version() < v"6"
-    all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
-  end
-
-  if algo < 0
-    global conv_forward_algos
-    key = (T, strides(x), strides(w), strides(y), cdims, size(x)[end])
-    algo = conv_forward_algos[key]
-    if algo < 0 # not in conv_forward_algos
-      # algo = UInt32(cudnnGetConvolutionForwardAlgorithm(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims), preference=2, workspacesize=workspacesize(x)) # will be removed in cuDNN 8
-      # returnedAlgoCount, perfResults = cudnnGetConvolutionForwardAlgorithm_v7(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))
-      # returnedAlgoCount, perfResults = cudnnFindConvolutionForwardAlgorithm(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))
-      returnedAlgoCount, perfResults = cudnnFindConvolutionForwardAlgorithmEx(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims), workspacesize=workspacesize(x))
-      algo = perfChoose(perfResults, returnedAlgoCount)
-      conv_forward_algos[key] = algo
+               alpha=1, beta=0, algo=-1) where T<:CUDNNFloat
+    if version() < v"6"
+        all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
     end
-  end
-
-  cudnnConvolutionForward(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims), alpha=alpha, algo=algo)
-  return y
+    if algo != -1
+        @warn "algo option has been deprecated, the fastest algo is computed automatically" maxlog=1
+    end
+    d = cudnnConvolutionDescriptor(cdims, x)
+    cudnnConvolutionForward!(y, w, x, d; alpha, beta, z=y)
 end
 
 if isdefined(NNlib, :conv_bias_act!)
-function NNlib.conv_bias_act!(y::DenseCuArray{T}, x::DenseCuArray{T}, w::DenseCuArray{T}, cdims::DenseConvDims, b::DenseCuArray{T}, σ=identity;
-                              z::DenseCuArray{T}=y, alpha1=1, alpha2=0, algo=-1) where T<:CUDNNFloat
-  if version() < v"6"
-    all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
-  end
-
-  if algo < 0
-    global conv_forward_algos
-    key = (T, strides(x), strides(w), strides(y), cdims, size(x)[end])
-    algo = conv_forward_algos[key]
-    if algo < 0 # not in conv_forward_algos
-      # algo = UInt32(cudnnGetConvolutionForwardAlgorithm(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims), preference=2, workspacesize=workspacesize(x))) # will be removed in cuDNN 8
-      # returnedAlgoCount, perfResults = cudnnGetConvolutionForwardAlgorithm_v7(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))
-      # returnedAlgoCount, perfResults = cudnnFindConvolutionForwardAlgorithm(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))
-      returnedAlgoCount, perfResults = cudnnFindConvolutionForwardAlgorithmEx(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims), workspacesize=workspacesize(x))
-      algo = perfChoose(perfResults, returnedAlgoCount)
-      conv_forward_algos[key] = algo
+    function NNlib.conv_bias_act!(y::DenseCuArray{T}, x::DenseCuArray{T}, w::DenseCuArray{T}, 
+                                  cdims::DenseConvDims, bias::DenseCuArray{T}, σ=identity;
+                                  z::DenseCuArray{T}=y, alpha=1, beta=0, algo=-1) where T<:CUDNNFloat
+        if version() < v"6"
+            all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
+        end
+        if algo != -1
+            @warn "The algo option has been deprecated, the fastest algo is computed automatically" maxlog=1
+        end    
+        d = cudnnConvolutionDescriptor(cdims, x)
+        # only relu and identity are supported by cudnnConvolutionForward!
+        activation = (σ == NNlib.relu ? CUDNN_ACTIVATION_RELU : CUDNN_ACTIVATION_IDENTITY)
+        cudnnConvolutionForward!(y, w, x, d; z, bias, activation, alpha, beta)
+        if activation === CUDNN_ACTIVATION_IDENTITY && σ ∉ (nothing, identity)
+            y = σ.(y)
+        end
+        return y
     end
-  end
-
-  # only relu and identity are supported
-  if σ == NNlib.relu # always merge convolutions, bias, and relu, even when bias is turned off
-    cudnnConvolutionBiasActivationForward(fix1d(y), fix1d(x), fix1d(w), fix1d(z), fix1d(b),
-                                          fix1d(cdims), algo=algo, alpha1=alpha1, alpha2=alpha2,
-                                          activationMode=CUDNN_ACTIVATION_RELU, activationCoeff=0.0)
-  elseif algo == 1 && b != nothing # only merge convolution and bias if the fastest algorithm is also the only supported algorithm and the bias is not turned off
-    # algo must be CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM (1) when activationMode equals CUDNN_ACTIVATION_IDENTITY
-    cudnnConvolutionBiasActivationForward(fix1d(y), fix1d(x), fix1d(w), fix1d(z), fix1d(b),
-                                          fix1d(cdims), algo=algo, alpha1=alpha1, alpha2=alpha2,
-                                          activationMode=CUDNN_ACTIVATION_IDENTITY, activationCoeff=0.0)
-    σ.(y)
-  else # fallback
-    if b == nothing # bias is turned off
-      σ.(conv!(y, x, w, cdims, alpha=alpha1, algo=algo))
-    else # bias is turned on
-      σ.(add_bias(conv!(y, x, w, cdims, alpha=alpha1, algo=algo), b))
-    end
-  end
-
-  return y
-end
 end
 
-const conv_data_algos = DefaultDict{Tuple, Int32}(Int32(-1))
 function ∇conv_data!(dx::DenseCuArray{T}, dy::DenseCuArray{T}, w::DenseCuArray{T},
-                     cdims::DenseConvDims; alpha=1, algo=-1) where T<:CUDNNFloat
-  if version() < v"6"
-    all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
-  end
-
-  if algo < 0
-    global conv_data_algos
-    key = (T, strides(dx), strides(w), strides(dy), cdims, size(dx)[end])
-    algo = conv_data_algos[key]
-    if algo < 0 # not in conv_data_algos
-      # algo = UInt32(cudnnGetConvolutionBackwardDataAlgorithm(fix1d(dx), fix1d(w), fix1d(dy), fix1d(cdims), preference=2, workspacesize=workspacesize(dx))) # will be removed in cuDNN 8
-      # returnedAlgoCount, perfResults = cudnnGetConvolutionBackwardDataAlgorithm_v7(fix1d(dx), fix1d(w), fix1d(dy), fix1d(cdims))
-      # returnedAlgoCount, perfResults = cudnnFindConvolutionBackwardDataAlgorithm(fix1d(dx), fix1d(w), fix1d(dy), fix1d(cdims))
-      returnedAlgoCount, perfResults = cudnnFindConvolutionBackwardDataAlgorithmEx(fix1d(dx), fix1d(w), fix1d(dy), fix1d(cdims), workspacesize=workspacesize(dx))
-      algo = perfChoose(perfResults, returnedAlgoCount)
-      conv_data_algos[key] = algo
+                     cdims::DenseConvDims; alpha=1, beta=0, algo=-1) where T<:CUDNNFloat
+    if version() < v"6"
+        all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
     end
-  end
-
-  cudnnConvolutionBackwardData(fix1d(dx), fix1d(w), fix1d(dy), fix1d(cdims), alpha=alpha, algo=algo)
-  return dx
+    if algo != -1
+        @warn "The algo option has been deprecated, the fastest algo is computed automatically" maxlog=1
+    end    
+    alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta);
+    xDesc, yDesc, wDesc = cudnnTensorDescriptor(dx), cudnnTensorDescriptor(dy), cudnnFilterDescriptor(w)
+    convDesc = cudnnConvolutionDescriptor(cdims, dx)
+    p = cudnnConvolutionBwdDataAlgoPerf(wDesc, w, yDesc, dy, convDesc, xDesc, dx)
+    @workspace size=p.memory workspace->cudnnConvolutionBackwardData(handle(), alpha, wDesc, w, yDesc, dy, convDesc, p.algo, workspace, sizeof(workspace), beta, xDesc, dx)
+    return dx
 end
 
-const conv_filter_algos = DefaultDict{Tuple, Int32}(Int32(-1))
 function ∇conv_filter!(dw::DenseCuArray{T}, x::DenseCuArray{T}, dy::DenseCuArray{T},
-                       cdims::DenseConvDims; alpha=1, algo=-1) where T<:CUDNNFloat
-  if version() < v"6"
-    all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
-  end
-
-  if algo < 0
-    global conv_filter_algos
-    # (type, batchsize, conv descriptor)
-    key = (T, strides(x), strides(dw), strides(dy), cdims, size(x)[end])
-    algo = conv_filter_algos[key]
-    if algo < 0 # not in conv_filter_algos
-      # algo = UInt32(cudnnGetConvolutionBackwardFilterAlgorithm(fix1d(dw), fix1d(x), fix1d(dy), fix1d(cdims), preference=2, workspacesize=workspacesize(x))) # will be removed in cuDNN 8
-      # returnedAlgoCount, perfResults = cudnnGetConvolutionBackwardFilterAlgorithm_v7(fix1d(dw), fix1d(x), fix1d(dy), fix1d(cdims))
-      # returnedAlgoCount, perfResults = cudnnFindConvolutionBackwardFilterAlgorithm(fix1d(dw), fix1d(x), fix1d(dy), fix1d(cdims))
-      returnedAlgoCount, perfResults = cudnnFindConvolutionBackwardFilterAlgorithmEx(fix1d(dw), fix1d(x), fix1d(dy), fix1d(cdims), workspacesize=workspacesize(x))
-      algo = perfChoose(perfResults, returnedAlgoCount)
-      conv_filter_algos[key] = algo
+                       cdims::DenseConvDims; alpha=1, beta=0, algo=-1) where T<:CUDNNFloat
+    if version() < v"6"
+        all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
     end
-  end
-
-  cudnnConvolutionBackwardFilter(fix1d(dw), fix1d(x), fix1d(dy), fix1d(cdims), alpha=alpha, algo=algo)
-  return dw
+    if algo != -1
+        @warn "The algo option has been deprecated, the fastest algo is computed automatically" maxlog=1
+    end    
+    alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta);
+    xDesc, yDesc, wDesc = cudnnTensorDescriptor(x), cudnnTensorDescriptor(dy), cudnnFilterDescriptor(dw)
+    convDesc = cudnnConvolutionDescriptor(cdims, x)
+    p = cudnnConvolutionBwdFilterAlgoPerf(xDesc, x, yDesc, dy, convDesc, wDesc, dw);
+    @workspace size=p.memory workspace->cudnnConvolutionBackwardFilter(handle(), alpha, xDesc, x, yDesc, dy, convDesc, p.algo, workspace, sizeof(workspace), beta, wDesc, dw);
+    return dw
 end
 
 
+
 # Bias
 
 # in-place for x (add b to x)
+# @denizyuret: cudnnAddTensor only supports (a,b,c,d)+(1,1,c,1) and (a,b,c,d,e)+(1,1,1,d,1), use cudnnOpTensor instead.
+# Compared to libknet8 x .+ b it is ~2x slower for (1,1,100,100), ~30% faster for (14,14,256,32)
+# CUDA.jl x .+ b is 2x slower than both
 add_bias(x::DenseCuArray{T}, b::DenseCuArray{T})  where {T<:CUDNNFloat} =
-  (cudnnAddTensor(reshape4D(x), reshape4D(b)); return x)
-
-∇conv_bias!(db::DenseCuArray{T}, dy::DenseCuArray{T}; alpha=1, beta=0) where T<:CUDNNFloat =
-  (cudnnConvolutionBackwardBias(fix1d(db), fix1d(dy), alpha=alpha, beta=beta); return db)
+    (cudnnAddTensor!(x, b); return x)
 
+function ∇conv_bias!(db::DenseCuArray{T}, dy::DenseCuArray{T}; alpha=1, beta=0) where T<:CUDNNFloat
+    alpha,beta = scalingParameter(T,alpha), scalingParameter(T,beta)
+    bDesc, yDesc = cudnnTensorDescriptor.((db,dy))
+    cudnnConvolutionBackwardBias(handle(), alpha, yDesc, dy, beta, bDesc, db)
+    return db
+end
 
 # Pooling
 
-maxpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat =
-  (cudnnPoolingForward(fix1d(y), fix1d(x), fix1d(pdims); mode=0); return y)
+function cudnnPoolingDescriptor(pdims::PoolDims, x::DenseCuArray{T}, mode::cudnnPoolingMode_t) where T
+    pdims, x = fix1d(pdims), fix1d(x)
+    window, padding, stride = NNlib.kernel_size(pdims), nnlibPadding(pdims), NNlib.stride(pdims)
+    nanOpt = CUDNN_NOT_PROPAGATE_NAN
+    cudnnPoolingDescriptor(mode, nanOpt, Cint(max(2,ndims(x)-2)), pooldims(window,size(x)), pooldims(padding,size(x)), pooldims(stride,size(x)))
+end
 
-∇maxpool!(dx::DenseCuArray{T}, dy::DenseCuArray{T}, y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat =
-  (cudnnPoolingBackward(fix1d(dx), fix1d(dy), fix1d(x), fix1d(y), fix1d(pdims), mode=0); return dx)
+function maxpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat
+    d = cudnnPoolingDescriptor(pdims, x, CUDNN_POOLING_MAX)
+    cudnnPoolingForward!(y, x, d)
+end
 
-meanpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat =
-  (cudnnPoolingForward(fix1d(y), fix1d(x), fix1d(pdims), mode=1); return y)
+function ∇maxpool!(dx::DenseCuArray{T}, dy::DenseCuArray{T}, y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat
+    xDesc, yDesc = cudnnTensorDescriptor.((x, y))
+    d = cudnnPoolingDescriptor(pdims, x, CUDNN_POOLING_MAX)
+    alpha, beta = scalingParameter(T,1), scalingParameter(T,0)
+    cudnnPoolingBackward(handle(), d, alpha, yDesc, y, yDesc, dy, xDesc, x, beta, xDesc, dx)
+    return dx
+end
 
-∇meanpool!(dx::DenseCuArray{T}, dy::DenseCuArray{T}, y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat =
-  (cudnnPoolingBackward(fix1d(dx), fix1d(dy), fix1d(x), fix1d(y), fix1d(pdims), mode=1); return dx)
+function meanpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat
+    d = cudnnPoolingDescriptor(pdims, x, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING)
+    cudnnPoolingForward!(y, x, d)
+end
 
+function ∇meanpool!(dx::DenseCuArray{T}, dy::DenseCuArray{T}, y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat
+    xDesc, yDesc = cudnnTensorDescriptor.((x, y))
+    d = cudnnPoolingDescriptor(pdims, x, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING)
+    alpha, beta = scalingParameter(T,1), scalingParameter(T,0)
+    cudnnPoolingBackward(handle(), d, alpha, yDesc, y, yDesc, dy, xDesc, x, beta, xDesc, dx)
+    return dx
+end
 
 # Activation
 
 using Base.Broadcast
 
 for (f, op) in [
-  CUDA.tanh       => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
-                                                       mode=CUDNN_ACTIVATION_TANH),
-  NNlib.σ         => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
-                                                       mode=CUDNN_ACTIVATION_SIGMOID),
-  NNlib.elu       => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
-                                                       mode=CUDNN_ACTIVATION_ELU),
-  NNlib.relu      => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
-                                                       mode=CUDNN_ACTIVATION_RELU),
-  NNlib.relu6     => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
-                                                       mode=CUDNN_ACTIVATION_CLIPPED_RELU,
-                                                       coeff=6.0),
-  NNlib.leakyrelu => (src,dst)->cudnnOpTensor(CUDNN_OP_TENSOR_MAX, reshape4D(src),
-                                              reshape4D(src), reshape4D(dst),
-                                              alpha1=0.01)]
-  @eval begin
-    # in-place
-    function Base.materialize!(dst::DenseCuArray{<:CUDNNFloat},
-                               bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
-        $op(bc.args[1], dst)
-        return dst
+    CUDA.tanh       => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_TANH),
+    NNlib.σ         => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_SIGMOID),
+    NNlib.elu       => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_ELU),
+    NNlib.relu      => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_RELU),
+    NNlib.relu6     => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_CLIPPED_RELU, coef=6.0),
+    NNlib.leakyrelu => (src,dst)->cudnnOpTensor!(dst, src, src; op=CUDNN_OP_TENSOR_MAX, alpha1=0.01)]
+    @eval begin
+        # in-place
+        function Base.materialize!(dst::DenseCuArray{<:CUDNNFloat},
+                                   bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
+            $op(bc.args[1], dst)
+            return dst
+        end
+
+        # out of place
+        function Base.materialize(bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
+            ElType = Broadcast.combine_eltypes(bc.f, bc.args)
+            dst = similar(bc, ElType)
+            $op(bc.args[1], dst)
+            return dst
+        end
     end
-
-    # out of place
-    function Base.materialize(bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
-        ElType = Broadcast.combine_eltypes(bc.f, bc.args)
-        dst = similar(bc, ElType)
-        $op(bc.args[1], dst)
-        return dst
-    end
-  end
 end
 
 # CUDNN_ACTIVATION_IDENTITY does not work with cudnnActivationForward
 # FIXME: put this optimization in GPUArrays' `copyto!` (like Base.Broadcast's `copyto!`)
 Base.broadcasted(::typeof(identity), x::DenseCuArray{T}) where {T<:CUDNNFloat} = x
+
+
+# Compatibility shims until users upgrade to new NNlib format
+function conv!(y::DenseCuArray{T}, x::DenseCuArray{T}, w::DenseCuArray{T}; pad=0, stride=1, flipkernel=0, dilation=1, kwargs...) where {T<:CUDNNFloat}
+    cdims = DenseConvDims(x, w; padding=pad, stride=stride, flipkernel=(flipkernel!=0), dilation=dilation)
+    return conv!(y, x, w, cdims; kwargs...)
+end
+
+function ∇conv_filter!(dw::DenseCuArray{T}, dy::DenseCuArray{T}, x::DenseCuArray{T}; pad=0, stride=1, flipkernel=0, dilation=1, kwargs...) where {T<:CUDNNFloat}
+    cdims = DenseConvDims(x, dw; padding=pad, stride=stride, flipkernel=(flipkernel!=0), dilation=dilation)
+    # NOTE!!!  This compat shim re-arranges the argument order!
+    return ∇conv_filter!(dw, x, dy, cdims; kwargs...)
+end
+
+function maxpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, k; pad=map(_->0,k), stride=k) where {T<:CUDNNFloat}
+    pdims = PoolDims(x, k; padding=pad, stride=stride)
+    return maxpool!(y, x, pdims)
+end
+
+function meanpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, k; pad=map(_->0,k), stride=k) where {T<:CUDNNFloat}
+    pdims = PoolDims(x, k; padding=pad, stride=stride)
+    return meanpool!(y, x, pdims)
+end
diff --git a/lib/cudnn/normalization.jl b/lib/cudnn/normalization.jl
new file mode 100644
index 0000000000..7ba6882ede
--- /dev/null
+++ b/lib/cudnn/normalization.jl
@@ -0,0 +1,118 @@
+"""
+    cudnnNormalizationForward(x, xmean, xvar, bias, scale; o...)
+    cudnnNormalizationForward!(y, x, xmean, xvar, bias, scale; o...)
+
+Return batch normalization applied to `x`:
+
+    y .= ((x .- mean(x; dims)) ./ sqrt.(epsilon .+ var(x; dims))) .* scale .+ bias  # training
+    y .= ((x .- xmean) ./ sqrt.(epsilon .+ xvar)) .* scale .+ bias                  # inference
+
+
+Bias and scale are trainable parameters, xmean and xvar are modified to collect statistics
+during training and treated as constants during inference. Note that during inference the
+values given by xmean and xvar arguments are used in the formula whereas during training the
+actual mean and variance of the minibatch are used in the formula: the xmean/xvar arguments
+are only used to collect statistics. In the original paper bias is referred to as beta and
+scale as gamma (Batch Normalization: Accelerating Deep Network Training by Reducing Internal
+Covariate Shift, S. Ioffe, C. Szegedy, 2015).
+
+Keyword arguments:
+* `epsilon = 1e-5`: epsilon value used in the normalization formula
+* `exponentialAverageFactor = 0.1`: factor used in running mean/variance calculation: `runningMean = runningMean*(1-factor) + newMean*factor`
+* `training = false`: boolean indicating training vs inference mode
+* `mode::cudnnNormMode_t = CUDNN_NORM_PER_CHANNEL`: Per-channel layer is based on the paper. In this mode `scale` etc. have dimensions (1,1,C,1). The other alternative is `CUDNN_NORM_PER_ACTIVATION` where `scale` etc. have dimensions `(W,H,C,1)`.
+* `algo::cudnnNormAlgo_t = CUDNN_NORM_ALGO_STANDARD`: The other alternative, `CUDNN_NORM_ALGO_PERSIST`, triggers the new semi-persistent NHWC kernel when certain conditions are met (see cudnn docs).
+* `normOps::cudnnNormOps_t = CUDNN_NORM_OPS_NORM`: Currently the other alternatives, `CUDNN_NORM_OPS_NORM_ACTIVATION` and `CUDNN_NORM_OPS_NORM_ADD_ACTIVATION` are not supported.
+* `z = nothing`: for residual addition to the result of the normalization operation, prior to the activation (will be supported when CUDNN_NORM_OPS_NORM_ADD_ACTIVATION is supported)
+* `groupCnt = 1`: Place holder for future work, should be set to 1 now
+* `alpha = 1; beta = 0`: scaling parameters: return `alpha * new_y + beta * old_y`
+
+"""
+cudnnNormalizationForward, cudnnNormalizationForward!
+
+
+# Public methods
+cudnnNormalizationForward(x, xmean, xvar, bias, scale; o...) = cudnnNormalizationForwardWithDefaults(x, xmean, xvar, bias, scale; o...)
+cudnnNormalizationForward!(y, x, xmean, xvar, bias, scale; o...) = cudnnNormalizationForwardWithDefaults(x, xmean, xvar, bias, scale; y, o...)
+
+
+# Private method
+function cudnnNormalizationForwardWithDefaults(
+    x, mean, variance, bias, scale;
+
+    # Inference parameters:
+    y = similar(x),
+    z = nothing, # for residual addition to the result of the normalization operation, prior to the activation
+    mode::cudnnNormMode_t = CUDNN_NORM_PER_CHANNEL, # Per-channel layer is based on the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, S. Ioffe, C. Szegedy, 2015.
+    normOps::cudnnNormOps_t = CUDNN_NORM_OPS_NORM,  # Currently CUDNN_NORM_OPS_NORM_ACTIVATION and CUDNN_NORM_OPS_NORM_ADD_ACTIVATION are only supported in the NHWC layout (training,backward), not supported (inference)
+    algo::cudnnNormAlgo_t = CUDNN_NORM_ALGO_STANDARD, # trigger the new semi-persistent NHWC kernel when CUDNN_NORM_ALGO_PERSIST
+    alpha::Real = 1,
+    beta::Real = 0,
+    epsilon::Real = Cdouble(1e-5), # Has to be >= 0. Should be the same in forward and backward functions.
+    groupCnt::Integer = Cint(1),   # Place hold for future work, should be set to 1 now
+
+    # Training-only parameters:
+    training = false,
+    exponentialAverageFactor::Real = Cdouble(0.1),
+    savedMean = nothing, # Optionally save intermediate results from the forward pass here - can be reused to speed up backward pass. NULL if unused.
+    savedInvVariance = nothing,
+
+    # Activation parameters:
+    activationMode::cudnnActivationMode_t = CUDNN_ACTIVATION_IDENTITY,
+    activationReluNanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN,
+    activationCoef::Real = 1,
+    activationDesc::Union{Nothing,cudnnActivationDescriptor} = (normOps == CUDNN_NORM_OPS_NORM ? nothing : cudnnActivationDescriptor(activationMode, activationReluNanOpt, Cdouble(activationCoef))),
+
+    # Tensor descriptors:
+    format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW,
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format),
+    yDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(y; format),
+    zDesc::Union{Nothing,cudnnTensorDescriptor} = (z === nothing ? nothing : cudnnTensorDescriptor(z; format)),
+    normScaleBiasDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(scale; format),
+    normMeanVarDesc::Union{Nothing,cudnnTensorDescriptor} = (mean === nothing ? nothing : cudnnTensorDescriptor(mean; format)),
+
+    # Temporary space used in training:
+    workspace = nothing,
+    reserveSpace = nothing,
+    dx = Ref{Any}(),
+    dscale = Ref{Any}(),
+    dbias = Ref{Any}(),
+    dz = Ref{Any}(),
+)
+    @assert epsilon >= 0 && exponentialAverageFactor >= 0  "epsilon and exponentialAverageFactor should be non-negative."
+    @assert groupCnt == 1  "Currently only groupCnt=1 is supported."
+    @assert normOps === CUDNN_NORM_OPS_NORM "Currently only normOps=CUDNN_NORM_OPS_NORM is supported."
+    alpha, beta = (a->scalingParameter(eltype(x),a)).((alpha, beta))
+    # Backward called separately on each variable. We will calculate all gradients on first call. Use `dready` to avoid subsequent calls.
+    dready = Ref{Bool}(false)   # this will be turned to `true` by the first backward call.
+    cudnnNormalizationForwardAD(x, scale, bias, z; training, mean, variance, y, mode, normOps, algo, alpha, beta, epsilon, groupCnt, exponentialAverageFactor, savedMean, savedInvVariance, activationDesc, xDesc, yDesc, zDesc, normScaleBiasDesc, normMeanVarDesc, workspace, reserveSpace, dx, dscale, dbias, dz, dready)
+end
+
+
+# AD method:
+function cudnnNormalizationForwardAD(x, scale, bias, z; training, mean, variance, y, mode, normOps, algo, alpha, beta, epsilon, groupCnt, exponentialAverageFactor, savedMean, savedInvVariance, activationDesc, xDesc, yDesc, zDesc, normScaleBiasDesc, normMeanVarDesc, workspace, reserveSpace, dx, dscale, dbias, dz, dready)
+    issimilar(x,y) = (typeof(x) === typeof(y) && (x === nothing || size(x) === size(y)))
+    if training
+        mean === nothing ? savedMean = nothing : savedMean === nothing ? savedMean = similar(mean) : @assert issimilar(mean, savedMean)
+        variance === nothing ? savedInvVariance = nothing : savedInvVariance === nothing ? savedInvVariance = similar(variance) : @assert issimilar(variance, savedInvVariance)
+        workspaceSize, reserveSpaceSize = cudnnNormalizationTempSpaceSizes(mode, normOps, algo, xDesc, zDesc, yDesc, normScaleBiasDesc, activationDesc, normMeanVarDesc, groupCnt)
+        if reserveSpaceSize > 0 && reserveSpace === nothing; reserveSpace = cudnnTempSpace(reserveSpaceSize); end
+        @assert sizeof(reserveSpace) >= reserveSpaceSize  "reserveSpace should be at least $(reserveSpaceSize) bytes"
+        if workspaceSize > 0 && workspace === nothing; workspace = cudnnTempSpace(workspaceSize); end
+        @assert sizeof(workspace) >= workspaceSize  "workspace should be at least $(workspaceSize) bytes"
+        cudnnNormalizationForwardTraining(handle(), mode, normOps, algo, alpha, beta, xDesc, x, normScaleBiasDesc, scale, bias, exponentialAverageFactor, something(normMeanVarDesc,C_NULL), something(mean,CU_NULL), something(variance,CU_NULL), epsilon, something(savedMean,CU_NULL), something(savedInvVariance,CU_NULL), something(activationDesc,C_NULL), something(zDesc,C_NULL), something(z,CU_NULL), yDesc, y, something(workspace,CU_NULL), sizeof(workspace), something(reserveSpace,CU_NULL), sizeof(reserveSpace), groupCnt)
+    else
+        @assert mean !== nothing && variance !== nothing && normMeanVarDesc !== nothing "normalization mean and variance are required in inference mode."
+        cudnnNormalizationForwardInference(handle(), mode, normOps, algo, alpha, beta, xDesc, x, normScaleBiasDesc, scale, bias, normMeanVarDesc, mean, variance, something(zDesc,C_NULL), something(z,CU_NULL), something(activationDesc,C_NULL), yDesc, y, epsilon, groupCnt)
+    end
+    return y
+end
+
+
+# Helper functions
+function cudnnNormalizationTempSpaceSizes(mode, normOps, algo, xDesc, zDesc, yDesc, normScaleBiasDesc, activationDesc, normMeanVarDesc, groupCnt)
+    workspaceSize, reserveSpaceSize = Ref{Csize_t}(0), Ref{Csize_t}(0)
+    cudnnGetNormalizationForwardTrainingWorkspaceSize(handle(), mode, normOps, algo, xDesc, something(zDesc,C_NULL), yDesc, normScaleBiasDesc, something(activationDesc,C_NULL), something(normMeanVarDesc,C_NULL), workspaceSize, groupCnt)
+    cudnnGetNormalizationTrainingReserveSpaceSize(handle(), mode, normOps, algo, something(activationDesc,C_NULL), xDesc, reserveSpaceSize, groupCnt)
+    workspaceSize[], reserveSpaceSize[]
+end
diff --git a/lib/cudnn/optensor.jl b/lib/cudnn/optensor.jl
new file mode 100644
index 0000000000..bb5ecb2c71
--- /dev/null
+++ b/lib/cudnn/optensor.jl
@@ -0,0 +1,72 @@
+# Compared to cudnnAddTensor!(copy(a),b), cudnnOpTensor is ~50% faster on
+# (14,14,256,32)+(1,1,256,1), ~50% slower on (1,1,100,100)+(1,1,100,1) Unlike cudnnAddTensor
+# it supports all broadcasting shapes up to ndims=5 as described in the documentation.
+
+"""
+    cudnnOpTensor(x1, x2; op, compType, nanOpt, alpha1, alpha2)
+    cudnnOpTensor(x1, x2, d::cudnnOpTensorDescriptor; alpha1, alpha2)
+    cudnnOpTensor!(y, x1, x2; op, compType, nanOpt, alpha1, alpha2, beta)
+    cudnnOpTensor!(y, x1, x2, d::cudnnOpTensorDescriptor; alpha1, alpha2, beta)
+
+Return the result of the specified broadcasting operation applied to `x1` and `x2`.
+Optionally `y` holds the result and `d` specifies the operation. Each dimension of the input
+tensor `x1` must match the corresponding dimension of the destination tensor `y`, and each
+dimension of the input tensor `x2` must match the corresponding dimension of the destination
+tensor `y` or must be equal to 1. Keyword arguments:
+
+* `alpha1=1, alpha2=1, beta=0` are used for scaling, i.e. `y .= beta*y .+ op.(alpha1*x1, alpha2*x2)`
+
+Keyword arguments used when `cudnnOpTensorDescriptor` is not specified:
+
+* `op = CUDNN_OP_TENSOR_ADD`, ADD can be replaced with MUL, MIN, MAX, SQRT, NOT; SQRT and NOT performed only on x1; NOT computes 1-x1
+* `compType = (eltype(x1) <: Float64 ? Float64 : Float32)`: Computation datatype (see cudnn docs for available options)
+* `nanOpt = CUDNN_NOT_PROPAGATE_NAN`: NAN propagation policy. The other option is `CUDNN_PROPAGATE_NAN`.
+"""
+cudnnOpTensor, cudnnOpTensor!
+
+
+# Public methods:
+cudnnOpTensor(x1,x2; o...)    = cudnnOpTensorWithDefaults(x1,x2; o...)
+cudnnOpTensor!(y,x1,x2; o...) = cudnnOpTensorWithDefaults(x1,x2; y, o...)
+cudnnOpTensor(x1,x2,d::cudnnOpTensorDescriptor; o...)    = cudnnOpTensorWithDefaults(x1,x2; opTensorDesc=d, o...)
+cudnnOpTensor!(y,x1,x2,d::cudnnOpTensorDescriptor; o...) = cudnnOpTensorWithDefaults(x1,x2; y, opTensorDesc=d, o...)
+
+
+# Private method:
+function cudnnOpTensorWithDefaults(
+    x1, x2;
+    y = similar(x1),
+    op::cudnnOpTensorOp_t = CUDNN_OP_TENSOR_ADD,
+    compType::DataType = (eltype(x1) <: Float64 ? Float64 : Float32),
+    nanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN,
+    opTensorDesc::cudnnOpTensorDescriptor = cudnnOpTensorDescriptor(op, cudnnDataType(compType), nanOpt),
+    alpha1::Real = 1,
+    alpha2::Real = 1,
+    beta::Real = 0,
+    x1Desc::cudnnTensorDescriptor = cudnnTensorDescriptor(x1),
+    x2Desc::cudnnTensorDescriptor = cudnnTensorDescriptor(x2),
+    yDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(y)
+)
+    @assert ndims(x1) <= 5
+    @assert size(y) == size(x1)
+    @assert all(size(x2,i) == size(x1,i) || size(x2,i) == 1 for i in 1:ndims(x2))
+    T = eltype(x1)
+    alpha1, alpha2, beta = scalingParameter(T,alpha1), scalingParameter(T,alpha2), scalingParameter(T,beta)
+    cudnnOpTensorAD(x1, x2; opTensorDesc, alpha1, x1Desc, alpha2, x2Desc, beta, yDesc, y)
+end
+
+
+# AD method: This method aids gradient definition, please do not remove!
+function cudnnOpTensorAD(x1, x2; opTensorDesc, alpha1, x1Desc, alpha2, x2Desc, beta, yDesc, y)
+    cudnnOpTensor(handle(), opTensorDesc, alpha1, x1Desc, x1, alpha2, x2Desc, x2, beta, yDesc, y)
+    return y
+end
+
+
+# Deprecated:
+function cudnnOpTensor(op::cudnnOpTensorOp_t, 
+                       A::DenseCuArray{T,N}, B::DenseCuArray{T,N}, C::DenseCuArray{T,N};
+                       alpha1=true, alpha2=true, beta=false) where {T,N}
+    @warn "cudnnOpTensor(op,A,B,C) is deprecated, please use one of the methods in `@doc cudnnOpTensor`." maxlog=1
+    cudnnOpTensorWithDefaults(A, B; y=C, op, alpha1, alpha2, beta)
+end
diff --git a/lib/cudnn/pooling.jl b/lib/cudnn/pooling.jl
index 44ab30d8d5..535767ad3e 100644
--- a/lib/cudnn/pooling.jl
+++ b/lib/cudnn/pooling.jl
@@ -1,53 +1,103 @@
-using NNlib: PoolDims
+"""
+    cudnnPoolingForward(x; mode, nanOpt, window, padding, stride, alpha)
+    cudnnPoolingForward(x, d::cudnnPoolingDescriptor; alpha)
+    cudnnPoolingForward!(y, x; mode, nanOpt, window, padding, stride, alpha, beta)
+    cudnnPoolingForward!(y, x, d::cudnnPoolingDescriptor; alpha, beta)
 
+Return pooled `x`, overwriting `y` if provided, according to keyword arguments or the
+pooling descriptor `d`. Please see the [cuDNN
+docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnPoolingForward) for
+details.
 
-# descriptor
+The dimensions of `x,y` tensors that are less than 4-D are assumed to be padded on the left
+with 1's. The first `n-2` are spatial dimensions, the last two are always assumed to be
+channel and batch.
+
+The arguments `window`, `padding`, and `stride` can be specified as `n-2` dimensional
+vectors, tuples or a single integer which is assumed to be repeated `n-2` times. If any of
+the entries is larger than the corresponding `x` dimension, the `x` dimension is used
+instead.
+
+Arguments:
+* `mode = CUDNN_POOLING_MAX`: Pooling method, other options are `CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING`, `CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING`, `CUDNN_POOLING_MAX_DETERMINISTIC`
+* `nanOpt = CUDNN_NOT_PROPAGATE_NAN`: NAN propagation policy, the other option is `CUDNN_PROPAGATE_NAN`
+* `window = 2`: Pooling window size
+* `padding = 0`: Padding assumed around `x`
+* `stride = window`: How far to shift pooling window at each step
+* `alpha=1, beta=0` can be used for scaling, i.e. `y .= alpha*op(x1) .+ beta*y`
+"""
+cudnnPoolingForward, cudnnPoolingForward!
 
-mutable struct PoolDesc
-    ptr::cudnnPoolingDescriptor_t
-end
 
-unsafe_free!(pd::PoolDesc)=cudnnDestroyPoolingDescriptor(pd.ptr)
+# Public methods
+cudnnPoolingForward(x; o...)     = cudnnPoolingForwardWithDefaults(x; o...)
+cudnnPoolingForward!(y, x; o...) = cudnnPoolingForwardWithDefaults(x; y, o...)
+cudnnPoolingForward(x, d::cudnnPoolingDescriptor; o...)     = cudnnPoolingForwardWithDefaults(x; poolingDesc=d, o...)
+cudnnPoolingForward!(y, x, d::cudnnPoolingDescriptor; o...) = cudnnPoolingForwardWithDefaults(x; y, poolingDesc=d, o...)
 
-Base.unsafe_convert(::Type{cudnnPoolingDescriptor_t}, pd::PoolDesc)=pd.ptr
 
-function PoolDesc(nd, window, padding, stride, mode, maxpoolingNanOpt=CUDNN_NOT_PROPAGATE_NAN)
-    pd = Ref{cudnnPoolingDescriptor_t}()
-    cudnnCreatePoolingDescriptor(pd)
-    cudnnSetPoolingNdDescriptor(pd[],cudnnPoolingMode_t(mode),maxpoolingNanOpt,nd,pdsize(window,nd),pdsize(padding,nd),pdsize(stride,nd))
-    this = PoolDesc(pd[])
-    finalizer(unsafe_free!, this)
-    return this
+# Private method
+function cudnnPoolingForwardWithDefaults(
+    x;                          # no type for x, could be AutoGrad.Value
+    mode::cudnnPoolingMode_t = CUDNN_POOLING_MAX,
+    nanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN,
+    window::Union{Integer,Vector{<:Integer},Tuple{<:Integer,Vararg{Int}}} = 2,
+    padding::Union{Integer,Vector{<:Integer},Tuple{<:Integer,Vararg{Int}}} = 0,
+    stride::Union{Integer,Vector{<:Integer},Tuple{<:Integer,Vararg{Int}}} = window,
+    poolingDesc::cudnnPoolingDescriptor = cudnnPoolingDescriptor(mode, nanOpt, Cint(max(2,ndims(x)-2)), pooldims(window,size(x)), pooldims(padding,size(x)), pooldims(stride,size(x))),
+    format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW,
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format),
+    y = cudnnPoolingForwardOutput(x, xDesc, poolingDesc, format),
+    yDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(y; format),
+    alpha::Real = 1,
+    beta::Real = 0,
+)
+    T = eltype(x)
+    alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta)
+    cudnnPoolingForwardAD(x; poolingDesc, alpha, beta, xDesc, yDesc, y)
 end
 
-function PoolDesc(pdims::PoolDims, mode, maxpoolingNanOpt=CUDNN_NOT_PROPAGATE_NAN)
-    pd = NNlib.padding(pdims)
-    if !all(pd[1:2:end] .== pd[2:2:end])
-        @warn("CuDNN does not support asymmetric padding; defaulting to symmetric choice")
+
+# Convert the integer, tuple or array to pooling dims compatible with array size
+function pooldims(d, s::Dims{N}) where N
+    if d isa Integer || length(d) == N-2
+        Cint[reverse(min.(d,s[1:N-2]))...]
+    else
+        throw(DimensionMismatch("Cannot pool $(Base.dims2string(s)) array with $d pooldims."))
     end
-    return PoolDesc(NNlib.spatial_dims(pdims), NNlib.kernel_size(pdims), pd[1:2:end],
-                    NNlib.stride(pdims), mode, maxpoolingNanOpt)
 end
 
+pooldims(d, s::Dims{3}) = pooldims(d, (1,s...))
+pooldims(d, s::Dims{2}) = pooldims(d, (1,1,s...))
+pooldims(d, s::Dims{1}) = pooldims(d, (1,1,1,s...))
+pooldims(d, s::Dims{0}) = pooldims(d, (1,1,1,1))
 
-# wrappers
 
-function cudnnPoolingForward(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, pdims::PoolDims;
-                             alpha=1, beta=0, mode=0) where {T,N}
-    cudnnPoolingForward(handle(), PoolDesc(pdims, mode),
-                        scalingParameter(T, alpha), TensorDesc(x), x,
-                        scalingParameter(T, beta ), TensorDesc(y), y)
+function cudnnPoolingForwardOutput(x, xDesc, poolingDesc, format)
+    d = Array{Cint}(undef, max(4, ndims(x))) # d = [N,C,Yn,...,Y1] no matter what format
+    cudnnGetPoolingNdForwardOutputDim(poolingDesc, xDesc, length(d), d)
+    if length(d) > ndims(x) # This happens when x is (X,C,N), its TD is [N,C,X,1]
+        @assert all(d[ndims(x)+1:end] .== 1)
+        d = d[1:ndims(x)]
+    end
+    # ydims(NCHW)=(Y1,...,Yn,C,N) ydims(NHWC)=(C,Y1,...,Yn,N)
+    ydims = (format === CUDNN_TENSOR_NCHW ? reverse(d) : (d[2],d[end:-1:3]...,d[1]))
+    similar(x, ydims...)
+end
+
+
+# AD method
+function cudnnPoolingForwardAD(x; poolingDesc, alpha, beta, xDesc, yDesc, y)
+    cudnnPoolingForward(handle(), poolingDesc, alpha, xDesc, x, beta, yDesc, y)
     return y
 end
 
-function cudnnPoolingBackward(dx::DenseCuArray{T,N}, dy::DenseCuArray{T,N}, x::DenseCuArray{T,N}, y::DenseCuArray{T,N}, pdims::PoolDims;
-                              alpha=1, mode=0) where {T,N}
-    if alpha!=1 && mode==0; error("Gradient of pool(alpha!=1,mode=0) broken in CUDNN"); end
-    beta = 0
-    cudnnPoolingBackward(handle(), PoolDesc(pdims, mode),
-                         scalingParameter(T, alpha), TensorDesc( y),  y,
-                         TensorDesc(dy), dy,
-                         TensorDesc( x),  x,
-                         scalingParameter(T, beta ), TensorDesc(dx), dx)
-    return dx
+
+# Deprecated methods
+function cudnnPoolingForward(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, pdims::NNlib.PoolDims;
+                             alpha=1, beta=0, mode=CUDNN_POOLING_MAX) where {T,N}
+    @warn "`cudnnPoolingForward(y,x,d::PoolDims)` is deprecated, please use one of the methods in `@doc cudnnPoolingForward`." maxlog=1
+    cudnnPoolingForward!(y, x; window=NNlib.kernel_size(pdims), padding=nnlibPadding(pdims), stride=NNlib.stride(pdims), mode, alpha, beta)
 end
+
+
diff --git a/lib/cudnn/reduce.jl b/lib/cudnn/reduce.jl
new file mode 100644
index 0000000000..3ae778b10b
--- /dev/null
+++ b/lib/cudnn/reduce.jl
@@ -0,0 +1,86 @@
+# This is unfortunately 10x slower than libknet8, 2x slower than CUDA.jl
+
+"""
+    cudnnReduceTensor(x; dims, op, compType, nanOpt, indices, alpha)
+    cudnnReduceTensor(x, d::cudnnReduceTensorDescriptor; dims, indices, alpha)
+    cudnnReduceTensor!(y, x; op, compType, nanOpt, indices, alpha, beta)
+    cudnnReduceTensor!(y, x, d::cudnnReduceTensorDescriptor; indices, alpha, beta)
+
+Return the result of the specified reduction operation applied to `x`.  Optionally `y` holds
+the result and `d` specifies the operation.  Each dimension of the output tensor `y` must
+match the corresponding dimension of the input tensor `x` or must be equal to 1. The
+dimensions equal to 1 indicate the dimensions of `x` to be reduced.  Keyword arguments:
+
+* `dims = ntuple(i->1,ndims(x))`: specifies the shape of the output when `y` is not given
+* `indices = nothing`: previously allocated space for writing indices which can be generated for min and max ops only, can be a `CuArray` of `UInt8`, `UInt16`, `UInt32` or `UInt64`
+* `alpha=1, beta=0` are used for scaling, i.e. `y .= alpha*op.(x1) .+ beta*y`
+
+Keyword arguments that can be used when `reduceTensorDesc` is not specified:
+* `op = CUDNN_REDUCE_TENSOR_ADD`: Reduction operation, ADD can be replaced with MUL, MIN, MAX, AMAX, AVG, NORM1, NORM2, MUL_NO_ZEROS
+* `compType = (eltype(x) <: Float64 ? Float64 : Float32)`: Computation datatype
+* `nanOpt = CUDNN_NOT_PROPAGATE_NAN`: NAN propagation policy, the other option is `CUDNN_PROPAGATE_NAN`
+"""
+cudnnReduceTensor, cudnnReduceTensor!
+
+
+# Public methods
+cudnnReduceTensor(x; o...)     = cudnnReduceTensorWithDefaults(x; o...)
+cudnnReduceTensor!(y, x; o...) = cudnnReduceTensorWithDefaults(x; y, o...)
+cudnnReduceTensor(x, d::cudnnReduceTensorDescriptor; o...)     = cudnnReduceTensorWithDefaults(x; reduceTensorDesc=d, o...)
+cudnnReduceTensor!(y, x, d::cudnnReduceTensorDescriptor; o...) = cudnnReduceTensorWithDefaults(x; y, reduceTensorDesc=d, o...)
+
+
+# Private method
+function cudnnReduceTensorWithDefaults(
+    x;
+    op::cudnnReduceTensorOp_t = CUDNN_REDUCE_TENSOR_ADD,
+    compType::DataType = (eltype(x) <: Float64 ? Float64 : Float32),
+    nanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN,
+    indices::Union{Vector{<:Unsigned},Nothing} = nothing,
+    reduceTensorDesc::cudnnReduceTensorDescriptor = cudnnReduceTensorDescriptor(op, cudnnDataType(compType), nanOpt, cudnnReduceTensorIndices(op, indices), cudnnIndicesType(indices)),
+    dims::Dims = ntuple(i->1,ndims(x)),
+    y = similar(x, dims),
+    alpha::Real = 1,
+    beta::Real = 0,
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x),
+    yDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(y),
+)
+    T = eltype(x)
+    alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta)
+    cudnnReduceTensorAD(x; reduceTensorDesc, alpha, xDesc, beta, yDesc, y, indices)
+end
+
+function cudnnReduceTensorIndices(op, indices)
+    if indices !== nothing && op in (CUDNN_REDUCE_TENSOR_MIN, CUDNN_REDUCE_TENSOR_MAX)
+        CUDNN_REDUCE_TENSOR_FLATTENED_INDICES
+    else
+        CUDNN_REDUCE_TENSOR_NO_INDICES
+    end
+end
+
+cudnnIndicesType(::Nothing)=CUDNN_32BIT_INDICES
+cudnnIndicesType(::Vector{UInt8})=CUDNN_8BIT_INDICES
+cudnnIndicesType(::Vector{UInt16})=CUDNN_16BIT_INDICES
+cudnnIndicesType(::Vector{UInt32})=CUDNN_32BIT_INDICES
+cudnnIndicesType(::Vector{UInt64})=CUDNN_64BIT_INDICES
+cudnnIndicesType(x)=error("Bad type $x for cudnnIndices, use Vector{UInt8, 16, 32 or 64}.")
+
+
+# AD method
+function cudnnReduceTensorAD(x; reduceTensorDesc, alpha, xDesc, beta, yDesc, y, indices)
+    @workspace size=@argout(
+        cudnnGetReductionWorkspaceSize(handle(), reduceTensorDesc, xDesc, yDesc, out(Ref{Csize_t}()))
+    )[] workspace->begin
+        cudnnReduceTensor(handle(), reduceTensorDesc, something(indices, C_NULL), sizeof(indices), workspace, sizeof(workspace), alpha, xDesc, x, beta, yDesc, y)
+    end
+    return y
+end
+
+
+# Deprecated
+function cudnnReduceTensor(op::cudnnReduceTensorOp_t,
+                           A::DenseCuArray{T,N}, C::DenseCuArray{T,N};
+                           alpha=true, beta=false) where {T,N}
+    @warn "cudnnReduceTensor(op,A,C) is deprecated, please use one of the methods in `@doc cudnnReduceTensor`." maxlog=1
+    cudnnReduceTensor(A; y=C, op, alpha, beta)
+end
diff --git a/lib/cudnn/rnn.jl b/lib/cudnn/rnn.jl
index 64886e0c2a..d7ea033e61 100644
--- a/lib/cudnn/rnn.jl
+++ b/lib/cudnn/rnn.jl
@@ -1,204 +1,325 @@
-# CUDNN_RNN_RELU: Stock RNN with ReLu activation
-# CUDNN_RNN_TANH: Stock RNN with tanh activation
-# CUDNN_LSTM:     LSTM with no peephole connections
-# CUDNN_GRU:      Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1)
-
-# param layout:
-# RNN: [weight, bias] × [input, hidden]
-# GRU: [weight, bias] × [input, hidden] × [reset, update, newmem]
-# LSTM: [weight, bias] × [input, hidden] × [input, forget, newmem, output]
-
-using LinearAlgebra
-
-function params(w::CuVector, input, hidden, n = 1)
-  slice(offset, shape) = reshape(view(w, offset.+(1:prod(shape))), shape)
-  wx = slice(0, (input, hidden*n))
-  wh = slice(length(wx), (hidden, hidden*n))
-  bias = view(w, length(wx)+length(wh) .+ (1:hidden*n))
-  (wx, wh), bias
-end
+"""
+    cudnnRNNForward(w, x; hiddenSize, o...)
+    cudnnRNNForward!(y, w, x; hiddenSize, o...)
+    cudnnRNNForward(w, x, d::cudnnRNNDescriptor; o...)
+    cudnnRNNForward!(y, w, x, d::cudnnRNNDescriptor; o...)
 
-mutable struct RNNDesc{T}
-  mode::cudnnRNNMode_t
-  input::Int
-  hidden::Int
-  params::CuVector{T}
-  weights::NTuple{2,CuMatrix{T}}
-  bias::CuVector{T}
-  ptr::Ptr{Nothing}
-end
+Apply the RNN specified with weights `w` and configuration given by `d` or keyword options
+to input `x`.
 
-Base.unsafe_convert(::Type{Ptr{Nothing}}, d::RNNDesc) = d.ptr
+Keyword arguments for hidden input/output:
+* `hx=nothing`: initialize the hidden vector if specified (by default initialized to 0).
+* `cx=nothing`: initialize the cell vector (only in LSTMs) if specified (by default initialized to 0).
+* `hy=nothing`: return the final hidden vector in hy if set to `Ref{Any}()`.
+* `cy=nothing`: return the final cell vector in cy (only in LSTMs) if set to `Ref{Any}()`.
 
-function rnnParamSize(T, r, input)
-  size = Csize_t[0]
-  cudnnGetRNNParamsSize(handle(), r, TensorDesc(T, (1,input,1)), size, cudnnDataType(T))
-  return Int(size[])÷sizeof(T)
-end
+Keyword arguments specifying the RNN when `d::cudnnRNNDescriptor` is not given:
+* `hiddenSize::Integer`: hidden vector size, which must be supplied when `d` is not given
+* `algo::cudnnRNNAlgo_t = CUDNN_RNN_ALGO_STANDARD`: RNN algo (CUDNN_RNN_ALGO_STANDARD, CUDNN_RNN_ALGO_PERSIST_STATIC, or CUDNN_RNN_ALGO_PERSIST_DYNAMIC).
+* `cellMode::cudnnRNNMode_t = CUDNN_LSTM`: Specifies the RNN cell type in the entire model (CUDNN_RNN_RELU, CUDNN_RNN_TANH, CUDNN_LSTM, CUDNN_GRU).
+* `biasMode::cudnnRNNBiasMode_t = CUDNN_RNN_DOUBLE_BIAS`: Sets the number of bias vectors (CUDNN_RNN_NO_BIAS, CUDNN_RNN_SINGLE_INP_BIAS, CUDNN_RNN_SINGLE_REC_BIAS, CUDNN_RNN_DOUBLE_BIAS). The two single bias settings are functionally the same for RELU, TANH and LSTM cell types. For differences in GRU cells, see the description of CUDNN_GRU in cudnn docs.
+* `dirMode::cudnnDirectionMode_t = CUDNN_UNIDIRECTIONAL`: Specifies the recurrence pattern: CUDNN_UNIDIRECTIONAL or CUDNN_BIDIRECTIONAL. In bidirectional RNNs, the hidden states passed between physical layers are concatenations of forward and backward hidden states.
+* `inputMode::cudnnRNNInputMode_t = CUDNN_LINEAR_INPUT`: Specifies how the input to the RNN model is processed by the first layer. When inputMode is CUDNN_LINEAR_INPUT, original input vectors of size inputSize are multiplied by the weight matrix to obtain vectors of hiddenSize. When inputMode is CUDNN_SKIP_INPUT, the original input vectors to the first layer are used as is without multiplying them by the weight matrix.
+* `mathPrec::DataType = eltype(x)`: This parameter is used to control the compute math precision in the RNN model. For Float16 input/output can be Float16 or Float32, for Float32 or Float64 input/output, must match the input/output type.
+* `mathType::cudnnMathType_t = math_mode()`: Sets the preferred option to use NVIDIA Tensor Cores accelerators on Volta (SM 7.0) or higher GPU-s. When dataType is CUDNN_DATA_HALF, the mathType parameter can be CUDNN_DEFAULT_MATH or CUDNN_TENSOR_OP_MATH. The ALLOW_CONVERSION setting is treated the same CUDNN_TENSOR_OP_MATH for this data type. When dataType is CUDNN_DATA_FLOAT, the mathType parameter can be CUDNN_DEFAULT_MATH or CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION. When the latter settings are used, original weights and intermediate results will be down-converted to CUDNN_DATA_HALF before they are used in another recursive iteration. When dataType is CUDNN_DATA_DOUBLE, the mathType parameter can be CUDNN_DEFAULT_MATH.
+* `inputSize::Integer = size(x,1)`: Size of the input vector in the RNN model. When the inputMode=CUDNN_SKIP_INPUT, the inputSize should match the hiddenSize value.
+* `projSize::Integer = hiddenSize`: The size of the LSTM cell output after the recurrent projection. This value should not be larger than hiddenSize. It is legal to set projSize equal to hiddenSize, however, in this case, the recurrent projection feature is disabled. The recurrent projection is an additional matrix multiplication in the LSTM cell to project hidden state vectors ht into smaller vectors rt = Wrht, where Wr is a rectangular matrix with projSize rows and hiddenSize columns. When the recurrent projection is enabled, the output of the LSTM cell (both to the next layer and unrolled in-time) is rt instead of ht. The recurrent projection can be enabled for LSTM cells and CUDNN_RNN_ALGO_STANDARD only.
+* `numLayers::Integer = 1`: Number of stacked, physical layers in the deep RNN model. When dirMode= CUDNN_BIDIRECTIONAL, the physical layer consists of two pseudo-layers corresponding to forward and backward directions. 
+* `dropout::Real = 0`: When non-zero, dropout operation will be applied between physical layers. A single layer network will have no dropout applied. Dropout is used in the training mode only.
+* `auxFlags::Integer = CUDNN_RNN_PADDED_IO_ENABLED`: Miscellaneous switches that do not require additional numerical values to configure the corresponding feature. In future cuDNN releases, this parameter will be used to extend the RNN functionality without adding new API functions (applicable options should be bitwise OR-ed). Currently, this parameter is used to enable or disable padded input/output (CUDNN_RNN_PADDED_IO_DISABLED, CUDNN_RNN_PADDED_IO_ENABLED). When the padded I/O is enabled, layouts CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED and CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED are permitted in RNN data descriptors. 
 
-ngates(mode) = [1, 1, 4, 3][mode+1]
-ngates(r::RNNDesc) = ngates(r.mode)
-
-function RNNDesc{T}(mode::cudnnRNNMode_t, input::Int, hidden::Int; layers = 1) where T
-  d = [C_NULL]
-  cudnnCreateRNNDescriptor(d)
-
-  dropoutDesc = DropoutDesc(0)
-  inputMode = CUDNN_LINEAR_INPUT
-  direction = CUDNN_UNIDIRECTIONAL
-  algo = CUDNN_RNN_ALGO_STANDARD
-  cudnnSetRNNDescriptor_v6(handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T))
-
-  w = CUDA.zeros(T, rnnParamSize(T, d[], input))
-  # TODO: avoid reserve allocation here
-  rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[])
-  finalizer(rd) do x
-    cudnnDestroyRNNDescriptor(x)
-  end
-  return rd
-end
+Other keyword arguments:
+* `layout::cudnnRNNDataLayout_t = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED`: The memory layout of the RNN data tensor. Options are CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED: Data layout is padded, with outer stride from one time-step to the next; CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED: The sequence length is sorted and packed as in the basic RNN API; CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED: Data layout is padded, with outer stride from one batch to the next.
+* `seqLengthArray::Vector{Cint} = nothing`: An integer array with batchSize number of elements. Describes the length (number of time-steps) of each sequence. Each element in seqLengthArray must be greater than or equal to 0 but less than or equal to maxSeqLength. In the packed layout, the elements should be sorted in descending order, similar to the layout required by the non-extended RNN compute functions. The default value `nothing` assumes uniform seqLengths, no padding.
+* `devSeqLengths::CuVector{Cint} = nothing`: Device copy of seqLengthArray
+* `fwdMode::cudnnForwardMode_t = CUDNN_FWD_MODE_INFERENCE`: set to `CUDNN_FWD_MODE_TRAINING` when training
+"""
+cudnnRNNForward, cudnnRNNForward!
 
-function setweights!(d::RNNDesc, Wi, Wh, b)
-  transpose!(d.weights[1], Wi)
-  transpose!(d.weights[2], Wh)
-  copyto!(d.bias, b)
-  return
-end
 
-function cudnnGetRNNTrainingReserveSize(r::RNNDesc, seqlen, xdesc)
-  size = Csize_t[0]
-  cudnnGetRNNTrainingReserveSize(handle(), r, seqlen, xdesc, size)
-  return Int(size[])
-end
+# Public methods
+cudnnRNNForward(w, x; hiddenSize, o...)     = cudnnRNNForwardWithDefaults(w, x; hiddenSize, o...)
+cudnnRNNForward!(y, w, x; hiddenSize, o...) = cudnnRNNForwardWithDefaults(w, x; y, hiddenSize, o...)
+cudnnRNNForward(w, x, d::cudnnRNNDescriptor; o...)     = cudnnRNNForwardWithDefaults(w, x; rnnDesc=d, o...)
+cudnnRNNForward!(y, w, x, d::cudnnRNNDescriptor; o...) = cudnnRNNForwardWithDefaults(w, x; y, rnnDesc=d, o...)
+
+
+# Private method
+function cudnnRNNForwardWithDefaults(
+    w, x;
+
+    # input hidden vectors
+    hx = nothing,
+    cx = nothing,
+
+    # output buffers
+    y = nothing,
+    hy = nothing,
+    cy = nothing,
+
+    # rnnDescriptor parameters
+    # TODO: look into GetClip, SetClip
+    algo::cudnnRNNAlgo_t = CUDNN_RNN_ALGO_STANDARD,
+    cellMode::cudnnRNNMode_t = CUDNN_LSTM,
+    biasMode::cudnnRNNBiasMode_t = CUDNN_RNN_DOUBLE_BIAS,
+    dirMode::cudnnDirectionMode_t = CUDNN_UNIDIRECTIONAL,
+    inputMode::cudnnRNNInputMode_t = CUDNN_LINEAR_INPUT,
+    dataType::DataType = eltype(x),
+    mathPrec::DataType = dataType, # has to match dataType with one extra possibility dt=Float16 => mp=Float16|Float32
+    mathType::cudnnMathType_t = math_mode(),
+    inputSize::Integer = size(x,1),
+    hiddenSize::Integer = 0,
+    projSize::Integer = hiddenSize,
+    numLayers::Integer = 1,
+    dropout::Real = 0,
+    auxFlags::Integer = CUDNN_RNN_PADDED_IO_ENABLED, # When the padded I/O is enabled, layouts CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED and CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED are permitted in RNN data descriptors.
+
+    # rnnDescriptor
+    rnnDesc::cudnnRNNDescriptor = cudnnRNNDescriptor(algo, cellMode, biasMode, dirMode, inputMode, cudnnDataType(dataType), cudnnDataType(mathPrec), mathType, Int32(inputSize), checkHidden(hiddenSize), Int32(projSize), Int32(numLayers), cudnnDropoutDescriptor(Cfloat(dropout)), UInt32(auxFlags)),
 
-function cudnnRNNForward(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y, hod,
-                         ho, cod, co, reserve=nothing) where T
-  @workspace size=@argout(
-      cudnnGetRNNWorkspaceSize(handle(), rnn, seqlen, xd,
-                               out(Ref{Csize_t}()))
-    )[] workspace->begin
-      if reserve == nothing
-        cudnnRNNForwardInference(handle(), rnn, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y,
-                                hod, ho, cod, co, workspace, sizeof(workspace))
-      else
-        cudnnRNNForwardTraining(handle(), rnn, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y,
-                                hod, ho, cod, co, workspace, sizeof(workspace),
-                                reserve, sizeof(reserve))
-      end
+    # rnnData parameters:
+    layout::cudnnRNNDataLayout_t = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED, # padded [X,B,T] array
+    seqLengthArray::Union{Nothing,Vector{Cint}} = nothing,         # assume no padding by default
+    paddingFill::Ptr{Cvoid} = C_NULL,
+
+    # rnnForward parameters
+    fwdMode::cudnnForwardMode_t = CUDNN_FWD_MODE_INFERENCE, # set to CUDNN_FWD_MODE_TRAINING when training
+    devSeqLengths::Union{Nothing,CuArray{Cint,1}} = nothing,
+    reserveSpace::Union{CuArray,Nothing} = nothing,
+    workspace::Union{CuArray,Nothing} = nothing,
+
+    # gradient buffers: layer designers may want to preallocate, so leave them as kwargs
+    dw = Ref{Any}(nothing),
+    dx = Ref{Any}(nothing),
+    dhx = Ref{Any}(nothing),
+    dcx = Ref{Any}(nothing),
+)
+    # Verify all inputs: they should be compatible with rnnDesc (in case it is supplied), not necessarily with kwargs:
+    rd = cudnnGetRNNDescriptor_v8(rnnDesc)
+    @assert rd.hiddenSize > 0  "hiddenSize > 0 must be provided"
+    @assert cudnnDataType(eltype(x)) == rd.dataType "Input x type not compatible with RNN"
+    @assert size(x,1) == rd.inputSize               "Input x size not compatible with RNN"
+    ydims = (rd.projSize << (rd.dirMode === CUDNN_BIDIRECTIONAL), size(x)[2:end]...)
+    if y !== nothing
+        @assert cudnnDataType(eltype(y)) == rd.dataType  "Output y type not compatible with RNN"
+        @assert size(y) == ydims                         "Output y size not compatible with RNN or input x"
+    else
+        y = similar(x, ydims)
     end
-end
+    if layout === CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED
+        if seqLengthArray === nothing; seqLengthArray = fill(Cint(size(x,3)), size(x,2)); end
+        @assert size(x,2) == length(seqLengthArray)  "Input x batchsize not compatible with seqLengthArray"
+        @assert size(x,3) >= maximum(seqLengthArray) "Input x seqLength not compatible with seqLengthArray"
+        xDesc = cudnnRNNDataDescriptor(rd.dataType, layout, Cint(size(x,3)), Cint(size(x,2)), Cint(size(x,1)), seqLengthArray, paddingFill)
+        yDesc = cudnnRNNDataDescriptor(rd.dataType, layout, Cint(size(y,3)), Cint(size(y,2)), Cint(size(y,1)), seqLengthArray, paddingFill)
+    elseif layout === CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED
+        if seqLengthArray === nothing; seqLengthArray = fill(Cint(size(x,2)), size(x,3)); end
+        @assert size(x,3) == length(seqLengthArray)  "Input x batchsize not compatible with seqLengthArray"
+        @assert size(x,2) >= maximum(seqLengthArray) "Input x seqLength not compatible with seqLengthArray"
+        xDesc = cudnnRNNDataDescriptor(rd.dataType, layout, Cint(size(x,2)), Cint(size(x,3)), Cint(size(x,1)), seqLengthArray, paddingFill)
+        yDesc = cudnnRNNDataDescriptor(rd.dataType, layout, Cint(size(y,2)), Cint(size(y,3)), Cint(size(y,1)), seqLengthArray, paddingFill)
+    elseif layout === CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED
+        if seqLengthArray === nothing; seqLengthArray = fill(Cint(size(x,3)), size(x,2)); end
+        @assert length(x)÷size(x,1) == sum(seqLengthArray) "Input x size not compatible with seqLengthArray"
+        xDesc = cudnnRNNDataDescriptor(rd.dataType, layout, maximum(seqLengthArray), Cint(length(seqLengthArray)), Cint(size(x,1)), seqLengthArray, paddingFill)
+        yDesc = cudnnRNNDataDescriptor(rd.dataType, layout, maximum(seqLengthArray), Cint(length(seqLengthArray)), Cint(size(y,1)), seqLengthArray, paddingFill)
+    else
+        error("Unknown layout $layout")
+    end
+    if devSeqLengths === nothing; devSeqLengths = CuArray(seqLengthArray); end
+
+    hdims = (rd.projSize, length(seqLengthArray), rd.numLayers << (rd.dirMode === CUDNN_BIDIRECTIONAL))
+    cdims = (rd.hiddenSize, length(seqLengthArray), rd.numLayers << (rd.dirMode === CUDNN_BIDIRECTIONAL))
+    if hx !== nothing
+        @assert cudnnDataType(eltype(hx)) == rd.dataType "Hidden hx type not compatible with RNN"
+        @assert size(hx) == hdims                        "Hidden hx size not compatible with RNN"
+    end
+    if hy !== nothing
+        @assert hy isa Ref{Any}
+        if isassigned(hy) && hy[] !== nothing
+            @assert cudnnDataType(eltype(hy[])) == rd.dataType "Hidden hy type not compatible with RNN"
+            @assert size(hy[]) == hdims                        "Hidden hy size not compatible with RNN"
+        else
+            hy[] = similar(y, hdims)
+        end
+    end
+    if rd.cellMode === CUDNN_LSTM
+        if cx !== nothing
+            @assert cudnnDataType(eltype(cx)) == rd.dataType "Hidden cx type not compatible with RNN"
+            @assert size(cx) == cdims                        "Hidden cx size not compatible with RNN"
+        end
+        if cy !== nothing
+            @assert cy isa Ref{Any}
+            if isassigned(cy) && cy[] !== nothing
+                @assert cudnnDataType(eltype(cy[])) == rd.dataType "Hidden cy type not compatible with RNN"
+                @assert size(cy[]) == cdims                        "Hidden cy size not compatible with RNN"
+            else
+                cy[] = similar(y, cdims)
+            end
+        end
+    end
+    hDesc = cudnnTensorDescriptor(CUDNN_TENSOR_NCHW, rd.dataType, Cint(3), Cint[reverse(hdims)...])
+    cDesc = cudnnTensorDescriptor(CUDNN_TENSOR_NCHW, rd.dataType, Cint(3), Cint[reverse(cdims)...])
 
-xDesc(x) = [TensorDesc(eltype(x), (1, size(x, 1), size(x, 2)))]
+    weightSpaceSize = cudnnRNNWeightSpaceSize(rnnDesc)
+    @assert sizeof(w) >= weightSpaceSize "RNN weights should be at least $weightSpaceSize bytes."
 
-hDesc(h::Nothing) = C_NULL, CU_NULL
-hDesc(x::Integer) = (@assert x == 0; hDesc(nothing))
-function hDesc(h::DenseCuArray)
-  TensorDesc(eltype(h), (size(h, 1), size(h, 2), 1)), h
-end
+    # Backward called separately on each variable. We will calculate all gradients on first call. Use `dready` to avoid subsequent calls.
+    dready = Ref{Bool}(false)   # this will be turned to `true` by the first backward call.
 
-# TODO: can we just manipulate strides here?
-# TODO: should use repmat, but this isn't implemented.
-hBatch(x::AbstractVector, h::CuVector) = h
-hBatch(x::AbstractMatrix, h::CuVector) = h .* CUDA.ones(1, size(x, 2))
-hBatch(x::AbstractMatrix, h::CuMatrix) = h .* CUDA.ones(1, size(h,2) == 1 ? size(x,2) : 1)
-
-function forward(rnn::RNNDesc{T}, x::DenseCuArray{T}, h_::DenseCuArray{T}, c_ = nothing, train = Val{false}) where T
-  h = hBatch(x, h_)
-  c = c_ == nothing ? nothing : hBatch(x, c_)
-  @assert size(x, 1) == rnn.input
-  @assert size(h, 1) == rnn.hidden
-  @assert size(x, 2) == size(h, 2)
-  seqLength = 1
-  xdesc = xDesc(x)
-  y = x isa AbstractVector ? similar(x, rnn.hidden) : similar(x, rnn.hidden, size(x, 2))
-  ho = similar(h)
-  ydesc = xDesc(y)
-  reserve = train == Val{true} ?
-    CuVector{UInt8}(undef, cudnnGetRNNTrainingReserveSize(rnn, seqLength, xdesc)) :
-    nothing
-  co = c == nothing ? c : similar(c)
-  cudnnRNNForward(rnn, seqLength,
-                  xdesc, x,
-                  hDesc(h)...,
-                  hDesc(c)...,
-                  FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
-                  ydesc, y,
-                  hDesc(ho)...,
-                  hDesc(co)...,
-                  reserve)
-  result = c == nothing ? (y, ho) : (y, ho, co)
-  return train == Val{true} ? (reserve, result) : result
+    y_h_c = cudnnRNNForwardAD(w, x, hx, cx; rnnDesc, fwdMode, devSeqLengths, xDesc, yDesc, y, hDesc, hy=(hy isa Ref ? hy[] : hy), cDesc, cy=(cy isa Ref ? cy[] : cy), workspace, reserveSpace, dw, dx, dhx, dcx, dready)
+    if hy isa Ref; hy[] = y_h_c[2]; end
+    if cy isa Ref && rd.cellMode === CUDNN_LSTM; cy[] = y_h_c[3]; end
+    return y_h_c[1]             # only return y; hy and cy can be accessed through keyword arguments. They still need to be in AutoGrad return value to be included in gradient calc.
 end
 
-forwardTrain(rnn::RNNDesc{T}, x::DenseCuArray{T}, h::DenseCuArray{T}, c = nothing) where T =
-  forward(rnn, x, h, c, Val{true})
-
-function cudnnRNNBackwardData(rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                              dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                              dx, dhxDesc, dhx, dcxDesc, dcx, reserve)
-  @workspace size=@argout(
-      cudnnGetRNNWorkspaceSize(handle(), rnnDesc, seqLength, dxDesc,
-                               out(Ref{Csize_t}()))
-    )[] workspace->begin
-      cudnnRNNBackwardData(handle(), rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                           dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                           dx, dhxDesc, dhx, dcxDesc, dcx, workspace, sizeof(workspace),
-                           reserve, sizeof(reserve))
-    end
+
+# AD method
+
+function cudnnRNNForwardAD(w, x, hx, cx; rnnDesc, fwdMode, devSeqLengths, xDesc, yDesc, y, hDesc, hy, cDesc, cy, workspace, reserveSpace, dw, dx, dhx, dcx, dready)
+    (workspaceSize, reserveSpaceSize) = cudnnRNNTempSpaceSizes(rnnDesc, fwdMode, xDesc)
+    if reserveSpaceSize > 0 && reserveSpace === nothing; reserveSpace = cudnnTempSpace(reserveSpaceSize); end
+    @assert sizeof(reserveSpace) >= reserveSpaceSize  "reserveSpace should be at least $reserveSpaceSize bytes"
+    # Cannot use @workspace here because it is shared between forw and back calls
+    if workspaceSize > 0 && workspace === nothing; workspace = cudnnTempSpace(workspaceSize); end
+    @assert sizeof(workspace) >= workspaceSize  "workspace should be at least $workspaceSize bytes"
+    cudnnRNNForward(handle(), rnnDesc, fwdMode, devSeqLengths, xDesc, x, yDesc, y, hDesc, something(hx, CU_NULL), something(hy, CU_NULL), cDesc, something(cx, CU_NULL), something(cy, CU_NULL), sizeof(w), w, sizeof(workspace), something(workspace, CU_NULL), sizeof(reserveSpace), something(reserveSpace, CU_NULL))
+    return (y, hy, cy)
 end
 
-function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T
-  # Same as above, any more efficient way?
-  dy = dy_ isa Integer ? zero(y) : dy_
-  yd = xDesc(y)
-  dx = y isa AbstractVector ? similar(dy, rnn.input) : similar(dy, rnn.input, size(dy, 2))
-  dh = similar(h)
-  dc = c == nothing ? nothing : similar(c)
-  cudnnRNNBackwardData(rnn, 1, yd, y, yd, dy, hDesc(dho)..., hDesc(dco)...,
-                       FilterDesc(T, (1, 1, length(rnn.params))), rnn.params, hDesc(h)...,
-                       hDesc(c)..., xDesc(dx), dx, hDesc(dh)..., hDesc(dc)..., reserve)
-  return c == nothing ? (dx, dh) : (dx, dh, dc)
+
+# Helper methods
+
+function cudnnRNNWeightSpaceSize(rnnDesc::cudnnRNNDescriptor)
+    ws = Csize_t[0]
+    cudnnGetRNNWeightSpaceSize(handle(), rnnDesc, ws)
+    ws[1]
 end
 
-backwardData(rnn, y, dy, dho, hx, reserve) =
-  backwardData(rnn, y, dy, dho, nothing, hx, nothing, reserve)
-
-function cudnnRNNBackwardWeights(rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc,
-                                 y, dwDesc, dw, reserve)
-  @workspace size=@argout(
-      cudnnGetRNNWorkspaceSize(handle(), rnnDesc, seqLength, xDesc,
-                               out(Ref{Csize_t}()))
-    )[] workspace->begin
-      cudnnRNNBackwardWeights(handle(), rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc,
-                              y, workspace, sizeof(workspace), dwDesc, dw,
-                              reserve, sizeof(reserve))
-    end
+function cudnnRNNTempSpaceSizes(rnnDesc::cudnnRNNDescriptor, fwdMode::cudnnForwardMode_t, xDesc::cudnnRNNDataDescriptor)
+    ws = Csize_t[0]; rs = Csize_t[0]
+    cudnnGetRNNTempSpaceSizes(handle(), rnnDesc, fwdMode, xDesc, ws, rs)
+    ws[1], rs[1]
 end
 
-function backwardWeights(rnn::RNNDesc{T}, x, h, y, reserve) where T
-  dw = zero(rnn.params)
-  cudnnRNNBackwardWeights(rnn, 1, xDesc(x), x, hDesc(h)..., xDesc(y), y,
-                          FilterDesc(T, (1, 1, length(dw))), dw, reserve)
-  return params(dw, rnn.input, rnn.hidden, ngates(rnn))
+function cudnnGetRNNDescriptor_v8(rnnDesc::cudnnRNNDescriptor)
+    (algo, cellMode, biasMode, dirMode, inputMode, dataType, mathPrec, mathType, inputSize, hiddenSize, projSize, numLayers, dropout, auxFlags) = (Ref{cudnnRNNAlgo_t}(), Ref{cudnnRNNMode_t}(), Ref{cudnnRNNBiasMode_t}(), Ref{cudnnDirectionMode_t}(), Ref{cudnnRNNInputMode_t}(), Ref{cudnnDataType_t}(), Ref{cudnnDataType_t}(), Ref{cudnnMathType_t}(), Ref{Int32}(), Ref{Int32}(), Ref{Int32}(), Ref{Int32}(), Ref{Ptr{Nothing}}(), Ref{UInt32}())
+    cudnnGetRNNDescriptor_v8(rnnDesc, algo, cellMode, biasMode, dirMode, inputMode, dataType, mathPrec, mathType, inputSize, hiddenSize, projSize, numLayers, dropout, auxFlags)
+    (algo, cellMode, biasMode, dirMode, inputMode, dataType, mathPrec, mathType, inputSize, hiddenSize, projSize, numLayers, dropout, auxFlags) = (algo[], cellMode[], biasMode[], dirMode[], inputMode[], dataType[], mathPrec[], mathType[], inputSize[], hiddenSize[], projSize[], numLayers[], dropout[], auxFlags[])
+    (; rnnDesc, algo, cellMode, biasMode, dirMode, inputMode, dataType, mathPrec, mathType, inputSize, hiddenSize, projSize, numLayers, dropout, auxFlags)
 end
 
-function pullback(rnn::RNNDesc{T}, x::DenseCuArray{T}, h::DenseCuArray{T}) where T <: Union{Float32,Float64}
-  reserve, (y, ho) = CUDNN.forwardTrain(rnn, x, h)
-  return (y, ho), function (dy, dho)
-    h_ = CUDNN.hBatch(x, h)
-    dx, dh = CUDNN.backwardData(rnn, y, dy, dho, h_, reserve)
-    (dWi, dWh), db = CUDNN.backwardWeights(rnn, x, h_, y, reserve)
-    return (x = dx, h = dh, Wi = dWi, Wh = dWh, b = db)
-  end
+checkHidden(h) = (h > 0 ? Int32(h) : error("hiddenSize > 0 is required"))
+
+
+"""
+    cudnnGetRNNWeightParams(w, d::cudnnRNNDescriptor)
+    cudnnGetRNNWeightParams(w; hiddenSize, o...)
+
+Return an array of weight matrices and bias vectors of an RNN specified by `d` or keyword
+options as views into `w`. The keyword arguments and defaults in the second form are the
+same as those in cudnnRNNForward specifying the RNN.
+
+In the returned array `a[1,l,p]` and `a[2,l,p]` give the weight matrix and bias vector for
+the l'th layer and p'th parameter or `nothing` if the specified matrix/vector does not
+exist. Note that the matrices should be transposed for left multiplication, e.g. `a[1,l,p]'
+* x`
+
+The `l` index refers to the pseudo-layer number. In uni-directional RNNs, a pseudo-layer is
+the same as a physical layer (pseudoLayer=1 is the RNN input layer, pseudoLayer=2 is the
+first hidden layer). In bi-directional RNNs, there are twice as many pseudo-layers in
+comparison to physical layers:
+
+    pseudoLayer=1 refers to the forward direction sub-layer of the physical input layer
+    pseudoLayer=2 refers to the backward direction sub-layer of the physical input layer
+    pseudoLayer=3 is the forward direction sub-layer of the first hidden layer, and so on
+
+The `p` index refers to the weight matrix or bias vector linear ID index.
+
+If cellMode in rnnDesc was set to CUDNN_RNN_RELU or CUDNN_RNN_TANH:
+
+    Value 1 references the weight matrix or bias vector used in conjunction with the input from the previous layer or input to the RNN model.
+    Value 2 references the weight matrix or bias vector used in conjunction with the hidden state from the previous time step or the initial hidden state.
+
+If cellMode in rnnDesc was set to CUDNN_LSTM:
+
+    Values 1, 2, 3 and 4 reference weight matrices or bias vectors used in conjunction with the input from the previous layer or input to the RNN model.
+    Values 5, 6, 7 and 8 reference weight matrices or bias vectors used in conjunction with the hidden state from the previous time step or the initial hidden state.
+    Value 9 corresponds to the projection matrix, if enabled (there is no bias in this operation).
+
+Values and their LSTM gates:
+
+    Values 1 and 5 correspond to the input gate.
+    Values 2 and 6 correspond to the forget gate.
+    Values 3 and 7 correspond to the new cell state calculations with hyperbolic tangent.
+    Values 4 and 8 correspond to the output gate.
+
+If cellMode in rnnDesc was set to CUDNN_GRU:
+
+    Values 1, 2 and 3 reference weight matrices or bias vectors used in conjunction with the input from the previous layer or input to the RNN model.
+    Values 4, 5 and 6 reference weight matrices or bias vectors used in conjunction with the hidden state from the previous time step or the initial hidden state.
+
+Values and their GRU gates:
+
+    Values 1 and 4 correspond to the reset gate.
+    Values 2 and 5 reference to the update gate.
+    Values 3 and 6 correspond to the new hidden state calculations with hyperbolic tangent.
+
+"""
+function cudnnGetRNNWeightParams(
+    w;
+    hiddenSize::Integer,
+    inputSize::Integer = hiddenSize,
+    projSize::Integer = hiddenSize,
+    algo::cudnnRNNAlgo_t = CUDNN_RNN_ALGO_STANDARD,
+    cellMode::cudnnRNNMode_t = CUDNN_LSTM,
+    biasMode::cudnnRNNBiasMode_t = CUDNN_RNN_DOUBLE_BIAS,
+    dirMode::cudnnDirectionMode_t = CUDNN_UNIDIRECTIONAL,
+    inputMode::cudnnRNNInputMode_t = CUDNN_LINEAR_INPUT,
+    dataType::DataType = Float32,
+    mathPrec::DataType = dataType,
+    mathType::cudnnMathType_t = math_mode(),
+    numLayers::Integer = 1,
+    dropout::Real = 0,
+    auxFlags::Integer = CUDNN_RNN_PADDED_IO_ENABLED,
+)
+    cudnnGetRNNWeightParams(w, cudnnRNNDescriptor(algo, cellMode, biasMode, dirMode, inputMode, cudnnDataType(dataType), cudnnDataType(mathPrec), mathType, Int32(inputSize), checkHidden(hiddenSize), Int32(projSize), Int32(numLayers), cudnnDropoutDescriptor(Cfloat(dropout)), UInt32(auxFlags)))
 end
 
-function pullback(rnn::RNNDesc{T}, x::DenseCuArray{T}, h::DenseCuArray{T}, c::DenseCuArray{T}) where T <: Union{Float32,Float64}
-  reserve, (y, ho, co) = CUDNN.forwardTrain(rnn, x, h, c)
-  return (y, ho, co), function (dy, dho, dco)
-    h_ = CUDNN.hBatch(x, h)
-    c_ = CUDNN.hBatch(x, c)
-    dx, dh, dc = CUDNN.backwardData(rnn, y, dy, dho, dco, h_, c_, reserve)
-    (dWi, dWh), db = CUDNN.backwardWeights(rnn, x, h_, y, reserve)
-    return (x = dx, h = dh, c = dc, Wi = dWi, Wh = dWh, b = db)
-  end
+
+function cudnnGetRNNWeightParams(w, rnnDesc::cudnnRNNDescriptor)
+    d = cudnnGetRNNDescriptor_v8(rnnDesc)
+    T = juliaDataType(d.dataType)
+    weightSpace = reinterpret(T, w)
+    nlayers = d.numLayers << (d.dirMode === CUDNN_BIDIRECTIONAL)
+    nparams = (d.cellMode === CUDNN_RNN_RELU || d.cellMode === CUDNN_RNN_TANH ? 2 :
+               d.cellMode === CUDNN_LSTM ? 9 : d.cellMode === CUDNN_GRU  ? 6 :
+               error("Unknown cellMode $(d.cellMode)"))
+    a = Array{Any}(undef, 2, nlayers, nparams)
+    p = Ref{Ptr{Cvoid}}(0)
+    cudnnCreateTensorDescriptor(p); mDesc = cudnnTensorDescriptor(p[])
+    cudnnCreateTensorDescriptor(p); bDesc = cudnnTensorDescriptor(p[])
+    mAddr = Ref{CuPtr{Cvoid}}(0)
+    bAddr = Ref{CuPtr{Cvoid}}(0)
+    for l in 1:nlayers, p in 1:nparams
+        cudnnGetRNNWeightParams(handle(), rnnDesc, l-1, sizeof(weightSpace), weightSpace, p-1, mDesc, mAddr, bDesc, bAddr)
+        mT,mD,mS = cudnnGetTensorDescriptor(mDesc)
+        bT,bD,bS = cudnnGetTensorDescriptor(bDesc)
+        @assert mT === bT === T
+        if mAddr[] === CU_NULL
+            a[1,l,p] = nothing
+        else
+            m0 = (mAddr[] - pointer(weightSpace)) ÷ sizeof(T) |> Int
+            a[1,l,p] = reshape(view(weightSpace, (m0+1):(m0+prod(mD))), (mD[1],mD[2]))
+        end
+        if bAddr[] === CU_NULL
+            a[2,l,p] = nothing
+        else
+            b0 = (bAddr[] - pointer(weightSpace)) ÷ sizeof(T) |> Int
+            a[2,l,p] = view(weightSpace, (b0+1):(b0+prod(bD)))
+        end
+    end
+    cudnnDestroyTensorDescriptor.((mDesc,bDesc))
+    return a
 end
+
diff --git a/lib/cudnn/softmax.jl b/lib/cudnn/softmax.jl
index f46520e1a0..c8f60c5abc 100644
--- a/lib/cudnn/softmax.jl
+++ b/lib/cudnn/softmax.jl
@@ -1,22 +1,51 @@
-# wrappers
-
-function cudnnSoftmaxForward(x::DenseCuArray{T,4}, y::DenseCuArray{T,4}=x;
-                             algo=CUDNN_SOFTMAX_FAST, # or CUDNN_SOFTMAX_ACCURATE
-                             mode=CUDNN_SOFTMAX_MODE_INSTANCE, # or CUDNN_SOFTMAX_MODE_CHANNEL
-                             alpha=1.0, beta=0.0) where T
-    cudnnSoftmaxForward(handle(), algo, mode,
-                        scalingParameter(T, alpha), TensorDesc(x), x,
-                        scalingParameter(T, beta ), TensorDesc(y), y)
+"""
+    cudnnSoftmaxForward(x; algo, mode, alpha)
+    cudnnSoftmaxForward!(y, x; algo, mode, alpha, beta)
+
+Return the softmax or logsoftmax of the input `x` depending on the `algo` keyword argument.
+The `y` argument holds the result and it should be similar to `x` if specified. Keyword
+arguments:
+
+* `algo = (CUDA.math_mode()===CUDA.FAST_MATH ? CUDNN_SOFTMAX_FAST : CUDNN_SOFTMAX_ACCURATE)`: Options are `CUDNN_SOFTMAX_ACCURATE` which subtracts max from every point to avoid overflow, `CUDNN_SOFTMAX_FAST` which doesn't and `CUDNN_SOFTMAX_LOG` which returns logsoftmax.
+* `mode = CUDNN_SOFTMAX_MODE_INSTANCE`: Compute softmax per image (N) across the dimensions C,H,W. `CUDNN_SOFTMAX_MODE_CHANNEL` computes softmax per spatial location (H,W) per image (N) across the dimension C. 
+* `alpha=1, beta=0` can be used for scaling, i.e. `y .= alpha*op(x1) .+ beta*y`
+"""
+
+
+# Public methods
+cudnnSoftmaxForward(x; o...) = cudnnSoftmaxForwardWithDefaults(x; o...)
+cudnnSoftmaxForward!(y, x; o...) = cudnnSoftmaxForwardWithDefaults(x; y, o...)
+
+
+# Private method
+function cudnnSoftmaxForwardWithDefaults(
+    x;
+    y = similar(x),
+    algo::cudnnSoftmaxAlgorithm_t = (CUDA.math_mode()===CUDA.FAST_MATH ? CUDNN_SOFTMAX_FAST : CUDNN_SOFTMAX_ACCURATE),
+    mode::cudnnSoftmaxMode_t = CUDNN_SOFTMAX_MODE_INSTANCE,
+    alpha::Real = 1,
+    beta::Real = 0,
+    format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW,
+    xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format),
+    yDesc::cudnnTensorDescriptor = xDesc,
+)
+    @assert size(y) == size(x)
+    T = eltype(x)
+    alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta)
+    cudnnSoftmaxForwardAD(x; algo, mode, alpha, xDesc, beta, yDesc, y)
+end
+
+
+# AD method
+function cudnnSoftmaxForwardAD(x; algo, mode, alpha, xDesc, beta, yDesc, y)
+    cudnnSoftmaxForward(handle(), algo, mode, alpha, xDesc, x, beta, yDesc, y)
     return y
 end
 
-function cudnnSoftmaxBackward(y::DenseCuArray{T,4}, dy::DenseCuArray{T,4}, dx::DenseCuArray{T,4}=dy;
-                              algo=CUDNN_SOFTMAX_FAST, # or CUDNN_SOFTMAX_ACCURATE
-                              mode=CUDNN_SOFTMAX_MODE_INSTANCE, # or CUDNN_SOFTMAX_MODE_CHANNEL
-                              alpha=1.0, beta=0.0) where T
-    cudnnSoftmaxBackward(handle(), algo, mode,
-                         scalingParameter(T, alpha), TensorDesc(y), y,
-                         TensorDesc(dy), dy,
-                         scalingParameter(T, beta ), TensorDesc(dx), dx)
-    return dx
+
+# Deprecated methods
+function cudnnSoftmaxForward(x::DenseCuArray{T,4}, y::DenseCuArray{T,4}; o...) where T
+    @warn "`cudnnSoftmaxForward(x,y)` is deprecated, please use one of the methods in `@doc cudnnSoftmaxForward`." maxlog=1
+    cudnnSoftmaxForward!(y, x; o...)
 end
+
diff --git a/lib/cudnn/tensor.jl b/lib/cudnn/tensor.jl
index 9a2cb9fd17..c98a020e23 100644
--- a/lib/cudnn/tensor.jl
+++ b/lib/cudnn/tensor.jl
@@ -1,132 +1,105 @@
-# TensorDesc
-
-# descriptor
-
-mutable struct TensorDesc
-    ptr::cudnnTensorDescriptor_t
+# Alternative constructors for cudnnTensorDescriptor and cudnnFilterDescriptor
+
+function cudnnTensorDescriptor(   # alternative constructor from array; main one in descriptors.jl
+    array;
+    format::cudnnTensorFormat_t=CUDNN_TENSOR_NCHW,
+    dims::Vector{Cint}=dim4(size(array),Val(format))
+)
+    @assert length(dims) <= CUDNN_DIM_MAX # length(dims) may not be N
+    cudnnTensorDescriptor(format, cudnnDataType(eltype(array)), Cint(length(dims)), dims)
 end
 
-unsafe_free!(td::TensorDesc) = cudnnDestroyTensorDescriptor(td.ptr)
-
-Base.unsafe_convert(::Type{cudnnTensorDescriptor_t}, td::TensorDesc) = td.ptr
 
-function TensorDesc(T::Type, size::NTuple{N,Integer}, strides::NTuple{N,Integer} = tuple_strides(size)) where N
-    sz = Cint.(size) |> reverse |> collect
-    st = Cint.(strides) |> reverse |> collect
-    td = Ref{cudnnTensorDescriptor_t}()
-    cudnnCreateTensorDescriptor(td)
-    cudnnSetTensorNdDescriptor(td[], cudnnDataType(T), length(sz), sz, st)
-    this = TensorDesc(td[])
-    finalizer(unsafe_free!, this)
-    return this
+function cudnnFilterDescriptor(  # alternative constructor from array; main one in descriptors.jl
+    array;
+    format::cudnnTensorFormat_t=CUDNN_TENSOR_NCHW,
+    dims::Vector{Cint}=dim4(size(array),Val(format))
+)
+    @assert length(dims) <= CUDNN_DIM_MAX # length(dims) may not be N
+    cudnnFilterDescriptor(cudnnDataType(eltype(array)), format, Cint(length(dims)), dims)
 end
 
-TensorDesc(a::DenseCuArray) = TensorDesc(eltype(a), size(a), strides(a))
-
-# wrappers
 
-function cudnnAddTensor(C::DenseCuArray{T,N}, A::DenseCuArray{T,N};
-                        alpha=1, beta=1) where {T,N}
-    cudnnAddTensor(handle(),
-                   scalingParameter(T, alpha), TensorDesc(A), A,
-                   scalingParameter(T, beta ), TensorDesc(C), C)
-    return C
+# From cuDNN docs: Due to historical reasons, the minimum number of dimensions in the filter
+# descriptor is three, and at most CUDNN_DIM_MAX dimensions (defined in cudnn.h = 8). 
+# However many operations only support 4 and 5. So we will pad dims to 4.
+# Note also the order of dims reverse from Julia to cuDNN.
+# RNN and multiHeadAttn do use 3D descriptors so they do not use dim4.
+# Note on formats: even when using the NHWC format the dims are given in NCHW order!
+
+dim4(s::Dims{0}, ::Val{CUDNN_TENSOR_NCHW}) = Cint[1,1,1,1]
+dim4(s::Dims{0}, ::Val{CUDNN_TENSOR_NHWC}) = Cint[1,1,1,1]
+dim4(s::Dims{1}, ::Val{CUDNN_TENSOR_NCHW}) = Cint[s[1],1,1,1]    # Cy -> Cy,1,1,1
+dim4(s::Dims{1}, ::Val{CUDNN_TENSOR_NHWC}) = Cint[s[1],1,1,1]    # Cy -> Cy,1,1,1
+dim4(s::Dims{2}, ::Val{CUDNN_TENSOR_NCHW}) = Cint[s[2],s[1],1,1] # Cx,Cy -> Cy,Cx,1,1
+dim4(s::Dims{2}, ::Val{CUDNN_TENSOR_NHWC}) = Cint[s[2],s[1],1,1] # Cx,Cy -> Cy,Cx,1,1
+dim4(s::Dims{3}, ::Val{CUDNN_TENSOR_NCHW}) = Cint[s[3],s[2],s[1],1] # Xn,Cx,Cy -> Cy,Cx,Xn,1
+dim4(s::Dims{3}, ::Val{CUDNN_TENSOR_NHWC}) = Cint[s[3],s[1],s[2],1] # Cx,Xn,Cy -> Cy,Cx,Xn,1
+dim4(s::Dims{N}, ::Val{CUDNN_TENSOR_NCHW}) where {N} = Cint[reverse(s)...] # X1,...,Xn,Cx,Cy -> Cy,Cx,Xn,...,X1
+dim4(s::Dims{N}, ::Val{CUDNN_TENSOR_NHWC}) where {N} = Cint[s[N],s[1],s[N-1:-1:2]...] # Cx,X1,...,Xn,Cy -> Cy,Cx,Xn,...,X1
+
+
+# If array is nothing, return nothing for descriptor
+cudnnTensorDescriptor(::Nothing; o...) = nothing
+cudnnFilterDescriptor(::Nothing; o...) = nothing
+
+
+# In case we need to get info about a descriptor
+
+function cudnnGetTensorDescriptor(d::cudnnTensorDescriptor)
+    nbDimsRequested = CUDNN_DIM_MAX
+    dataType = Ref{cudnnDataType_t}(CUDNN_DATA_FLOAT)
+    nbDims = Ref{Cint}(0)
+    dimA = Array{Cint}(undef, CUDNN_DIM_MAX)
+    strideA = Array{Cint}(undef, CUDNN_DIM_MAX)
+    cudnnGetTensorNdDescriptor(d, nbDimsRequested, dataType, nbDims, dimA, strideA)
+    T = juliaDataType(dataType[])
+    D = (dimA[nbDims[]:-1:1]...,)
+    S = (strideA[nbDims[]:-1:1]...,)
+    return T,D,S
 end
 
-
-# OpTensorDesc
-
-# descriptor
-
-mutable struct OpTensorDesc
-    ptr::cudnnOpTensorDescriptor_t
+function cudnnGetFilterDescriptor(d::cudnnFilterDescriptor)
+    nbDimsRequested = CUDNN_DIM_MAX
+    dataType = Ref{cudnnDataType_t}(CUDNN_DATA_FLOAT)
+    format = Ref{cudnnTensorFormat_t}(CUDNN_TENSOR_NCHW)
+    nbDims = Ref{Cint}(0)
+    dimA = Array{Cint}(undef, CUDNN_DIM_MAX)
+    cudnnGetFilterNdDescriptor(d, nbDimsRequested, dataType, format, nbDims, dimA)
+    T = juliaDataType(dataType[])
+    D = (dimA[nbDims[]:-1:1]...,)
+    return T,D,format[]
 end
 
-unsafe_free!(otd::OpTensorDesc) = cudnnDestroyOpTensorDescriptor(otd.ptr)
-
-Base.unsafe_convert(::Type{cudnnOpTensorDescriptor_t}, otd::OpTensorDesc) = otd.ptr
 
-function OpTensorDesc(op::cudnnOpTensorOp_t, T::Type;
-                      opTensorNanOpt=CUDNN_NOT_PROPAGATE_NAN)
-    otd = Ref{cudnnOpTensorDescriptor_t}()
-    cudnnCreateOpTensorDescriptor(otd)
-    cudnnSetOpTensorDescriptor(otd[], op, cudnnDataType(T), opTensorNanOpt)
-    this = OpTensorDesc(otd[])
-    finalizer(unsafe_free!, this)
-    return this
+# Deprecated
+function TensorDesc(ptr::cudnnTensorDescriptor_t)
+    @warn "TensorDesc is deprecated, use cudnnTensorDescriptor instead." maxlog=1
+    cudnnTensorDescriptor(ptr)
 end
 
-OpTensorDesc(op::cudnnOpTensorOp_t, a::DenseCuArray) = OpTensorDesc(op, eltype(a))
-
-# wrappers
-
-function cudnnOpTensor(op::cudnnOpTensorOp_t,
-                       A::DenseCuArray{T,N}, B::DenseCuArray{T,N}, C::DenseCuArray{T,N};
-                       alpha1=true, alpha2=true, beta=false) where {T,N}
-    cudnnOpTensor(handle(), OpTensorDesc(op, T),
-                  scalingParameter(T, alpha1), TensorDesc(A), A,
-                  scalingParameter(T, alpha2), TensorDesc(B), B,
-                  scalingParameter(T, beta  ), TensorDesc(C), C)
-    return C
+function TensorDesc(a::DenseCuArray)
+    @warn "TensorDesc is deprecated, use cudnnTensorDescriptor instead." maxlog=1
+    cudnnTensorDescriptor(a)
 end
 
-
-# ReduceTensorDesc
-
-# descriptor
-
-mutable struct ReduceTensorDesc
-    ptr::cudnnReduceTensorDescriptor_t
+function TensorDesc(T::Type, size::NTuple{N,Integer}, strides::NTuple{N,Integer} = tuple_strides(size)) where N
+    @warn "TensorDesc is deprecated, use cudnnTensorDescriptor instead." maxlog=1
+    cudnnTensorDescriptor(CUDNN_TENSOR_NCHW, cudnnDataType(T), Cint(N), dim4(size,Val(CUDNN_TENSOR_NCHW)))
 end
 
-unsafe_free!(rtd::ReduceTensorDesc) = cudnnDestroyReduceTensorDescriptor(rtd.ptr)
-
-Base.unsafe_convert(::Type{cudnnReduceTensorDescriptor_t}, rtd::ReduceTensorDesc) = rtd.ptr
-
-function ReduceTensorDesc(op::cudnnReduceTensorOp_t, T::Type;
-                          reduceTensorNanOpt=CUDNN_NOT_PROPAGATE_NAN,
-                          reduceTensorIndices=CUDNN_REDUCE_TENSOR_NO_INDICES,
-                          reduceTensorIndicesType=CUDNN_32BIT_INDICES)
-    rtd = Ref{cudnnReduceTensorDescriptor_t}()
-    cudnnCreateReduceTensorDescriptor(rtd)
-    cudnnSetReduceTensorDescriptor(rtd[], op, cudnnDataType(T), reduceTensorNanOpt,
-                                   reduceTensorIndices, reduceTensorIndicesType)
-    this = ReduceTensorDesc(rtd[])
-    finalizer(unsafe_free!, this)
-    return this
+function FilterDesc(ptr::cudnnFilterDescriptor_t)
+    @warn "FilterDesc is deprecated, use cudnnFilterDescriptor instead." maxlog=1
+    cudnnFilterDescriptor(ptr)
 end
 
-ReduceTensorDesc(op::cudnnReduceTensorOp_t, a::DenseCuArray) = ReduceTensorDesc(op, eltype(a))
-
-# wrappers
-
-function cudnnGetReductionIndicesSize(op::cudnnReduceTensorOp_t,
-                                      A::DenseCuArray{T,N}, C::DenseCuArray{T,N}) where {T,N}
-    size=@argout(
-        cudnnGetReductionIndicesSize(
-            handle(), ReduceTensorDesc(op, A),
-            TensorDesc(A), TensorDesc(C),
-            out(Ref{Csize_t}()))
-        )[]
-    return size
+function FilterDesc(a::DenseCuArray; format = CUDNN_TENSOR_NCHW)
+    @warn "FilterDesc is deprecated, use cudnnFilterDescriptor instead." maxlog=1
+    cudnnFilterDescriptor(a; format)
 end
 
-function cudnnReduceTensor(op::cudnnReduceTensorOp_t,
-                           A::DenseCuArray{T,N}, C::DenseCuArray{T,N};
-                           alpha=true, beta=false) where {T,N}
-    # indices = Array{UInt64, 1}(undef, N)
-    indicesSizeInBytes = cudnnGetReductionIndicesSize(op, A, C)
-    @workspace size=@argout(
-        cudnnGetReductionWorkspaceSize(
-            handle(), ReduceTensorDesc(op, A),
-            TensorDesc(A), TensorDesc(C),
-            out(Ref{Csize_t}()))
-        )[] workspace->begin
-            cudnnReduceTensor(handle(), ReduceTensorDesc(op, A),
-                              C_NULL, indicesSizeInBytes,
-                              workspace, sizeof(workspace),
-                              scalingParameter(T, alpha), TensorDesc(A), A,
-                              scalingParameter(T, beta ), TensorDesc(C), C)
-        end
-    return C
+function FilterDesc(T::Type, size::Tuple; format = CUDNN_TENSOR_NCHW)
+    @warn "FilterDesc is deprecated, use cudnnFilterDescriptor instead." maxlog=1
+    dims = dim4(size, Val(format))
+    cudnnFilterDescriptor(cudnnDataType(T), format, Cint(length(dims)), dims)
 end
diff --git a/lib/cudnn/util.jl b/lib/cudnn/util.jl
index aab1a3e3a1..008d75001a 100644
--- a/lib/cudnn/util.jl
+++ b/lib/cudnn/util.jl
@@ -7,9 +7,19 @@ cptr(x,a::DenseCuArray{Float16})=Float32[x]
 cudnnDataType(::Type{Float16})=CUDNN_DATA_HALF
 cudnnDataType(::Type{Float32})=CUDNN_DATA_FLOAT
 cudnnDataType(::Type{Float64})=CUDNN_DATA_DOUBLE
+cudnnDataType(::Type{Int8}) = CUDNN_DATA_INT8
+cudnnDataType(::Type{UInt8}) = CUDNN_DATA_UINT8
+cudnnDataType(::Type{Int32}) = CUDNN_DATA_INT32
+# The following are 32-bit elements each composed of 4 8-bit integers, only supported with CUDNN_TENSOR_NCHW_VECT_C
+# CUDNN_DATA_INT8x4,
+# CUDNN_DATA_UINT8x4,
+# CUDNN_DATA_INT8x32,
 juliaDataType(a)=(a==CUDNN_DATA_HALF ? Float16 :
                   a==CUDNN_DATA_FLOAT ? Float32 :
-                  a==CUDNN_DATA_DOUBLE ? Float64 : error())
+                  a==CUDNN_DATA_DOUBLE ? Float64 :
+                  a==CUDNN_DATA_INT8 ? Int8 :
+                  a==CUDNN_DATA_UINT8 ? UInt8 :
+                  a==CUDNN_DATA_INT32 ? Int32 : error())
 
 tuple_strides(A::Tuple) = _strides((1,), A)
 _strides(out::Tuple{Int}, A::Tuple{}) = ()
@@ -26,3 +36,18 @@ scalingParameter(T, val) = error("Unknown tensor type $T")
 scalingParameter(::Type{Float16}, val) = Ref{Float32}(val)
 scalingParameter(::Type{Float32}, val) = Ref{Float32}(val)
 scalingParameter(::Type{Float64}, val) = Ref{Float64}(val)
+
+
+# Create temporary reserveSpace. Use 128 to avoid alignment issues.
+function cudnnTempSpace(nbytes)
+    nbytes == 0 ? nothing : CuArray{Int128}(undef, (nbytes-1)÷sizeof(Int128)+1)
+end
+
+
+function nnlibPadding(dims)
+    pd = NNlib.padding(dims)
+    if !all(pd[1:2:end] .== pd[2:2:end])
+        @warn "cuDNN does not support asymmetric padding; defaulting to symmetric choice" maxlog=1
+    end
+    return pd[1:2:end]
+end
diff --git a/test/cudnn.jl b/test/cudnn.jl
deleted file mode 100644
index 6e61831d6d..0000000000
--- a/test/cudnn.jl
+++ /dev/null
@@ -1,111 +0,0 @@
-using CUDA.CUDNN
-
-@test has_cudnn()
-@test CUDNN.version() isa VersionNumber
-
-@testset "NNlib" begin
-  using NNlib
-  using NNlib: ∇conv_data, ∇conv_filter,
-               maxpool, meanpool, ∇maxpool, ∇meanpool,
-               softmax, ∇softmax, logsoftmax, ∇logsoftmax
-  a, b, c = rand(Float64, 10, 10, 3, 1), rand(Float64, 2, 2, 3, 4), rand(Float64, 9, 9, 4, 1)
-  da, db, dc = CuArray(a), CuArray(b), CuArray(c)
-  cdims = DenseConvDims(a, b)
-  @test NNlib.conv(a, b, cdims) ≈ collect(NNlib.conv(da, db, cdims))
-  @test ∇conv_data(c, b, cdims) ≈ collect(∇conv_data(dc, db, cdims))
-  @test ∇conv_filter(a, c, cdims) ≈ collect(∇conv_filter(da, dc, cdims))
-
-  # Test for agreement between CPU NNlib and CuDNN versions, across a variety of kwargs
-  for num_spatial_dims in (1, 2, 3)
-    # Initialize data we'll run our tests over
-    C_in = 3
-    C_out = 4
-    batch_size = 1
-    x = rand(Float64, fill(8, num_spatial_dims)..., C_in, batch_size)
-    w = rand(Float64, fill(2, num_spatial_dims)..., C_in, C_out)
-    b = rand(Float64, fill(1, num_spatial_dims)..., C_in, C_out)
-    options = (Dict(), Dict(:dilation => 2), Dict(:flipkernel => true), Dict(:stride => 2),)
-    algos = (1, 0, 1, 1,)
-
-    for (opts, algo) in zip(options, algos)
-      cdims = DenseConvDims(x, w; opts...)
-      y = NNlib.conv(x, w, cdims)
-
-      # Test that basic convolution is equivalent across GPU/CPU
-      @test testf((x, w) -> NNlib.conv(x, w, cdims), x, w)
-      @test testf((y, w) -> ∇conv_data(y, w, cdims), y, w)
-      @test testf((x, y) -> ∇conv_filter(x, y, cdims), x, y)
-
-      # Test that we can use an alternative algorithm without dying
-      @test_nowarn NNlib.conv!(CuArray{Float32}(y), CuArray{Float32}(x), CuArray{Float32}(w), cdims; algo=algo)
-      @test_nowarn NNlib.∇conv_data!(CuArray{Float32}(x), CuArray{Float32}(y), CuArray{Float32}(w), cdims; algo=algo)
-      @test_nowarn NNlib.∇conv_filter!(CuArray{Float32}(w), CuArray{Float32}(x), CuArray{Float32}(y), cdims; algo=algo)
-    end
-
-    # Test that pooling is equivalent across GPU/CPU
-    pdims = PoolDims(x, 2)
-    y = maxpool(x, pdims)
-    dy = ones(size(y))
-    @test testf(x -> maxpool(x, pdims), x)
-    @test testf((dy, y, x) -> ∇maxpool(dy, y, x, pdims), dy, y, x)
-    @test testf(x -> maxpool(x, pdims), x)
-    @test testf((dy, y, x) -> ∇maxpool(dy, y, x, pdims), dy, y, x)
-
-    # CPU implementation of ∇conv_bias!
-    db = zeros(Float64, 1, 1, 3, 1)
-    function CUDNN.∇conv_bias!(db, y)
-      db .= sum(y, dims=(1:(ndims(y)-2)))
-      return db
-    end
-    #@test testf(CUDNN.∇conv_bias!, db, y)
-  end
-
-  for dims in [(5,5), (5,)]
-    @test testf(softmax, rand(Float64, dims))
-    @test testf(∇softmax, rand(Float64, dims), rand(Float64, dims))
-    @test testf(logsoftmax, rand(Float64, dims))
-    @test testf(∇logsoftmax, rand(Float64, dims), rand(Float64, dims))
-  end
-end
-
-@testset "Activations and Other Ops" begin
-  @test testf(CUDNN.cudnnAddTensor, CUDA.rand(Float32, 10, 10, 3, 1), CUDA.rand(Float32, 10, 10, 3, 1))
-  @test testf(CUDNN.cudnnActivationForward, CUDA.rand(Float32, 10, 10, 3, 1), CUDA.rand(Float32, 10, 10, 3, 1))
-  @test testf(CUDNN.cudnnActivationBackward, CUDA.rand(Float32, 10, 10, 3, 1), CUDA.rand(Float32, 10, 10, 3, 1), CUDA.rand(Float32, 10, 10, 3, 1), CUDA.rand(Float32, 10, 10, 3, 1))
-
-  # activations defined in src/nnlib.jl
-  ACTIVATION_FUNCTIONS = [σ, logσ, hardσ, hardtanh, relu, leakyrelu, relu6, rrelu,
-                          elu, gelu, celu, swish, lisht, selu, trelu, softplus,
-                          softsign, logcosh, mish, tanhshrink, softshrink];
-  for dims in ((5,5), (5,))
-    for f in filter(x -> x != rrelu, ACTIVATION_FUNCTIONS)
-      @test testf(x -> f.(x), rand(Float64, dims))
-    end
-  end
-
-  # softplus does not give `Inf` for large arguments
-  x = CuArray([1000.])
-  @test all(softplus.(x) .== x)
-
-  # optimized activation overwrote inputs
-  let
-    x = CUDA.ones(1)
-    @test Array(x) == [1f0]
-    tanh.(x)
-    @test Array(x) == [1f0]
-    y = tanh.(x)
-    @test Array(x) == [1f0]
-    @test Array(y) == [tanh(1f0)]
-    x .= tanh.(y)
-    @test Array(y) == [tanh(1f0)]
-    @test Array(x) == [tanh(tanh(1f0))]
-  end
-end
-
-@testset "Batchnorm" begin
-  v = CUDA.rand(Float32, 2)
-  m = CUDA.rand(Float32, 2, 5)
-  for training in (false, true)
-    CUDNN.batchnorm(v, v, m, v, v, 1.0; training=training)
-  end
-end
diff --git a/test/cudnn/activation.jl b/test/cudnn/activation.jl
new file mode 100644
index 0000000000..e3c30e8873
--- /dev/null
+++ b/test/cudnn/activation.jl
@@ -0,0 +1,66 @@
+using CUDA, Test, Random
+using CUDA.CUDNN: 
+    cudnnActivationForward,
+    cudnnActivationForward!,
+    cudnnActivationBackward,
+    cudnnActivationDescriptor,
+        cudnnActivationDescriptor_t,
+        cudnnCreateActivationDescriptor,
+        cudnnSetActivationDescriptor,
+        cudnnGetActivationDescriptor,
+        cudnnDestroyActivationDescriptor,
+    cudnnActivationMode_t,
+        CUDNN_ACTIVATION_SIGMOID,      # 0
+        CUDNN_ACTIVATION_RELU,         # 1
+        CUDNN_ACTIVATION_TANH,         # 2
+        CUDNN_ACTIVATION_CLIPPED_RELU, # 3
+        CUDNN_ACTIVATION_ELU,          # 4
+        CUDNN_ACTIVATION_IDENTITY,     # 5
+    cudnnNanPropagation_t,
+        CUDNN_NOT_PROPAGATE_NAN, # 0
+        CUDNN_PROPAGATE_NAN,     # 1
+    handle
+
+
+@testset "cudnn/activation" begin
+
+    @test cudnnActivationDescriptor(C_NULL) isa cudnnActivationDescriptor
+    @test Base.unsafe_convert(Ptr, cudnnActivationDescriptor(C_NULL)) isa Ptr
+    @test cudnnActivationDescriptor(CUDNN_ACTIVATION_RELU,CUDNN_NOT_PROPAGATE_NAN,0) isa cudnnActivationDescriptor
+
+    (ax,ay) = randn.((10,10))
+    (cx,cy) = CuArray.((ax,ay))
+
+    function activationtest(
+        ;mode=CUDNN_ACTIVATION_SIGMOID,
+        nanOpt=CUDNN_NOT_PROPAGATE_NAN,
+        coef=1,
+        alpha=1,
+        beta=0,
+    )
+        fx = (mode === CUDNN_ACTIVATION_SIGMOID ? 1 ./ (1 .+ exp.(-ax)) :
+              mode === CUDNN_ACTIVATION_RELU ? max.(0,ax) :
+              mode === CUDNN_ACTIVATION_TANH ? tanh.(ax) :
+              mode === CUDNN_ACTIVATION_CLIPPED_RELU ? clamp.(ax,0,coef) :
+              mode === CUDNN_ACTIVATION_ELU ? (x->(x >= 0 ? x : coef*(exp(x)-1))).(ax) :
+              error("Unknown activation"))
+        d = cudnnActivationDescriptor(mode,nanOpt,Cfloat(coef))
+        y0 = alpha * fx
+        y1 = y0 .+ beta * ay
+        ((y0 ≈ cudnnActivationForward(cx; mode, nanOpt, coef, alpha) |> Array) &&
+         (y0 ≈ cudnnActivationForward(cx, d; alpha) |> Array) &&
+         (y1 ≈ cudnnActivationForward!(copy(cy), cx; mode, nanOpt, coef, alpha, beta) |> Array) &&
+         (y1 ≈ cudnnActivationForward!(copy(cy), cx, d; alpha, beta) |> Array))
+    end
+    
+    @test activationtest(mode=CUDNN_ACTIVATION_SIGMOID)
+    @test activationtest(mode=CUDNN_ACTIVATION_RELU)
+    @test activationtest(mode=CUDNN_ACTIVATION_TANH)
+    @test activationtest(mode=CUDNN_ACTIVATION_CLIPPED_RELU)
+    @test activationtest(mode=CUDNN_ACTIVATION_ELU)
+    @test activationtest(nanOpt=CUDNN_PROPAGATE_NAN)
+    @test activationtest(coef=2,mode=CUDNN_ACTIVATION_CLIPPED_RELU)
+    @test activationtest(coef=2,mode=CUDNN_ACTIVATION_ELU)
+    @test activationtest(alpha=2)
+    @test activationtest(beta=2)
+end
diff --git a/test/cudnn/convolution.jl b/test/cudnn/convolution.jl
new file mode 100644
index 0000000000..401da95122
--- /dev/null
+++ b/test/cudnn/convolution.jl
@@ -0,0 +1,184 @@
+using Test, CUDA, Random
+import NNlib
+using CUDA.CUDNN:
+    cudnnConvolutionForward,
+    cudnnConvolutionForward!,
+    cudnnConvolutionBackwardFilter,
+    cudnnConvolutionBackwardData,
+    cudnnGetConvolutionNdForwardOutputDim,
+    cudnnSetConvolutionMathType,
+    cudnnSetConvolutionReorderType,
+    cudnnSetConvolutionGroupCount,
+    cudnnFindConvolutionForwardAlgorithmEx,
+        cudnnConvolutionFwdAlgoPerf_t,
+    cudnnFindConvolutionBackwardFilterAlgorithmEx,
+        cudnnConvolutionBwdFilterAlgoPerf_t,
+    cudnnFindConvolutionBackwardDataAlgorithmEx,
+        cudnnConvolutionBwdDataAlgoPerf_t,
+    cudnnConvolutionDescriptor,
+        cudnnConvolutionDescriptor_t,
+        cudnnCreateConvolutionDescriptor,
+        cudnnSetConvolutionNdDescriptor,
+        cudnnDestroyConvolutionDescriptor,
+    cudnnConvolutionMode_t,
+        CUDNN_CONVOLUTION,       # 0
+        CUDNN_CROSS_CORRELATION, # 1
+    cudnnActivationMode_t,
+        CUDNN_ACTIVATION_SIGMOID,      # 0
+        CUDNN_ACTIVATION_RELU,         # 1
+        CUDNN_ACTIVATION_TANH,         # 2
+        CUDNN_ACTIVATION_CLIPPED_RELU, # 3
+        CUDNN_ACTIVATION_ELU,          # 4
+        CUDNN_ACTIVATION_IDENTITY,     # 5
+    cudnnNanPropagation_t,
+        CUDNN_NOT_PROPAGATE_NAN, # 0
+        CUDNN_PROPAGATE_NAN,     # 1
+    cudnnMathType_t,
+        CUDNN_DEFAULT_MATH,                    # 0
+        CUDNN_TENSOR_OP_MATH,                  # 1
+        CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION, # 2
+        CUDNN_FMA_MATH,                        # 3
+    cudnnReorderType_t,
+        CUDNN_DEFAULT_REORDER, # 0
+        CUDNN_NO_REORDER,      # 1
+    cudnnConvolutionFwdAlgo_t,
+        CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,         # 0
+        CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, # 1
+        CUDNN_CONVOLUTION_FWD_ALGO_GEMM,                  # 2
+        CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,                # 3
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT,                   # 4
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,            # 5
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,              # 6
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,     # 7
+        CUDNN_CONVOLUTION_FWD_ALGO_COUNT,                 # 8
+    cudnnConvolutionBwdFilterAlgo_t,
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,                 # 0, /* non-deterministic */
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,                 # 1,
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,               # 2,
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,                 # 3, /* non-deterministic */
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,          # 4, /* not implemented */
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED, # 5,
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,        # 6,
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT,             # 7
+    cudnnConvolutionBwdDataAlgo_t,
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,                 # 0, /* non-deterministic */
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,                 # 1,
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,               # 2,
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,        # 3,
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,          # 4,
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED, # 5,
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT,             # 6
+    cudnnTensorFormat_t,
+        CUDNN_TENSOR_NCHW,        # 0, /* row major (wStride = 1, hStride = w) */
+        CUDNN_TENSOR_NHWC,        # 1, /* feature maps interleaved ( cStride = 1 )*/
+        CUDNN_TENSOR_NCHW_VECT_C, # 2, /* each image point is vector of element of C, vector length in data type */
+    cudnnDataType,
+    convdims,
+    math_mode,
+    handle
+
+@testset "cudnn/convolution" begin
+    T = Float32
+    ax,aw,ab = randn(T,8,8,4,4),randn(T,3,3,4,4),randn(T,1,1,4,1)
+    cx,cw,cb = CuArray.((ax,aw,ab))
+
+    function convtest(;
+                      blendz=false,
+                      bias=nothing,
+                      activation = CUDNN_ACTIVATION_IDENTITY,
+                      mode = CUDNN_CONVOLUTION,
+                      padding = 0,
+                      stride = 1,
+                      dilation = 1,
+                      group = 1,
+                      dataType = eltype(cx),
+                      mathType = math_mode(),
+                      reorderType = CUDNN_DEFAULT_REORDER,
+                      alpha = 1,
+                      beta = 0)
+        if group == 1
+            cdims = NNlib.DenseConvDims(ax, aw; stride, padding, dilation, flipkernel = (mode === CUDNN_CROSS_CORRELATION))
+            ay = NNlib.conv(ax, aw, cdims)
+            cw0 = cw
+        else
+            # Implement grouped convolution
+            xchan = size(aw,3)÷group
+            ychan = size(aw,4)÷group
+            xdims = (size(ax,1),size(ax,2),xchan,size(ax,4))
+            wdims = (size(aw,1),size(aw,2),xchan,ychan)
+            cdims = NNlib.DenseConvDims(xdims, wdims; stride, padding, dilation, flipkernel = (mode === CUDNN_CROSS_CORRELATION))
+            ay = nothing
+            for g in 1:group
+                xrange = 1+(g-1)*xchan:g*xchan
+                yrange = 1+(g-1)*ychan:g*ychan
+                ay0 = NNlib.conv(ax[:,:,xrange,:], aw[:,:,1:xchan,yrange], cdims)
+                ay = (ay === nothing ? ay0 : cat(ay, ay0; dims=3))
+            end
+            cw0 = CuArray(aw[:,:,1:xchan,:])
+        end
+
+        if alpha != 1; ay = alpha * ay; end
+        if bias != nothing; ay = ay .+ Array(bias); end
+
+        act = (activation === CUDNN_ACTIVATION_RELU ? NNlib.relu :
+               activation === CUDNN_ACTIVATION_IDENTITY ? identity :
+               error("Unsupported activation $activation"))
+        ay1 = act.(ay)
+
+        az0 = randn(T,size(ay)...)
+        ay0 = randn(T,size(ay)...)
+        cy0, cy1 = CuArray.((ay0,ay0))
+        if blendz
+            cz0 = cz1 = CuArray(az0)
+            ay2 = act.(ay .+ beta * az0)
+        else
+            cz0, cz1 = cy0, cy1
+            ay2 = act.(ay .+ beta * ay0)
+        end
+        d = cudnnConvolutionDescriptor(convdims(padding,size(ax)), convdims(stride,size(ax)), convdims(dilation,size(ax)), mode, cudnnDataType(dataType), mathType, reorderType, Cint(group))
+        ((ay1 ≈ cudnnConvolutionForward(cw0, cx; bias, activation, mode, padding, stride, dilation, group, mathType, reorderType, alpha) |> Array) &&
+         (ay1 ≈ cudnnConvolutionForward(cw0, cx, d; bias, activation, alpha) |> Array) &&
+         (ay2 ≈ cudnnConvolutionForward!(cy0, cw0, cx; z=cz0, bias, activation, mode, padding, stride, dilation, group, mathType, reorderType, alpha, beta) |> Array) &&
+         (ay2 ≈ cudnnConvolutionForward!(cy1, cw0, cx, d; z=cz1, bias, activation, alpha, beta) |> Array))
+    end
+
+    # These call cudnnConvolutionForward
+    @test convtest()
+    @test convtest(padding=1)
+    @test convtest(stride=2)
+    @test convtest(dilation=2)
+    @test convtest(group=2) # See https://blog.yani.ai/filter-group-tutorial/
+    @test convtest(mathType=CUDNN_DEFAULT_MATH)
+    @test convtest(mathType=CUDNN_TENSOR_OP_MATH)
+    @test convtest(mathType=CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)
+    @test convtest(reorderType=CUDNN_NO_REORDER)
+    @test convtest(alpha=2)
+    @test convtest(beta=2)
+
+    # These call cudnnConvolutionBiasActivationForward
+    @test convtest(bias=cb)
+    @test convtest(blendz=true)
+    @test convtest(activation=CUDNN_ACTIVATION_RELU)
+    @test convtest(bias=cb,blendz=true)
+    @test convtest(bias=cb,activation=CUDNN_ACTIVATION_RELU)
+    @test convtest(bias=cb,padding=1)
+    @test convtest(bias=cb,stride=2)
+    @test convtest(bias=cb,dilation=2)
+    @test convtest(bias=cb,group=2)
+    @test convtest(bias=cb,mathType=CUDNN_DEFAULT_MATH)
+    @test convtest(bias=cb,mathType=CUDNN_TENSOR_OP_MATH)
+    @test convtest(bias=cb,mathType=CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)
+    @test convtest(bias=cb,reorderType=CUDNN_NO_REORDER)
+    @test convtest(bias=cb,alpha=2)
+    @test convtest(bias=cb,beta=2)
+    @test convtest(bias=cb,beta=2,blendz=true)
+
+    # Test tensor format
+    cx2,cw2,cb2 = (x->permutedims(x,(3,1,2,4))).((cx,cw,cb))
+    whcn = cudnnConvolutionForward(cw,cx)
+    cwhn = cudnnConvolutionForward(cw2,cx2,format=CUDNN_TENSOR_NHWC)
+    @test cwhn ≈ permutedims(whcn,(3,1,2,4))
+    whcn = cudnnConvolutionForward(cw,cx;bias=cb)
+    cwhn = cudnnConvolutionForward(cw2,cx2;bias=cb2,format=CUDNN_TENSOR_NHWC)
+    @test cwhn ≈ permutedims(whcn,(3,1,2,4))
+end
diff --git a/test/cudnn/dropout.jl b/test/cudnn/dropout.jl
new file mode 100644
index 0000000000..16e1c8a037
--- /dev/null
+++ b/test/cudnn/dropout.jl
@@ -0,0 +1,35 @@
+using Test, CUDA, Statistics
+using CUDA.CUDNN: 
+    cudnnDropoutForward,
+    cudnnDropoutForward!,
+    cudnnDropoutBackward,
+    cudnnDropoutSeed,
+    cudnnDropoutDescriptor,
+        cudnnDropoutDescriptor_t,
+        cudnnCreateDropoutDescriptor,
+        cudnnSetDropoutDescriptor,
+        cudnnGetDropoutDescriptor,
+        cudnnRestoreDropoutDescriptor,
+        cudnnDestroyDropoutDescriptor,
+    cudnnDropoutGetStatesSize,
+    cudnnDropoutGetReserveSpaceSize,
+    handle
+
+@testset "cudnn/dropout" begin
+
+    @test cudnnDropoutDescriptor(C_NULL) isa cudnnDropoutDescriptor
+    @test Base.unsafe_convert(Ptr, cudnnDropoutDescriptor(C_NULL)) isa Ptr
+    @test cudnnDropoutDescriptor(0.5) isa cudnnDropoutDescriptor
+
+    N,P = 1000, 0.7
+    x = CUDA.rand(N)
+    d = cudnnDropoutDescriptor(P)
+    cudnnDropoutSeed[] = 1
+    y = cudnnDropoutForward(x; dropout = P)
+    @test isapprox(mean(Array(y).==0), P; atol = 3/sqrt(N))
+    @test y == cudnnDropoutForward(x, d)
+    @test y == cudnnDropoutForward!(similar(x), x; dropout = P)
+    @test y == cudnnDropoutForward!(similar(x), x, d)
+    cudnnDropoutSeed[] = -1
+
+end
diff --git a/test/cudnn/inplace.jl b/test/cudnn/inplace.jl
new file mode 100644
index 0000000000..ab065c1067
--- /dev/null
+++ b/test/cudnn/inplace.jl
@@ -0,0 +1,32 @@
+using Test, CUDA, Random
+import CUDA.CUDNN:
+    cudnnSetTensor!,
+    cudnnScaleTensor!,
+    cudnnScaleTensor,
+    cudnnAddTensor!,
+    cudnnAddTensor,
+    CUDNN_TENSOR_NHWC
+
+
+@testset "cudnn/inplace" begin
+    x = CUDA.rand(10)
+    cudnnSetTensor!(x, 7)
+    @test all(isequal(7), Array(x))
+    ax = rand(10)
+    cx = CuArray(ax)
+    @test 7*ax ≈ cudnnScaleTensor(cx, 7) |> Array
+    @test 7*ax ≈ cudnnScaleTensor!(similar(cx), cx, 7) |> Array
+    ax,ab = rand(5,4,3,2),rand(1,1,3,1)
+    cx,cb = CuArray.((ax,ab))
+    @test ax .+ ab ≈ cudnnAddTensor(cx, cb) |> Array
+    @test ax .+ 7*ab ≈ cudnnAddTensor(cx, cb, alpha=7) |> Array
+    @test 7*ax .+ ab ≈ cudnnAddTensor(cx, cb, beta=7) |> Array
+    @test ax .+ ab ≈ cudnnAddTensor!(similar(cx), cx, cb) |> Array
+    @test ax .+ 7*ab ≈ cudnnAddTensor!(similar(cx), cx, cb, alpha=7) |> Array
+    @test 7*ax .+ ab ≈ cudnnAddTensor!(similar(cx), cx, cb, beta=7) |> Array
+    @test ax .+ ab ≈ cudnnAddTensor!(cx, cx, cb) |> Array
+    @test ax .+ ab ≈ cx |> Array
+    ax,ab = rand(3,5,4,2),rand(3,1,1,1)
+    cx,cb = CuArray.((ax,ab))
+    @test ax .+ ab ≈ cudnnAddTensor(cx, cb, format=CUDNN_TENSOR_NHWC) |> Array
+end
diff --git a/test/cudnn/multiheadattn.jl b/test/cudnn/multiheadattn.jl
new file mode 100644
index 0000000000..d4f8a75e10
--- /dev/null
+++ b/test/cudnn/multiheadattn.jl
@@ -0,0 +1,164 @@
+using Test, Random, CUDA
+
+using CUDA.CUDNN: 
+    cudnnMultiHeadAttnForward,
+    cudnnMultiHeadAttnForward!,
+    cudnnMultiHeadAttnBackwardData,
+    cudnnMultiHeadAttnBackwardWeights,
+    cudnnGetMultiHeadAttnBuffers,
+    cudnnGetMultiHeadAttnWeights,
+    cudnnAttnDescriptor,
+        cudnnAttnDescriptor_t,
+        cudnnCreateAttnDescriptor,
+        cudnnDestroyAttnDescriptor,
+        cudnnSetAttnDescriptor,
+        cudnnGetAttnDescriptor,
+        cudnnDataType_t,
+        cudnnDropoutDescriptor_t,
+    cudnnAttnQueryMap_t,
+        CUDNN_ATTN_QUERYMAP_ALL_TO_ONE, # 0         /* multiple Q-s map to a single (K,V) set when beam size > 1, beam sizes for (K,V) = 1 */
+        CUDNN_ATTN_QUERYMAP_ONE_TO_ONE, # (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1, beam sizes for (K,V) = beam size for (Q) */
+        CUDNN_ATTN_DISABLE_PROJ_BIASES, # 0         /* no biases in attention input and output projections */
+        CUDNN_ATTN_ENABLE_PROJ_BIASES,  # (1U << 1) /* use biases in attention input and output projections */
+    cudnnMultiHeadAttnWeightKind_t,
+        CUDNN_MH_ATTN_Q_WEIGHTS, # 0, /* input projection weights for 'queries' */
+        CUDNN_MH_ATTN_K_WEIGHTS, # 1, /* input projection weights for 'keys' */
+        CUDNN_MH_ATTN_V_WEIGHTS, # 2, /* input projection weights for 'values' */
+        CUDNN_MH_ATTN_O_WEIGHTS, # 3, /* output projection weights */
+        CUDNN_MH_ATTN_Q_BIASES,  # 4, /* input projection bias tensor for 'queries' */
+        CUDNN_MH_ATTN_K_BIASES,  # 5, /* input projection bias for 'keys' */
+        CUDNN_MH_ATTN_V_BIASES,  # 6, /* input projection bias for 'values' */
+        CUDNN_MH_ATTN_O_BIASES,  # 7, /* output projection biases */
+    cudnnMathType_t,
+        CUDNN_DEFAULT_MATH,                    # 0,
+        CUDNN_TENSOR_OP_MATH,                  # 1,
+        CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION, # 2,
+        CUDNN_FMA_MATH,                        # 3,
+    cudnnWgradMode_t,
+        CUDNN_WGRAD_MODE_ADD,  # 0,
+        CUDNN_WGRAD_MODE_SET,  # 1,
+    cudnnSeqDataDescriptor,
+        cudnnSeqDataDescriptor_t,
+        cudnnCreateSeqDataDescriptor,
+        cudnnDestroySeqDataDescriptor,
+        cudnnSetSeqDataDescriptor,
+        cudnnGetSeqDataDescriptor,
+    cudnnSeqDataAxis_t,
+        CUDNN_SEQDATA_TIME_DIM,  # 0, /* index in time */
+        CUDNN_SEQDATA_BATCH_DIM, # 1, /* index in batch */
+        CUDNN_SEQDATA_BEAM_DIM,  # 2, /* index in beam */
+        CUDNN_SEQDATA_VECT_DIM,  # 3  /* index in vector */
+        CUDNN_SEQDATA_DIM_COUNT, # 4
+    cudnnDataType,
+    cudnnSeqDataDefaultAxes,
+    math_mode,
+    sdim,
+    handle
+
+
+@testset "cudnn/multiheadattn" begin
+
+    function mhatest(
+        # Input tensor descriptors
+        ;axes::Vector{cudnnSeqDataAxis_t} = cudnnSeqDataDefaultAxes,
+        seqLengthsQO::Vector{<:Integer} = fill(Cint(sdim(queries,axes,CUDNN_SEQDATA_TIME_DIM)), sdim(queries,axes,CUDNN_SEQDATA_BATCH_DIM)*sdim(queries,axes,CUDNN_SEQDATA_BEAM_DIM)),
+        seqLengthsKV::Vector{<:Integer} = fill(Cint(sdim(keys,axes,CUDNN_SEQDATA_TIME_DIM)), sdim(keys,axes,CUDNN_SEQDATA_BATCH_DIM)*sdim(keys,axes,CUDNN_SEQDATA_BEAM_DIM)),
+        #devSeqLengthsQO::CuVector{Cint} = convert(CuVector{Cint}, seqLengthsQO),
+        #devSeqLengthsKV::CuVector{Cint} = convert(CuVector{Cint}, seqLengthsKV),
+        #qDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(queries; axes, seqLengthArray=seqLengthsQO),
+        #kDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(keys;    axes, seqLengthArray=seqLengthsKV),
+        #vDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(values;  axes, seqLengthArray=seqLengthsKV),
+
+        # attnDesc parameters
+        attnMode::Unsigned = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE | CUDNN_ATTN_DISABLE_PROJ_BIASES |> Cuint,
+        nHeads::Integer = Cint(1),
+        smScaler::Real = Cdouble(1),
+        # dataType::DataType = eltype(queries),
+        # computePrec::DataType = eltype(queries),  ## No other option according to 8.0.2
+        mathType::cudnnMathType_t = math_mode(),
+        # attnDropout::Real = 0, ## The dropout option is currently not supported by the multi-head attention API
+        # postDropout::Real = 0, ## The dropout option is currently not supported by the multi-head attention API
+        qProjSize::Integer = 0, # Use zero to disable the corresponding projection
+        kProjSize::Integer = 0,
+        vProjSize::Integer = 0,
+        oProjSize::Integer = 0,
+        qoMaxSeqLength::Integer = sdim(queries,axes,CUDNN_SEQDATA_TIME_DIM),
+        kvMaxSeqLength::Integer = sdim(keys,axes,CUDNN_SEQDATA_TIME_DIM),
+        maxBatchSize::Integer = sdim(queries,axes,CUDNN_SEQDATA_BATCH_DIM),
+        maxBeamSize::Integer = sdim(queries,axes,CUDNN_SEQDATA_BEAM_DIM),
+
+        # forw parameters
+        residuals = nothing,
+        currIdx::Integer = -1,
+        loWinIdx::Array{Cint} = fill(Cint(0), qoMaxSeqLength),
+        hiWinIdx::Array{Cint} = fill(Cint(kvMaxSeqLength), qoMaxSeqLength),
+        #workspace::Union{CuArray,Nothing}    = nothing, 
+        #reserveSpace::Union{CuArray,Nothing} = nothing,
+    )
+        attnDesc::cudnnAttnDescriptor = cudnnAttnDescriptor(
+            Cuint(attnMode),
+            Cint(nHeads),
+            Cdouble(smScaler),
+            cudnnDataType(eltype(queries)),    # dataType
+            cudnnDataType(eltype(queries)),    # computePrec
+            mathType,
+            C_NULL,  # attnDropout
+            C_NULL,  # postDropout
+            Cint(sdim(queries,axes,CUDNN_SEQDATA_VECT_DIM)), # qSize
+            Cint(sdim(keys,   axes,CUDNN_SEQDATA_VECT_DIM)), # kSize
+            Cint(sdim(values, axes,CUDNN_SEQDATA_VECT_DIM)), # vSize
+            Cint(qProjSize),
+            Cint(kProjSize),
+            Cint(vProjSize),
+            Cint(oProjSize),
+            Cint(qoMaxSeqLength),
+            Cint(kvMaxSeqLength),
+            Cint(maxBatchSize),
+            Cint(maxBeamSize)
+        )
+        y = cudnnMultiHeadAttnForward(weights, queries, keys, values; axes, seqLengthsQO, seqLengthsKV, attnMode, nHeads, smScaler, mathType, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize, residuals, currIdx, loWinIdx, hiWinIdx)
+        (y ≈ cudnnMultiHeadAttnForward!(zero(y), weights, queries, keys, values; axes, seqLengthsQO, seqLengthsKV, attnMode, nHeads, smScaler, mathType, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize, residuals, currIdx, loWinIdx, hiWinIdx) &&
+         y ≈ cudnnMultiHeadAttnForward(weights, queries, keys, values, attnDesc; axes, seqLengthsQO, seqLengthsKV, residuals, currIdx, loWinIdx, hiWinIdx) &&
+         y ≈ cudnnMultiHeadAttnForward!(zero(y), weights, queries, keys, values, attnDesc; axes, seqLengthsQO, seqLengthsKV, residuals, currIdx, loWinIdx, hiWinIdx))
+    end
+
+    Q,K,V,B,T,F = 6,6,5,4,3,Float32
+
+    weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,B,T),(F,K,B,T),(F,V,B,T)))
+    @test mhatest()
+    @test mhatest(attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE | CUDNN_ATTN_ENABLE_PROJ_BIASES |> Cuint, vProjSize=7)
+    @test mhatest(seqLengthsQO = Cint[1,2,3,1])
+    @test mhatest(seqLengthsKV = Cint[1,2,3,1])
+    @test mhatest(nHeads = 2)
+    @test mhatest(smScaler = 2)
+    @test mhatest(mathType = CUDNN_DEFAULT_MATH)
+    @test mhatest(mathType = CUDNN_TENSOR_OP_MATH)
+    @test mhatest(mathType = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)
+    @test mhatest(mathType = CUDNN_FMA_MATH)
+    @test mhatest(kProjSize = 7, qProjSize = 7) # k and q have to match
+    @test mhatest(vProjSize = 7)
+    @test mhatest(oProjSize = 7)
+    @test mhatest(qoMaxSeqLength = 7)
+    @test mhatest(kvMaxSeqLength = 7)
+    @test mhatest(maxBatchSize = 7)
+    @test mhatest(maxBeamSize = 7)
+    @test mhatest(loWinIdx = fill(Cint(1),T))
+    @test mhatest(hiWinIdx = fill(Cint(1),T))
+    @test mhatest(currIdx = 0)
+
+    # Test residuals: residuals and output (and thus values unless oProjSize>0) must match queries in vector size
+    values, residuals = (CUDA.randn(x...) for x in ((F,Q,B,T),(F,Q,B,T)))
+    @test mhatest(residuals = residuals)
+
+    # Test nonstandard axes order
+    weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,T,B),(F,K,T,B),(F,V,T,B)))
+    @test mhatest(axes = [CUDNN_SEQDATA_VECT_DIM, CUDNN_SEQDATA_TIME_DIM, CUDNN_SEQDATA_BATCH_DIM, CUDNN_SEQDATA_BEAM_DIM])
+
+    # Test beam handling
+    weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,B,T,2),(F,K,B,T,1),(F,V,B,T,1)))
+    @test mhatest()
+    # CUDNN_ATTN_QUERYMAP_ONE_TO_ONE does not seem to be supported
+    # weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,B,T,M),(F,K,B,T,M),(F,V,B,T,M)))
+    # @test mhatest(attnMode = CUDNN_ATTN_QUERYMAP_ONE_TO_ONE | CUDNN_ATTN_DISABLE_PROJ_BIASES |> Cuint) ## Not supported
+
+end
diff --git a/test/cudnn/nnlib.jl b/test/cudnn/nnlib.jl
new file mode 100644
index 0000000000..d993dce73b
--- /dev/null
+++ b/test/cudnn/nnlib.jl
@@ -0,0 +1,138 @@
+#include("../setup.jl")          # need for testf
+using Test, Random, CUDA, NNlib
+using CUDA.CUDNN
+using NNlib
+using NNlib: ∇conv_data, ∇conv_filter,
+    maxpool, meanpool, ∇maxpool, ∇meanpool,
+    softmax, ∇softmax, logsoftmax, ∇logsoftmax
+
+@test has_cudnn()
+@test CUDNN.version() isa VersionNumber
+
+@testset "NNlib" begin
+    a, b, c = rand(Float64, 10, 10, 3, 1), rand(Float64, 2, 2, 3, 4), rand(Float64, 9, 9, 4, 1)
+    da, db, dc = CuArray(a), CuArray(b), CuArray(c)
+    cdims = DenseConvDims(a, b)
+    @test NNlib.conv(a, b, cdims) ≈ collect(NNlib.conv(da, db, cdims))
+    @test ∇conv_data(c, b, cdims) ≈ collect(∇conv_data(dc, db, cdims))
+    @test ∇conv_filter(a, c, cdims) ≈ collect(∇conv_filter(da, dc, cdims))
+
+    # Test for agreement between CPU NNlib and CuDNN versions, across a variety of kwargs
+    for num_spatial_dims in (1, 2, 3)
+        # Initialize data we'll run our tests over
+        C_in = 3
+        C_out = 4
+        batch_size = 1
+        x = rand(Float64, fill(8, num_spatial_dims)..., C_in, batch_size)
+        w = rand(Float64, fill(2, num_spatial_dims)..., C_in, C_out)
+        b = rand(Float64, fill(1, num_spatial_dims)..., C_in, C_out)
+        options = (Dict(), Dict(:dilation => 2), Dict(:flipkernel => true), Dict(:stride => 2), Dict(:padding => 1))
+
+        # @denizyuret: algo option deprecated for nnlib, handling in cudnn
+        # algos = (1, 0, 1, 1,)
+        # for (opts, algo) in zip(options, algos)
+
+        for opts in options  
+            cdims = DenseConvDims(x, w; opts...)
+            y = NNlib.conv(x, w, cdims)
+
+            # Test that basic convolution is equivalent across GPU/CPU
+            @test testf((x, w) -> NNlib.conv(x, w, cdims), x, w)
+            @test testf((y, w) -> NNlib.∇conv_data(y, w, cdims), y, w)
+            @test testf((x, y) -> NNlib.∇conv_filter(x, y, cdims), x, y)
+
+            # Scaling factors
+            @test testf((x, w) -> NNlib.conv(x, w, cdims; alpha=2.0), x, w)
+            @test testf((y, w) -> NNlib.∇conv_data(y, w, cdims; alpha=2.0), y, w)
+            @test testf((x, y) -> NNlib.∇conv_filter(x, y, cdims; alpha=2.0), x, y)
+            
+            @test testf((y, x, w) -> NNlib.conv!(copy(y), x, w, cdims; beta=2.0), y, x, w)
+            # @test testf((x, y, w) -> NNlib.∇conv_data!(copy(x), y, w, cdims; beta=2.0), x, y, w)
+            @test testf((w, x, y) -> NNlib.∇conv_filter!(copy(w), x, y, cdims; beta=2.0), w, x, y)
+
+            # Test the compatibility shims
+            cy,cx,cw = CuArray{Float32}.((y,x,w))
+            opts2 = Dict((k==:padding ? :pad : k)=>v for (k,v) in opts)
+            @test NNlib.conv!(similar(cy),cx,cw; opts2...) ≈ NNlib.conv!(similar(cy),cx,cw,cdims)
+            @test NNlib.∇conv_filter!(similar(cw),cy,cx; opts2...) ≈ NNlib.∇conv_filter!(similar(cw),cx,cy,cdims)
+        end
+
+        # Test that pooling is equivalent across GPU/CPU
+        pdims = PoolDims(x, 2)
+        y = maxpool(x, pdims)
+        dy = ones(size(y))
+        @test testf(x -> maxpool(x, pdims), x)
+        @test testf((dy, y, x) -> ∇maxpool(dy, y, x, pdims), dy, y, x)
+        @test testf(x -> maxpool(x, pdims), x)
+        @test testf((dy, y, x) -> ∇maxpool(dy, y, x, pdims), dy, y, x)
+
+        # Test the compatibility shims for pooling
+        cx,cy,cdy = CuArray{Float32}.((x,y,dy))
+        win,pad=2,1
+        @test maxpool!(similar(cy), cx, win; pad=pad, stride=win) ≈ maxpool!(similar(cy), cx, PoolDims(cx, win; padding=pad, stride=win))
+        @test meanpool!(similar(cy), cx, win; pad=pad, stride=win) ≈ meanpool!(similar(cy), cx, PoolDims(cx, win; padding=pad, stride=win))
+
+        # CPU implementation of ∇conv_bias!
+        db = zeros(Float64, 1, 1, 3, 1)
+        dy = randn(Float64, 8, 8, 3, 1)
+        function CUDNN.∇conv_bias!(db, dy)
+            db .= sum(dy, dims=(1:(ndims(dy)-2)))
+            return db
+        end
+        @test testf(CUDNN.∇conv_bias!, db, dy)
+    end
+
+    for dims in [(5,5), (5,)]
+        x = randn(Float64,dims)
+        y = softmax(x)
+        dy = randn(Float64,dims)
+        @test testf(softmax, x)
+        @test testf(∇softmax, dy, x) # add y when NNlib implements it
+        y = logsoftmax(x)
+        @test testf(logsoftmax, x)
+        @test testf(∇logsoftmax, dy, x) # add y when NNlib implements it
+    end
+end
+
+@testset "Activations and Other Ops" begin
+    @test testf(CUDNN.cudnnAddTensor, CUDA.rand(Float32, 10, 10, 3, 1), CUDA.rand(Float32, 10, 10, 3, 1))
+    @test testf(CUDNN.cudnnActivationForward!, CUDA.rand(Float32, 10, 10, 3, 1), CUDA.rand(Float32, 10, 10, 3, 1))
+    # @denizyuret: no high level api for backward functions, see CUDA/lib/cudnn/README.md
+    # @test testf(CUDNN.cudnnActivationBackward, CUDA.rand(Float32, 10, 10, 3, 1), CUDA.rand(Float32, 10, 10, 3, 1), CUDA.rand(Float32, 10, 10, 3, 1), CUDA.rand(Float32, 10, 10, 3, 1))
+
+    # activations defined in src/nnlib.jl
+    ACTIVATION_FUNCTIONS = [σ, logσ, hardσ, hardtanh, relu, leakyrelu, relu6, rrelu,
+                            elu, gelu, celu, swish, lisht, selu, trelu, softplus,
+                            softsign, logcosh, mish, tanhshrink, softshrink];
+    for dims in ((5,5), (5,))
+        for f in filter(x -> x != rrelu, ACTIVATION_FUNCTIONS)
+            @test testf(x -> f.(x), rand(Float64, dims))
+        end
+    end
+
+    # softplus does not give `Inf` for large arguments
+    x = CuArray([1000.])
+    @test all(softplus.(x) .== x)
+
+    # optimized activation overwrote inputs
+    let
+        x = CUDA.ones(1)
+        @test Array(x) == [1f0]
+        tanh.(x)
+        @test Array(x) == [1f0]
+        y = tanh.(x)
+        @test Array(x) == [1f0]
+        @test Array(y) == [tanh(1f0)]
+        x .= tanh.(y)
+        @test Array(y) == [tanh(1f0)]
+        @test Array(x) == [tanh(tanh(1f0))]
+    end
+end
+
+@testset "Batchnorm" begin
+    v = CUDA.rand(Float32, 2)
+    m = CUDA.rand(Float32, 2, 5)
+    for training in (false, true)
+        CUDNN.batchnorm(v, v, m, v, v, 1.0; training=training)
+    end
+end
diff --git a/test/cudnn/normalization.jl b/test/cudnn/normalization.jl
new file mode 100644
index 0000000000..0df78d2697
--- /dev/null
+++ b/test/cudnn/normalization.jl
@@ -0,0 +1,113 @@
+using Test, Random, Statistics, CUDA
+
+using CUDA.CUDNN:
+    cudnnNormalizationForward,
+    cudnnNormalizationForward!,
+    cudnnNormalizationForwardInference,
+    cudnnNormalizationForwardTraining,
+    cudnnNormalizationBackward,
+    cudnnActivationDescriptor,
+    cudnnNormMode_t,
+        CUDNN_NORM_PER_ACTIVATION, # 0, bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice)
+        CUDNN_NORM_PER_CHANNEL,    # 1, bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors)
+    cudnnNormOps_t,
+        CUDNN_NORM_OPS_NORM,                # 0, /* do normalization only */
+        CUDNN_NORM_OPS_NORM_ACTIVATION,     # 1, /* do Norm, then activation */
+        CUDNN_NORM_OPS_NORM_ADD_ACTIVATION, # 2, /* do Norm, then elemWiseAdd, then activation */
+    cudnnNormAlgo_t,
+        CUDNN_NORM_ALGO_STANDARD, # 0
+        CUDNN_NORM_ALGO_PERSIST,  # 1
+    cudnnActivationMode_t,
+        CUDNN_ACTIVATION_SIGMOID,      # 0
+        CUDNN_ACTIVATION_RELU,         # 1
+        CUDNN_ACTIVATION_TANH,         # 2
+        CUDNN_ACTIVATION_CLIPPED_RELU, # 3
+        CUDNN_ACTIVATION_ELU,          # 4
+        CUDNN_ACTIVATION_IDENTITY,     # 5
+    cudnnNanPropagation_t,
+        CUDNN_NOT_PROPAGATE_NAN, # 0
+        CUDNN_PROPAGATE_NAN,     # 1
+    cudnnTensorFormat_t,
+        CUDNN_TENSOR_NCHW,        # 0, /* row major (wStride = 1, hStride = w) */
+        CUDNN_TENSOR_NHWC,        # 1, /* feature maps interleaved ( cStride = 1 )*/
+        CUDNN_TENSOR_NCHW_VECT_C, # 2, /* each image point is vector of element of C, vector length in data type */
+    handle
+
+
+@testset "cudnn/normalization" begin
+
+    function normtest(
+        x;
+
+        training = false,
+        
+        # Inference parameters:
+        z = nothing, # for residual addition to the result of the normalization operation, prior to the activation
+        mode::cudnnNormMode_t = CUDNN_NORM_PER_CHANNEL, # Per-channel layer is based on the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, S. Ioffe, C. Szegedy, 2015.
+        normOps::cudnnNormOps_t = CUDNN_NORM_OPS_NORM,  # Currently CUDNN_NORM_OPS_NORM_ACTIVATION and CUDNN_NORM_OPS_NORM_ADD_ACTIVATION are only supported in the NHWC layout (training,backward), not supported (inference)
+        algo::cudnnNormAlgo_t = CUDNN_NORM_ALGO_STANDARD, # trigger the new semi-persistent NHWC kernel when CUDNN_NORM_ALGO_PERSIST
+        alpha::Real = 1,
+        beta::Real = 0,
+        epsilon::Real = 1e-5, # Has to be >= 0. Should be the same in forward and backward functions.
+        groupCnt::Integer = 1, # Place hold for future work, should be set to 1 now
+
+        # Main argument defaults:
+        format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW, # or NHWC
+        _sdims = (mode == CUDNN_NORM_PER_CHANNEL    && format == CUDNN_TENSOR_NCHW ? (1,1,size(x,3),1) :
+                  mode == CUDNN_NORM_PER_CHANNEL    && format == CUDNN_TENSOR_NHWC ? (size(x,1),1,1,1) :
+                  mode == CUDNN_NORM_PER_ACTIVATION && format == CUDNN_TENSOR_NCHW ? (size(x)[1:3]...,1) :
+                  mode == CUDNN_NORM_PER_ACTIVATION && format == CUDNN_TENSOR_NHWC ? (size(x)[1:3]...,1) :
+                  error("Unknown mode $mode and format $format")),
+        scale = fill!(similar(x, _sdims), 1),
+        bias =  fill!(similar(x, _sdims), 0),
+        xmean =  fill!(similar(x, _sdims), 0),
+        xvar = fill!(similar(x, _sdims), 1),
+
+        # Training-only parameters:
+        exponentialAverageFactor::Real = 0.1,
+        savedMean = nothing, # Optionally save intermediate results from the forward pass here - can be reused to speed up backward pass. NULL if unused.
+        savedInvVariance = nothing,
+
+        # Activation parameters:
+        activationMode::cudnnActivationMode_t = CUDNN_ACTIVATION_IDENTITY,
+        activationReluNanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN,
+        activationCoef::Real = 1,
+        activationDesc::Union{Nothing,cudnnActivationDescriptor} = (normOps == CUDNN_NORM_OPS_NORM ? nothing : cudnnActivationDescriptor(activationMode, activationReluNanOpt, Cdouble(activationCoef))),
+    )
+        if training
+            dims = findall(size(xmean) .== 1)
+            m = mean(x; dims)
+            v = var(x; dims, mean=m, corrected=false)
+            y = bias .+ scale .* (x .- m) ./ sqrt.(epsilon .+ v)
+        else
+            y = bias .+ scale .* (x .- xmean) ./ sqrt.(epsilon .+ xvar)
+        end
+        y0 = randn!(similar(x))
+        y1 = alpha * y
+        y2 = y1 + beta * y0
+        (y1 ≈ cudnnNormalizationForward(x, xmean, xvar, bias, scale; training, z, mode, normOps, algo, alpha, epsilon, groupCnt, format, exponentialAverageFactor, savedMean, savedInvVariance, activationDesc) &&
+         y2 ≈ cudnnNormalizationForward!(copy(y0), x, xmean, xvar, bias, scale; training, z, mode, normOps, algo, alpha, beta, epsilon, groupCnt, format, exponentialAverageFactor, savedMean, savedInvVariance, activationDesc))
+    end
+
+    x, z, s = (CUDA.randn(x...) for x in ((5,4,3,2),(5,4,3,2),(1,1,3,1)))
+    @test normtest(x)
+    @test normtest(x; training = true)
+    @test normtest(x; mode = CUDNN_NORM_PER_ACTIVATION)
+    @test normtest(x; algo = CUDNN_NORM_ALGO_PERSIST)
+    @test normtest(x; algo = CUDNN_NORM_ALGO_PERSIST, format = CUDNN_TENSOR_NHWC)
+    @test normtest(x; alpha = 2)
+    @test normtest(x; beta = 2)
+    @test normtest(x; epsilon = 0)
+    @test normtest(x; format = CUDNN_TENSOR_NHWC)
+    @test normtest(x; scale = fill!(s, 2))
+    @test normtest(x; bias  = fill!(s, 2))
+    @test normtest(x; xmean  = fill!(s, 2))
+    @test normtest(x; xvar = fill!(s, 2))
+    @test normtest(x; exponentialAverageFactor = 0.01)
+    @test normtest(x; savedMean = similar(s))
+    @test normtest(x; savedInvVariance = similar(s))
+    # cudnn-8.0.5: Currently, CUDNN_NORM_OPS_NORM_ACTIVATION and CUDNN_NORM_OPS_NORM_ADD_ACTIVATION are not supported in inference.
+    #@test normtest(x; normOps = CUDNN_NORM_OPS_NORM_ACTIVATION, activationMode = CUDNN_ACTIVATION_RELU, format = CUDNN_TENSOR_NHWC)
+    #@test normtest(x; normOps = CUDNN_NORM_OPS_NORM_ADD_ACTIVATION, activationMode = CUDNN_ACTIVATION_RELU, z, format = CUDNN_TENSOR_NHWC)
+    #@test normtest(x; groupCnt = 2) # cudnn-8.0.5: Currently only groupCnt=1 is supported
+end
diff --git a/test/cudnn/optensor.jl b/test/cudnn/optensor.jl
new file mode 100644
index 0000000000..4bf4df7dec
--- /dev/null
+++ b/test/cudnn/optensor.jl
@@ -0,0 +1,67 @@
+using CUDA, Test, Random
+using CUDA.CUDNN:
+    cudnnOpTensor,
+    cudnnOpTensor!,
+    cudnnOpTensorDescriptor,
+        cudnnOpTensorDescriptor_t,
+        cudnnCreateOpTensorDescriptor,
+        cudnnSetOpTensorDescriptor,
+        cudnnGetOpTensorDescriptor,
+        cudnnDestroyOpTensorDescriptor,
+    cudnnOpTensorOp_t,
+        CUDNN_OP_TENSOR_ADD,  # 0,
+        CUDNN_OP_TENSOR_MUL,  # 1,
+        CUDNN_OP_TENSOR_MIN,  # 2,
+        CUDNN_OP_TENSOR_MAX,  # 3,
+        CUDNN_OP_TENSOR_SQRT, # 4, performed only on first arg
+        CUDNN_OP_TENSOR_NOT,  # 5, performed only on first arg
+    cudnnNanPropagation_t,
+        CUDNN_NOT_PROPAGATE_NAN, # 0
+        CUDNN_PROPAGATE_NAN,     # 1
+    cudnnDataType,
+    handle
+
+
+@testset "cudnn/optensor" begin
+
+    @test cudnnOpTensorDescriptor(C_NULL) isa cudnnOpTensorDescriptor
+    @test Base.unsafe_convert(Ptr, cudnnOpTensorDescriptor(C_NULL)) isa Ptr
+    @test cudnnOpTensorDescriptor(CUDNN_OP_TENSOR_ADD,cudnnDataType(Float32),CUDNN_NOT_PROPAGATE_NAN) isa cudnnOpTensorDescriptor
+
+    (ax1,ax2,ay) = rand.((10,10,10))
+    (cx1,cx2,cy) = CuArray.((ax1,ax2,ay))
+
+    function optensortest(
+        ;op=CUDNN_OP_TENSOR_ADD,
+        nanOpt=CUDNN_NOT_PROPAGATE_NAN,
+        compType=(eltype(ax1) <: Float64 ? Float64 : Float32),
+        alpha1=1,
+        alpha2=1,
+        beta=0,
+    )
+        f1 = (op === CUDNN_OP_TENSOR_ADD ? alpha1*ax1 .+ alpha2*ax2 :
+              op === CUDNN_OP_TENSOR_MUL ? (alpha1*ax1) .* (alpha2*ax2) :
+              op === CUDNN_OP_TENSOR_MIN ? min.(alpha1*ax1, alpha2*ax2) :
+              op === CUDNN_OP_TENSOR_MAX ? max.(alpha1*ax1, alpha2*ax2) :
+              op === CUDNN_OP_TENSOR_SQRT ? sqrt.(alpha1*ax1) :
+              op === CUDNN_OP_TENSOR_NOT ? 1 .- ax1 :
+              error("Unknown optensor"))
+        f2 = f1 .+ beta * ay
+        d = cudnnOpTensorDescriptor(op,cudnnDataType(compType),nanOpt)
+        ((f1 ≈ cudnnOpTensor(cx1, cx2; op, compType, nanOpt, alpha1, alpha2) |> Array) &&
+         (f1 ≈ cudnnOpTensor(cx1, cx2, d; alpha1, alpha2) |> Array) &&
+         (f2 ≈ cudnnOpTensor!(copy(cy), cx1, cx2; op, compType, nanOpt, alpha1, alpha2, beta) |> Array) &&
+         (f2 ≈ cudnnOpTensor!(copy(cy), cx1, cx2, d; alpha1, alpha2, beta) |> Array))
+    end
+    
+    @test optensortest(op = CUDNN_OP_TENSOR_ADD)
+    @test optensortest(op = CUDNN_OP_TENSOR_MUL)
+    @test optensortest(op = CUDNN_OP_TENSOR_MIN)
+    @test optensortest(op = CUDNN_OP_TENSOR_MAX)
+    @test optensortest(op = CUDNN_OP_TENSOR_SQRT)
+    @test optensortest(op = CUDNN_OP_TENSOR_NOT)
+    @test optensortest(nanOpt = CUDNN_PROPAGATE_NAN)
+    @test optensortest(alpha1 = 2)
+    @test optensortest(alpha2 = 2)
+    @test optensortest(beta = 2)
+end
diff --git a/test/cudnn/pooling.jl b/test/cudnn/pooling.jl
new file mode 100644
index 0000000000..6c18a62142
--- /dev/null
+++ b/test/cudnn/pooling.jl
@@ -0,0 +1,97 @@
+using Test, CUDA, Random
+import NNlib
+using CUDA.CUDNN:
+    cudnnPoolingForward,
+    cudnnPoolingForward!,
+    cudnnPoolingBackward,
+    cudnnGetPoolingNdForwardOutputDim,
+    cudnnPoolingDescriptor,
+        cudnnPoolingDescriptor_t,
+        cudnnCreatePoolingDescriptor,
+        cudnnSetPoolingNdDescriptor,
+        cudnnDestroyPoolingDescriptor,
+    cudnnPoolingMode_t,
+        CUDNN_POOLING_MAX,                           # 0,
+        CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, # 1, /* count for average includes padded values */
+        CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING, # 2, /* count for average does not include padded values */
+        CUDNN_POOLING_MAX_DETERMINISTIC,             # 3
+    cudnnNanPropagation_t,
+        CUDNN_NOT_PROPAGATE_NAN, # 0
+        CUDNN_PROPAGATE_NAN,     # 1
+    cudnnTensorFormat_t,
+        CUDNN_TENSOR_NCHW,        # 0, /* row major (wStride = 1, hStride = w) */
+        CUDNN_TENSOR_NHWC,        # 1, /* feature maps interleaved ( cStride = 1 )*/
+        CUDNN_TENSOR_NCHW_VECT_C, # 2, /* each image point is vector of element of C, vector length in data type */
+    pooldims,
+    handle
+
+
+@testset "cudnn/pooling" begin
+
+    function pooltest(;
+                      mode = CUDNN_POOLING_MAX,
+                      nanOpt = CUDNN_NOT_PROPAGATE_NAN,
+                      window = 2,
+                      padding = 0,
+                      stride = window,
+                      format = CUDNN_TENSOR_NCHW,
+                      dataType = Float32,
+                      alpha = 1,
+                      beta = 0)
+        ax = randn(dataType,12,6,4,2)
+        N = ndims(ax)
+        window = expand(Val(N-2), window)
+        stride = expand(Val(N-2), stride)
+        padding = expand(Val(N-2), padding)
+        pdims = NNlib.PoolDims(ax, window; padding = padding, stride = stride)
+        #=
+        if mode == CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
+            @warn "Pool mode=$mode not yet implemented in NNlib, using INCLUDE instead. See https://github.com/FluxML/NNlib.jl/issues/218" maxlog=1
+        end
+        if mode == CUDNN_POOLING_MAX_DETERMINISTIC
+            @warn "Pool mode=$mode not yet implemented in NNlib, using MAX instead." maxlog=1
+        end
+        if nanOpt == CUDNN_NOT_PROPAGATE_NAN
+            @warn "Pool nanOpt=$nanOpt not yet implemented in NNlib, using PROPAGATE instead. See https://github.com/FluxML/NNlib.jl/issues/218" maxlog=1
+        end
+        =#
+        ay1 = (mode == CUDNN_POOLING_MAX ? NNlib.maxpool(ax, pdims) :
+               mode == CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ? NNlib.meanpool(ax, pdims) :
+               mode == CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING ? NNlib.meanpool(ax, pdims) : 
+               mode == CUDNN_POOLING_MAX_DETERMINISTIC ? NNlib.maxpool(ax, pdims) :
+               error("mode=$mode is not supported."))
+        ay1 = alpha * ay1
+        ay  = randn!(similar(ay1))
+        ay2 = ay1 .+ beta * ay
+        d = cudnnPoolingDescriptor(mode, nanOpt, Cint(max(2,ndims(ax)-2)), pooldims(window,size(ax)), pooldims(padding,size(ax)), pooldims(stride,size(ax)))
+        nhwc(a) = permutedims(a,(3,1,2,4))
+        if format === CUDNN_TENSOR_NCHW
+            cx, cy = CuArray.((ax, ay))
+        else
+            cx, cy = CuArray.(nhwc.((ax,ay)))
+            ay1, ay2 = nhwc.((ay1, ay2))
+        end
+        ((ay1 ≈ cudnnPoolingForward(cx; mode, nanOpt, window, padding, stride, format, alpha) |> Array) &&
+         (ay1 ≈ cudnnPoolingForward(cx, d; format, alpha) |> Array) &&
+         (ay2 ≈ cudnnPoolingForward!(copy(cy), cx; mode, nanOpt, window, padding, stride, format, alpha, beta) |> Array) &&
+         (ay2 ≈ cudnnPoolingForward!(copy(cy), cx, d; format, alpha, beta) |> Array))
+    end
+
+    expand(::Val{N}, i::NTuple{N}) where {N} = i
+    expand(::Val{N}, i::Integer) where {N} = ntuple(_ -> i, N)
+
+
+    @test pooltest()
+    @test pooltest(mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING)
+    @test pooltest(mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING)
+    @test pooltest(mode = CUDNN_POOLING_MAX_DETERMINISTIC)
+    @test pooltest(nanOpt = CUDNN_PROPAGATE_NAN)
+    @test pooltest(window = 3)
+    @test pooltest(padding = 1)
+    @test pooltest(stride = 1)
+    @test pooltest(format = CUDNN_TENSOR_NHWC)
+    @test pooltest(dataType = Float16)
+    @test pooltest(alpha = 2)
+    @test pooltest(beta = 2)
+    
+end
diff --git a/test/cudnn/reduce.jl b/test/cudnn/reduce.jl
new file mode 100644
index 0000000000..8a1ab0fa5a
--- /dev/null
+++ b/test/cudnn/reduce.jl
@@ -0,0 +1,89 @@
+using CUDA, Test, Random, Statistics
+using CUDA.CUDNN:
+    cudnnReduceTensor,
+    cudnnReduceTensor!,
+    cudnnGetReductionIndicesSize,
+    cudnnGetReductionWorkspaceSize,
+    cudnnReduceTensorDescriptor,
+        cudnnReduceTensorDescriptor_t,
+        cudnnCreateReduceTensorDescriptor,
+        cudnnSetReduceTensorDescriptor,
+        cudnnGetReduceTensorDescriptor,
+        cudnnDestroyReduceTensorDescriptor,
+    cudnnReduceTensorOp_t,
+        CUDNN_REDUCE_TENSOR_ADD,          # 0,
+        CUDNN_REDUCE_TENSOR_MUL,          # 1,
+        CUDNN_REDUCE_TENSOR_MIN,          # 2,
+        CUDNN_REDUCE_TENSOR_MAX,          # 3,
+        CUDNN_REDUCE_TENSOR_AMAX,         # 4,
+        CUDNN_REDUCE_TENSOR_AVG,          # 5,
+        CUDNN_REDUCE_TENSOR_NORM1,        # 6,
+        CUDNN_REDUCE_TENSOR_NORM2,        # 7,
+        CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS, # 8,
+    cudnnNanPropagation_t,
+        CUDNN_NOT_PROPAGATE_NAN, # 0
+        CUDNN_PROPAGATE_NAN,     # 1
+    cudnnReduceTensorIndices,
+    cudnnReduceTensorIndices_t,
+        CUDNN_REDUCE_TENSOR_NO_INDICES,        # 0,
+        CUDNN_REDUCE_TENSOR_FLATTENED_INDICES, # 1,
+    cudnnIndicesType,
+    cudnnIndicesType_t,
+        CUDNN_32BIT_INDICES, # 0,
+        CUDNN_64BIT_INDICES, # 1,
+        CUDNN_16BIT_INDICES, # 2,
+        CUDNN_8BIT_INDICES,  # 3,
+    cudnnDataType,
+    handle
+
+
+@testset "cudnn/reduce" begin
+
+    @test cudnnReduceTensorDescriptor(C_NULL) isa cudnnReduceTensorDescriptor
+    @test Base.unsafe_convert(Ptr, cudnnReduceTensorDescriptor(C_NULL)) isa Ptr
+    @test cudnnReduceTensorDescriptor(CUDNN_REDUCE_TENSOR_ADD,cudnnDataType(Float32),CUDNN_NOT_PROPAGATE_NAN,CUDNN_REDUCE_TENSOR_NO_INDICES,CUDNN_32BIT_INDICES) isa cudnnReduceTensorDescriptor
+
+    (ax,ay) = randn(Float32,10,10), randn(Float32,10,1)
+    (cx,cy) = CuArray.((ax,ay))
+
+    function reducetensortest(
+        ; op::cudnnReduceTensorOp_t = CUDNN_REDUCE_TENSOR_ADD,
+        compType::DataType = (eltype(ax) <: Float64 ? Float64 : Float32),
+        nanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN,
+        indices::Union{Vector{<:Unsigned},Nothing} = nothing,
+        d::cudnnReduceTensorDescriptor = cudnnReduceTensorDescriptor(op, cudnnDataType(compType), nanOpt, cudnnReduceTensorIndices(op, indices), cudnnIndicesType(indices)),
+        alpha::Real = 1,
+        beta::Real = 0,
+    )
+        f0 = (op === CUDNN_REDUCE_TENSOR_ADD          ? sum(ax, dims=2) :
+              op === CUDNN_REDUCE_TENSOR_MUL          ? prod(ax, dims=2) :
+              op === CUDNN_REDUCE_TENSOR_MIN          ? minimum(ax, dims=2) :
+              op === CUDNN_REDUCE_TENSOR_MAX          ? maximum(ax, dims=2) :
+              op === CUDNN_REDUCE_TENSOR_AMAX         ? maximum(abs, ax, dims=2) :
+              op === CUDNN_REDUCE_TENSOR_AVG          ? mean(ax, dims=2) :
+              op === CUDNN_REDUCE_TENSOR_NORM1        ? sum(abs, ax, dims=2) :
+              op === CUDNN_REDUCE_TENSOR_NORM2        ? sqrt.(sum(abs2, ax, dims=2)) :
+              op === CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS ? (ax1=copy(ax);ax1[ax.==0].=1;prod(ax1,dims=2)) :
+              error("Unknown reducetensor"))
+        f1 = alpha * f0
+        f2 = f1 + beta * ay
+        dims = size(ay)
+        ((f1 ≈ cudnnReduceTensor(cx; dims, op, compType, nanOpt, indices, alpha) |> Array) &&
+         (f1 ≈ cudnnReduceTensor(cx, d; dims, indices, alpha) |> Array) &&
+         (f2 ≈ cudnnReduceTensor!(copy(cy), cx; op, compType, nanOpt, indices, alpha, beta) |> Array) &&
+         (f2 ≈ cudnnReduceTensor!(copy(cy), cx, d; indices, alpha, beta) |> Array))
+    end
+    
+    @test reducetensortest()
+    @test reducetensortest(op = CUDNN_REDUCE_TENSOR_MUL)
+    @test reducetensortest(op = CUDNN_REDUCE_TENSOR_MIN)
+    @test reducetensortest(op = CUDNN_REDUCE_TENSOR_MAX)
+    @test reducetensortest(op = CUDNN_REDUCE_TENSOR_AMAX)
+    @test reducetensortest(op = CUDNN_REDUCE_TENSOR_AVG)
+    @test reducetensortest(op = CUDNN_REDUCE_TENSOR_NORM1)
+    @test reducetensortest(op = CUDNN_REDUCE_TENSOR_NORM2)
+    @test reducetensortest(op = CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS)
+    @test reducetensortest(nanOpt = CUDNN_PROPAGATE_NAN)
+    @test reducetensortest(alpha = 2)
+    @test reducetensortest(beta = 2)
+end
diff --git a/test/cudnn/rnn.jl b/test/cudnn/rnn.jl
new file mode 100644
index 0000000000..3fa5863f8d
--- /dev/null
+++ b/test/cudnn/rnn.jl
@@ -0,0 +1,137 @@
+using Test, CUDA, Random
+
+using CUDA.CUDNN:
+    cudnnRNNForward,
+    cudnnRNNForward!,
+    cudnnRNNBackwardData_v8,
+    cudnnRNNBackwardWeights_v8,
+    cudnnRNNDescriptor,
+    cudnnRNNDescriptor_t,
+    cudnnSetRNNDescriptor_v8,
+    cudnnGetRNNWeightSpaceSize,
+    cudnnGetRNNTempSpaceSizes,
+    cudnnRNNAlgo_t,
+        CUDNN_RNN_ALGO_STANDARD,        # 0, robust performance across a wide range of network parameters
+        CUDNN_RNN_ALGO_PERSIST_STATIC,  # 1, fast when the first dimension of the input tensor is small (meaning, a small minibatch), cc>=6.0
+        CUDNN_RNN_ALGO_PERSIST_DYNAMIC, # 2, similar to static, optimize using the specific parameters of the network and active GPU, cc>=6.0
+        CUDNN_RNN_ALGO_COUNT,           # 3
+    cudnnRNNMode_t,
+        CUDNN_RNN_RELU, # 0, /* basic RNN cell type with ReLu activation */
+        CUDNN_RNN_TANH, # 1, /* basic RNN cell type with tanh activation */
+        CUDNN_LSTM,     # 2, /* LSTM with optional recurrent projection and clipping */
+        CUDNN_GRU,      # 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
+    cudnnRNNBiasMode_t,
+        CUDNN_RNN_NO_BIAS,         # 0, /* rnn cell formulas do not use biases */
+        CUDNN_RNN_SINGLE_INP_BIAS, # 1, /* rnn cell formulas use one input bias in input GEMM */
+        CUDNN_RNN_DOUBLE_BIAS,     # 2, /* default, rnn cell formulas use two bias vectors */
+        CUDNN_RNN_SINGLE_REC_BIAS, # 3  /* rnn cell formulas use one recurrent bias in recurrent GEMM */
+    cudnnDirectionMode_t,
+        CUDNN_UNIDIRECTIONAL, # 0, /* single direction network */
+        CUDNN_BIDIRECTIONAL,  # 1, /* output concatination at each layer */
+    cudnnRNNInputMode_t,
+        CUDNN_LINEAR_INPUT, # 0, /* adjustable weight matrix in first layer input GEMM */
+        CUDNN_SKIP_INPUT,   # 1, /* fixed identity matrix in the first layer input GEMM */
+    cudnnMathType_t,
+        CUDNN_DEFAULT_MATH,                    # 0,
+        CUDNN_TENSOR_OP_MATH,                  # 1,
+        CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION, # 2,
+        CUDNN_FMA_MATH,                        # 3,
+    #/* For auxFlags in cudnnSetRNNDescriptor_v8() and cudnnSetRNNPaddingMode() */
+        CUDNN_RNN_PADDED_IO_DISABLED, # 0
+        CUDNN_RNN_PADDED_IO_ENABLED,  # (1U << 0)
+    cudnnForwardMode_t,
+        CUDNN_FWD_MODE_INFERENCE, # 0
+        CUDNN_FWD_MODE_TRAINING,  # 1
+    cudnnRNNDataDescriptor_t,
+    cudnnSetRNNDataDescriptor,
+    cudnnRNNDataLayout_t,
+        CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED,   # 0, /* padded, outer stride from one time-step to the next */
+        CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,     # 1, /* sequence length sorted and packed as in basic RNN api */
+        CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED, # 2, /* padded, outer stride from one batch to the next */
+    cudnnWgradMode_t,
+        CUDNN_WGRAD_MODE_ADD, # 0, /* add partial gradients to wgrad output buffers */
+        CUDNN_WGRAD_MODE_SET, # 1, /* write partial gradients to wgrad output buffers */
+    cudnnTensorDescriptor,
+    cudnnDropoutDescriptor,
+    cudnnDataType,
+    math_mode,
+    handle
+
+
+@testset "cudnn/rnn" begin
+
+    X,H,B,T = 8,8,4,2
+    w = CUDA.randn(10000)
+    x = CUDA.randn(X,B,T)
+    hx1 = CUDA.randn(H,B,1)
+    cx1 = CUDA.randn(H,B,1)
+
+    function rnntest(
+        ;hx = nothing,
+        cx = nothing,
+        hy = nothing,
+        cy = nothing,
+        layout::cudnnRNNDataLayout_t = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED,
+        seqLengthArray::Union{Nothing,Vector{Cint}} = nothing,
+        fwdMode::cudnnForwardMode_t = CUDNN_FWD_MODE_INFERENCE,
+        # descriptor keywords
+        hiddenSize::Integer = H,
+        algo::cudnnRNNAlgo_t = CUDNN_RNN_ALGO_STANDARD,
+        cellMode::cudnnRNNMode_t = CUDNN_LSTM,
+        biasMode::cudnnRNNBiasMode_t = CUDNN_RNN_DOUBLE_BIAS,
+        dirMode::cudnnDirectionMode_t = CUDNN_UNIDIRECTIONAL,
+        inputMode::cudnnRNNInputMode_t = CUDNN_LINEAR_INPUT,
+        mathPrec::DataType = eltype(x),
+        mathType::cudnnMathType_t = math_mode(),
+        inputSize::Integer = size(x,1),
+        projSize::Integer = hiddenSize,
+        numLayers::Integer = 1,
+        dropout::Real = 0,
+        auxFlags::Integer = CUDNN_RNN_PADDED_IO_ENABLED,
+    )
+        d = cudnnRNNDescriptor(algo, cellMode, biasMode, dirMode, inputMode, cudnnDataType(eltype(x)), cudnnDataType(mathPrec), mathType, Int32(inputSize), Int32(hiddenSize), Int32(projSize), Int32(numLayers), cudnnDropoutDescriptor(Cfloat(dropout)), UInt32(auxFlags))
+        y = cudnnRNNForward(w, x; hx, cx, hy, cy, layout, seqLengthArray, fwdMode, hiddenSize, algo, cellMode, biasMode, dirMode, inputMode, mathPrec, mathType, inputSize, projSize, numLayers, dropout, auxFlags)
+        _y = copy(y)
+        _hy = (hy === nothing ? hy : copy(hy[]))
+        _cy = (cy === nothing ? cy : copy(cy[]))
+        (_y ≈ cudnnRNNForward!(y, w, x; hx, cx, hy, cy, layout, seqLengthArray, fwdMode, hiddenSize, algo, cellMode, biasMode, dirMode, inputMode, mathPrec, mathType, inputSize, projSize, numLayers, dropout, auxFlags) &&
+         (_hy === hy === nothing || _hy ≈ hy[]) &&
+         (_cy === cy === nothing || _cy ≈ cy[]) &&
+         _y ≈ cudnnRNNForward(w, x, d; hx, cx, hy, cy, layout, seqLengthArray, fwdMode) &&
+         (_hy === hy === nothing || _hy ≈ hy[]) &&
+         (_cy === cy === nothing || _cy ≈ cy[]) &&
+         _y ≈ cudnnRNNForward!(y, w, x, d; hx, cx, hy, cy, layout, seqLengthArray, fwdMode) &&
+         (_hy === hy === nothing || _hy ≈ hy[]) &&
+         (_cy === cy === nothing || _cy ≈ cy[]))
+    end
+        
+    @test rnntest()
+    @test rnntest(hx=hx1)
+    @test rnntest(cx=cx1)
+    @test rnntest(hy=Ref{Any}())
+    @test rnntest(cy=Ref{Any}())
+    @test rnntest(layout=CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED)
+    @test rnntest(layout=CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED)
+    @test rnntest(seqLengthArray=Cint[1,2,1,2])
+    @test rnntest(fwdMode=CUDNN_FWD_MODE_TRAINING)
+    @test rnntest(hiddenSize=16)
+    @test rnntest(algo=CUDNN_RNN_ALGO_PERSIST_STATIC)
+    #@test rnntest(algo=CUDNN_RNN_ALGO_PERSIST_DYNAMIC) # causes segfault
+    @test rnntest(cellMode=CUDNN_RNN_RELU)
+    @test rnntest(cellMode=CUDNN_RNN_TANH)
+    @test rnntest(cellMode=CUDNN_GRU)
+    @test rnntest(biasMode=CUDNN_RNN_NO_BIAS)
+    @test rnntest(biasMode=CUDNN_RNN_SINGLE_INP_BIAS)
+    @test rnntest(biasMode=CUDNN_RNN_SINGLE_REC_BIAS)
+    @test rnntest(dirMode=CUDNN_BIDIRECTIONAL)
+    @test rnntest(inputMode=CUDNN_SKIP_INPUT)
+    @test rnntest(mathPrec=Float32) # only possible option for F32 input
+    @test rnntest(mathType=CUDNN_DEFAULT_MATH)
+    @test rnntest(mathType=CUDNN_TENSOR_OP_MATH)
+    @test rnntest(mathType=CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)
+    @test rnntest(projSize=4)
+    @test rnntest(numLayers=2)
+    @test rnntest(dropout=0.5)
+    @test rnntest(auxFlags=CUDNN_RNN_PADDED_IO_DISABLED)
+    @test rnntest(auxFlags=CUDNN_RNN_PADDED_IO_ENABLED)
+end
diff --git a/test/cudnn/softmax.jl b/test/cudnn/softmax.jl
new file mode 100644
index 0000000000..265788ff6d
--- /dev/null
+++ b/test/cudnn/softmax.jl
@@ -0,0 +1,50 @@
+using Test, CUDA
+using CUDA.CUDNN:
+    cudnnSoftmaxForward,
+    cudnnSoftmaxForward!,
+    cudnnSoftmaxBackward,
+    cudnnSoftmaxAlgorithm_t,
+        CUDNN_SOFTMAX_FAST,     # 0, /* straightforward implementation */
+        CUDNN_SOFTMAX_ACCURATE, # 1, /* subtract max from every point to avoid overflow */
+        CUDNN_SOFTMAX_LOG,      # 2
+    cudnnSoftmaxMode_t,
+        CUDNN_SOFTMAX_MODE_INSTANCE, # 0, /* compute the softmax over all C, H, W for each N */
+        CUDNN_SOFTMAX_MODE_CHANNEL,  # 1  /* compute the softmax over all C for each H, W, N */
+    handle
+
+
+@testset "cudnn/softmax" begin
+    ax,ay = randn(Float32,10,10),randn(Float32,10,10)
+    cx,cy = CuArray.((ax,ay))
+
+    function softmaxtest(
+        ; alpha=1,
+        beta=0,
+        mode=CUDNN_SOFTMAX_MODE_INSTANCE,
+        algo=CUDNN_SOFTMAX_FAST
+    )
+        d = mode === CUDNN_SOFTMAX_MODE_INSTANCE ? 1 : 2
+        x = ax .- maximum(ax, dims=d)
+        y = x .- log.(sum(exp.(x), dims=d))
+        if algo !== CUDNN_SOFTMAX_LOG; y = exp.(y); end
+        add1(x)=reshape(x, (size(x)..., 1))
+        if mode === CUDNN_SOFTMAX_MODE_CHANNEL
+            y,cx1,cy1 = add1.((y,cx,cy))
+        else
+            cx1,cy1 = cx,cy
+        end
+        y0 = alpha * y
+        y1 = y0 .+ beta * ay
+        ((y0 ≈ cudnnSoftmaxForward(cx1; algo, mode, alpha) |> Array) &&
+         (y1 ≈ cudnnSoftmaxForward!(copy(cy1), cx1; algo, mode, alpha, beta) |> Array))
+    end
+
+    @test softmaxtest()
+    @test softmaxtest(alpha=2)
+    @test softmaxtest(beta=2)
+    @test softmaxtest(mode=CUDNN_SOFTMAX_MODE_INSTANCE)
+    @test softmaxtest(mode=CUDNN_SOFTMAX_MODE_CHANNEL)
+    @test softmaxtest(algo=CUDNN_SOFTMAX_FAST)
+    @test softmaxtest(algo=CUDNN_SOFTMAX_ACCURATE)
+    @test softmaxtest(algo=CUDNN_SOFTMAX_LOG)
+end
diff --git a/test/cudnn/tensor.jl b/test/cudnn/tensor.jl
new file mode 100644
index 0000000000..a09c56a367
--- /dev/null
+++ b/test/cudnn/tensor.jl
@@ -0,0 +1,39 @@
+using Test, CUDA
+using Base: unsafe_convert
+using CUDA.CUDNN: 
+    cudnnTensorDescriptor,
+    cudnnCreateTensorDescriptor,
+    cudnnFilterDescriptor,
+    cudnnDataType,
+    cudnnDataType_t,
+    CUDNN_TENSOR_NCHW,
+    CUDNN_STATUS_SUCCESS,
+    @retry_reclaim,
+    handle
+
+
+@testset "cudnn/tensor" begin
+
+    x = CUDA.rand(1,1,1,2)
+
+    TD = cudnnTensorDescriptor
+    FD = cudnnFilterDescriptor
+    DT = cudnnDataType
+
+    @test TD(x) isa TD
+    @test TD(CUDNN_TENSOR_NCHW, DT(eltype(x)), Cint(ndims(x)), Cint[reverse(size(x))...]) isa TD
+    td = TD(x)
+    @test TD(td.ptr) isa TD
+    @test unsafe_convert(Ptr, TD(td.ptr)) isa Ptr
+
+    @test FD(x) isa FD
+    @test FD(DT(eltype(x)),CUDNN_TENSOR_NCHW,Cint(ndims(x)),Cint[reverse(size(x))...]) isa FD
+    fd = FD(x)
+    @test FD(fd.ptr) isa FD
+    @test unsafe_convert(Ptr, FD(fd.ptr)) isa Ptr
+
+    @test DT(Float32) isa cudnnDataType_t
+
+    @test (@retry_reclaim(x->(x!==CUDNN_STATUS_SUCCESS),cudnnCreateTensorDescriptor(Ref{Ptr{Cvoid}}(C_NULL)))) isa Nothing
+
+end
diff --git a/test/forwarddiff.jl b/test/forwarddiff.jl
index 0f566664dc..12e9cb1d2f 100644
--- a/test/forwarddiff.jl
+++ b/test/forwarddiff.jl
@@ -12,7 +12,7 @@ function test_derivative(f, x::T) where T
   return CUDA.@allowscalar buf[]
 end
 
-testf(cuf, f, x) = test_derivative(cuf, x) ≈ ForwardDiff.derivative(f, x)
+testdiff(cuf, f, x) = test_derivative(cuf, x) ≈ ForwardDiff.derivative(f, x)
 
 
 @testset "UNARY" begin
@@ -35,12 +35,12 @@ testf(cuf, f, x) = test_derivative(cuf, x) ≈ ForwardDiff.derivative(f, x)
       x64 += 1
     end
 
-    @test testf(cuf, f, x32)
-    @test testf(cuf, f, x64)
+    @test testdiff(cuf, f, x32)
+    @test testdiff(cuf, f, x64)
 
     if fn ∉ nonneg
-      @test testf(cuf, f, nx32)
-      @test testf(cuf, f, nx64)
+      @test testdiff(cuf, f, nx32)
+      @test testdiff(cuf, f, nx64)
     end
   end
 end
@@ -52,16 +52,16 @@ end
   y64 = rand(Float64)
   y = Int32(7)
 
-  @test testf(x->CUDA.pow(x, Int32(7)), x->x^y, x32)
-  @test testf(x->CUDA.pow(x, y), x->x^y, x64)
-  @test testf(x->CUDA.pow(x, y32), x->x^y32, x32)
-  @test testf(x->CUDA.pow(x, y64), x->x^y64, x64)
+  @test testdiff(x->CUDA.pow(x, Int32(7)), x->x^y, x32)
+  @test testdiff(x->CUDA.pow(x, y), x->x^y, x64)
+  @test testdiff(x->CUDA.pow(x, y32), x->x^y32, x32)
+  @test testdiff(x->CUDA.pow(x, y64), x->x^y64, x64)
 
-  @test testf(y->CUDA.pow(x32, y), y->x32^y, y32)
-  @test testf(y->CUDA.pow(x64, y), y->x64^y, y64)
+  @test testdiff(y->CUDA.pow(x32, y), y->x32^y, y32)
+  @test testdiff(y->CUDA.pow(x64, y), y->x64^y, y64)
 
-  @test testf(x->CUDA.pow(x, x), x->x^x, x32)
-  @test testf(x->CUDA.pow(x, x), x->x^x, x64)
+  @test testdiff(x->CUDA.pow(x, x), x->x^x, x32)
+  @test testdiff(x->CUDA.pow(x, x), x->x^x, x64)
 end
 
 @testset "LITERAL_POW" begin