FluxML · bors · Sep 6, 2021 · Aug 31, 2021 · Aug 31, 2021 · Aug 31, 2021
diff --git a/src/functor.jl b/src/functor.jl
@@ -86,7 +86,20 @@ julia> typeof(m_cpu.W)
 Matrix{Float32}
 ```
 """
-cpu(m) = fmap(x -> adapt(Array, x), m)
+cpu(x) = fmap(_cpu_array, x; exclude = _isbitsarray)
+
+_cpu_array(x::AbstractArray) = adapt(Array, x)
+
+function Zygote.ChainRules.rrule(::typeof(_cpu_array), x::AbstractArray)
+    y = _cpu_array(x)
+    if x === y
+        # Trivial use: cpu(x::Array) shouldn't push its gradient to GPU
+        return y, dy -> (Zygote.ChainRules.NoTangent(), dy)
+    else
+        # Allows both cpu(x::CuArray) and cpu(x::Adjoint{T,CuArray}):
+        return y, dy -> (Zygote.ChainRules.NoTangent(), _gpu_array(dy))
+    end
+end
 
 _isbitsarray(::AbstractArray{<:Number}) = true
 _isbitsarray(::AbstractArray{T}) where T = isbitstype(T)
@@ -99,8 +112,7 @@ Moves `m` to the current GPU device, if available. It is a no-op otherwise.
 See the [CUDA.jl docs](https://juliagpu.github.io/CUDA.jl/stable/usage/multigpu/) 
 to help identify the current device.
 
-This works for functions and 
-any struct with [`@functor`](@ref) defined.
+This works for functions, and any struct marked with [`@functor`](@ref).
 
 ```julia-repl
 julia> m = Dense(1,2)
@@ -116,7 +128,27 @@ julia> typeof(m_gpu.W) # notice the type of the array changed to a CuArray
 CuArray{Float32, 2}
 ```
 """
-gpu(x) = use_cuda[] ? fmap(CUDA.cu, x; exclude = _isbitsarray) : x
+gpu(x) = use_cuda[] ? fmap(_gpu_array, x; exclude = _isbitsarray) : x
+
+_gpu_array(x::AbstractArray) = CUDA.cu(x)
+
+# While `cu` moves Arrays to the GPU, we also want to move some structured arrays
+# https://github.com/FluxML/Zygote.jl/issues/1005
+_gpu_array(x::Zygote.FillArrays.AbstractFill) = CUDA.fill(first(x), size(x))  # gradient of sum
+function _gpu_array(x::Zygote.OneElement)  # gradient of getindex
+    y = CUDA.zeros(eltype(x), size(x))
+    CUDA.@allowscalar y[x.ind...] = x.val
+    y
+end
+
+function Zygote.ChainRules.rrule(::typeof(_gpu_array), x::AbstractArray)
+    y = _gpu_array(x)
+    if x === y  # trivial case, e.g. gpu(x::Adjoint{T,CuArray})
+        return y, dy -> (Zygote.ChainRules.NoTangent(), dy)
+    else
+        return y, dy -> (Zygote.ChainRules.NoTangent(), _cpu_array(dy))
+    end
+end
 
 # Precision
 

diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
@@ -84,3 +84,44 @@ end
     @test gpu((;a=[SimpleBits(1)])).a isa CuVector{SimpleBits}
   end
 end
+
+@testset "gpu(cpu(x)) inside gradient" begin
+  a = randn(Float32, 4, 4)
+  ca = cu(a)
+
+  # Trivial functions
+  @test gradient(x -> sum(abs, gpu(x)), a)[1] isa Matrix
+  @test gradient(x -> sum(gpu(x)), a)[1] isa Matrix
+  @test gradient(x -> sum(gpu(x)), a')[1] isa Matrix
+  @test gradient(x -> sum(abs, cpu(x)), ca)[1] isa CuArray
+  @test gradient(x -> sum(cpu(x)), ca)[1] isa CuArray  # This involves FillArray
+  @test gradient(x -> sum(cpu(x)), ca')[1] isa CuArray 
+
+  # Even more trivial: no movement
+  @test gradient(x -> sum(abs, cpu(x)), a)[1] isa Matrix
+  @test gradient(x -> sum(abs, cpu(x)), a')[1] isa Matrix
+  @test gradient(x -> sum(cpu(x)), a)[1] isa FillArrays.Fill
+  @test gradient(x -> sum(abs, gpu(x)), ca)[1] isa CuArray
+  @test_skip gradient(x -> sum(abs, gpu(x)), ca')[1] isa CuArray
+
+  # More complicated, Array * CuArray is an error
+  g0 = gradient(x -> sum(abs, (a * (a * x))), a)[1]
+  @test g0 ≈ gradient(x -> sum(abs, cpu(ca * gpu(a * x))), a)[1]
+  @test cu(g0) ≈ gradient(x -> sum(abs, gpu(a * cpu(ca * x))), ca)[1]
+
+  # Scalar indexing of an array, needs OneElement to transfer to GPU
+  # https://github.com/FluxML/Zygote.jl/issues/1005
+  @test gradient(x -> cpu(2 .* gpu(x))[1], Float32[1,2,3]) == ([2,0,0],)
+  @test gradient(x -> cpu(gpu(x) * gpu(x))[1,2], Float32[1 2 3; 4 5 6; 7 8 9]) == ([2 6 8; 0 2 0; 0 3 0],)
+
+  # Explicit pieces. It's not entirely clear that it's desirable to move these if they appear alone,
+  # but it's necessary to move them if they appear in gradient of cpu(::CuArray), as in the
+  # examples above. Those must not break, but a re-design could perhaps change these:
+  g1 = Zygote.OneElement(1, (2,3), axes(ones(4,5)))
+  @test gpu(g1) isa CuArray
+  @test gpu(g1) ≈ cu(Matrix(g1))
+
+  g2 = Fill(1f0,2)
+  @test gpu(g2) isa CuArray
+  @test gpu(g2) ≈ cu(Vector(g2))
+end