FluxML · bors · Jun 21, 2021 · Jun 20, 2021 · Jun 20, 2021 · Jun 20, 2021
diff --git a/src/lib/broadcast.jl b/src/lib/broadcast.jl
@@ -254,7 +254,14 @@ end
     placeholder = similar(xs)
     sum(xs, dims = dims), Δ -> (placeholder .= Δ,)
   end
-
+
+  # Make sure sum(f, ::CuArray) uses broadcase through forward-mode defined above
+  # Not the ChainRules.rrule which will use the Zygote.Context and thus not be GPU compatible
+  @adjoint function sum(f, xs::CuArray; kws...)
+    @assert !haskey(kws, :init) # TODO add init support (julia 1.6)
+    return pullback(__context__, (f, xs) -> sum(f.(xs); kws...), f, xs)
+  end
+
   @adjoint function Base.convert(::Type{T}, xs::Array)  where {T<:CUDA.CuArray}
     Base.convert(T, xs), Δ -> (nothing, Base.convert(Array, Δ),)
   end

diff --git a/test/cuda.jl b/test/cuda.jl
@@ -26,6 +26,17 @@ end
   @test g_gpu |> collect ≈ g
 end
 
+@testset "sum(f, x)" begin
+  a = Float32.(-4:4)
+  a_gpu = a |> cu
+
+  f(x) = sum(abs, x)
+  g = gradient(f, a)[1]
+  g_gpu = gradient(f, a_gpu)[1]
+  @test g_gpu isa CuArray
+  @test g_gpu |> collect ≈ g
+end
+
 @testset "jacobian" begin
   v1 = cu(collect(1:3f0))