more verbose and accurate testing

FluxML · Dec 21, 2020 · 5df14d2 · 5df14d2
1 parent fc6c8e9
commit 5df14d2
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 59 deletions.
diff --git a/Project.toml b/Project.toml
@@ -20,10 +20,11 @@ ZygoteRules = "0.2"
 julia = "1.3"
 
 [extras]
+FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Random", "Statistics", "Test", "Zygote"]
+test = ["FiniteDifferences", "Random", "Statistics", "Test", "Zygote"]
diff --git a/test/zygote.jl b/test/zygote.jl
@@ -1,111 +1,100 @@
 using Zygote, NNlib
 using Random
 using NNlib: conv, ∇conv_data, depthwiseconv, batched_mul
+using FiniteDifferences: grad, central_fdm
 
-function ngradient(f, xs::AbstractArray...)
-  grads = zero.(xs)
-  for (x, Δ) in zip(xs, grads), i in 1:length(x)
-    δ = sqrt(eps())
-    tmp = x[i]
-    x[i] = tmp - δ/2
-    y1 = f(xs...)
-    x[i] = tmp + δ/2
-    y2 = f(xs...)
-    x[i] = tmp
-    Δ[i] = (y2-y1)/δ
-  end
-  return grads
-end
-
-function gradcheck(f, xs...; rtol = 1e-5, atol = 1e-5)
+function gradcheck(f, xs...; rtol = 1e-6, atol = 1e-6)
   grad_zygote = gradient(f, xs...)
-  grad_finite_difference = ngradient(f, xs...)
-  return all(isapprox.(grad_zygote, grad_finite_difference; rtol = rtol, atol = atol))
+  grad_finite_difference = grad(central_fdm(5, 1), f, xs...)
+  #return all(isapprox.(grad_zygote, grad_finite_difference; rtol = rtol, atol = atol))
+  for (grad_zygote, grad_finite_difference) in zip(grad_zygote, grad_finite_difference)
+    @test isapprox(grad_zygote, grad_finite_difference; rtol = rtol, atol = atol)
+  end
 end
 
 gradtest(f, xs::AbstractArray...; kw...) = gradcheck((xs...) -> sum(sin.(f(xs...))), xs...; kw...)
 gradtest(f, dims...; kw...) = gradtest(f, rand.(Float64, dims)...; kw...)
 
-@test gradtest((x, W, b) -> relu.(W*x .+ b), 5, (2,5), 2)
-@test gradtest((x, W, b) -> relu.(W*x .+ b), (5,3), (2,5), 2)
-@test gradtest((x, W, b) -> selu.(W*x .+ b), 5, (2,5), 2)
-@test gradtest((x, W, b) -> selu.(W*x .+ b), (5,3), (2,5), 2)
-@test gradtest((x, W, b) -> elu.(W*x .+ b, 2), 5, (2,5), 2)
-@test gradtest((x, W, b) -> elu.(W*x .+ b, 2), (5,3), (2,5), 2)
+gradtest((x, W, b) -> relu.(W*x .+ b), 5, (2,5), 2)
+gradtest((x, W, b) -> relu.(W*x .+ b), (5,3), (2,5), 2)
+gradtest((x, W, b) -> selu.(W*x .+ b), 5, (2,5), 2)
+gradtest((x, W, b) -> selu.(W*x .+ b), (5,3), (2,5), 2)
+gradtest((x, W, b) -> elu.(W*x .+ b, 2), 5, (2,5), 2)
+gradtest((x, W, b) -> elu.(W*x .+ b, 2), (5,3), (2,5), 2)
 
 # tests for https://github.com/FluxML/Zygote.jl/issues/758
-@test gradient(xs -> sum(selu.(xs)), [1_000, 10_000]) == ([1.0507009873554805, 1.0507009873554805],)
-@test gradient(x -> selu(x), 1_000) == (1.0507009873554805,)
-@test gradient(xs -> sum(elu.(xs, 2)), [1_000, 10_000]) == ([1., 1.],)
-@test gradient(x -> elu(x, 2), 1_000) == (1.,)
-@test gradient(x -> elu(x, 2), -1) == (2*exp(-1),)
-@test gradcheck(x->sum(selu.(x)),[100., 1_000.])
-@test gradcheck(x->sum(elu.(x, 3.5)),[100., 1_000.])
-@test gradcheck(x->sum(elu.(x, 3.5)),[1_000., 10_000.]) # for elu the tests are passing but for selu not, interesting
+gradient(xs -> sum(selu.(xs)), [1_000, 10_000]) == ([1.0507009873554805, 1.0507009873554805],)
+gradient(x -> selu(x), 1_000) == (1.0507009873554805,)
+gradient(xs -> sum(elu.(xs, 2)), [1_000, 10_000]) == ([1., 1.],)
+gradient(x -> elu(x, 2), 1_000) == (1.,)
+gradient(x -> elu(x, 2), -1) == (2*exp(-1),)
+gradcheck(x->sum(selu.(x)),[100., 1_000.])
+gradcheck(x->sum(elu.(x, 3.5)),[100., 1_000.])
+gradcheck(x->sum(elu.(x, 3.5)),[1_000., 10_000.]) # for elu the tests are passing but for selu not, interesting
 # numerical instability even for the linear part of such function, see:
 # julia> ngradient(x->sum(selu.(x)),[1_000., 10_000.])
 # ([1.0506591796875, 1.0506591796875],)
 # julia> gradient(x->sum(selu.(x)),[1_000., 10_000.])
 # ([1.0507009873554805, 1.0507009873554805],)
 @test_broken gradcheck(x->sum(selu.(x)),[1_000., 10_000.])
 
-@test gradtest((x, W, b) -> σ.(W*x .+ b), 5, (2,5), 2)
-@test gradtest((x, W, b) -> σ.(W*x .+ b), (5,3), (2,5), 2)
-@test gradtest((x, W, b) -> logσ.(W*x .+ b), 5, (2,5), 2)
-@test gradtest((x, W, b) -> logσ.(W*x .+ b), (5,3), (2,5), 2)
+gradtest((x, W, b) -> σ.(W*x .+ b), 5, (2,5), 2)
+gradtest((x, W, b) -> σ.(W*x .+ b), (5,3), (2,5), 2)
+gradtest((x, W, b) -> logσ.(W*x .+ b), 5, (2,5), 2)
+gradtest((x, W, b) -> logσ.(W*x .+ b), (5,3), (2,5), 2)
 
-@test gradtest(x -> softmax(x).*(1:3), 3)
-@test gradtest(x -> softmax(x).*(1:3), (3,5))
-@test gradtest(x -> softmax(x, dims=2).*(1:3), (3,5))
-@test gradtest(x -> logsoftmax(x).*(1:3), 3)
-@test gradtest(x -> logsoftmax(x).*(1:3), (3,5))
-@test gradtest(x -> logsoftmax(x, dims=2).*(1:3), (3,5))
+gradtest(x -> softmax(x).*(1:3), 3)
+gradtest(x -> softmax(x).*(1:3), (3,5))
+gradtest(x -> softmax(x, dims=2).*(1:3), (3,5))
+gradtest(x -> logsoftmax(x).*(1:3), 3)
+gradtest(x -> logsoftmax(x).*(1:3), (3,5))
+gradtest(x -> logsoftmax(x, dims=2).*(1:3), (3,5))
 
 @testset "conv: spatial_rank=$spatial_rank" for spatial_rank in (1, 2, 3)
   x = rand(repeat([5], spatial_rank)..., 3, 2)
   w = rand(repeat([3], spatial_rank)..., 3, 3)
   cdims = DenseConvDims(x, w)
-  @test gradtest((x, w) -> conv(x, w, cdims), x, w)
-  @test gradtest((x, w) -> sum(conv(x, w, cdims)), x, w)  # https://github.com/FluxML/Flux.jl/issues/1055
+  gradtest((x, w) -> conv(x, w, cdims), x, w)
+  gradtest((x, w) -> sum(conv(x, w, cdims)), x, w)  # https://github.com/FluxML/Flux.jl/issues/1055
 
   y = conv(x, w, cdims)
-  @test gradtest((y, w) -> ∇conv_data(y, w, cdims), y, w)
+  gradtest((y, w) -> ∇conv_data(y, w, cdims), y, w)
   if spatial_rank == 3
     @test_broken gradtest((y, w) -> sum(∇conv_data(y, w, cdims)), y, w)
   else
-    @test gradtest((y, w) -> sum(∇conv_data(y, w, cdims)), y, w)
+    gradtest((y, w) -> sum(∇conv_data(y, w, cdims)), y, w)
   end
 
   dcdims = DepthwiseConvDims(x, w)
-  @test gradtest((x, w) -> depthwiseconv(x, w, dcdims), x, w)
+  gradtest((x, w) -> depthwiseconv(x, w, dcdims), x, w)
 
   y = depthwiseconv(x, w, dcdims)
-  @test gradtest((y, w) -> ∇depthwiseconv_data(y, w, dcdims), y, w)
+  gradtest((y, w) -> ∇depthwiseconv_data(y, w, dcdims), y, w)
   if spatial_rank == 3
     @test_broken gradtest((y, w) -> sum(∇depthwiseconv_data(y, w, dcdims)), y, w)
   else
-    @test gradtest((y, w) -> sum(∇depthwiseconv_data(y, w, dcdims)), y, w)
+    gradtest((y, w) -> sum(∇depthwiseconv_data(y, w, dcdims)), y, w)
   end
 end
 
 @testset "pooling: spatial_rank=$spatial_rank" for spatial_rank in (1, 2)
   x = rand(repeat([10], spatial_rank)..., 3, 2)
   pdims = PoolDims(x, 2)
-  @test gradtest(x -> maxpool(x, pdims), x)
-  @test gradtest(x -> meanpool(x, pdims), x)
-  @test gradtest(x -> sum(maxpool(x, pdims)), x)
-  @test gradtest(x -> sum(meanpool(x, pdims)), x)
+  gradtest(x -> maxpool(x, pdims), x)
+  gradtest(x -> meanpool(x, pdims), x)
+  gradtest(x -> sum(maxpool(x, pdims)), x)
+  gradtest(x -> sum(meanpool(x, pdims)), x)
 
   #https://github.com/FluxML/NNlib.jl/issues/188
   k = ntuple(_ -> 2, spatial_rank)  # Kernel size of pool in ntuple format
-  @test gradtest(x -> maxpool(x, k), x)
-  @test gradtest(x -> meanpool(x, k), x)
-  @test gradtest(x -> sum(maxpool(x, k)), x)
-  @test gradtest(x -> sum(meanpool(x, k)), x)
+  gradtest(x -> maxpool(x, k), x)
+  gradtest(x -> meanpool(x, k), x)
+  gradtest(x -> sum(maxpool(x, k)), x)
+  gradtest(x -> sum(meanpool(x, k)), x)
 end
 
 @testset "batched matrix multiplication" begin
   rng, M, P, Q = MersenneTwister(123456), 13, 7, 11
   B = 3
-  @test gradtest(batched_mul, randn(rng, M, P, B), randn(rng, P, Q, B))
+  gradtest(batched_mul, randn(rng, M, P, B), randn(rng, P, Q, B))
 end