FluxML · FelixBenning · Jan 13, 2022 · Jan 13, 2022 · Jan 16, 2022 · Jan 18, 2022
diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,8 @@
 ## v0.12.9
 * Fixed incorrect output and added GPU compatibility for [AlphaDropout](https://github.com/FluxML/Flux.jl/pull/1781).
 * Add trilinear [Upsample layer](https://github.com/FluxML/Flux.jl/pull/1792).
+* Add `step!` as a single training step of `train!` to allow for more exotic
+optimisers (#666)
 
 ## v0.12.8
 * Optimized inference and gradient calculation of OneHotMatrix[pr](https://github.com/FluxML/Flux.jl/pull/1756)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
@@ -17,15 +17,27 @@ for d in datapoints
   # `d` should produce a collection of arguments
   # to the loss function
 
-  # Calculate the gradients of the parameters
-  # with respect to the loss function
-  grads = Flux.gradient(parameters) do
+  # Update the parameters based on the chosen
+  # optimiser (opt)
+  loss, grads = optimstep!(params, opt) do
     loss(d...)
   end
+end
+```
+
+`optimstep!` is the optimiser implementation and thus dispatches depending on
+the optimizer type. As an example, the default `optimstep!` for optimiser who
+use the gradient to update the parameters (e.g. gradient descent, momentum, ADAM, etc.) looks like this
 
+```julia
+function optimstep!(loss, params, opt)
+  # Calculate the gradients of the parameters
+  # with respect to the loss function
+  val, grads = Flux.withgradient(loss, parameters)
   # Update the parameters based on the chosen
   # optimiser (opt)
   Flux.Optimise.update!(opt, parameters, grads)
+  return val, grads
 end
 ```
 

diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
@@ -3,7 +3,7 @@ module Optimise
 using LinearAlgebra
 import ArrayInterface
 
-export train!, update!,
+export train!, step!, update!,
 	Descent, ADAM, Momentum, Nesterov, RMSProp,
 	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, OADAM, AdaBelief,
 	InvDecay, ExpDecay, WeightDecay, stop, skip, Optimiser,

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
@@ -1,5 +1,5 @@
 using Juno
-import Zygote: Params, gradient
+import Zygote: Params, withgradient
 
 """
     update!(x, x̄)
@@ -80,6 +80,35 @@ end
 batchmemaybe(x) = tuple(x)
 batchmemaybe(x::Tuple) = x
 
+"""
+    optimstep!(loss, params, opt)
+
+`optimstep!` uses a `loss` function (with no inputs) to improve the [Model parameters](@ref) (`params`)
+based on a pluggable [Optimisers](@ref) (`opt`). It represents a single step in
+the training loop `train!`.
+
+The default implementation for `optimstep!` is takes the gradient of `loss`
+and calls `Flux.Optimise.update!` to adjust the parameters, but you can overload
+`optimstep!` for specific types of `opt`. This can be useful if your optimization routine
+has does not follow the standard gradient descent procedure (e.g. gradient-free optimizers).
+
+Unlike `train!`, the loss function of `optimstep!` accepts no input.
+Instead, `train!` cycles through the data in a loop and calls `optimstep!`:
+```julia
+for d in data
+  optimstep!(ps, opt) do
+    loss(d)
+  end
+end
+```
+If you are writing [Custom Training loops](@ref), then you should follow this pattern.
+"""
+function optimstep!(loss, params, opt)
+  val, gs = withgradient(loss, params)
+  update!(opt, params, gs)
+  return val, gs
+end
+
 """
     train!(loss, params, data, opt; cb)
 
@@ -106,10 +135,9 @@ function train!(loss, ps, data, opt; cb = () -> ())
   cb = runall(cb)
   @progress for d in data
     try
-      gs = gradient(ps) do
+      optimstep!(ps, opt) do
         loss(batchmemaybe(d)...)
       end
-      update!(opt, ps, gs)
       cb()
     catch ex
       if ex isa StopException