FluxML · mcabbott · Apr 12, 2023 · Apr 1, 2023
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -45,13 +45,13 @@ image = rand(Float32, 224, 224, 3, 1) |> gpu;  # dummy data
 @show sum(model(image));  # dummy loss function
 
 rule = Optimisers.Adam()  # use the Adam optimiser with its default settings
-state = Optimisers.setup(rule, model);  # initialise this optimiser's momentum etc.
+state_tree = Optimisers.setup(rule, model);  # initialise this optimiser's momentum etc.
 
 ∇model, _ = gradient(model, image) do m, x  # calculate the gradients
   sum(m(x))
 end;
 
-state, model = Optimisers.update(state, model, ∇model);
+state_tree, model = Optimisers.update(state_tree, model, ∇model);
 @show sum(model(image));  # reduced
 
 ```
@@ -60,7 +60,7 @@ Notice that a completely new instance of the model is returned. Internally, this
 is handled by [Functors.jl](https://fluxml.ai/Functors.jl), where we do a walk over the
 tree formed by the model and update the parameters using the gradients.
 
-There is also [`Optimisers.update!`](@ref) which similarly returns a new model and new state,
+There is also [`Optimisers.update!`](@ref) which similarly returns a new model,
 but is free to mutate arrays within the old one for efficiency.
 (The method of `apply!` above is likewise free to mutate arrays within its state;
 they are defensively copied when this rule is used with `update`.)

diff --git a/src/Optimisers.jl b/src/Optimisers.jl
@@ -66,7 +66,7 @@ init
 ###
 
 """
-    Optimisers.setup(rule, model) -> tree
+    Optimisers.setup(rule, model) -> state_tree
 
 Initialises the given optimiser for every trainable parameter within the model.
 Returns a tree of the relevant states, which must be passed to [`update`](@ref)
@@ -141,6 +141,7 @@ This is used in exactly the same manner as [`update`](@ref), but because it may
 arrays within the old model (and the old state), it will be faster for models of ordinary
 `Array`s or `CuArray`s. However, you should not rely on the old model being fully updated
 but rather use the returned model.
+(The original state tree is always mutated, as each `Leaf` is mutable.)
 
 # Example
 
@@ -149,9 +150,10 @@ julia> using StaticArrays, Zygote, Optimisers
 
 julia> m = (x = [1f0, 2f0], y = SA[4f0, 5f0]);  # partly mutable model
 
-julia> t = Optimisers.setup(Momentum(1/30, 0.9), m);
+julia> t = Optimisers.setup(Momentum(1/30, 0.9), m)  # tree of states
+(x = Leaf(Momentum{Float64}(0.0333333, 0.9), Float32[0.0, 0.0]), y = Leaf(Momentum{Float64}(0.0333333, 0.9), Float32[0.0, 0.0]))
 
-julia> g = gradient(m -> sum(abs2.(m.x .+ m.y)), m)[1]
+julia> g = gradient(m -> sum(abs2.(m.x .+ m.y)), m)[1]  # structural gradient
 (x = Float32[10.0, 14.0], y = Float32[10.0, 14.0])
 
 julia> t2, m2 = Optimisers.update!(t, m, g);
@@ -165,7 +167,7 @@ true
 julia> m  # original should be discarded, may be mutated but no guarantee
 (x = Float32[0.6666666, 1.5333333], y = Float32[4.0, 5.0])
 
-julia> t == t2  # original state is in fact guaranteed to be mutated
+julia> t == t2  # original state tree is guaranteed to be mutated
 true
 ```
 """