JuliaReinforcementLearning · baedan · Jul 19, 2022 · Jul 19, 2022 · Jul 19, 2022 · Jul 19, 2022
diff --git a/.cspell/cspell.json b/.cspell/cspell.json
@@ -3,6 +3,7 @@
     "language": "en",
     "words": [
         "leduc",
+        "unnormalized",
         "Approximator",
         "glorot",
         "imresize",
@@ -196,4 +197,4 @@
         "\\{%.*%\\}", // liquid syntax
         "/^\\s*```[\\s\\S]*?^\\s*```/gm" // Another attempt at markdown code blocks. https://github.com/streetsidesoftware/vscode-spell-checker/issues/202#issuecomment-377477473
     ]
-}
+}
diff --git a/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/policy_gradient.jl b/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/policy_gradient.jl
@@ -1,4 +1,5 @@
 # include("run.jl")
+include("util.jl")
 include("vpg.jl")
 # include("A2C.jl")
 # include("ppo.jl")
@@ -8,4 +9,4 @@ include("vpg.jl")
 # include("td3.jl")
 # include("sac.jl")
 # include("maddpg.jl")
-# include("vmpo.jl")
+# include("vmpo.jl")
diff --git a/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/util.jl b/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/util.jl
@@ -0,0 +1,63 @@
+using Distributions: DiscreteDistribution, ContinuousDistribution
+using Flux: softmax
+
+export action_distribution
+
+"""
+    action_distribution(dist, model_output)
+
+Compute the action distribution using the distribution type and output from a model.
+"""
+action_distribution(dist, model_output) =
+    throw(ArgumentError("dist ($dist) is not a ContinuousDistribution or DiscreteDistribution, not implemented"))
+
+"""
+    action_distribution(dist::Type{T}, model_output) where {T<:DiscreteDistribution}
+
+When `dist` is a subtype of `DiscreteDistribution`, assume `model_output` is a batch of unnormalized log probabilities.
+# Examples
+```jldoctest
+julia> model_output = reshape(1:10, 5, 2)
+5×2 reshape(::UnitRange{Int64}, 5, 2) with eltype Int64:
+ 1   6
+ 2   7
+ 3   8
+ 4   9
+ 5  10
+julia> action_distribution(Categorical, model_output)
+2-element Vector{Categorical{Float64, Vector{Float64}}}:
+ Categorical{Float64, Vector{Float64}}(
+support: Base.OneTo(5)
+p: [0.011656230956039605, 0.03168492079612427, 0.0861285444362687, 0.23412165725273662, 0.6364086465588308]
+)
+
+ Categorical{Float64, Vector{Float64}}(
+support: Base.OneTo(5)
+p: [0.011656230956039605, 0.03168492079612427, 0.0861285444362687, 0.23412165725273662, 0.6364086465588308]
+)
+```
+"""
+action_distribution(dist::Type{T}, model_output) where {T<:DiscreteDistribution} = 
+    map(col -> dist(col; check_args=false), eachcol(softmax(model_output)))
+
+"""
+    action_distribution(dist::Type{T}, model_output) where {T<:ContinuousDistribution}
+
+When `dist` is a subtype of `ContinuousDistribution`, assume `model_output` are a batch of parameters to be supplied to `dist`.
+# Examples
+```jldoctest
+julia> model_output = reshape(1:10, 2, 5)
+2×5 reshape(::UnitRange{Int64}, 2, 5) with eltype Int64:
+ 1  3  5  7   9
+ 2  4  6  8  10
+julia> action_distribution(Normal, model_output)
+5-element Vector{Normal{Float64}}:
+ Normal{Float64}(μ=1.0, σ=2.0)
+ Normal{Float64}(μ=3.0, σ=4.0)
+ Normal{Float64}(μ=5.0, σ=6.0)
+ Normal{Float64}(μ=7.0, σ=8.0)
+ Normal{Float64}(μ=9.0, σ=10.0)
+```
+"""
+action_distribution(dist::Type{T}, model_output) where {T<:ContinuousDistribution} = 
+    map(col -> dist(col...), eachcol(model_output))
diff --git a/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/vpg.jl b/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/vpg.jl
@@ -1,7 +1,7 @@
 export VPG
 
 using Random: GLOBAL_RNG, shuffle
-using Distributions: ContinuousDistribution, DiscreteDistribution
+using Distributions: ContinuousDistribution, DiscreteDistribution, logpdf
 using Functors: @functor
 using Flux: params, softmax, gradient, logsoftmax
 using StatsBase: mean
@@ -26,13 +26,7 @@ end
 
 function (π::VPG)(env::AbstractEnv)
     res = env |> state |> send_to_device(π) |> π.approximator |> send_to_host
-    if π.dist <: ContinuousDistribution
-        rand.(π.rng, π.dist.(res...))
-    elseif π.dist <: DiscreteDistribution
-        rand(π.rng, res |> softmax |> π.dist)
-    else
-        @error "unknown distribution"
-    end
+    rand(π.rng, action_distribution(π.dist, res)[1])
 end
 
 function (p::Agent{<:VPG})(::PostEpisodeStage, env::AbstractEnv)
@@ -70,13 +64,7 @@ function RLBase.optimise!(p::VPG, batch::NamedTuple{(:state, :action, :gain)})
     end
 
     gs = gradient(params(A)) do
-        if p.dist <: DiscreteDistribution
-            log_prob = s |> A |> logsoftmax
-            log_probₐ = log_prob[CartesianIndex.(a, 1:length(a))]
-        elseif p.dist <: ContinuousDistribution
-            dist = p.dist.(A(s)...) # TODO: this part does not work on GPU. See: https://github.com/JuliaStats/Distributions.jl/issues/1183 .
-            log_probₐ = logpdf.(dist, A)
-        end
+        log_probₐ = logpdf.(action_distribution(p.dist, A(s)), a)
         loss = -mean(log_probₐ .* δ)
         ignore_derivatives() do
             # @info "VPG" loss = loss