Skip to content

Implementation of NFSP and NFSP_KuhnPoker experiment #402

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jul 31, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions docs/experiments/experiments/NFSP/JuliaRL_NFSP_KuhnPoker.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# ---
# title: JuliaRL\_NFSP\_KuhnPoker
# cover: assets/logo.svg
# description: NFSP applied to KuhnPokerEnv
# date: 2021-07-26
# author: "[Peter Chen](https://github.com/peterchen96)"
# ---

#+ tangle=true
using ReinforcementLearning
using StableRNGs
using Flux
using Flux.Losses

mutable struct ResultNEpisode <: AbstractHook
episode_counter::Int
eval_every::Int
episode::Vector{Int}
results::Vector{Float64}
end

function (hook::ResultNEpisode)(::PostEpisodeStage, policy, env)
hook.episode_counter += 1
if hook.episode_counter % hook.eval_every == 0
push!(hook.episode, hook.episode_counter)
push!(hook.results, RLZoo.nash_conv(policy, env))
end
end

function RL.Experiment(
::Val{:JuliaRL},
::Val{:NFSP},
::Val{:KuhnPoker},
::Nothing;
seed = 123,
)
rng = StableRNG(seed)

## Encode the KuhnPokerEnv's states for training.
env = KuhnPokerEnv()
wrapped_env = StateTransformedEnv(
env;
state_mapping = s -> [findfirst(==(s), state_space(env)) / length(state_space(env))], # for normalization
state_space_mapping = ss -> [[findfirst(==(s), state_space(env)) / length(state_space(env))] for s in state_space(env)]
)
player = 1 # or 2
ns, na = length(state(wrapped_env, player)), length(action_space(wrapped_env, player))

## construct rl_agent(use `DQN`) and sl_agent(use `BehaviorCloningPolicy`)
rl_agent = Agent(
policy = QBasedPolicy(
learner = DQNLearner(
approximator = NeuralNetworkApproximator(
model = Chain(
Dense(ns, 128, relu; init = glorot_normal(rng)),
Dense(128, na; init = glorot_normal(rng))
) |> cpu,
optimizer = Descent(0.01),
),
target_approximator = NeuralNetworkApproximator(
model = Chain(
Dense(ns, 128, relu; init = glorot_normal(rng)),
Dense(128, na; init = glorot_normal(rng))
) |> cpu,
),
γ = 1.0f0,
loss_func = huber_loss,
batch_size = 128,
update_freq = 128,
update_horizon = 0,
min_replay_history = 1000,
target_update_freq = 1000,
rng = rng,
),
explorer = EpsilonGreedyExplorer(
kind = :linear,
ϵ_init = 0.06,
ϵ_stable = 0.001,
decay_steps = 3_000_000,
rng = rng,
),
),
trajectory = CircularArraySARTTrajectory(
capacity = 200_000,
state = Vector{Float64} => (ns, )
),
)

sl_agent = Agent(
policy = BehaviorCloningPolicy(;
approximator = NeuralNetworkApproximator(
model = Chain(
Dense(ns, 128, relu; init = glorot_normal(rng)),
Dense(128, na; init = glorot_normal(rng))
) |> cpu,
optimizer = Descent(0.01),
),
explorer = WeightedSoftmaxExplorer(),
batch_size = 128,
min_reservoir_history = 1000,
rng = rng,
),
trajectory = ReservoirTrajectory(
2_000_000;# reservoir capacity
rng = rng,
:state => Vector{Float64},
:action_probs => Vector{Float64},
),
)

## set parameters and initial NFSPAgentManager
η = 0.1 # anticipatory parameter
nfsp = NFSPAgentManager(
Dict(
(player, NFSPAgent(
deepcopy(rl_agent),
deepcopy(sl_agent),
η,
rng,
128, # update_freq for NFSPAgent
0, # initial step_counter
true, # initial NFSPAgent's learn mode
)) for player in players(wrapped_env) if player != chance_player(wrapped_env)
)
)

stop_condition = StopAfterEpisode(3_000_000, is_show_progress=!haskey(ENV, "CI"))
hook = ResultNEpisode(0, 10_000, [], [])

Experiment(nfsp, wrapped_env, stop_condition, hook, "# run NFSP on KuhnPokerEnv")
end

#+ tangle=false
using Plots
ex = E`JuliaRL_NFSP_KuhnPoker`
run(ex)
plot(ex.hook.episode, ex.hook.results, xaxis=:log, xlabel="episode", ylabel="nash_conv")

savefig("assets/JuliaRL_NFSP_KuhnPoker.png")#hide

# ![](assets/JuliaRL_NFSP_KuhnPoker.png)
6 changes: 6 additions & 0 deletions docs/experiments/experiments/NFSP/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"description": "Neural Fictitious Self-play(NFSP) related experiments.",
"order": [
"JuliaRL_NFSP_KuhnPoker.jl"
]
}
3 changes: 2 additions & 1 deletion docs/experiments/experiments/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"Policy Gradient",
"Offline",
"Search",
"CFR"
"CFR",
"NFSP"
]
}

1 change: 1 addition & 0 deletions src/ReinforcementLearningZoo/src/algorithms/algorithms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ include("policy_gradient/policy_gradient.jl")
include("searching/searching.jl")
include("cfr/cfr.jl")
include("offline_rl/offline_rl.jl")
include("nfsp/abstract_nfsp.jl")
45 changes: 45 additions & 0 deletions src/ReinforcementLearningZoo/src/algorithms/nfsp/abstract_nfsp.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
include("nfsp.jl")
include("nfsp_manager.jl")


function Base.run(
policy::NFSPAgentManager,
env::AbstractEnv,
stop_condition = StopAfterEpisode(1),
hook = EmptyHook(),
)
@assert NumAgentStyle(env) isa MultiAgent
@assert DefaultStateStyle(env) isa InformationSet

is_stop = false

while !is_stop
RLBase.reset!(env)
hook(PRE_EPISODE_STAGE, policy, env)

# set train mode
for player in players(env)
if player != chance_player(env)
agent = policy.agents[player]
agent.mode = rand(agent.rng) < agent.η
end
end

while !is_terminated(env) # one episode
RLBase.update!(policy, env)
hook(POST_ACT_STAGE, policy, env)

if stop_condition(policy, env)
is_stop = true
break
end
end # end of an episode

if is_terminated(env)
policy(POST_EPISODE_STAGE, env)
hook(POST_EPISODE_STAGE, policy, env)
end
end
hook(POST_EXPERIMENT_STAGE, policy, env)
hook
end
127 changes: 127 additions & 0 deletions src/ReinforcementLearningZoo/src/algorithms/nfsp/nfsp.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
export NFSPAgent


"""
NFSPAgent(; rl_agent::Agent, sl_agent::Agent, args...)

Neural Fictitious Self-Play (NFSP) agent implemented in Julia.
See the paper https://arxiv.org/abs/1603.01121 for more details.

# Keyword arguments

- `rl_agent::Agent`, Reinforcement Learning(RL) agent(use `DQN` for example), which works to search the best response from the self-play process.
- `sl_agent::Agent`, Supervisor Learning(SL) agent(use `BehaviorCloningPolicy` for example), which works to learn the best response from the rl_agent's policy.
- `η`, anticipatory parameter, the probability to use `ϵ-greedy(Q)` policy when training the agent.
- `rng=Random.GLOBAL_RNG`.
- `update_freq::Int`: the frequency of updating the agents' `approximator`.
- `step_counter::Int`, count the step.
- `mode::Bool`, used when learning, true as BestResponse(rl_agent's output), false as AveragePolicy(sl_agent's output).
"""
mutable struct NFSPAgent <: AbstractPolicy
rl_agent::Agent
sl_agent::Agent
η
rng
update_freq::Int
step_counter::Int
mode::Bool
end

function RLBase.update!(π::NFSPAgent, env::AbstractEnv)
action = π.mode ? π.rl_agent(env) : π.sl_agent(env)
π(PRE_ACT_STAGE, env, action)
env(action)
π(POST_ACT_STAGE, env)
end

RLBase.prob(π::NFSPAgent, env::AbstractEnv, args...) = prob(π.sl_agent.policy, env, args...)

function (π::NFSPAgent)(stage::PreActStage, env::AbstractEnv, action)
rl = π.rl_agent
sl = π.sl_agent

# update trajectory
if π.mode
action_probs = prob(rl.policy, env)
if typeof(action_probs) == Categorical{Float64, Vector{Float64}}
action_probs = probs(action_probs)
end

RLBase.update!(sl.trajectory, sl.policy, env, stage, action_probs)
rl(PRE_ACT_STAGE, env, action) # also update rl_agent's network
else
RLBase.update!(rl.trajectory, rl.policy, env, stage, action)
end

# update agent's approximator
π.step_counter += 1
if π.step_counter % π.update_freq == 0
RLBase.update!(sl.policy, sl.trajectory)
if !π.mode
rl_learn(π.rl_agent)
end
end
end

(π::NFSPAgent)(stage::PostActStage, env::AbstractEnv) = π.rl_agent(stage, env)

function (π::NFSPAgent)(stage::PostEpisodeStage, env::AbstractEnv)
rl = π.rl_agent
sl = π.sl_agent
RLBase.update!(rl.trajectory, rl.policy, env, stage)

# train the agent
π.step_counter += 1
if π.step_counter % π.update_freq == 0
RLBase.update!(sl.policy, sl.trajectory)
if !π.mode
rl_learn(π.rl_agent)
end
end
end

# Following is the supplement functions
# if the implementation work well, following function maybe move to the correspond file.
function rl_learn(rl_agent::Agent{<:QBasedPolicy, <:AbstractTrajectory})
# just learn the approximator, not update target_approximator
learner, t = rl_agent.policy.learner, rl_agent.trajectory
length(t[:terminal]) - learner.sampler.n <= learner.min_replay_history && return

_, batch = sample(learner.rng, t, learner.sampler)

if t isa PrioritizedTrajectory
priorities = update!(learner, batch)
t[:priority][inds] .= priorities
else
update!(learner, batch)
end
end

function RLBase.update!(p::BehaviorCloningPolicy, batch::NamedTuple{(:state, :action_probs)})
s, probs = batch.state, batch.action_probs
m = p.approximator
gs = gradient(params(m)) do
ŷ = m(s)
y = probs
Flux.Losses.logitcrossentropy(ŷ, y)
end
update!(m, gs)
end

function RLBase.update!(
trajectory::ReservoirTrajectory,
policy::AbstractPolicy,
env::AbstractEnv,
::PreActStage,
action_probs::Vector{Float64},
)
s = policy isa NamedPolicy ? state(env, nameof(policy)) : state(env)
if haskey(trajectory.buffer, :legal_actions_mask)
lasm =
policy isa NamedPolicy ? legal_action_space_mask(env, nameof(policy)) :
legal_action_space_mask(env)
push!(trajectory; :state => s, :action_probs => action_probs, :legal_actions_mask => lasm)
else
push!(trajectory; :state => s, :action_probs => action_probs)
end
end
39 changes: 39 additions & 0 deletions src/ReinforcementLearningZoo/src/algorithms/nfsp/nfsp_manager.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
export NFSPAgentManager


"""
NFSPAgentManager(; agents::Dict{Any, NFSPAgent})

A special MultiAgentManager in which all agents use NFSP policy to play the game.
"""
mutable struct NFSPAgentManager <: AbstractPolicy
agents::Dict{Any, NFSPAgent}
end

function (π::NFSPAgentManager)(env::AbstractEnv)
player = current_player(env)
if player == chance_player(env)
env |> legal_action_space |> rand |> env
else
env |> π.agents[player] |> env
end
end

function (π::NFSPAgentManager)(stage::PostEpisodeStage, env::AbstractEnv)
for player in players(env)
if player != chance_player(env)
π.agents[player](stage, env)
end
end
end

RLBase.prob(π::NFSPAgentManager, env::AbstractEnv, args...) = prob(π.agents[current_player(env)], env, args...)

function RLBase.update!(π::NFSPAgentManager, env::AbstractEnv)
player = current_player(env)
if player == chance_player(env)
env |> legal_action_space |> rand |> env
else
RLBase.update!(π.agents[player], env)
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ end

function RLBase.prob(p::BehaviorCloningPolicy, env::AbstractEnv)
s = state(env)
s_batch = Flux.unsqueeze(s, ndims(s) + 1)
values = p.approximator(s_batch) |> vec |> send_to_host
m = p.approximator
s_batch = send_to_device(device(m), Flux.unsqueeze(s, ndims(s) + 1))
values = m(s_batch) |> vec |> send_to_host
typeof(ActionStyle(env)) == MinimalActionSet ? prob(p.explorer, values) : prob(p.explorer, values, legal_action_space_mask(env))
end

Expand Down