Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor to use tyro #424

Merged
merged 44 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
cd4851e
Refactor to use tyro
vwxyzjn Oct 16, 2023
b97d54f
push
vwxyzjn Oct 16, 2023
b87a015
psuh
vwxyzjn Oct 16, 2023
896f346
refactor
vwxyzjn Oct 16, 2023
6220645
fix pre-commit
vwxyzjn Oct 16, 2023
adbf836
fix pre-commit
vwxyzjn Oct 16, 2023
8af1e13
fix commend
vwxyzjn Oct 16, 2023
0b61550
Merge branch 'master' into refactor-tyro
sdpkjc Oct 16, 2023
96a56b8
refactor
vwxyzjn Oct 16, 2023
a8795a9
Merge branch 'refactor-tyro' of https://github.com/vwxyzjn/cleanrl in…
vwxyzjn Oct 16, 2023
cb6b47a
update poetry
vwxyzjn Oct 16, 2023
cfeedb0
fix test case
vwxyzjn Oct 16, 2023
9c0959c
quick fix
vwxyzjn Oct 16, 2023
5f3f716
fix
vwxyzjn Oct 17, 2023
08f4392
update optuna
vwxyzjn Oct 17, 2023
de6c829
quick change
vwxyzjn Oct 17, 2023
b09e088
fix ppg
vwxyzjn Oct 17, 2023
e92cf57
quick fix
vwxyzjn Oct 17, 2023
57b05fb
fix optuna
vwxyzjn Oct 17, 2023
17f49db
quick change
vwxyzjn Oct 17, 2023
cbbdc8b
fix
vwxyzjn Oct 17, 2023
e69b317
quick change
vwxyzjn Oct 17, 2023
f83a218
quick change
vwxyzjn Oct 17, 2023
86e6275
fix bug in multi-gpu
vwxyzjn Nov 8, 2023
bf5368a
refactor benchmark, support slurm
vwxyzjn Nov 8, 2023
aec360b
remove mujoco_py stuff
vwxyzjn Nov 9, 2023
46efc25
add slurm template
vwxyzjn Nov 9, 2023
072eafb
pre-commit
vwxyzjn Nov 9, 2023
b2542e0
update ddpg docs
vwxyzjn Nov 9, 2023
33a5609
update td3 docs
vwxyzjn Nov 9, 2023
4d8c3da
update sac
vwxyzjn Nov 9, 2023
70702cf
bug fix
vwxyzjn Nov 13, 2023
4c09502
Merge branch 'refactor-tyro' of https://github.com/vwxyzjn/cleanrl in…
vwxyzjn Nov 13, 2023
60b71f7
update docs
vwxyzjn Nov 27, 2023
7a96de2
update ppo docs
vwxyzjn Nov 27, 2023
89846df
bump version
vwxyzjn Nov 27, 2023
4f0dc48
bump version
vwxyzjn Nov 27, 2023
d821748
bump test cases
vwxyzjn Nov 27, 2023
7880155
add benchmark utility docs
vwxyzjn Nov 27, 2023
50ec155
bump test
vwxyzjn Nov 27, 2023
940595a
fix #418
vwxyzjn Nov 27, 2023
b0caf45
update requirements.txt
vwxyzjn Nov 27, 2023
aaf7dd0
test
vwxyzjn Nov 27, 2023
2fb4814
add numpy
vwxyzjn Nov 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refactor
  • Loading branch information
vwxyzjn committed Oct 16, 2023
commit 896f34624b92f207a82e3f5bb0cb2511b968433a
145 changes: 75 additions & 70 deletions cleanrl/ppo_rnd_envpool.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,84 +13,86 @@
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tyro
from gym.wrappers.normalize import RunningMeanStd
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter


def parse_args():
# fmt: off
parser = argparse.ArgumentParser()
parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"),
help="the name of this experiment")
parser.add_argument("--seed", type=int, default=1,
help="seed of the experiment")
parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, `torch.backends.cudnn.deterministic=False`")
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, cuda will be enabled by default")
parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
help="if toggled, this experiment will be tracked with Weights and Biases")
parser.add_argument("--wandb-project-name", type=str, default="cleanRL",
help="the wandb's project name")
parser.add_argument("--wandb-entity", type=str, default=None,
help="the entity (team) of wandb's project")
@dataclass
class Args:
exp_name: str = os.path.basename(__file__)[: -len(".py")]
"""the name of this experiment"""
seed: int = 1
"""seed of the experiment"""
torch_deterministic: bool = True
"""if toggled, `torch.backends.cudnn.deterministic=False`"""
cuda: bool = True
"""if toggled, cuda will be enabled by default"""
track: bool = False
"""if toggled, this experiment will be tracked with Weights and Biases"""
wandb_project_name: str = "cleanRL"
"""the wandb's project name"""
wandb_entity: str = None
"""the entity (team) of wandb's project"""
capture_video: bool = False
"""whether to capture videos of the agent performances (check out `videos` folder)"""

# Algorithm specific arguments
parser.add_argument("--env-id", type=str, default="MontezumaRevenge-v5",
help="the id of the environment")
parser.add_argument("--total-timesteps", type=int, default=2000000000,
help="total timesteps of the experiments")
parser.add_argument("--learning-rate", type=float, default=1e-4,
help="the learning rate of the optimizer")
parser.add_argument("--num-envs", type=int, default=128,
help="the number of parallel game environments")
parser.add_argument("--num-steps", type=int, default=128,
help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--anneal-lr", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Toggle learning rate annealing for policy and value networks")
parser.add_argument("--gamma", type=float, default=0.999,
help="the discount factor gamma")
parser.add_argument("--gae-lambda", type=float, default=0.95,
help="the lambda for the general advantage estimation")
parser.add_argument("--num-minibatches", type=int, default=4,
help="the number of mini-batches")
parser.add_argument("--update-epochs", type=int, default=4,
help="the K epochs to update the policy")
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Toggles advantages normalization")
parser.add_argument("--clip-coef", type=float, default=0.1,
help="the surrogate clipping coefficient")
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
parser.add_argument("--ent-coef", type=float, default=0.001,
help="coefficient of the entropy")
parser.add_argument("--vf-coef", type=float, default=0.5,
help="coefficient of the value function")
parser.add_argument("--max-grad-norm", type=float, default=0.5,
help="the maximum norm for the gradient clipping")
parser.add_argument("--target-kl", type=float, default=None,
help="the target KL divergence threshold")
parser.add_argument("--sticky-action", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, sticky action will be used")
env_id: str = "MontezumaRevenge-v5"
"""the id of the environment"""
total_timesteps: int = 2000000000
"""total timesteps of the experiments"""
learning_rate: float = 1e-4
"""the learning rate of the optimizer"""
num_envs: int = 128
"""the number of parallel game environments"""
num_steps: int = 128
"""the number of steps to run in each environment per policy rollout"""
anneal_lr: bool = True
"""Toggle learning rate annealing for policy and value networks"""
gamma: float = 0.999
"""the discount factor gamma"""
gae_lambda: float = 0.95
"""the lambda for the general advantage estimation"""
num_minibatches: int = 4
"""the number of mini-batches"""
update_epochs: int = 4
"""the K epochs to update the policy"""
norm_adv: bool = True
"""Toggles advantages normalization"""
clip_coef: float = 0.1
"""the surrogate clipping coefficient"""
clip_vloss: bool = True
"""Toggles whether or not to use a clipped loss for the value function, as per the paper."""
ent_coef: float = 0.001
"""coefficient of the entropy"""
vf_coef: float = 0.5
"""coefficient of the value function"""
max_grad_norm: float = 0.5
"""the maximum norm for the gradient clipping"""
target_kl: float = None
"""the target KL divergence threshold"""

# RND arguments
parser.add_argument("--update-proportion", type=float, default=0.25,
help="proportion of exp used for predictor update")
parser.add_argument("--int-coef", type=float, default=1.0,
help="coefficient of extrinsic reward")
parser.add_argument("--ext-coef", type=float, default=2.0,
help="coefficient of intrinsic reward")
parser.add_argument("--int-gamma", type=float, default=0.99,
help="Intrinsic reward discount rate")
parser.add_argument("--num-iterations-obs-norm-init", type=int, default=50,
help="number of iterations to initialize the observations normalization parameters")

args = parser.parse_args()
args.batch_size = int(args.num_envs * args.num_steps)
args.minibatch_size = int(args.batch_size // args.num_minibatches)
# fmt: on
return args
update_proportion: float = 0.25
"""proportion of exp used for predictor update"""
int_coef: float = 1.0
"""coefficient of extrinsic reward"""
ext_coef: float = 2.0
"""coefficient of intrinsic reward"""
int_gamma: float = 0.99
"""Intrinsic reward discount rate"""
num_iterations_obs_norm_init: int = 50
"""number of iterations to initialize the observations normalization parameters"""

# to be filled in runtime
batch_size: int = 0
"""the batch size (computed in runtime)"""
minibatch_size: int = 0
"""the mini-batch size (computed in runtime)"""
num_iterations: int = 0
"""the number of iterations (computed in runtime)"""


class RecordEpisodeStatistics(gym.Wrapper):
Expand Down Expand Up @@ -242,7 +244,10 @@ def update(self, rews):


if __name__ == "__main__":
args = parse_args()
args = tyro.cli(Args)
args.batch_size = int(args.num_envs * args.num_steps)
args.minibatch_size = int(args.batch_size // args.num_minibatches)
args.num_iterations = args.total_timesteps // args.batch_size
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

args.num_iterations does not appear to replace the previous num_updates. Defined but not used.

run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
if args.track:
import wandb
Expand Down
139 changes: 67 additions & 72 deletions cleanrl/qdagger_dqn_atari_impalacnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tyro
from huggingface_hub import hf_hub_download
from rich.progress import track
from stable_baselines3.common.atari_wrappers import (
Expand All @@ -28,81 +29,72 @@
from cleanrl_utils.evals.dqn_eval import evaluate


def parse_args():
# fmt: off
parser = argparse.ArgumentParser()
parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"),
help="the name of this experiment")
parser.add_argument("--seed", type=int, default=1,
help="seed of the experiment")
parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, `torch.backends.cudnn.deterministic=False`")
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, cuda will be enabled by default")
parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
help="if toggled, this experiment will be tracked with Weights and Biases")
parser.add_argument("--wandb-project-name", type=str, default="cleanRL",
help="the wandb's project name")
parser.add_argument("--wandb-entity", type=str, default=None,
help="the entity (team) of wandb's project")
parser.add_argument("--capture-video", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
help="whether to capture videos of the agent performances (check out `videos` folder)")
parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
help="whether to save model into the `runs/{run_name}` folder")
parser.add_argument("--upload-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
help="whether to upload the saved model to huggingface")
parser.add_argument("--hf-entity", type=str, default="",
help="the user or org name of the model repository from the Hugging Face Hub")
@dataclass
class Args:
exp_name: str = os.path.basename(__file__)[: -len(".py")]
"""the name of this experiment"""
seed: int = 1
"""seed of the experiment"""
torch_deterministic: bool = True
"""if toggled, `torch.backends.cudnn.deterministic=False`"""
cuda: bool = True
"""if toggled, cuda will be enabled by default"""
track: bool = False
"""if toggled, this experiment will be tracked with Weights and Biases"""
wandb_project_name: str = "cleanRL"
"""the wandb's project name"""
wandb_entity: str = None
"""the entity (team) of wandb's project"""
capture_video: bool = False
"""whether to capture videos of the agent performances (check out `videos` folder)"""
save_model: bool = False
"""whether to save model into the `runs/{run_name}` folder"""
upload_model: bool = False
"""whether to upload the saved model to huggingface"""
hf_entity: str = ""
"""the user or org name of the model repository from the Hugging Face Hub"""

# Algorithm specific arguments
parser.add_argument("--env-id", type=str, default="BreakoutNoFrameskip-v4",
help="the id of the environment")
parser.add_argument("--total-timesteps", type=int, default=10000000,
help="total timesteps of the experiments")
parser.add_argument("--learning-rate", type=float, default=1e-4,
help="the learning rate of the optimizer")
parser.add_argument("--num-envs", type=int, default=1,
help="the number of parallel game environments")
parser.add_argument("--buffer-size", type=int, default=1000000,
help="the replay memory buffer size")
parser.add_argument("--gamma", type=float, default=0.99,
help="the discount factor gamma")
parser.add_argument("--tau", type=float, default=1.,
help="the target network update rate")
parser.add_argument("--target-network-frequency", type=int, default=1000,
help="the timesteps it takes to update the target network")
parser.add_argument("--batch-size", type=int, default=32,
help="the batch size of sample from the reply memory")
parser.add_argument("--start-e", type=float, default=1,
help="the starting epsilon for exploration")
parser.add_argument("--end-e", type=float, default=0.01,
help="the ending epsilon for exploration")
parser.add_argument("--exploration-fraction", type=float, default=0.10,
help="the fraction of `total-timesteps` it takes from start-e to go end-e")
parser.add_argument("--learning-starts", type=int, default=80000,
help="timestep to start learning")
parser.add_argument("--train-frequency", type=int, default=4,
help="the frequency of training")
env_id: str = "BreakoutNoFrameskip-v4"
"""the id of the environment"""
total_timesteps: int = 10000000
"""total timesteps of the experiments"""
learning_rate: float = 1e-4
"""the learning rate of the optimizer"""
num_envs: int = 1
"""the number of parallel game environments"""
buffer_size: int = 1000000
"""the replay memory buffer size"""
gamma: float = 0.99
"""the discount factor gamma"""
tau: float = 1.0
"""the target network update rate"""
target_network_frequency: int = 1000
"""the timesteps it takes to update the target network"""
batch_size: int = 32
"""the batch size of sample from the reply memory"""
start_e: float = 1.0
"""the starting epsilon for exploration"""
end_e: float = 0.01
"""the ending epsilon for exploration"""
exploration_fraction: float = 0.10
"""the fraction of `total-timesteps` it takes from start-e to go end-e"""
learning_starts: int = 80000
"""timestep to start learning"""
train_frequency: int = 4
"""the frequency of training"""

# QDagger specific arguments
parser.add_argument("--teacher-policy-hf-repo", type=str, default=None,
help="the huggingface repo of the teacher policy")
parser.add_argument("--teacher-eval-episodes", type=int, default=10,
help="the number of episodes to run the teacher policy evaluate")
parser.add_argument("--teacher-steps", type=int, default=500000,
help="the number of steps to run the teacher policy to generate the replay buffer")
parser.add_argument("--offline-steps", type=int, default=500000,
help="the number of steps to run the student policy with the teacher's replay buffer")
parser.add_argument("--temperature", type=float, default=1.0,
help="the temperature parameter for qdagger")
args = parser.parse_args()
# fmt: on
assert args.num_envs == 1, "vectorized envs are not supported at the moment"

if args.teacher_policy_hf_repo is None:
args.teacher_policy_hf_repo = f"cleanrl/{args.env_id}-dqn_atari-seed1"

return args
teacher_policy_hf_repo: str = None
"""the huggingface repo of the teacher policy"""
teacher_eval_episodes: int = 10
"""the number of episodes to run the teacher policy evaluate"""
teacher_steps: int = 500000
"""the number of steps to run the teacher policy to generate the replay buffer"""
offline_steps: int = 500000
"""the number of steps to run the student policy with the teacher's replay buffer"""
temperature: float = 1.0
"""the temperature parameter for qdagger"""


def make_env(env_id, seed, idx, capture_video, run_name):
Expand Down Expand Up @@ -212,7 +204,10 @@ def kl_divergence_with_logits(target_logits, prediction_logits):
poetry run pip install "stable_baselines3==2.0.0a1" "gymnasium[atari,accept-rom-license]==0.28.1" "ale-py==0.8.1"
"""
)
args = parse_args()
args = tyro.cli(Args)
assert args.num_envs == 1, "vectorized envs are not supported at the moment"
if args.teacher_policy_hf_repo is None:
args.teacher_policy_hf_repo = f"cleanrl/{args.env_id}-dqn_atari-seed1"
run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
if args.track:
import wandb
Expand Down
Loading