refactor

vwxyzjn · vwxyzjn · Nov 28, 2023 · Oct 16, 2023 · Oct 16, 2023 · Oct 16, 2023
commit 896f34624b92f207a82e3f5bb0cb2511b968433a
diff --git a/cleanrl/ppo_rnd_envpool.py b/cleanrl/ppo_rnd_envpool.py
@@ -13,84 +13,86 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
+import tyro
 from gym.wrappers.normalize import RunningMeanStd
 from torch.distributions.categorical import Categorical
 from torch.utils.tensorboard import SummaryWriter
 
 
-def parse_args():
-    # fmt: off
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"),
-        help="the name of this experiment")
-    parser.add_argument("--seed", type=int, default=1,
-        help="seed of the experiment")
-    parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-        help="if toggled, `torch.backends.cudnn.deterministic=False`")
-    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-        help="if toggled, cuda will be enabled by default")
-    parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
-        help="if toggled, this experiment will be tracked with Weights and Biases")
-    parser.add_argument("--wandb-project-name", type=str, default="cleanRL",
-        help="the wandb's project name")
-    parser.add_argument("--wandb-entity", type=str, default=None,
-        help="the entity (team) of wandb's project")
+@dataclass
+class Args:
+    exp_name: str = os.path.basename(__file__)[: -len(".py")]
+    """the name of this experiment"""
+    seed: int = 1
+    """seed of the experiment"""
+    torch_deterministic: bool = True
+    """if toggled, `torch.backends.cudnn.deterministic=False`"""
+    cuda: bool = True
+    """if toggled, cuda will be enabled by default"""
+    track: bool = False
+    """if toggled, this experiment will be tracked with Weights and Biases"""
+    wandb_project_name: str = "cleanRL"
+    """the wandb's project name"""
+    wandb_entity: str = None
+    """the entity (team) of wandb's project"""
+    capture_video: bool = False
+    """whether to capture videos of the agent performances (check out `videos` folder)"""
 
     # Algorithm specific arguments
-    parser.add_argument("--env-id", type=str, default="MontezumaRevenge-v5",
-        help="the id of the environment")
-    parser.add_argument("--total-timesteps", type=int, default=2000000000,
-        help="total timesteps of the experiments")
-    parser.add_argument("--learning-rate", type=float, default=1e-4,
-        help="the learning rate of the optimizer")
-    parser.add_argument("--num-envs", type=int, default=128,
-        help="the number of parallel game environments")
-    parser.add_argument("--num-steps", type=int, default=128,
-        help="the number of steps to run in each environment per policy rollout")
-    parser.add_argument("--anneal-lr", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-        help="Toggle learning rate annealing for policy and value networks")
-    parser.add_argument("--gamma", type=float, default=0.999,
-        help="the discount factor gamma")
-    parser.add_argument("--gae-lambda", type=float, default=0.95,
-        help="the lambda for the general advantage estimation")
-    parser.add_argument("--num-minibatches", type=int, default=4,
-        help="the number of mini-batches")
-    parser.add_argument("--update-epochs", type=int, default=4,
-        help="the K epochs to update the policy")
-    parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-        help="Toggles advantages normalization")
-    parser.add_argument("--clip-coef", type=float, default=0.1,
-        help="the surrogate clipping coefficient")
-    parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-        help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
-    parser.add_argument("--ent-coef", type=float, default=0.001,
-        help="coefficient of the entropy")
-    parser.add_argument("--vf-coef", type=float, default=0.5,
-        help="coefficient of the value function")
-    parser.add_argument("--max-grad-norm", type=float, default=0.5,
-        help="the maximum norm for the gradient clipping")
-    parser.add_argument("--target-kl", type=float, default=None,
-        help="the target KL divergence threshold")
-    parser.add_argument("--sticky-action", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-        help="if toggled, sticky action will be used")
+    env_id: str = "MontezumaRevenge-v5"
+    """the id of the environment"""
+    total_timesteps: int = 2000000000
+    """total timesteps of the experiments"""
+    learning_rate: float = 1e-4
+    """the learning rate of the optimizer"""
+    num_envs: int = 128
+    """the number of parallel game environments"""
+    num_steps: int = 128
+    """the number of steps to run in each environment per policy rollout"""
+    anneal_lr: bool = True
+    """Toggle learning rate annealing for policy and value networks"""
+    gamma: float = 0.999
+    """the discount factor gamma"""
+    gae_lambda: float = 0.95
+    """the lambda for the general advantage estimation"""
+    num_minibatches: int = 4
+    """the number of mini-batches"""
+    update_epochs: int = 4
+    """the K epochs to update the policy"""
+    norm_adv: bool = True
+    """Toggles advantages normalization"""
+    clip_coef: float = 0.1
+    """the surrogate clipping coefficient"""
+    clip_vloss: bool = True
+    """Toggles whether or not to use a clipped loss for the value function, as per the paper."""
+    ent_coef: float = 0.001
+    """coefficient of the entropy"""
+    vf_coef: float = 0.5
+    """coefficient of the value function"""
+    max_grad_norm: float = 0.5
+    """the maximum norm for the gradient clipping"""
+    target_kl: float = None
+    """the target KL divergence threshold"""
 
     # RND arguments
-    parser.add_argument("--update-proportion", type=float, default=0.25,
-        help="proportion of exp used for predictor update")
-    parser.add_argument("--int-coef", type=float, default=1.0,
-        help="coefficient of extrinsic reward")
-    parser.add_argument("--ext-coef", type=float, default=2.0,
-        help="coefficient of intrinsic reward")
-    parser.add_argument("--int-gamma", type=float, default=0.99,
-        help="Intrinsic reward discount rate")
-    parser.add_argument("--num-iterations-obs-norm-init", type=int, default=50,
-        help="number of iterations to initialize the observations normalization parameters")
-
-    args = parser.parse_args()
-    args.batch_size = int(args.num_envs * args.num_steps)
-    args.minibatch_size = int(args.batch_size // args.num_minibatches)
-    # fmt: on
-    return args
+    update_proportion: float = 0.25
+    """proportion of exp used for predictor update"""
+    int_coef: float = 1.0
+    """coefficient of extrinsic reward"""
+    ext_coef: float = 2.0
+    """coefficient of intrinsic reward"""
+    int_gamma: float = 0.99
+    """Intrinsic reward discount rate"""
+    num_iterations_obs_norm_init: int = 50
+    """number of iterations to initialize the observations normalization parameters"""
+
+    # to be filled in runtime
+    batch_size: int = 0
+    """the batch size (computed in runtime)"""
+    minibatch_size: int = 0
+    """the mini-batch size (computed in runtime)"""
+    num_iterations: int = 0
+    """the number of iterations (computed in runtime)"""
 
 
 class RecordEpisodeStatistics(gym.Wrapper):
@@ -242,7 +244,10 @@ def update(self, rews):
 
 
 if __name__ == "__main__":
-    args = parse_args()
+    args = tyro.cli(Args)
+    args.batch_size = int(args.num_envs * args.num_steps)
+    args.minibatch_size = int(args.batch_size // args.num_minibatches)
+    args.num_iterations = args.total_timesteps // args.batch_size
     run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
     if args.track:
         import wandb

diff --git a/cleanrl/qdagger_dqn_atari_impalacnn.py b/cleanrl/qdagger_dqn_atari_impalacnn.py
@@ -12,6 +12,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
+import tyro
 from huggingface_hub import hf_hub_download
 from rich.progress import track
 from stable_baselines3.common.atari_wrappers import (
@@ -28,81 +29,72 @@
 from cleanrl_utils.evals.dqn_eval import evaluate
 
 
-def parse_args():
-    # fmt: off
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"),
-        help="the name of this experiment")
-    parser.add_argument("--seed", type=int, default=1,
-        help="seed of the experiment")
-    parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-        help="if toggled, `torch.backends.cudnn.deterministic=False`")
-    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-        help="if toggled, cuda will be enabled by default")
-    parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
-        help="if toggled, this experiment will be tracked with Weights and Biases")
-    parser.add_argument("--wandb-project-name", type=str, default="cleanRL",
-        help="the wandb's project name")
-    parser.add_argument("--wandb-entity", type=str, default=None,
-        help="the entity (team) of wandb's project")
-    parser.add_argument("--capture-video", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
-        help="whether to capture videos of the agent performances (check out `videos` folder)")
-    parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
-        help="whether to save model into the `runs/{run_name}` folder")
-    parser.add_argument("--upload-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
-        help="whether to upload the saved model to huggingface")
-    parser.add_argument("--hf-entity", type=str, default="",
-        help="the user or org name of the model repository from the Hugging Face Hub")
+@dataclass
+class Args:
+    exp_name: str = os.path.basename(__file__)[: -len(".py")]
+    """the name of this experiment"""
+    seed: int = 1
+    """seed of the experiment"""
+    torch_deterministic: bool = True
+    """if toggled, `torch.backends.cudnn.deterministic=False`"""
+    cuda: bool = True
+    """if toggled, cuda will be enabled by default"""
+    track: bool = False
+    """if toggled, this experiment will be tracked with Weights and Biases"""
+    wandb_project_name: str = "cleanRL"
+    """the wandb's project name"""
+    wandb_entity: str = None
+    """the entity (team) of wandb's project"""
+    capture_video: bool = False
+    """whether to capture videos of the agent performances (check out `videos` folder)"""
+    save_model: bool = False
+    """whether to save model into the `runs/{run_name}` folder"""
+    upload_model: bool = False
+    """whether to upload the saved model to huggingface"""
+    hf_entity: str = ""
+    """the user or org name of the model repository from the Hugging Face Hub"""
 
     # Algorithm specific arguments
-    parser.add_argument("--env-id", type=str, default="BreakoutNoFrameskip-v4",
-        help="the id of the environment")
-    parser.add_argument("--total-timesteps", type=int, default=10000000,
-        help="total timesteps of the experiments")
-    parser.add_argument("--learning-rate", type=float, default=1e-4,
-        help="the learning rate of the optimizer")
-    parser.add_argument("--num-envs", type=int, default=1,
-        help="the number of parallel game environments")
-    parser.add_argument("--buffer-size", type=int, default=1000000,
-        help="the replay memory buffer size")
-    parser.add_argument("--gamma", type=float, default=0.99,
-        help="the discount factor gamma")
-    parser.add_argument("--tau", type=float, default=1.,
-        help="the target network update rate")
-    parser.add_argument("--target-network-frequency", type=int, default=1000,
-        help="the timesteps it takes to update the target network")
-    parser.add_argument("--batch-size", type=int, default=32,
-        help="the batch size of sample from the reply memory")
-    parser.add_argument("--start-e", type=float, default=1,
-        help="the starting epsilon for exploration")
-    parser.add_argument("--end-e", type=float, default=0.01,
-        help="the ending epsilon for exploration")
-    parser.add_argument("--exploration-fraction", type=float, default=0.10,
-        help="the fraction of `total-timesteps` it takes from start-e to go end-e")
-    parser.add_argument("--learning-starts", type=int, default=80000,
-        help="timestep to start learning")
-    parser.add_argument("--train-frequency", type=int, default=4,
-        help="the frequency of training")
+    env_id: str = "BreakoutNoFrameskip-v4"
+    """the id of the environment"""
+    total_timesteps: int = 10000000
+    """total timesteps of the experiments"""
+    learning_rate: float = 1e-4
+    """the learning rate of the optimizer"""
+    num_envs: int = 1
+    """the number of parallel game environments"""
+    buffer_size: int = 1000000
+    """the replay memory buffer size"""
+    gamma: float = 0.99
+    """the discount factor gamma"""
+    tau: float = 1.0
+    """the target network update rate"""
+    target_network_frequency: int = 1000
+    """the timesteps it takes to update the target network"""
+    batch_size: int = 32
+    """the batch size of sample from the reply memory"""
+    start_e: float = 1.0
+    """the starting epsilon for exploration"""
+    end_e: float = 0.01
+    """the ending epsilon for exploration"""
+    exploration_fraction: float = 0.10
+    """the fraction of `total-timesteps` it takes from start-e to go end-e"""
+    learning_starts: int = 80000
+    """timestep to start learning"""
+    train_frequency: int = 4
+    """the frequency of training"""
 
     # QDagger specific arguments
-    parser.add_argument("--teacher-policy-hf-repo", type=str, default=None,
-        help="the huggingface repo of the teacher policy")
-    parser.add_argument("--teacher-eval-episodes", type=int, default=10,
-        help="the number of episodes to run the teacher policy evaluate")
-    parser.add_argument("--teacher-steps", type=int, default=500000,
-        help="the number of steps to run the teacher policy to generate the replay buffer")
-    parser.add_argument("--offline-steps", type=int, default=500000,
-        help="the number of steps to run the student policy with the teacher's replay buffer")
-    parser.add_argument("--temperature", type=float, default=1.0,
-        help="the temperature parameter for qdagger")
-    args = parser.parse_args()
-    # fmt: on
-    assert args.num_envs == 1, "vectorized envs are not supported at the moment"
-
-    if args.teacher_policy_hf_repo is None:
-        args.teacher_policy_hf_repo = f"cleanrl/{args.env_id}-dqn_atari-seed1"
-
-    return args
+    teacher_policy_hf_repo: str = None
+    """the huggingface repo of the teacher policy"""
+    teacher_eval_episodes: int = 10
+    """the number of episodes to run the teacher policy evaluate"""
+    teacher_steps: int = 500000
+    """the number of steps to run the teacher policy to generate the replay buffer"""
+    offline_steps: int = 500000
+    """the number of steps to run the student policy with the teacher's replay buffer"""
+    temperature: float = 1.0
+    """the temperature parameter for qdagger"""
 
 
 def make_env(env_id, seed, idx, capture_video, run_name):
@@ -212,7 +204,10 @@ def kl_divergence_with_logits(target_logits, prediction_logits):
 poetry run pip install "stable_baselines3==2.0.0a1" "gymnasium[atari,accept-rom-license]==0.28.1"  "ale-py==0.8.1" 
 """
         )
-    args = parse_args()
+    args = tyro.cli(Args)
+    assert args.num_envs == 1, "vectorized envs are not supported at the moment"
+    if args.teacher_policy_hf_repo is None:
+        args.teacher_policy_hf_repo = f"cleanrl/{args.env_id}-dqn_atari-seed1"
     run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
     if args.track:
         import wandb