-
Create a new conda envrionment:
conda create -n lm_stable_baselines python=3.11 conda activate lm_stable_baselines
-
Install lm_stable_baselines:
pip install -e .
-
Install the rest of the requirements:
pip install -r pip_requirements.txt
You can download the the data here (it's just the MATH dataset in a special format and gsm8k): https://drive.google.com/file/d/1kRv4X3ZDlKj9-4Rf5E5MqzqQWJooX340/view?usp=sharing
Once you've downloaded it, unzip it, create a data folder and put it in there. Your data folder should be here:
PauseToken
|
|-> data/
|-> MATH_json/
|-> train.json
|-> test.json
|-> gsm8k_jsonl/
|-> train.json
|-> test.json
|-> src/
|-> model/
....
...
Here's a template on how to run sft:
# OPTIONS
## <MODEL-NAME>: mistral, llama1B , llama3B
## <DATA>: gsm8k , math , pros_qa
## <REWARD>: gsm8k , math , pros_qa
## <METRIC>: gsm8k , math , pros_qa
## <NUM_VAL_SAMPLES>: 748 (for gsm8k), 750 (for math), 300 (for pros_qa)
python src/train.py experiment=train/sft/<MODEL-NAME> data=<DATA> metrics=<METRIC> rl_algorithm/reward=<REWARD> trainer.num_val_samples=<NUM_VAL_SAMPLES> trainer.n_outer_loops=1 run_name=<YOUR-RUN-NAME-HERE>
So for example if I want to run sft for mistral on gsm8k:
python src/train.py experiment=train/sft/mistral data=gsm8k metrics=gsm8k rl_algorithm/reward=gsm8k trainer.num_val_samples=748 trainer.n_outer_loops=1 run_name=sft_mistral_gsm8k
For the sake of being fully clear (and because there are a few exceptions) I'll explicitely write all the warm up runs we did
# Mistral on gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-PATH-TO-MODEL> experiment=train/sft/mistral data=gsm8k metrics=gsm8k rl_algorithm/reward=gsm8k trainer.num_val_samples=748 trainer.n_outer_loops=1 run_name=warmup_mistral_gsm8k
# Mistral on Math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-PATH-TO-MODEL> experiment=train/sft/mistral data=math metrics=math rl_algorithm/reward=math trainer.num_val_samples=750 trainer.n_outer_loops=1 run_name=warmup_mistral_math rl_algorithm.policy.max_output_generation_length=1800 rl_algorithm.n_envs=16
# llama3B on gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-PATH-TO-MODEL> experiment=train/sft/llama3B data=gsm8k metrics=gsm8k rl_algorithm/reward=gsm8k trainer.num_val_samples=748 trainer.n_outer_loops=1 run_name=warmup_llama3B_gsm8k
# llama3B on Math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-PATH-TO-MODEL> experiment=train/sft/llama3B data=math metrics=math rl_algorithm/reward=math trainer.num_val_samples=750 trainer.n_outer_loops=1 run_name=warmup_llama3B_math rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.n_envs=16
# llama3B on gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=Qwen/Qwen2.5-Math-1.5B experiment=train/sft/qwen1B data=gsm8k metrics=gsm8k rl_algorithm/reward=gsm8k trainer.num_val_samples=748 trainer.n_outer_loops=1 run_name=warmup_qwen1B_gsm8k
# llama3B on Math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=Qwen/Qwen2.5-Math-1.5B experiment=train/sft/qwen1B data=math metrics=math rl_algorithm/reward=math trainer.num_val_samples=750 trainer.n_outer_loops=1 run_name=warmup_qwen1B_math rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.n_envs=16
# llama3B on gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=Qwen/Qwen2.5-Math-7B experiment=train/sft/qwen7B data=gsm8k metrics=gsm8k rl_algorithm/reward=gsm8k trainer.num_val_samples=748 trainer.n_outer_loops=1 run_name=warmup_qwen7B_gsm8k rl_algorithm.n_envs=16
# llama3B on Math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=Qwen/Qwen2.5-Math-7B experiment=train/sft/qwen7B data=math metrics=math rl_algorithm/reward=math trainer.num_val_samples=750 trainer.n_outer_loops=1 run_name=warmup_qwen7B_math rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.n_envs=16
Here's a template on how to run Curriculum and it's baseline
## <MODEL-NAME>: mistral, llama1B , llama3B
## <DATA>: gsm8k , math , pros_qa
## <REWARD>: gsm8k , math , pros_qa
## <METRIC>: gsm8k , math , pros_qa
## <NUM_VAL_SAMPLES>: 748 (for gsm8k), 750 (for math), 300 (for pros_qa)
# Baseline
python src/train.py experiment=train/ppo/<MODEL-NAME>/baseline_sft trainer.n_outer_loops=50 rl_algorithm.policy.ft_on_action_only=true run_name=<RUN-NAME> rl_algorithm.ent_coef=0.009 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 data=<DATA> rl_algorithm/reward=<REWARD> metrics=<METRIC> trainer.num_val_samples=<NUM_VAL_SAMPLES> rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY>
# Curriculum with Beta
python src/train.py experiment=train/ppo/<MODEL-NAME>/curr_beta rl_algorithm.policy.ft_on_action_only=true run_name=<RUN-NAME> rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.init_alpha=5.0 trainer.callbacks.portion_annealers.final_alpha=0.05 trainer.callbacks.portion_annealers.init_beta=5.0 trainer.callbacks.portion_annealers.final_beta=10.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=10 data=<DATA> rl_algorithm/reward=<REWARD> metrics=<METRIC> trainer.num_val_samples=<NUM_VAL_SAMPLES> rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY>
# Curriculum with Uniform Distribution
python src/train.py experiment=train/ppo/<MODEL-NAME>/curr_uniform rl_algorithm.policy.ft_on_action_only=true run_name=<RUN-NAME> rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.lower_bound_init_portion=0.0 trainer.callbacks.portion_annealers.lower_bound_final_portion=0.0 trainer.callbacks.portion_annealers.upper_bound_init_portion=1.0 trainer.callbacks.portion_annealers.upper_bound_final_portion=0.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=<NUM_VAL_SAMPLES> rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY>
For the sake of being perfectly clear, I'll show how to run all
### PPO Baselines ###
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/mistral/baseline_sft trainer.n_outer_loops=50 rl_algorithm.policy.ft_on_action_only=true name=mistral-on-gsm8k run_name=mistral_rl_baseline_ppo rl_algorithm.ent_coef=0.009 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/mistral/baseline_sft trainer.n_outer_loops=50 rl_algorithm.policy.ft_on_action_only=true name=mistral-on-math run_name=mistral_rl_baseline_ppo rl_algorithm.ent_coef=0.009 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.policy.max_output_generation_length=1800 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=1800 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=1800
### Curriculum with Beta
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/mistral/curr_beta rl_algorithm.policy.ft_on_action_only=true name=mistral-on-gsm8k run_name=mistral_beta_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.init_alpha=5.0 trainer.callbacks.portion_annealers.final_alpha=0.05 trainer.callbacks.portion_annealers.init_beta=5.0 trainer.callbacks.portion_annealers.final_beta=10.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/mistral/curr_beta rl_algorithm.policy.ft_on_action_only=true name=mistral-on-math run_name=mistral_beta_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.init_alpha=5.0 trainer.callbacks.portion_annealers.final_alpha=0.05 trainer.callbacks.portion_annealers.init_beta=5.0 trainer.callbacks.portion_annealers.final_beta=10.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.batch_size=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=1800 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=1800
### Curriculum with Uniform
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/mistral/curr_uniform rl_algorithm.policy.ft_on_action_only=true name=mistral-on-gsm8k run_name=mistral_unif_curr_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.lower_bound_init_portion=0.0 trainer.callbacks.portion_annealers.lower_bound_final_portion=0.0 trainer.callbacks.portion_annealers.upper_bound_init_portion=1.0 trainer.callbacks.portion_annealers.upper_bound_final_portion=0.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/mistral/curr_uniform rl_algorithm.policy.ft_on_action_only=true name=mistral-on-math run_name=mistral_unif_curr_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.lower_bound_init_portion=0.0 trainer.callbacks.portion_annealers.lower_bound_final_portion=0.0 trainer.callbacks.portion_annealers.upper_bound_init_portion=1.0 trainer.callbacks.portion_annealers.upper_bound_final_portion=0.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.batch_size=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=1800 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=1800
### SFT As baselines ###
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/sft/mistral data=gsm8k metrics=gsm8k rl_algorithm/reward=gsm8k trainer.num_val_samples=748 trainer.n_outer_loops=50 run_name=sft_mistral_gsm8k
# Math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/sft/mistral data=math metrics=math rl_algorithm/reward=math trainer.num_val_samples=750 trainer.n_outer_loops=50 run_name=sft_mistral_math rl_algorithm.policy.max_output_generation_length=1800 rl_algorithm.n_envs=16
### PPO Baselines ###
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/llama3B/baseline_sft trainer.n_outer_loops=50 rl_algorithm.policy.ft_on_action_only=true name=llama3B-on-gsm8k run_name=llama3B_rl_baseline_ppo rl_algorithm.ent_coef=0.009 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/llama3B/baseline_sft trainer.n_outer_loops=50 rl_algorithm.policy.ft_on_action_only=true name=llama3B-on-math run_name=llama3B_rl_baseline_ppo rl_algorithm.ent_coef=0.009 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=2048
### Curriculum with Beta
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/llama3B/curr_beta rl_algorithm.policy.ft_on_action_only=true name=llama3B-on-gsm8k run_name=llama3B_beta_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.init_alpha=5.0 trainer.callbacks.portion_annealers.final_alpha=0.05 trainer.callbacks.portion_annealers.init_beta=5.0 trainer.callbacks.portion_annealers.final_beta=10.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/llama3B/curr_beta rl_algorithm.policy.ft_on_action_only=true name=llama3B-on-math run_name=llama3B_beta_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.init_alpha=5.0 trainer.callbacks.portion_annealers.final_alpha=0.05 trainer.callbacks.portion_annealers.init_beta=5.0 trainer.callbacks.portion_annealers.final_beta=10.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.batch_size=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=2048
### Curriculum with Uniform
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/llama3B/curr_uniform rl_algorithm.policy.ft_on_action_only=true name=llama3B-on-gsm8k run_name=llama3B_unif_curr_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.lower_bound_init_portion=0.0 trainer.callbacks.portion_annealers.lower_bound_final_portion=0.0 trainer.callbacks.portion_annealers.upper_bound_init_portion=1.0 trainer.callbacks.portion_annealers.upper_bound_final_portion=0.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/llama3B/curr_uniform rl_algorithm.policy.ft_on_action_only=true name=llama3B-on-math run_name=llama3B_unif_curr_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.lower_bound_init_portion=0.0 trainer.callbacks.portion_annealers.lower_bound_final_portion=0.0 trainer.callbacks.portion_annealers.upper_bound_init_portion=1.0 trainer.callbacks.portion_annealers.upper_bound_final_portion=0.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.batch_size=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=2048
### SFT As baselines ###
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/sft/llama3B data=gsm8k metrics=gsm8k rl_algorithm/reward=gsm8k trainer.num_val_samples=748 trainer.n_outer_loops=50 run_name=sft_llama3B_gsm8k
# Math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/sft/llama3B data=math metrics=math rl_algorithm/reward=math trainer.num_val_samples=750 trainer.n_outer_loops=50 run_name=sft_llama3B_math rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.n_envs=16
### PPO Baselines ###
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen1B/baseline_sft trainer.n_outer_loops=50 rl_algorithm.policy.ft_on_action_only=true name=qwen1B-on-gsm8k run_name=qwen1B_rl_baseline_ppo rl_algorithm.ent_coef=0.009 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen1B/baseline_sft trainer.n_outer_loops=50 rl_algorithm.policy.ft_on_action_only=true name=qwen1B-on-math run_name=qwen1B_rl_baseline_ppo rl_algorithm.ent_coef=0.009 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=2048
### Curriculum with Beta
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen1B/curr_beta rl_algorithm.policy.ft_on_action_only=true name=qwen1B-on-gsm8k run_name=qwen1B_beta_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.init_alpha=5.0 trainer.callbacks.portion_annealers.final_alpha=0.05 trainer.callbacks.portion_annealers.init_beta=5.0 trainer.callbacks.portion_annealers.final_beta=10.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen1B/curr_beta rl_algorithm.policy.ft_on_action_only=true name=qwen1B-on-math run_name=qwen1B_beta_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.init_alpha=5.0 trainer.callbacks.portion_annealers.final_alpha=0.05 trainer.callbacks.portion_annealers.init_beta=5.0 trainer.callbacks.portion_annealers.final_beta=10.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.batch_size=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=2048
### Curriculum with Uniform
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen1B/curr_uniform rl_algorithm.policy.ft_on_action_only=true name=qwen1B-on-gsm8k run_name=qwen1B_unif_curr_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.lower_bound_init_portion=0.0 trainer.callbacks.portion_annealers.lower_bound_final_portion=0.0 trainer.callbacks.portion_annealers.upper_bound_init_portion=1.0 trainer.callbacks.portion_annealers.upper_bound_final_portion=0.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen1B/curr_uniform rl_algorithm.policy.ft_on_action_only=true name=qwen1B-on-math run_name=qwen1B_unif_curr_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.lower_bound_init_portion=0.0 trainer.callbacks.portion_annealers.lower_bound_final_portion=0.0 trainer.callbacks.portion_annealers.upper_bound_init_portion=1.0 trainer.callbacks.portion_annealers.upper_bound_final_portion=0.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.batch_size=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=2048
### SFT As baselines ###
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/sft/qwen1B data=gsm8k metrics=gsm8k rl_algorithm/reward=gsm8k trainer.num_val_samples=748 trainer.n_outer_loops=50 run_name=sft_qwen1B_gsm8k
# Math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/sft/qwen1B data=math metrics=math rl_algorithm/reward=math trainer.num_val_samples=750 trainer.n_outer_loops=50 run_name=sft_qwen1B_math rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.n_envs=16
### PPO Baselines ###
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen7B/baseline_sft trainer.n_outer_loops=50 rl_algorithm.policy.ft_on_action_only=true name=qwen7B-on-gsm8k run_name=qwen7B_rl_baseline_ppo rl_algorithm.ent_coef=0.009 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen7B/baseline_sft trainer.n_outer_loops=50 rl_algorithm.policy.ft_on_action_only=true name=qwen7B-on-math run_name=qwen7B_rl_baseline_ppo rl_algorithm.ent_coef=0.009 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=2048
### Curriculum with Beta
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen7B/curr_beta rl_algorithm.policy.ft_on_action_only=true name=qwen7B-on-gsm8k run_name=qwen7B_beta_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.init_alpha=5.0 trainer.callbacks.portion_annealers.final_alpha=0.05 trainer.callbacks.portion_annealers.init_beta=5.0 trainer.callbacks.portion_annealers.final_beta=10.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen7B/curr_beta rl_algorithm.policy.ft_on_action_only=true name=qwen7B-on-math run_name=qwen7B_beta_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.init_alpha=5.0 trainer.callbacks.portion_annealers.final_alpha=0.05 trainer.callbacks.portion_annealers.init_beta=5.0 trainer.callbacks.portion_annealers.final_beta=10.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.batch_size=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=2048
### Curriculum with Uniform
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen7B/curr_uniform rl_algorithm.policy.ft_on_action_only=true name=qwen7B-on-gsm8k run_name=qwen7B_unif_curr_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.lower_bound_init_portion=0.0 trainer.callbacks.portion_annealers.lower_bound_final_portion=0.0 trainer.callbacks.portion_annealers.upper_bound_init_portion=1.0 trainer.callbacks.portion_annealers.upper_bound_final_portion=0.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=gsm8k rl_algorithm/reward=gsm8k metrics=gsm8k trainer.num_val_samples=748
# math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/ppo/qwen7B/curr_uniform rl_algorithm.policy.ft_on_action_only=true name=qwen7B-on-math run_name=qwen7B_unif_curr_ppo rl_algorithm.ent_coef=0.01 rl_algorithm.vf_coef=0.01 rl_algorithm.base_kl_coef=0.01 trainer.n_outer_loops=50 trainer.callbacks.portion_annealers.lower_bound_init_portion=0.0 trainer.callbacks.portion_annealers.lower_bound_final_portion=0.0 trainer.callbacks.portion_annealers.upper_bound_init_portion=1.0 trainer.callbacks.portion_annealers.upper_bound_final_portion=0.0 trainer.callbacks.portion_annealers.warmup_timesteps=0 trainer.callbacks.portion_annealers.total_timesteps=50 data=math rl_algorithm/reward=math metrics=math trainer.num_val_samples=750 rl_algorithm.n_envs=16 rl_algorithm.n_steps=2 rl_algorithm.batch_size=2 rl_algorithm.n_grad_accumulation_steps=16 rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.policy.model.value_head.transformer_config.config.n_positions=2048
### SFT As baselines ###
# gsm8k
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/sft/qwen7B data=gsm8k metrics=gsm8k rl_algorithm/reward=gsm8k trainer.num_val_samples=748 trainer.n_outer_loops=50 run_name=sft_qwen7B_gsm8k
# Math
python src/train.py rl_algorithm.policy.model.language_model.pretrained_model_name_or_path=<PATH-TO-THE-WARMED-UP-POLICY> experiment=train/sft/qwen7B data=math metrics=math rl_algorithm/reward=math trainer.num_val_samples=750 trainer.n_outer_loops=50 run_name=sft_qwen7B_math rl_algorithm.policy.max_output_generation_length=2048 rl_algorithm.n_envs=16