@@ -21,6 +21,7 @@ eval_datasets: "ppo-kk/5ppl/test.jsonl" # Path to the evaluation dataset
21
21
prompt_key : " src" # Key for the prompt in the dataset
22
22
response_key : " tgt" # Key for the response in the dataset
23
23
dataloader_drop_last : true # Whether to drop the last incomplete batch in the DataLoader
24
+ dataloader_shuffle : false # Whether to shuffle the train dataset
24
25
balance_batch : true # Whether to balance batch size across dataset_world_size
25
26
use_remove_padding : true # Whether to remove padding tokens in the input
26
27
@@ -46,7 +47,7 @@ rollout_quant_type: "" # Quantization type, e.g., "weight_only_int8"
46
47
# training args
47
48
do_train : true # Whether to perform training
48
49
seed : 42 # Random seed for reproducibility
49
- global_batch_size : 4 # Global batch size for training
50
+ global_batch_size : 8 # Global batch size for training
50
51
global_gen_batch_size : -1 # Global generation batch size for dynamic sampling
51
52
global_mini_batch_size : -1 # Mini-batch size for training
52
53
rollout_n : 8 # Number of rollouts
@@ -65,7 +66,7 @@ adam_beta1: 0.9 # AdamW optimizer beta1
65
66
adam_beta2 : 0.999 # AdamW optimizer beta2
66
67
adam_epsilon : 1e-8 # AdamW optimizer epsilon
67
68
max_grad_norm : 1.0 # Maximum gradient norm for clipping
68
- max_steps : -1 # Maximum number of training steps
69
+ max_steps : 3600 # Maximum number of training steps
69
70
save_steps : 300 # Number of steps between model saves
70
71
save_strategy : " steps" # Strategy for saving models
71
72
ignore_save_lr_and_optim : true # Whether to ignore saving learning rate and optimizer state (leave empty if not specified)
@@ -98,7 +99,7 @@ eval_steps: 20 # Number of steps between evaluations
98
99
99
100
# device memory optimization args
100
101
use_flash_attention : true # Whether to use fused attention operations
101
- use_fused_rms_norm : true # Whether to use fused RMS norm operations, which needs to install fused_ln in slm/model_zoo/gpt-3/external_ops
102
+ use_fused_rms_norm : false # Whether to use fused RMS norm operations, which needs to install fused_ln in slm/model_zoo/gpt-3/external_ops
102
103
use_fused_rope : false # Whether to use fused rope operations
103
104
use_fused_head_and_loss_fn : true # Whether to use fused head and loss function
104
105
use_fused_linear : true # Whether to use fused linear operations
@@ -115,4 +116,4 @@ release_grads: true # Whether to release gradients
115
116
offload_optim : false # Whether to offload optimizer to pinned memory
116
117
117
118
# benchmark args
118
- skip_profile_timer : false # Whether to skip profiling timer
119
+ skip_profile_timer : false # Whether to skip profiling timer
0 commit comments