Skip to content

Add config for single node runs #3604

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions sdk/python/jobs/grpo/aml_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
AML_SUBSCRIPTION = "<SUBSCRIPTION_ID>"
AML_RESOURCE_GROUP = "<RESOURCE_GROUP_NAME>"
AML_WORKSPACE_NAME = "<WORKSPACE_NAME>"
N_NODES = 2 # Number of nodes to use for training, options: [1, 2]

# Initialize the MLClient to connect to your Azure ML workspace
ml_client = MLClient(
Expand Down Expand Up @@ -97,7 +98,7 @@ def setup_compute():
"""
# Compute Cluster Setup: Select or Create GPU Compute for Training

# Specify the desired Azure VM size (default: 8 x H100 GPUs). This job requires falsh attention and needs A100 or H100 GPUs.
# Specify the desired Azure VM size (default: 8 x H100 GPUs). This job requires flash attention and needs A100 or H100 GPUs.
compute_cluster_size = "STANDARD_ND96ISR_H100_V5"

# Name of the compute cluster to use (change if you have a different cluster)
Expand All @@ -117,8 +118,8 @@ def setup_compute():
name=compute_cluster,
size=compute_cluster_size,
tier="Dedicated",
max_instances=2, # Increase for multi-node training
min_instances=2,
max_instances=N_NODES,
min_instances=N_NODES,
)
ml_client.compute.begin_create_or_update(compute).wait()
print("✅ Compute cluster created successfully.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -354,20 +354,20 @@
"from azure.ai.ml import command, Input, Output\n",
"from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment, Model\n",
"from azure.ai.ml.constants import AssetTypes\n",
"from aml_setup import N_NODES\n",
"\n",
"# Below is a command job that takes grpo config, deepspeed config, the dataset and the model parameters as inputs.\n",
"# This kicks off a distributed job on a gpu cluster with 2 nodes (8XH100 on each).\n",
"command_str = \"\"\"python BldDemo_Reasoning_Train.py \\\n",
" --config grpo_trainer_config.yaml \\\n",
"command_str = f\"\"\"python BldDemo_Reasoning_Train.py \\\n",
" --config {\"grpo_trainer_config.yaml\" if N_NODES==2 else \"grpo_trainer_config_single_node.yaml\"} \\\n",
" --model_name_or_path ${{inputs.model_dir}} \\\n",
" --dataset_name ${{inputs.dataset}} \\\n",
" --output_dir ${{outputs.checkpoint_folder}} \\\n",
" --final_model_save_path ${{outputs.mlflow_model_folder}} \\\n",
" --deepspeed deepspeed_stage3_zero_config.json \\\n",
" --mlflow_task_type \"chat-completion\" \"\"\"\n",
"\n",
"# Base model name is passed as an argument to the script.\n",
"command_str += f'--base_model_name \"{model.name}\"'\n",
" --deepspeed {\"deepspeed_stage3_zero_config.json\" if N_NODES==2 else \"deepspeed_stage3_zero_config_single_node.json\"} \\\n",
" --mlflow_task_type \"chat-completion\" \\\n",
" --base_model_name \"{model.name}\"\n",
"\"\"\"\n",
"\n",
"# Model directory and dataset as job inputs.\n",
"job_input = {\n",
Expand Down Expand Up @@ -400,15 +400,15 @@
" command=command_str,\n",
" environment=environment,\n",
" compute=compute.name,\n",
" instance_count=2,\n",
" instance_count=N_NODES,\n",
" outputs=job_output,\n",
" distribution={\n",
" \"type\": \"PyTorch\",\n",
" # set process count to the number of gpus per node\n",
" \"process_count_per_instance\": 8,\n",
" },\n",
" experiment_name=\"build-demo-reasoning-training-jobs\",\n",
" display_name=\"build-demo-reasoning-train-batchsize-16\",\n",
" display_name=f\"build-demo-reasoning-train-batchsize-{N_NODES*8}\",\n",
" properties={\"_azureml.LogTrainingMetricsToAzMon\": \"true\"},\n",
" # Environment variables to enable profiling\n",
" environment_variables={\n",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"deepspeed_multinode_launcher": "standard",
"offload_optimizer_device": "cpu",
"offload_param_device": "cpu",
"zero3_init_flag": true,
"zero3_save_16bit_model": true,
"zero_stage": 3,
"overlap_comm": true,
"sub_group_size": 1000000,
"stage3_max_live_parameters": 1000,
"stage3_max_reuse_distance": 1000000,
"stage3_gather_16bit_weights_on_model_save": true,
"train_batch_size": 8,
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps" : 1,
"reduce_bucket_size": 5000000,
"stage3_prefetch_bucket_size": 5000000,
"stage3_param_persistence_threshold": 5000000,
"memory_efficient_linear": true,
"contiguous_gradients": true,
"zero_optimization": {
"stage": 3,
"cpu_offload": true,
"contiguous_gradients": true,
"sub_group_size": 1000000,
"stage3_prefetch_bucket_size": 5000000,
"stage3_param_persistence_threshold": 5000000,
"stage3_max_live_parameters": 1000,
"stage3_max_reuse_distance": 1000000,
"stage3_gather_16bit_weights_on_model_save": true
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 3e-06,
"betas": [
0.9,
0.999
],
"eps": 1e-8,
"weight_decay": 0.0
}
}
}
76 changes: 76 additions & 0 deletions sdk/python/jobs/grpo/src/grpo_trainer_config_single_node.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# There are two modes to run vllm for generating samples: server and colocate.
# We use the colocate mode here, which runs the sampler and trainer on the same GPU.
use_vllm: True
vllm_mode: "colocate"
# This ratio controls memory distribution between sampler and trainer.
vllm_gpu_memory_utilization: 0.25
vllm_tensor_parallel_size: 4

# Evaluation settings
do_eval: true
eval_strategy: steps
eval_steps: 5
eval_on_start: true
per_device_eval_batch_size: 1

# Flash Attention 2 distributes computations more granularly across GPU threads, reducing bottlenecks and improving throughput
attn_implementation: flash_attention_2

# Sampler settings
max_prompt_length: 1500
max_completion_length: 5000
# Number of samples to generate for each prompt
num_generations: 2
log_completions: true
num_completions_to_print: 2

# Enable logging metrics to Azure ML
report_to:
- azure_ml

# Logging settings
log_level: info
logging_first_step: true
logging_steps: 5
logging_strategy: steps

# Checkpoint settings
save_strategy: "steps"
save_steps: 100
save_total_limit: 20

# Reward settings for sample rewarding.
reward_funcs:
- accuracy
- format
# reward = 0.8*accuracy + 0.2*format
# These weights helps us define the importance of each reward function.
reward_weights:
- 0.8
- 0.2

# Training settings
max_steps: 100
num_train_epochs: 3
per_device_train_batch_size: 1
learning_rate: 3.0e-06
lr_scheduler_type: cosine
warmup_ratio: 0.1

# Specifies how many batches to process before performing a backward/update step.
# A higher value improves GPU utilization, but it comes with the risk of running out of memory.
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
run_name: rlm8
seed: 42
overwrite_output_dir: false
push_to_hub: false
hub_model_id: NA
hub_strategy: checkpoint
hub_private_repo: true
model_revision: main
torch_dtype: bfloat16
bf16: true
dataset_config: default
Loading