Azure · sharvin2187 · Jun 11, 2025 · Jun 1, 2025 · Jun 1, 2025 · Jun 1, 2025
@@ -33,6 +33,7 @@
 AML_SUBSCRIPTION = "<SUBSCRIPTION_ID>"
 AML_RESOURCE_GROUP = "<RESOURCE_GROUP_NAME>"
 AML_WORKSPACE_NAME = "<WORKSPACE_NAME>"
+N_NODES = 2  # Number of nodes to use for training, options: [1, 2]
 
 # Initialize the MLClient to connect to your Azure ML workspace
 ml_client = MLClient(
@@ -97,7 +98,7 @@ def setup_compute():
     """
     # Compute Cluster Setup: Select or Create GPU Compute for Training
 
-    # Specify the desired Azure VM size (default: 8 x H100 GPUs). This job requires falsh attention and needs A100 or H100 GPUs.
+    # Specify the desired Azure VM size (default: 8 x H100 GPUs). This job requires flash attention and needs A100 or H100 GPUs.
     compute_cluster_size = "STANDARD_ND96ISR_H100_V5"
 
     # Name of the compute cluster to use (change if you have a different cluster)
@@ -117,8 +118,8 @@ def setup_compute():
                 name=compute_cluster,
                 size=compute_cluster_size,
                 tier="Dedicated",
-                max_instances=2,  # Increase for multi-node training
-                min_instances=2,
+                max_instances=N_NODES,
+                min_instances=N_NODES,
             )
             ml_client.compute.begin_create_or_update(compute).wait()
             print("✅ Compute cluster created successfully.")

@@ -354,20 +354,20 @@
     "from azure.ai.ml import command, Input, Output\n",
     "from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment, Model\n",
     "from azure.ai.ml.constants import AssetTypes\n",
+    "from aml_setup import N_NODES\n",
     "\n",
     "# Below is a command job that takes grpo config, deepspeed config, the dataset and the model parameters as inputs.\n",
     "# This kicks off a distributed job on a gpu cluster with 2 nodes (8XH100 on each).\n",
-    "command_str = \"\"\"python BldDemo_Reasoning_Train.py \\\n",
-    "    --config grpo_trainer_config.yaml \\\n",
+    "command_str = f\"\"\"python BldDemo_Reasoning_Train.py \\\n",
+    "    --config {\"grpo_trainer_config.yaml\" if N_NODES==2 else \"grpo_trainer_config_single_node.yaml\"} \\\n",
     "    --model_name_or_path ${{inputs.model_dir}} \\\n",
     "    --dataset_name ${{inputs.dataset}} \\\n",
     "    --output_dir ${{outputs.checkpoint_folder}} \\\n",
     "    --final_model_save_path ${{outputs.mlflow_model_folder}} \\\n",
-    "    --deepspeed deepspeed_stage3_zero_config.json \\\n",
-    "    --mlflow_task_type \"chat-completion\" \"\"\"\n",
-    "\n",
-    "# Base model name is passed as an argument to the script.\n",
-    "command_str += f'--base_model_name \"{model.name}\"'\n",
+    "    --deepspeed {\"deepspeed_stage3_zero_config.json\" if N_NODES==2 else \"deepspeed_stage3_zero_config_single_node.json\"} \\\n",
+    "    --mlflow_task_type \"chat-completion\" \\\n",
+    "    --base_model_name \"{model.name}\"\n",
+    "\"\"\"\n",
     "\n",
     "# Model directory and dataset as job inputs.\n",
     "job_input = {\n",
@@ -400,15 +400,15 @@
     "    command=command_str,\n",
     "    environment=environment,\n",
     "    compute=compute.name,\n",
-    "    instance_count=2,\n",
+    "    instance_count=N_NODES,\n",
     "    outputs=job_output,\n",
     "    distribution={\n",
     "        \"type\": \"PyTorch\",\n",
     "        # set process count to the number of gpus per node\n",
     "        \"process_count_per_instance\": 8,\n",
     "    },\n",
     "    experiment_name=\"build-demo-reasoning-training-jobs\",\n",
-    "    display_name=\"build-demo-reasoning-train-batchsize-16\",\n",
+    "    display_name=f\"build-demo-reasoning-train-batchsize-{N_NODES*8}\",\n",
     "    properties={\"_azureml.LogTrainingMetricsToAzMon\": \"true\"},\n",
     "    # Environment variables to enable profiling\n",
     "    environment_variables={\n",

@@ -0,0 +1,44 @@
+{
+  "deepspeed_multinode_launcher": "standard",
+  "offload_optimizer_device": "cpu",
+  "offload_param_device": "cpu",
+  "zero3_init_flag": true,
+  "zero3_save_16bit_model": true,
+  "zero_stage": 3,
+  "overlap_comm": true,
+  "sub_group_size": 1000000,
+  "stage3_max_live_parameters": 1000,
+  "stage3_max_reuse_distance": 1000000,
+  "stage3_gather_16bit_weights_on_model_save": true,
+  "train_batch_size": 8,
+  "train_micro_batch_size_per_gpu": 1,
+  "gradient_accumulation_steps" : 1,
+  "reduce_bucket_size": 5000000,
+  "stage3_prefetch_bucket_size": 5000000,
+  "stage3_param_persistence_threshold": 5000000,
+  "memory_efficient_linear": true,
+  "contiguous_gradients": true,
+  "zero_optimization": {
+    "stage": 3,
+    "cpu_offload": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 1000000,
+    "stage3_prefetch_bucket_size": 5000000,
+    "stage3_param_persistence_threshold": 5000000,
+    "stage3_max_live_parameters": 1000,
+    "stage3_max_reuse_distance": 1000000,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 3e-06,
+      "betas": [
+        0.9,
+        0.999
+      ],
+      "eps": 1e-8,
+      "weight_decay": 0.0
+    }
+  }
+}
@@ -0,0 +1,76 @@
+# There are two modes to run vllm for generating samples: server and colocate.
+# We use the colocate mode here, which runs the sampler and trainer on the same GPU.
+use_vllm: True
+vllm_mode: "colocate"
+# This ratio controls memory distribution between sampler and trainer.
+vllm_gpu_memory_utilization: 0.25
+vllm_tensor_parallel_size: 4
+
+# Evaluation settings
+do_eval: true
+eval_strategy: steps
+eval_steps: 5
+eval_on_start: true
+per_device_eval_batch_size: 1
+
+# Flash Attention 2 distributes computations more granularly across GPU threads, reducing bottlenecks and improving throughput
+attn_implementation: flash_attention_2
+
+# Sampler settings
+max_prompt_length: 1500
+max_completion_length: 5000
+# Number of samples to generate for each prompt
+num_generations: 2
+log_completions: true
+num_completions_to_print: 2
+
+# Enable logging metrics to Azure ML
+report_to:
+- azure_ml
+
+# Logging settings
+log_level: info
+logging_first_step: true
+logging_steps: 5
+logging_strategy: steps
+
+# Checkpoint settings
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 20
+
+# Reward settings for sample rewarding.
+reward_funcs:
+- accuracy
+- format
+# reward = 0.8*accuracy + 0.2*format
+# These weights helps us define the importance of each reward function.
+reward_weights:
+- 0.8
+- 0.2
+
+# Training settings
+max_steps: 100
+num_train_epochs: 3
+per_device_train_batch_size: 1
+learning_rate: 3.0e-06
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+
+# Specifies how many batches to process before performing a backward/update step.
+# A higher value improves GPU utilization, but it comes with the risk of running out of memory.
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+run_name: rlm8
+seed: 42
+overwrite_output_dir: false
+push_to_hub: false
+hub_model_id: NA
+hub_strategy: checkpoint
+hub_private_repo: true
+model_revision: main
+torch_dtype: bfloat16
+bf16: true
+dataset_config: default