basetenlabs · rcano-baseten · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/examples/megatron-qwen3/training/config.py b/examples/megatron-qwen3/training/config.py
@@ -2,21 +2,27 @@
 from truss.base import truss_config
 
 BASE_IMAGE = "baseten/megatron:0.0.1"
-PROJECT_NAME = "Megatron-qwen3-30b-a3b 2nodes"
+PROJECT_NAME = "Megatron Qwen3 H200"
+NODE_COUNT = 2
+MICRO_BATCH_SIZE = 1
+NUM_GPUS = 8
 
 training_runtime = definitions.Runtime(
     start_commands=["/bin/sh -c 'chmod +x ./run.sh && ./run.sh'"],
     environment_variables={
         "HF_TOKEN": definitions.SecretReference(
-            name="hf_access_token"
+            name="bt_hf_access_token"
         ),  # The name of the HF Access Token secret in your B10 account
         "HF_HUB_ENABLE_HF_TRANSFER": "true",
         "WANDB_API_KEY": definitions.SecretReference(
             name="wandb_api_key"
         ),  # comment this out if you don't want to use wandb
+        "GLOBAL_BATCH_SIZE": str(MICRO_BATCH_SIZE * NUM_GPUS * NODE_COUNT),
+        "MICRO_BATCH_SIZE": str(MICRO_BATCH_SIZE),
+        "SHOULD_CLEAR_CACHE": "false",
     },
     cache_config=definitions.CacheConfig(
-        enabled=False,
+        enabled=True,
     ),
     checkpointing_config=definitions.CheckpointingConfig(
         enabled=True,
@@ -25,10 +31,10 @@
 
 training_compute = definitions.Compute(
     accelerator=truss_config.AcceleratorSpec(
-        accelerator=truss_config.Accelerator.H100,
+        accelerator=truss_config.Accelerator.H200,
         count=8,
     ),
-    node_count=2,
+    node_count=NODE_COUNT,
 )
 
 training_job = definitions.TrainingJob(

diff --git a/examples/megatron-qwen3/training/run.sh b/examples/megatron-qwen3/training/run.sh
@@ -13,6 +13,28 @@ git clone https://github.com/NVIDIA/Megatron-LM.git Megatron-LM --branch core_r0
 cd /root/
 export DATASET="zai-org/LongAlign-10k"
 export MODEL_ID="Qwen/Qwen3-30B-A3B-Instruct-2507"
+export CKPT_DIR=${BT_RW_CACHE_DIR}/${BT_TRAINING_JOB_ID}
+mkdir -p $CKPT_DIR
+
+# Begin setup of rsync
+if ! command -v rsync &> /dev/null; then
+    echo "Installing rsync..."
+    apt-get update && apt-get install -y rsync
+fi
+
+# Set up rsync in the background to sync checkpoints to the checkpointing directory
+if [[ "${BT_NODE_RANK}" == "0" ]]; then
+    echo "Setting up continuous rsync from shared file system to checkpointing directory"
+    # Start a background loop that continuously syncs
+    (
+        while true; do
+            rsync -avz --delete $CKPT_DIR/ $BT_CHECKPOINT_DIR/
+            sleep 30  # Sync every 30 seconds
+        done
+    ) &
+    RSYNC_PID=$!
+    echo "Continuous rsync started with PID: $RSYNC_PID"
+fi
 
 export MCORE_MODEL_DIR="Converted/Qwen3-30B-A3B-Instruct-2507-mcore"
 swift export \
@@ -24,43 +46,95 @@ swift export \
 
 echo "Done converting ckpt"
 
-PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True NPROC_PER_NODE=$BT_NUM_GPUS NNODES=$BT_GROUP_SIZE NODE_RANK=$BT_NODE_RANK MASTER_ADDR=$BT_LEADER_ADDR megatron sft \
-    --load $MCORE_MODEL_DIR \
-    --dataset $DATASET \
-    --no_initialization false \
-    --split_dataset_ratio 0.01 \
-    --tensor_model_parallel_size 2 \
-    --pipeline_model_parallel_size 2 \
-    --expert_model_parallel_size 2 \
-    --moe_permute_fusion true \
-    --moe_grouped_gemm true \
-    --moe_shared_expert_overlap true \
-    --moe_aux_loss_coeff 1e-3 \
-    --micro_batch_size 1 \
-    --global_batch_size 8 \
-    --packing true \
-    --recompute_granularity full \
-    --recompute_method uniform \
-    --recompute_num_layers 4 \
-    --train_iters 200 \
-    --eval_iters 40 \
-    --finetune true \
-    --cross_entropy_loss_fusion true \
-    --lr 1e-5 \
-    --lr_warmup_fraction 0.05 \
-    --min_lr 1e-6 \
-    --save $BT_CHECKPOINT_DIR \
-    --eval_interval 40 \
-    --save_interval 40 \
-    --max_length 32000 \
-    --num_workers 8 \
-    --dataset_num_proc 8 \
-    --no_save_optim true \
-    --no_save_rng true \
-    --sequence_parallel true \
-    --attention_backend flash \
-    --optimizer_cpu_offload true \
-    --use_precision_aware_optimizer true \
-    --use_hf 1 \
-    --wandb_project qwen3_moe_megatron \
-    --wandb_exp_name all_training_b10f 
+run_megatron_training() {
+    PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True NPROC_PER_NODE=$BT_NUM_GPUS NNODES=$BT_GROUP_SIZE NODE_RANK=$BT_NODE_RANK MASTER_ADDR=$BT_LEADER_ADDR megatron sft \
+        --load $MCORE_MODEL_DIR \
+        --dataset $DATASET \
+        --no_initialization false \
+        --split_dataset_ratio 0.01 \
+        --tensor_model_parallel_size 2 \
+        --pipeline_model_parallel_size 2 \
+        --expert_model_parallel_size 2 \
+        --moe_permute_fusion true \
+        --moe_grouped_gemm true \
+        --moe_shared_expert_overlap true \
+        --moe_aux_loss_coeff 1e-3 \
+        --micro_batch_size $MICRO_BATCH_SIZE \
+        --global_batch_size $GLOBAL_BATCH_SIZE \
+        --packing true \
+        --recompute_granularity full \
+        --recompute_method uniform \
+        --recompute_num_layers 4 \
+        --train_iters 5 \
+        --eval_iters 5 \
+        --finetune true \
+        --cross_entropy_loss_fusion true \
+        --lr 1e-5 \
+        --lr_warmup_fraction 0.05 \
+        --min_lr 1e-6 \
+        --save $CKPT_DIR \
+        --eval_interval 5 \
+        --save_interval 5 \
+        --max_length 32000 \
+        --num_workers 8 \
+        --dataset_num_proc 8 \
+        --no_save_optim true \
+        --no_save_rng true \
+        --sequence_parallel true \
+        --attention_backend flash \
+        --optimizer_cpu_offload true \
+        --use_precision_aware_optimizer true \
+        --use_hf 1 \
+        --wandb_project qwen3_moe_megatron \
+        --wandb_exp_name $BT_TRAINING_JOB_NAME
+}
+
+set +e
+run_megatron_training 2>&1 | tee training.log
+EXIT_CODE=$?
+set -e  # Re-enable exit on error
+
+echo "Training completed with exit code: $EXIT_CODE"
+
+
+if [[ "${BT_NODE_RANK}" == "0" ]]; then
+    echo "Stopping continuous rsync and performing final synchronization..."
+
+    # Kill the continuous rsync process
+    if [[ -n "$RSYNC_PID" ]]; then
+        echo "Killing continuous rsync process (PID: $RSYNC_PID)"
+        kill $RSYNC_PID 2>/dev/null || true
+        # Wait a moment for the process to terminate
+        sleep 2
+    fi
+
+    # Perform final synchronization to ensure everything is synced
+    echo "Performing final rsync..."
+    rsync -avz --delete $CKPT_DIR/ $BT_CHECKPOINT_DIR/
+
+    echo "Uploading checkpoints to hub..."
+    pushd $CKPT_DIR
+    ls -la
+    V0_DIR=$(echo v0-*)
+    popd
+    echo "V0_DIR: $V0_DIR"
+    swift export \
+        --mcore_model "${CKPT_DIR}/${V0_DIR}" \
+        --to_hf true \
+        --torch_dtype bfloat16 \
+        --output_dir megatron_output/hf_converted \
+        --push_to_hub true \
+        --hub_token $HF_TOKEN \
+        --hub_model_id rayraycano/megatron-qwen3-30b-a3b
+
+    echo "Final synchronization complete!"
+    # Optionally clear out cache. Set this in your config.py
+    if [[ "${SHOULD_CLEAR_CACHE}" == "true" ]]; then
+        echo "Clearing out cache..."
+        rm -rf $CKPT_DIR
+    fi
+else
+    echo "Worker waiting for leader to rsync..."
+    sleep infinity
+fi
+