11#!/usr/bin/env python
22
33import json
4+ import os
5+ import socket
6+ import traceback
47from functools import partial
58from pathlib import Path
6- from typing import Optional
9+ from typing import Any , Optional
710
811import click
912import click_pathlib
2932from modalities .models .huggingface_adapters .hf_adapter import HFModelAdapter
3033from modalities .running_env .cuda_env import CudaEnv
3134from modalities .util import print_rank_0
35+ from modalities .utils .benchmarking .benchmarking_utils import SweepSets , get_updated_sweep_status
36+ from modalities .utils .benchmarking .sweep_utils import SweepGenerator
3237from modalities .utils .communication_test import run_communication_test
3338
3439
@@ -50,21 +55,71 @@ def main() -> None:
5055 default = False ,
5156 help = "If set, run a communication test before training." ,
5257)
53- def CMD_entry_point_run_modalities (config_file_path : Path , test_comm : bool = False ):
58+ @click .option (
59+ "--experiment_id" ,
60+ type = str ,
61+ default = None ,
62+ help = "Optional experiment ID to use for this run. If not provided, it will be derived from the config file path." ,
63+ )
64+ @click .option (
65+ "--error_log_folder" ,
66+ type = click_pathlib .Path (),
67+ default = None ,
68+ help = "Optional path to a folder where error logs will be written." ,
69+ )
70+ def CMD_entry_point_run_modalities (
71+ config_file_path : Path ,
72+ test_comm : bool = False ,
73+ experiment_id : Optional [str ] = None ,
74+ error_log_folder : Optional [Path ] = None ,
75+ ):
5476 """Entrypoint to run the model training.
5577
5678 Args:
5779 config_file_path (Path): Path to the YAML training config file.
80+ test_comm (bool): If set, run a communication test before training.
81+ experiment_id (Optional[str]): Optional experiment ID to use for this run.
82+ If not provided it will be generated. Default is None.
83+ error_log_folder (Optional[Path]): Optional path to a folder where error logs will be written.
5884 """
59- with CudaEnv (process_group_backend = ProcessGroupBackendType .nccl ):
60- if test_comm :
61- print_rank_0 ("Running communication test..." )
62- run_communication_test ()
63- print_rank_0 ("Communication test succeeded." )
6485
65- main_obj = Main (config_file_path )
66- components = main_obj .build_components (components_model_type = TrainingComponentsInstantiationModel )
67- main_obj .run (components )
86+ def _format_exception_as_json (e : Exception , environment : dict [str , Any ]) -> str :
87+ # Format an exception into a structured JSON string with error message, type, and stack trace.
88+ error = {
89+ "error" : str (e ),
90+ "type" : type (e ).__name__ ,
91+ "stacktrace" : traceback .format_exception (type (e ), e , e .__traceback__ ),
92+ }
93+
94+ return json .dumps ({"environment" : environment , "error" : error }, indent = 2 )
95+
96+ try :
97+ with CudaEnv (process_group_backend = ProcessGroupBackendType .nccl ):
98+ if test_comm :
99+ print_rank_0 ("Running communication test..." )
100+ run_communication_test ()
101+ print_rank_0 ("Communication test succeeded." )
102+
103+ main_obj = Main (config_file_path , experiment_id = experiment_id )
104+ components = main_obj .build_components (components_model_type = TrainingComponentsInstantiationModel )
105+ main_obj .run (components )
106+ except Exception as e :
107+ if error_log_folder is not None :
108+ environment = {
109+ "rank" : int (os .environ ["RANK" ] if "RANK" in os .environ else - 1 ),
110+ "local_rank" : int (os .environ ["LOCAL_RANK" ] if "LOCAL_RANK" in os .environ else - 1 ),
111+ "world_size" : int (os .environ ["WORLD_SIZE" ] if "WORLD_SIZE" in os .environ else - 1 ),
112+ "hostname" : socket .gethostname (),
113+ }
114+ error_log_folder = (
115+ error_log_folder .parent
116+ / f"{ error_log_folder .stem } _{ environment ['hostname' ]} _{ environment ['local_rank' ]} .log"
117+ )
118+ error_log_folder .parent .mkdir (parents = True , exist_ok = True )
119+ with open (error_log_folder , "w" , encoding = "utf-8" ) as f :
120+ f .write (_format_exception_as_json (e , environment ))
121+
122+ raise RuntimeError (f"An error occurred while running the training: { e } . " ) from e
68123
69124
70125@main .command (name = "warmstart" )
@@ -523,5 +578,90 @@ def CMD_shuffle_jsonl_data(
523578 )
524579
525580
581+ @main .group (name = "benchmark" )
582+ def benchmark ():
583+ """
584+ Collection of utilities to prepare and run benchmarks.
585+ """
586+ pass
587+
588+
589+ @benchmark .command (name = "prepare_sweep_configs" )
590+ @click .option (
591+ "--sweep_config_path" ,
592+ type = click .Path (exists = True , path_type = Path ),
593+ required = True ,
594+ help = "Path to the sweep configuration YAML file." ,
595+ )
596+ @click .option (
597+ "--output_dir" ,
598+ type = click .Path (file_okay = False , writable = True , path_type = Path ),
599+ required = True ,
600+ help = "Directory to save the generated sweep configurations." ,
601+ )
602+ @click .option (
603+ "--world_sizes" ,
604+ type = str ,
605+ default = "2" ,
606+ help = "Comma-separated list of world sizes (must not have spaces), e.g. --world_sizes '2,4,8'" ,
607+ )
608+ def prepare_sweep_configs (sweep_config_path : Path , output_dir : Path , world_sizes : str ):
609+ """
610+ Utility for preparing sweep configurations.
611+ """
612+ try :
613+ world_sizes_list : list [int ] = list (map (int , world_sizes .split ("," )))
614+ except ValueError as e :
615+ raise ValueError ("Invalid world_sizes format. Please provide a comma-separated list of integers." ) from e
616+ SweepGenerator .generate_sweep_configs (sweep_config_path , output_dir , world_sizes_list )
617+
618+
619+ @benchmark .command (name = "list_remaining_runs" )
620+ @click .option (
621+ "--exp_root" ,
622+ type = click .Path (exists = True , file_okay = False , path_type = Path ),
623+ required = True ,
624+ help = "Path to the root directory of the experiment containing config files." ,
625+ )
626+ @click .option (
627+ "--file_list_path" ,
628+ type = click .Path (path_type = Path ),
629+ required = True ,
630+ help = "Output file to store paths of configs to run." ,
631+ )
632+ @click .option (
633+ "--expected_steps" ,
634+ type = int ,
635+ required = True ,
636+ help = "Expected number of steps in evaluation_results.jsonl" ,
637+ )
638+ @click .option (
639+ "--skip_exception_types" ,
640+ type = str ,
641+ default = "" ,
642+ help = "Exception types to skip when checking for successful runs. "
643+ "Typically, we would add 'OutOfMemoryError', as rerunning the experiment would result in the same error. "
644+ " List of exceptions is comma-separated." ,
645+ )
646+ def CMD_entry_point_list_remaining_runs (
647+ exp_root : Path ,
648+ file_list_path : Path ,
649+ expected_steps : int ,
650+ skip_exception_types : str = "" ,
651+ ):
652+ """
653+ Prepare a file list of remaining runs from a grid search experiment directory.
654+ """
655+ skip_exception_types_list = skip_exception_types .split ("," ) if skip_exception_types != "" else []
656+ file_list_dict = get_updated_sweep_status (
657+ exp_root = exp_root ,
658+ expected_steps = expected_steps ,
659+ skip_exception_types = skip_exception_types_list ,
660+ )
661+ with file_list_path .open ("w" , encoding = "utf-8" ) as f :
662+ for cfg in file_list_dict [SweepSets .UPDATED_CONFIGS .value ]:
663+ f .write (f"{ cfg } \n " )
664+
665+
526666if __name__ == "__main__" :
527667 main ()
0 commit comments