Modalities
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CHANGELOG_DEV.md‎
Lines changed: 16 additions & 0 deletions b/‎CHANGELOG_DEV.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 0 deletions b/‎README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/modalities/__main__.py‎
Lines changed: 150 additions & 10 deletions b/‎src/modalities/__main__.py‎
Lines changed: 150 additions & 10 deletions
diff --git a/‎src/modalities/config/config.py‎
Lines changed: 5 additions & 3 deletions b/‎src/modalities/config/config.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/modalities/config/instantiation_models.py‎
Lines changed: 9 additions & 1 deletion b/‎src/modalities/config/instantiation_models.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/modalities/conversion/gpt2/conversion_model.py‎
Lines changed: 3 additions & 1 deletion b/‎src/modalities/conversion/gpt2/conversion_model.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/modalities/logging_broker/subscriber_impl/results_subscriber.py‎
Lines changed: 22 additions & 15 deletions b/‎src/modalities/logging_broker/subscriber_impl/results_subscriber.py‎
Lines changed: 22 additions & 15 deletions
diff --git a/‎src/modalities/logging_broker/subscriber_impl/subscriber_factory.py‎
Lines changed: 2 additions & 6 deletions b/‎src/modalities/logging_broker/subscriber_impl/subscriber_factory.py‎
Lines changed: 2 additions & 6 deletions
@@ -164,8 +164,11 @@ tests/tmp/*
 *wandb_storage*
 .coverage/*
 *.pbin
+tutorials/scaling_up2/experiments
 tutorials/scaling_up/experiments
 tutorials/profiling/experiments
 tutorials/instruction_tuning/prepared_data
 config_files/instruction_tuning
 data/lorem_ipsum_instruct.jsonl
+tutorials/scaling_up/logs*
+tutorials/scaling_up/experiments_old/*
@@ -186,3 +186,19 @@ There are now three AC variants:
 * adds support for Tensor Parallelism (including Sequence Parallelism). 
 * adds a debugging toolkit to track the input and output tensors during a forward pass, gradients during the backward pass and weight tensors.
 Tensors can be either normal Tensors or DTensors.  
+
+
+## PR #389 Benchmark Tooling 
+* adds benchmarking tooling to modalities and allows for scaling benchmarks across varying number of nodes and the cartesian product of configurable hyper parameters.
+
+**Breaking Changes**
+* Renaming: EvaluationResultToDiscSubscriberConfig.output_path -> EvaluationResultToDiscSubscriberConfig.output_file_path
+
+
+
+## PR #410 MFU incorporates dp_degree now instead of world_size
+
+This PR fixes the MFU and throughput calculations by taking the dp degree into account instead of the world size. When we use parallelization strategies on top of FSDP, then the world size is different from the  data parallel degree. This needs to be reflected in throughput and MFU metric calculations, as done by this PR. 
+
+**Breaking Changes**
+* Existing configs need to be adapted to correctly use dp degree rather than world size. 
@@ -198,6 +198,12 @@ Even though Modalities significantly simplifies LLM training, there is still som
 - [Library Usage](tutorials/library_usage/README.md)</br>
   How to use Modalities as a library and register custom components with Modalities.
 
+- [Instruction Tuning](tutorials/instruction_tuning/README.md)</br>
+  Teaches you how to apply instruction tuning on a pre-trained model.
+
+- [Scaling Up](tutorials/scaling_up/README.md)</br>
+  When scaling up your training to hundreds or thousands of GPUs, you want to maintain linear scalability. 
+  This tutorial teaches you how to find the optimal throughput setting for various hyperparameter settings at different scales. 
 
 
 
 
@@ -1,9 +1,12 @@
 #!/usr/bin/env python
 
 import json
+import os
+import socket
+import traceback
 from functools import partial
 from pathlib import Path
-from typing import Optional
+from typing import Any, Optional
 
 import click
 import click_pathlib
@@ -29,6 +32,8 @@
 from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapter
 from modalities.running_env.cuda_env import CudaEnv
 from modalities.util import print_rank_0
+from modalities.utils.benchmarking.benchmarking_utils import SweepSets, get_updated_sweep_status
+from modalities.utils.benchmarking.sweep_utils import SweepGenerator
 from modalities.utils.communication_test import run_communication_test
 
 
@@ -50,21 +55,71 @@ def main() -> None:
     default=False,
     help="If set, run a communication test before training.",
 )
-def CMD_entry_point_run_modalities(config_file_path: Path, test_comm: bool = False):
+@click.option(
+    "--experiment_id",
+    type=str,
+    default=None,
+    help="Optional experiment ID to use for this run. If not provided, it will be derived from the config file path.",
+)
+@click.option(
+    "--error_log_folder",
+    type=click_pathlib.Path(),
+    default=None,
+    help="Optional path to a folder where error logs will be written.",
+)
+def CMD_entry_point_run_modalities(
+    config_file_path: Path,
+    test_comm: bool = False,
+    experiment_id: Optional[str] = None,
+    error_log_folder: Optional[Path] = None,
+):
     """Entrypoint to run the model training.
 
     Args:
         config_file_path (Path): Path to the YAML training config file.
+        test_comm (bool): If set, run a communication test before training.
+        experiment_id (Optional[str]): Optional experiment ID to use for this run.
+            If not provided it will be generated. Default is None.
+        error_log_folder (Optional[Path]): Optional path to a folder where error logs will be written.
     """
-    with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
-        if test_comm:
-            print_rank_0("Running communication test...")
-            run_communication_test()
-            print_rank_0("Communication test succeeded.")
 
-        main_obj = Main(config_file_path)
-        components = main_obj.build_components(components_model_type=TrainingComponentsInstantiationModel)
-        main_obj.run(components)
+    def _format_exception_as_json(e: Exception, environment: dict[str, Any]) -> str:
+        # Format an exception into a structured JSON string with error message, type, and stack trace.
+        error = {
+            "error": str(e),
+            "type": type(e).__name__,
+            "stacktrace": traceback.format_exception(type(e), e, e.__traceback__),
+        }
+
+        return json.dumps({"environment": environment, "error": error}, indent=2)
+
+    try:
+        with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
+            if test_comm:
+                print_rank_0("Running communication test...")
+                run_communication_test()
+                print_rank_0("Communication test succeeded.")
+
+            main_obj = Main(config_file_path, experiment_id=experiment_id)
+            components = main_obj.build_components(components_model_type=TrainingComponentsInstantiationModel)
+            main_obj.run(components)
+    except Exception as e:
+        if error_log_folder is not None:
+            environment = {
+                "rank": int(os.environ["RANK"] if "RANK" in os.environ else -1),
+                "local_rank": int(os.environ["LOCAL_RANK"] if "LOCAL_RANK" in os.environ else -1),
+                "world_size": int(os.environ["WORLD_SIZE"] if "WORLD_SIZE" in os.environ else -1),
+                "hostname": socket.gethostname(),
+            }
+            error_log_folder = (
+                error_log_folder.parent
+                / f"{error_log_folder.stem}_{environment['hostname']}_{environment['local_rank']}.log"
+            )
+            error_log_folder.parent.mkdir(parents=True, exist_ok=True)
+            with open(error_log_folder, "w", encoding="utf-8") as f:
+                f.write(_format_exception_as_json(e, environment))
+
+        raise RuntimeError(f"An error occurred while running the training: {e}. ") from e
 
 
 @main.command(name="warmstart")
@@ -523,5 +578,90 @@ def CMD_shuffle_jsonl_data(
     )
 
 
+@main.group(name="benchmark")
+def benchmark():
+    """
+    Collection of utilities to prepare and run benchmarks.
+    """
+    pass
+
+
+@benchmark.command(name="prepare_sweep_configs")
+@click.option(
+    "--sweep_config_path",
+    type=click.Path(exists=True, path_type=Path),
+    required=True,
+    help="Path to the sweep configuration YAML file.",
+)
+@click.option(
+    "--output_dir",
+    type=click.Path(file_okay=False, writable=True, path_type=Path),
+    required=True,
+    help="Directory to save the generated sweep configurations.",
+)
+@click.option(
+    "--world_sizes",
+    type=str,
+    default="2",
+    help="Comma-separated list of world sizes (must not have spaces), e.g. --world_sizes '2,4,8'",
+)
+def prepare_sweep_configs(sweep_config_path: Path, output_dir: Path, world_sizes: str):
+    """
+    Utility for preparing sweep configurations.
+    """
+    try:
+        world_sizes_list: list[int] = list(map(int, world_sizes.split(",")))
+    except ValueError as e:
+        raise ValueError("Invalid world_sizes format. Please provide a comma-separated list of integers.") from e
+    SweepGenerator.generate_sweep_configs(sweep_config_path, output_dir, world_sizes_list)
+
+
+@benchmark.command(name="list_remaining_runs")
+@click.option(
+    "--exp_root",
+    type=click.Path(exists=True, file_okay=False, path_type=Path),
+    required=True,
+    help="Path to the root directory of the experiment containing config files.",
+)
+@click.option(
+    "--file_list_path",
+    type=click.Path(path_type=Path),
+    required=True,
+    help="Output file to store paths of configs to run.",
+)
+@click.option(
+    "--expected_steps",
+    type=int,
+    required=True,
+    help="Expected number of steps in evaluation_results.jsonl",
+)
+@click.option(
+    "--skip_exception_types",
+    type=str,
+    default="",
+    help="Exception types to skip when checking for successful runs. "
+    "Typically, we would add 'OutOfMemoryError', as rerunning the experiment would result in the same error. "
+    " List of exceptions is comma-separated.",
+)
+def CMD_entry_point_list_remaining_runs(
+    exp_root: Path,
+    file_list_path: Path,
+    expected_steps: int,
+    skip_exception_types: str = "",
+):
+    """
+    Prepare a file list of remaining runs from a grid search experiment directory.
+    """
+    skip_exception_types_list = skip_exception_types.split(",") if skip_exception_types != "" else []
+    file_list_dict = get_updated_sweep_status(
+        exp_root=exp_root,
+        expected_steps=expected_steps,
+        skip_exception_types=skip_exception_types_list,
+    )
+    with file_list_path.open("w", encoding="utf-8") as f:
+        for cfg in file_list_dict[SweepSets.UPDATED_CONFIGS.value]:
+            f.write(f"{cfg}\n")
+
+
 if __name__ == "__main__":
     main()
@@ -458,8 +458,7 @@ class DummyResultSubscriberConfig(BaseModel):
 
 
 class EvaluationResultToDiscSubscriberConfig(BaseModel):
-    output_folder_path: Path
-    experiment_id: str
+    output_file_path: Path
 
 
 class WandBEvaluationResultSubscriberConfig(BaseModel):
@@ -517,7 +516,10 @@ def node_env_resolver_fun(var_name: str) -> int:
             return os.cpu_count()
 
     OmegaConf.register_new_resolver("cuda_env", cuda_env_resolver_fun, replace=True)
-    modalities_env_kwargs = {"config_file_path": config_file_path}
+    modalities_env_kwargs: dict[str, Any] = {
+        "config_file_path": config_file_path,
+        "config_folder_path": config_file_path.parent,
+    }
     if experiment_id is not None:
         modalities_env_kwargs["experiment_id"] = experiment_id
     OmegaConf.register_new_resolver(
 
@@ -35,6 +35,13 @@ class StepProfile(BaseModel):
     sequence_length: Annotated[int, Field(strict=True, ge=1)]
 
 
+class MeshDefinition(BaseModel):
+    dp_degree: Annotated[int, Field(strict=True, gt=0)]
+    tp_degree: Annotated[int, Field(strict=True, gt=0)] = 1
+    pp_degree: Annotated[int, Field(strict=True, gt=0)] = 1
+    cp_degree: Annotated[int, Field(strict=True, gt=0)] = 1
+
+
 class ConsistencyEnforcement(BaseModel):
     enforce_tokens_per_step_consistency: bool = True
     enforce_last_step_logged: bool = True
@@ -92,6 +99,7 @@ class DCPWarmstartCheckpointPaths(BaseModel):
         intervals: Intervals
         consistency_enforcement: ConsistencyEnforcement
         step_profile: StepProfile
+        mesh_definition: MeshDefinition
         training_target: TrainingTarget
         training_progress: TrainingProgress
         warmstart_checkpoint_paths: Optional[WarmstartCheckpointPaths | DCPWarmstartCheckpointPaths] = None
@@ -106,7 +114,7 @@ def _check_tokens_per_step_conistency(self) -> "TrainingComponentsInstantiationM
                 self.step_profile.local_train_micro_batch_size
                 * self.step_profile.sequence_length
                 * self.step_profile.gradient_accumulation_steps
-                * self.cuda_env.world_size
+                * self.mesh_definition.dp_degree
             )
             if required_num_tokens_per_step != step_profile_num_tokens_per_step:
                 warning_message = (
 
@@ -51,7 +51,9 @@ def convert_model_config(modalities_config: dict) -> GPT2Config:
         num_hidden_layers=config["n_layer"],
         num_key_value_heads=config["n_head_kv"],
         num_attention_heads=config["n_head_q"],
-        intermediate_size=SwiGLU._get_hidden_dim(ffn_hidden=config["ffn_hidden"]),
+        intermediate_size=SwiGLU._get_hidden_dim(
+            ffn_hidden=config["ffn_hidden"], enforce_swiglu_hidden_dim_multiple_of=256
+        ),
         attention_bias=config["bias"],
         mlp_bias=config["bias"],
         hidden_act="silu",
 
@@ -1,5 +1,5 @@
 import json
-from dataclasses import asdict, is_dataclass
+from dataclasses import fields, is_dataclass
 from pathlib import Path
 from typing import Any
 
@@ -10,7 +10,7 @@
 from rich.console import Group
 from rich.panel import Panel
 
-from modalities.batch import EvaluationResultBatch
+from modalities.batch import EvaluationResultBatch, ResultItem
 from modalities.config.config import WandbMode
 from modalities.logging_broker.messages import Message
 from modalities.logging_broker.subscriber import MessageSubscriberIF
@@ -115,34 +115,41 @@ class EvaluationResultToDiscSubscriber(MessageSubscriberIF[EvaluationResultBatch
     def __init__(self, output_file_path: Path) -> None:
         super().__init__()
         self.output_file_path = output_file_path
+        self.output_file_path.parent.mkdir(parents=True, exist_ok=True)
 
     def consume_dict(self, message_dict: dict[str, Any]):
         """Optional: log config data if needed (here: no-op)."""
         pass
 
     @staticmethod
-    def _convert_evaluation_result_batch(eval_result_batch: EvaluationResultBatch) -> dict[str, Any]:
+    def _convert_evaluation_result_batch(obj: EvaluationResultBatch) -> dict[str, Any]:
         """
         Recursively convert EvaluationResultBatch structure to JSON-serializable format.
         Handles dataclasses and torch.Tensor.
         """
-        if is_dataclass(eval_result_batch):
+
+        def shallow_asdict(obj):
+            # Converts a dataclass to a dictionary without deep recursion.
+            if not is_dataclass(obj):
+                raise TypeError("shallow_asdict() should be called on dataclass instances")
+            return {f.name: getattr(obj, f.name) for f in fields(obj)}
+
+        if isinstance(obj, ResultItem):
+            return obj.value.item() if obj.value.ndim == 0 else obj.value.tolist()
+        elif is_dataclass(obj):
             result_dict = {}
-            for k, v in asdict(eval_result_batch).items():
+            for k, v in shallow_asdict(obj).items():
                 result_dict[k] = EvaluationResultToDiscSubscriber._convert_evaluation_result_batch(v)
             return result_dict
 
-        elif isinstance(eval_result_batch, dict):
-            return {
-                k: EvaluationResultToDiscSubscriber._convert_evaluation_result_batch(v)
-                for k, v in eval_result_batch.items()
-            }
-        elif isinstance(eval_result_batch, list):
-            return [EvaluationResultToDiscSubscriber._convert_evaluation_result_batch(v) for v in eval_result_batch]
-        elif isinstance(eval_result_batch, torch.Tensor):
-            return eval_result_batch.item() if eval_result_batch.ndim == 0 else eval_result_batch.tolist()
+        elif isinstance(obj, dict):
+            return {k: EvaluationResultToDiscSubscriber._convert_evaluation_result_batch(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [EvaluationResultToDiscSubscriber._convert_evaluation_result_batch(v) for v in obj]
+        elif isinstance(obj, torch.Tensor):
+            return obj.item() if obj.ndim == 0 else obj.tolist()
         else:
-            return eval_result_batch
+            return obj
 
     def consume_message(self, message: Message[EvaluationResultBatch]):
         """Writes the evaluation result to the JSONL file if rank 0."""
 
@@ -57,12 +57,8 @@ def get_dummy_result_subscriber() -> DummyResultSubscriber:
         return DummyResultSubscriber()
 
     @staticmethod
-    def get_evaluation_result_to_disc_subscriber(
-        output_folder_path: Path, experiment_id: str
-    ) -> EvaluationResultToDiscSubscriber:
-        return EvaluationResultToDiscSubscriber(
-            output_file_path=output_folder_path / experiment_id / "evaluation_results.jsonl"
-        )
+    def get_evaluation_result_to_disc_subscriber(output_file_path: Path) -> EvaluationResultToDiscSubscriber:
+        return EvaluationResultToDiscSubscriber(output_file_path=output_file_path)
 
     @staticmethod
     def get_wandb_result_subscriber(