Adds template for custom path saving results (#755)

NathanHB · Copilot · web-flow · commit b6816a87e73c · 2025-05-21T14:15:12.000+02:00
## Pull Request Overview

This PR adds support for using a custom template to determine where evaluation results are saved. The changes include adding a new parameter "results_path_template" across multiple main modules and updating the EvaluationTracker to honor this template for saving results; the associated tests and documentation have been updated accordingly.
- Added a new test for the custom results template.
- Extended CLI options in several main modules to accept "results_path_template".
- Updated EvaluationTracker logic and documentation to reflect the new functionality.


---------

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx
@@ -13,6 +13,11 @@ To save the details of the evaluation, you can use the `--save-details`
 option. The details will be saved in a parquet file
 `{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet`.
 
+If you want results to be saved in a custom path, you can set the `results-path-template` option.
+This allows you to set a string template for the path. The template need to contain the following
+variables: `output_dir`, `model_name`, `org`. For example
+`{output_dir}/{org}_{model}`. The template will be used to create the path for the results file.
+
 ## Pushing results to the HuggingFace hub
 
 You can push the results and evaluation details to the HuggingFace hub. To do
diff --git a/src/lighteval/config/lighteval_config.py b/src/lighteval/config/lighteval_config.py
@@ -60,6 +60,7 @@ class LightEvalLoggingArgs:
     """Arguments related to logging for LightEval"""
 
     output_dir: str
+    results_path_template: str | None = None
     save_details: bool = True
     push_to_hub: bool = False
     push_to_tensorboard: bool = False
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -97,6 +97,9 @@ class EvaluationTracker:
 
     Args:
         output_dir (`str`): Local folder path where you want results to be saved.
+        results_path_template (`str`, *optional*): template to use for the results output directory. for example,
+            `"{output_dir}/results_this_time_it_will_work/{org}_{model}"` will create a folder named `results` in the output directory
+            with the model name and the organization name.
         save_details (`bool`, defaults to True): If True, details are saved to the `output_dir`.
         push_to_hub (`bool`, defaults to False): If True, details are pushed to the hub.
             Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset,
@@ -119,6 +122,7 @@ class EvaluationTracker:
     def __init__(
         self,
         output_dir: str,
+        results_path_template: str | None = None,
         save_details: bool = True,
         push_to_hub: bool = False,
         push_to_tensorboard: bool = False,
@@ -152,6 +156,7 @@ def __init__(
         self.tensorboard_repo = f"{hub_results_org}/tensorboard_logs"
         self.tensorboard_metric_prefix = tensorboard_metric_prefix
         self.nanotron_run_info = nanotron_run_info
+        self.results_path_template = results_path_template
 
         self.public = public
 
@@ -259,7 +264,14 @@ def push_to_wandb(self, results_dict: dict, details_datasets: dict) -> None:
         self.wandb_run.finish()
 
     def save_results(self, date_id: str, results_dict: dict):
-        output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name
+        if self.results_path_template is not None:
+            org_model_parts = self.general_config_logger.model_name.split("/")
+            org = org_model_parts[0] if len(org_model_parts) >= 2 else ""
+            model = org_model_parts[1] if len(org_model_parts) >= 2 else org_model_parts[0]
+            output_dir = self.output_dir
+            output_dir_results = Path(self.results_path_template.format(output_dir=output_dir, org=org, model=model))
+        else:
+            output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name
         self.fs.mkdirs(output_dir_results, exist_ok=True)
         output_results_file = output_dir_results / f"results_{date_id}.json"
         logger.info(f"Saving results to {output_results_file}")
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -60,9 +60,6 @@ def accelerate(  # noqa C901
     custom_tasks: Annotated[
         Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = None,
-    cache_dir: Annotated[
-        Optional[str], Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANEL_NAME_1)
-    ] = None,
     num_fewshot_seeds: Annotated[
         int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = 1,
@@ -73,6 +70,13 @@ def accelerate(  # noqa C901
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = "results",
+    results_path_template: Annotated[
+        str | None,
+        Option(
+            help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = None,
     push_to_hub: Annotated[
         bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
@@ -118,6 +122,7 @@ def accelerate(  # noqa C901
 
     evaluation_tracker = EvaluationTracker(
         output_dir=output_dir,
+        results_path_template=results_path_template,
         save_details=save_details,
         push_to_hub=push_to_hub,
         push_to_tensorboard=push_to_tensorboard,
diff --git a/src/lighteval/main_custom.py b/src/lighteval/main_custom.py
@@ -70,6 +70,13 @@ def custom(
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2)
     ] = "results",
+    results_path_template: Annotated[
+        str | None,
+        Option(
+            help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
+            rich_help_panel=HELP_PANNEL_NAME_2,
+        ),
+    ] = None,
     push_to_hub: Annotated[
         bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2)
     ] = False,
@@ -101,6 +108,7 @@ def custom(
 
     evaluation_tracker = EvaluationTracker(
         output_dir=output_dir,
+        results_path_template=results_path_template,
         save_details=save_details,
         push_to_hub=push_to_hub,
         push_to_tensorboard=push_to_tensorboard,
diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py
@@ -72,6 +72,13 @@ def inference_endpoint(
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = "results",
+    results_path_template: Annotated[
+        str | None,
+        Option(
+            help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = None,
     push_to_hub: Annotated[
         bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
@@ -111,6 +118,7 @@ def inference_endpoint(
 
     evaluation_tracker = EvaluationTracker(
         output_dir=output_dir,
+        results_path_template=results_path_template,
         save_details=save_details,
         push_to_hub=push_to_hub,
         push_to_tensorboard=push_to_tensorboard,
@@ -185,6 +193,13 @@ def tgi(
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = "results",
+    results_path_template: Annotated[
+        str | None,
+        Option(
+            help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = None,
     push_to_hub: Annotated[
         bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
@@ -227,6 +242,7 @@ def tgi(
 
     evaluation_tracker = EvaluationTracker(
         output_dir=output_dir,
+        results_path_template=results_path_template,
         save_details=save_details,
         push_to_hub=push_to_hub,
         push_to_tensorboard=push_to_tensorboard,
@@ -302,6 +318,13 @@ def litellm(
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = "results",
+    results_path_template: Annotated[
+        str | None,
+        Option(
+            help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = None,
     push_to_hub: Annotated[
         bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
@@ -344,6 +367,7 @@ def litellm(
 
     evaluation_tracker = EvaluationTracker(
         output_dir=output_dir,
+        results_path_template=results_path_template,
         save_details=save_details,
         push_to_hub=push_to_hub,
         push_to_tensorboard=push_to_tensorboard,
@@ -420,6 +444,13 @@ def inference_providers(
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = "results",
+    results_path_template: Annotated[
+        str | None,
+        Option(
+            help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = None,
     push_to_hub: Annotated[
         bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
@@ -462,6 +493,7 @@ def inference_providers(
 
     evaluation_tracker = EvaluationTracker(
         output_dir=output_dir,
+        results_path_template=results_path_template,
         save_details=save_details,
         push_to_hub=push_to_hub,
         push_to_tensorboard=push_to_tensorboard,
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
@@ -81,6 +81,7 @@ def nanotron(
 
     evaluation_tracker = EvaluationTracker(
         output_dir=lighteval_config.logging.output_dir,
+        results_path_template=lighteval_config.logging.results_path_template,
         hub_results_org=lighteval_config.logging.results_org,
         public=lighteval_config.logging.public_run,
         push_to_hub=lighteval_config.logging.push_to_hub,
diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py
@@ -63,6 +63,13 @@ def sglang(
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = "results",
+    results_path_template: Annotated[
+        str | None,
+        Option(
+            help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = None,
     push_to_hub: Annotated[
         bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
@@ -104,6 +111,7 @@ def sglang(
 
     evaluation_tracker = EvaluationTracker(
         output_dir=output_dir,
+        results_path_template=results_path_template,
         save_details=save_details,
         push_to_hub=push_to_hub,
         push_to_tensorboard=push_to_tensorboard,
diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py
@@ -66,6 +66,13 @@ def vllm(
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = "results",
+    results_path_template: Annotated[
+        str | None,
+        Option(
+            help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = None,
     push_to_hub: Annotated[
         bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
@@ -107,6 +114,7 @@ def vllm(
 
     evaluation_tracker = EvaluationTracker(
         output_dir=output_dir,
+        results_path_template=results_path_template,
         save_details=save_details,
         push_to_hub=push_to_hub,
         push_to_tensorboard=push_to_tensorboard,
diff --git a/tests/logging/test_evaluation_tracker.py b/tests/logging/test_evaluation_tracker.py
@@ -96,6 +96,30 @@ def test_results_logging(mock_evaluation_tracker: EvaluationTracker):
     assert saved_results["config_general"]["model_name"] == "test_model"
 
 
+def test_results_logging_template(mock_evaluation_tracker: EvaluationTracker):
+    task_metrics = {
+        "task1": {"accuracy": 0.8, "f1": 0.75},
+        "task2": {"precision": 0.9, "recall": 0.85},
+    }
+    mock_evaluation_tracker.metrics_logger.metric_aggregated = task_metrics
+    mock_evaluation_tracker.results_path_template = "{output_dir}/{org}_{model}"
+
+    mock_evaluation_tracker.save()
+
+    results_dir = Path(mock_evaluation_tracker.output_dir) / "_test_model"
+    assert results_dir.exists()
+
+    result_files = list(results_dir.glob("results_*.json"))
+    assert len(result_files) == 1
+
+    with open(result_files[0], "r") as f:
+        saved_results = json.load(f)
+
+    assert "results" in saved_results
+    assert saved_results["results"] == task_metrics
+    assert saved_results["config_general"]["model_name"] == "test_model"
+
+
 @pytest.mark.evaluation_tracker(save_details=True)
 def test_details_logging(mock_evaluation_tracker, mock_datetime):
     task_details = {