huggingface · NouamaneTazi · Apr 18, 2025 · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025
diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
@@ -461,6 +461,13 @@ def __post_init__(self):
 
         if self.s3_upload is not None:
             self.s3_upload.__post_init__()
+            if self.lighteval is not None:
+                if self.lighteval.eval_interval is None:
+                    self.lighteval.eval_interval = self.checkpoints.checkpoint_interval
+                else:
+                    assert (
+                        self.lighteval.eval_interval % self.checkpoints.checkpoint_interval == 0
+                    ), f"eval_interval={self.lighteval.eval_interval} must be a multiple of checkpoint_interval={self.checkpoints.checkpoint_interval}"
 
         # Some final sanity checks across separate arguments sections:
         if self.profiler is not None and self.profiler.profiler_export_path is not None:
@@ -543,14 +550,15 @@ def global_batch_size(self):
     def global_batch_size_in_tokens(self):
         return self.global_batch_size * self.tokens.sequence_length
 
-    def save_as_yaml(self, file_path: str):
+    def save_as_yaml(self, file_path: str, sanity_checks: bool = True):
         config_dict = serialize(self)
         file_path = str(file_path)
         with open(file_path, "w") as f:
             yaml.dump(config_dict, f)
 
         # Sanity test config can be reloaded
-        _ = get_config_from_file(file_path, config_class=self.__class__)
+        if sanity_checks:
+            _ = get_config_from_file(file_path, config_class=self.__class__)
 
     def get_yaml(self):
         config_dict = serialize(self)

diff --git a/src/nanotron/config/lighteval_config.py b/src/nanotron/config/lighteval_config.py
@@ -73,6 +73,22 @@ def __post_init__(self):
         assert self.wandb_project != "", "Please specify a wandb_project"
 
 
+@dataclass
+class LightEvalSlurm:
+    """Arguments related to SLURM configuration for LightEval"""
+
+    gpus_per_node: int = 8
+    partition: str = "hopper-prod"
+    hf_cache: str = "~/.cache/huggingface"
+    cpus_per_task: int = 88
+    qos: str = "low"
+    time: str = "24:00:00"
+    reservation: Optional[str] = "smollm"
+
+    def __post_init__(self):
+        self.hf_cache = str(Path(self.hf_cache).expanduser())
+
+
 @dataclass
 class LightEvalConfig:
     """Arguments related to running LightEval on checkpoints.
@@ -81,13 +97,37 @@ class LightEvalConfig:
     the saved config when running LightEval after training.
     """
 
-    slurm_template: Optional[str] = None
-    slurm_script_dir: Optional[str] = None
-
-    checkpoints_path: Optional[str] = None
+    slurm_script_dir: Optional[Path] = Path("eval_results/launch-config")
+    logs_path: Optional[Path] = Path("eval_results/logs")
+    local_checkpoint_dir: Path = Path(
+        "/scratch"
+    )  # Base directory for temporary checkpoint storage, will store under {local_checkpoint_dir}/{run_name}/{step}
     parallelism: Optional[ParallelismArgs] = None
     batch_size: Optional[int] = None
     generation: Optional[Union[GenerationArgs, Dict[str, GenerationArgs]]] = None
     tasks: Optional[LightEvalTasksArgs] = None
     logging: Optional[LightEvalLoggingArgs] = None
     wandb: Optional[LightEvalWandbLoggerConfig] = None
+    slurm: Optional[LightEvalSlurm] = None
+    s3_save_path: Optional[str] = None # should not be dependent of the run_name
+    output_dir: Optional[str] = None # we should sanity check that it's the same as the one in the eval_config_override
+    nanotron_path: Optional[str] = "./"
+    eval_config_override: str = None
+    eval_config_override: Path = None  # Previously hardcoded in run_slurm_one_job
+    eval_interval: Optional[
+        int
+    ] = None  # Must be multiple of checkpoint_interval. If None, eval will be done after each checkpoint upload to s3
+    eval_interval_file: Optional[
+        Path
+    ] = None  # If specified, eval_interval will be read from this file upon the next evaluation.
+
+    def __post_init__(self):
+        if self.parallelism is None:
+            self.parallelism = ParallelismArgs(dp=1, pp=1, tp=1, tp_linear_async_communication=True)
+        if self.slurm is None:
+            self.slurm = LightEvalSlurm()
+        self.local_checkpoint_dir = str(Path(self.local_checkpoint_dir).expanduser())
+        if self.eval_interval_file is not None and Path(self.eval_interval_file).exists():
+            logger.warning(
+                f"Eval interval file {self.eval_interval_file} exists. `eval_interval` will be replaced by the value in the file upon the next evaluation. You should probably delete this file if that's not what you want."
+            )
diff --git a/src/nanotron/config/models_config.py b/src/nanotron/config/models_config.py
@@ -146,10 +146,10 @@ class Qwen2Config:
     no_rope_layer: Optional[
         int
     ] = None  # Skip rope every no_rope_layer layers (see https://arxiv.org/abs/2501.18795 https://arxiv.org/abs/2305.19466 and Llama4)
-    _fused_rotary_emb: bool = True
-    _fused_rms_norm: bool = True
-    _use_qkv_packed: bool = True
-    _use_doc_masking: bool = True
+    _fused_rotary_emb: bool = False
+    _fused_rms_norm: bool = False
+    _use_qkv_packed: bool = False
+    _use_doc_masking: bool = False
 
     # MoE configuration
     moe_config: Optional[MoEConfig] = None

diff --git a/src/nanotron/eval/README.md b/src/nanotron/eval/README.md
@@ -0,0 +1,13 @@
+# Nanotron Evaluation
+
+This directory contains code for evaluating models trained with Nanotron.
+
+## Installation
+
+To use the evaluation functionality, you need to install the `lighteval` package:
+
+```bash
+uv pip install lighteval[dev]
+```
+
+## Usage
diff --git a/src/nanotron/eval/__init__.py b/src/nanotron/eval/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa: F401
+
+from .one_job_runner import LightEvalRunner
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# flake8: noqa: F401

		from .one_job_runner import LightEvalRunner