Skip to content

run evals in background #352

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Apr 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions src/nanotron/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,13 @@ def __post_init__(self):

if self.s3_upload is not None:
self.s3_upload.__post_init__()
if self.lighteval is not None:
if self.lighteval.eval_interval is None:
self.lighteval.eval_interval = self.checkpoints.checkpoint_interval
else:
assert (
self.lighteval.eval_interval % self.checkpoints.checkpoint_interval == 0
), f"eval_interval={self.lighteval.eval_interval} must be a multiple of checkpoint_interval={self.checkpoints.checkpoint_interval}"

# Some final sanity checks across separate arguments sections:
if self.profiler is not None and self.profiler.profiler_export_path is not None:
Expand Down Expand Up @@ -543,14 +550,15 @@ def global_batch_size(self):
def global_batch_size_in_tokens(self):
return self.global_batch_size * self.tokens.sequence_length

def save_as_yaml(self, file_path: str):
def save_as_yaml(self, file_path: str, sanity_checks: bool = True):
config_dict = serialize(self)
file_path = str(file_path)
with open(file_path, "w") as f:
yaml.dump(config_dict, f)

# Sanity test config can be reloaded
_ = get_config_from_file(file_path, config_class=self.__class__)
if sanity_checks:
_ = get_config_from_file(file_path, config_class=self.__class__)

def get_yaml(self):
config_dict = serialize(self)
Expand Down
48 changes: 44 additions & 4 deletions src/nanotron/config/lighteval_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,22 @@ def __post_init__(self):
assert self.wandb_project != "", "Please specify a wandb_project"


@dataclass
class LightEvalSlurm:
"""Arguments related to SLURM configuration for LightEval"""

gpus_per_node: int = 8
partition: str = "hopper-prod"
hf_cache: str = "~/.cache/huggingface"
cpus_per_task: int = 88
qos: str = "low"
time: str = "24:00:00"
reservation: Optional[str] = "smollm"

def __post_init__(self):
self.hf_cache = str(Path(self.hf_cache).expanduser())


@dataclass
class LightEvalConfig:
"""Arguments related to running LightEval on checkpoints.
Expand All @@ -81,13 +97,37 @@ class LightEvalConfig:
the saved config when running LightEval after training.
"""

slurm_template: Optional[str] = None
slurm_script_dir: Optional[str] = None

checkpoints_path: Optional[str] = None
slurm_script_dir: Optional[Path] = Path("eval_results/launch-config")
logs_path: Optional[Path] = Path("eval_results/logs")
local_checkpoint_dir: Path = Path(
"/scratch"
) # Base directory for temporary checkpoint storage, will store under {local_checkpoint_dir}/{run_name}/{step}
parallelism: Optional[ParallelismArgs] = None
batch_size: Optional[int] = None
generation: Optional[Union[GenerationArgs, Dict[str, GenerationArgs]]] = None
tasks: Optional[LightEvalTasksArgs] = None
logging: Optional[LightEvalLoggingArgs] = None
wandb: Optional[LightEvalWandbLoggerConfig] = None
slurm: Optional[LightEvalSlurm] = None
s3_save_path: Optional[str] = None # should not be dependent of the run_name
output_dir: Optional[str] = None # we should sanity check that it's the same as the one in the eval_config_override
nanotron_path: Optional[str] = "./"
eval_config_override: str = None
eval_config_override: Path = None # Previously hardcoded in run_slurm_one_job
eval_interval: Optional[
int
] = None # Must be multiple of checkpoint_interval. If None, eval will be done after each checkpoint upload to s3
eval_interval_file: Optional[
Path
] = None # If specified, eval_interval will be read from this file upon the next evaluation.

def __post_init__(self):
if self.parallelism is None:
self.parallelism = ParallelismArgs(dp=1, pp=1, tp=1, tp_linear_async_communication=True)
if self.slurm is None:
self.slurm = LightEvalSlurm()
self.local_checkpoint_dir = str(Path(self.local_checkpoint_dir).expanduser())
if self.eval_interval_file is not None and Path(self.eval_interval_file).exists():
logger.warning(
f"Eval interval file {self.eval_interval_file} exists. `eval_interval` will be replaced by the value in the file upon the next evaluation. You should probably delete this file if that's not what you want."
)
8 changes: 4 additions & 4 deletions src/nanotron/config/models_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,10 @@ class Qwen2Config:
no_rope_layer: Optional[
int
] = None # Skip rope every no_rope_layer layers (see https://arxiv.org/abs/2501.18795 https://arxiv.org/abs/2305.19466 and Llama4)
_fused_rotary_emb: bool = True
_fused_rms_norm: bool = True
_use_qkv_packed: bool = True
_use_doc_masking: bool = True
_fused_rotary_emb: bool = False
_fused_rms_norm: bool = False
_use_qkv_packed: bool = False
_use_doc_masking: bool = False

# MoE configuration
moe_config: Optional[MoEConfig] = None
Expand Down
13 changes: 13 additions & 0 deletions src/nanotron/eval/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Nanotron Evaluation

This directory contains code for evaluating models trained with Nanotron.

## Installation

To use the evaluation functionality, you need to install the `lighteval` package:

```bash
uv pip install lighteval[dev]
```

## Usage
3 changes: 3 additions & 0 deletions src/nanotron/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# flake8: noqa: F401

from .one_job_runner import LightEvalRunner
Loading
Loading