Skip to content

Commit 283895a

Browse files
authored
Merge branch 'main' into jgreer013/async_inference_writes
2 parents 341d687 + ea83b07 commit 283895a

File tree

13 files changed

+169
-137
lines changed

13 files changed

+169
-137
lines changed

Makefile

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
SHELL := /bin/bash
2+
13
# General makefile
24
# Conda environment name
35
CONDA_ENV := oumi
@@ -47,10 +49,10 @@ setup:
4749
else \
4850
conda create -n $(CONDA_ENV) python=3.11 -y; \
4951
if [ -f ~/.zshrc ]; then \
50-
source ~/.zshrc \
52+
source ~/.zshrc; \
5153
elif [ -f ~/.bashrc ]; then \
52-
source ~/.bashrc \
53-
fi \
54+
source ~/.bashrc; \
55+
fi; \
5456
conda activate $(CONDA_ENV); \
5557
pip install -e ".[all]"; \
5658
pre-commit install; \
+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# Config to eval Llama 3.1 70B Instruct on GCP.
2+
# Example command:
3+
# oumi-launch -p configs/oumi/jobs/gcp/llama70b_eval.yaml -c llama70b-eval
4+
name: llama70b-eval
5+
6+
resources:
7+
cloud: gcp
8+
accelerators: "A100:4"
9+
use_spot: true
10+
disk_size: 400 # Disk size in GBs
11+
12+
# Upload working directory to remote ~/sky_workdir.
13+
working_dir: .
14+
15+
# Mount local files.
16+
file_mounts:
17+
~/.netrc: ~/.netrc # WandB credentials
18+
# Mount HF token, which is needed to download locked-down models from HF Hub.
19+
# This is created on the local machine by running `huggingface-cli login`.
20+
~/.cache/huggingface/token: ~/.cache/huggingface/token
21+
22+
envs:
23+
# NOTE: For SFT, update this to point to your model checkpoint.
24+
MODEL_CHECKPOINT_DIR: meta-llama/Meta-Llama-3.1-70B-Instruct
25+
# NOTE: For LoRA, update this to point to your LoRA adapter.
26+
LORA_ADAPTER_DIR: ""
27+
28+
setup: |
29+
set -e
30+
pip install '.[train,gpu]'
31+
# Install model from HF Hub. This tool increases download speed compared to
32+
# downloading the model during eval.
33+
pip install hf_transfer
34+
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download meta-llama/Meta-Llama-3.1-70B-Instruct
35+
36+
run: |
37+
set -e # Exit if any command failed.
38+
source ./configs/skypilot/sky_init.sh
39+
40+
if test ${OUMI_NUM_NODES} -ne 1; then
41+
echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes."
42+
exit 1
43+
fi
44+
45+
echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
46+
if test -n "$LORA_ADAPTER_DIR"; then
47+
echo "Using LoRA adapter: ${LORA_ADAPTER_DIR}"
48+
fi
49+
50+
set -x # Enable command tracing.
51+
python -m oumi.evaluate \
52+
-c configs/oumi/llama70b.eval.yaml \
53+
"model.model_name=${MODEL_CHECKPOINT_DIR}" \
54+
"model.adapter_model=${LORA_ADAPTER_DIR}"
55+
56+
echo "Node ${SKYPILOT_NODE_RANK} is all done!"

configs/oumi/jobs/gcp/llama8b_eval.yaml

+17-42
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
# oumi-launch -p configs/oumi/jobs/gcp/llama8b_eval.yaml -c llama8b-eval
44
name: llama8b-eval
55

6-
num_nodes: 1
76
resources:
87
cloud: gcp
98
accelerators: "A100:4"
@@ -19,18 +18,11 @@ file_mounts:
1918
# This is created on the local machine by running `huggingface-cli login`.
2019
~/.cache/huggingface/token: ~/.cache/huggingface/token
2120

22-
storage_mounts:
23-
# See https://github.com/oumi-ai/oumi/wiki/Clouds-Setup#mounting-gcs-buckets
24-
# for documentation on using GCS buckets.
25-
/output_dir_gcs:
26-
source: gs://oumi-dev-us-central1
27-
store: gcs
28-
2921
envs:
30-
WANDB_PROJECT: oumi-eval
31-
# HF datasets require trusting remote code to be enabled.
32-
HF_DATASETS_TRUST_REMOTE_CODE: 1
33-
OUMI_EVALUATION_FRAMEWORK: lm_harness # Valid values: "lm_harness", "oumi"
22+
# NOTE: For SFT, update this to point to your model checkpoint.
23+
MODEL_CHECKPOINT_DIR: meta-llama/Meta-Llama-3.1-8B-Instruct
24+
# NOTE: For LoRA, update this to point to your LoRA adapter.
25+
LORA_ADAPTER_DIR: ""
3426

3527
setup: |
3628
set -e
@@ -44,38 +36,21 @@ run: |
4436
set -e # Exit if any command failed.
4537
source ./configs/skypilot/sky_init.sh
4638
47-
# NOTE: For SFT, update this to point to your model checkpoint.
48-
MODEL_CHECKPOINT_DIR="meta-llama/Meta-Llama-3.1-8B-Instruct"
49-
# NOTE: For LoRA, update this to point to your LoRA adapter.
50-
LORA_ADAPTER_DIR=""
39+
if test ${OUMI_NUM_NODES} -ne 1; then
40+
echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes."
41+
exit 1
42+
fi
5143
52-
echo "Starting evaluation for ${EVAL_CHECKPOINT_DIR} ..."
44+
echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
45+
if test -n "$LORA_ADAPTER_DIR"; then
46+
echo "Using LoRA adapter: ${LORA_ADAPTER_DIR}"
47+
fi
5348
5449
set -x # Enable command tracing.
55-
TOTAL_NUM_GPUS=$((${OUMI_NUM_NODES} * ${SKYPILOT_NUM_GPUS_PER_NODE}))
56-
57-
if [ "$OUMI_EVALUATION_FRAMEWORK" == "lm_harness" ]; then
58-
accelerate launch \
59-
--num_processes=${TOTAL_NUM_GPUS} \
60-
--num_machines=${OUMI_NUM_NODES} \
61-
--machine_rank=${SKYPILOT_NODE_RANK} \
62-
--main_process_ip ${OUMI_MASTER_ADDR} \
63-
--main_process_port 8007 \
64-
-m oumi.evaluate \
65-
-c configs/oumi/llama8b.eval.yaml \
66-
"model.adapter_model=${EVAL_CHECKPOINT_DIR}"
67-
elif [ "$OUMI_EVALUATION_FRAMEWORK" == "oumi" ]; then
68-
echo "The custom eval framework is deprecated. Use LM_HARNESS instead."
69-
if test ${OUMI_NUM_NODES} -ne 1; then
70-
echo "Legacy evaluation can only run on 1 node. Actual: ${OUMI_NUM_NODES} nodes."
71-
exit 1
72-
fi
73-
python -m oumi.evaluate \
74-
-c configs/oumi/llama8b.eval.legacy.yaml \
75-
"model.adapter_model=${EVAL_CHECKPOINT_DIR}"
76-
else
77-
echo "Unknown evaluation framework: ${OUMI_EVALUATION_FRAMEWORK}"
78-
exit 1
79-
fi
50+
accelerate launch \
51+
-m oumi.evaluate \
52+
-c configs/oumi/llama8b.eval.yaml \
53+
"model.model_name=${MODEL_CHECKPOINT_DIR}" \
54+
"model.adapter_model=${LORA_ADAPTER_DIR}"
8055
8156
echo "Node ${SKYPILOT_NODE_RANK} is all done!"
+20-13
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Config to eval Llama 3.1 70B Instruct.
1+
# Config to eval Llama 3.1 70B Instruct on Polaris.
22
# Example command:
33
# oumi-launch -p configs/oumi/jobs/polaris/llama70b_eval.yaml -c preemptable.$ALCF_USER user=$ALCF_USER
44
name: llama70b-eval
@@ -11,6 +11,12 @@ resources:
1111
# Upload working directory to /home/$USER/oumi_launcher/llama70b_eval.
1212
working_dir: .
1313

14+
envs:
15+
# NOTE: For SFT, update this to point to your model checkpoint.
16+
MODEL_CHECKPOINT_DIR: meta-llama/Meta-Llama-3.1-70B-Instruct
17+
# NOTE: For LoRA, update this to point to your LoRA adapter.
18+
LORA_ADAPTER_DIR: ""
19+
1420
# `setup` will always be executed before `run`. It's strongly suggested to set any PBS
1521
# directives in the `setup` section. Additional commands can also be run here after the
1622
# PBS directives.
@@ -23,22 +29,23 @@ setup: |
2329
#PBS -e /eagle/community_ai/jobs/logs/
2430
2531
run: |
26-
set -e
27-
28-
# Various setup for running on Polaris.
32+
set -e # Exit if any command failed.
2933
source ${PBS_O_WORKDIR}/scripts/polaris/polaris_init.sh
3034
31-
# NOTE: For SFT, update this to point to your model checkpoint.
32-
MODEL_CHECKPOINT_DIR="meta-llama/Meta-Llama-3.1-70B-Instruct"
33-
# NOTE: For LoRA, update this to point to your LoRA adapter.
34-
LORA_ADAPTER_DIR=""
35+
if test ${OUMI_NUM_NODES} -ne 1; then
36+
echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes."
37+
exit 1
38+
fi
3539
36-
echo "Starting evaluation for ${EVAL_CHECKPOINT_DIR} ..."
40+
echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
41+
if test -n "$LORA_ADAPTER_DIR"; then
42+
echo "Using LoRA adapter: ${LORA_ADAPTER_DIR}"
43+
fi
3744
3845
set -x # Enable command tracing.
3946
python -m oumi.evaluate \
40-
-c configs/oumi/llama70b.eval.yaml \
41-
"model.model_name=${MODEL_CHECKPOINT_DIR}" \
42-
"model.adapter_model=${LORA_ADAPTER_DIR}"
47+
-c configs/oumi/llama70b.eval.yaml \
48+
"model.model_name=${MODEL_CHECKPOINT_DIR}" \
49+
"model.adapter_model=${LORA_ADAPTER_DIR}"
4350
44-
echo "Polaris job is all done!"
51+
echo -e "Finished eval on node:\n$(cat $PBS_NODEFILE)"
+19-34
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Config to eval Llama 3.1 8B Instruct.
1+
# Config to eval Llama 3.1 8B Instruct on Polaris.
22
# Example command:
33
# oumi-launch -p configs/oumi/jobs/polaris/llama8b_eval.yaml -c debug.$ALCF_USER user=$ALCF_USER
44
name: llama8b-eval
@@ -11,6 +11,12 @@ resources:
1111
# Upload working directory to /home/$USER/oumi_launcher/llama8b_eval.
1212
working_dir: .
1313

14+
envs:
15+
# NOTE: For SFT, update this to point to your model checkpoint.
16+
MODEL_CHECKPOINT_DIR: meta-llama/Meta-Llama-3.1-8B-Instruct
17+
# NOTE: For LoRA, update this to point to your LoRA adapter.
18+
LORA_ADAPTER_DIR: ""
19+
1420
# `setup` will always be executed before `run`. It's strongly suggested to set any PBS
1521
# directives in the `setup` section. Additional commands can also be run here after the
1622
# PBS directives.
@@ -23,45 +29,24 @@ setup: |
2329
#PBS -e /eagle/community_ai/jobs/logs/
2430
2531
run: |
26-
set -e
27-
28-
# Various setup for running on Polaris.
32+
set -e # Exit if any command failed.
2933
source ${PBS_O_WORKDIR}/scripts/polaris/polaris_init.sh
3034
31-
# NOTE: For SFT, update this to point to your model checkpoint.
32-
MODEL_CHECKPOINT_DIR="meta-llama/Meta-Llama-3.1-8B-Instruct"
33-
# NOTE: For LoRA, update this to point to your LoRA adapter.
34-
LORA_ADAPTER_DIR=""
35-
3635
if test ${OUMI_NUM_NODES} -ne 1; then
37-
echo "Evaluation can only run on 1 Polaris node. Actual: ${OUMI_NUM_NODES} nodes."
36+
echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes."
3837
exit 1
3938
fi
4039
41-
EVALUATION_FRAMEWORK="lm_harness" # Valid values: "lm_harness", "oumi"
42-
43-
echo "Starting evaluation for ${EVAL_CHECKPOINT_DIR} ..."
40+
echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
41+
if test -n "$LORA_ADAPTER_DIR"; then
42+
echo "Using LoRA adapter: ${LORA_ADAPTER_DIR}"
43+
fi
4444
4545
set -x # Enable command tracing.
46+
accelerate launch \
47+
-m oumi.evaluate \
48+
-c configs/oumi/llama8b.eval.yaml \
49+
"model.model_name=${MODEL_CHECKPOINT_DIR}" \
50+
"model.adapter_model=${LORA_ADAPTER_DIR}"
4651
47-
TOTAL_NUM_GPUS=$((${OUMI_NUM_NODES} * 4))
48-
49-
if [ "$EVALUATION_FRAMEWORK" == "lm_harness" ]; then
50-
accelerate launch \
51-
--num_processes=${TOTAL_NUM_GPUS} \
52-
--num_machines=${OUMI_NUM_NODES} \
53-
-m oumi.evaluate \
54-
-c configs/oumi/llama8b.eval.yaml \
55-
"model.adapter_model=${EVAL_CHECKPOINT_DIR}"
56-
elif [ "$EVALUATION_FRAMEWORK" == "oumi" ]; then
57-
echo "The custom eval framework is deprecated. Use LM_HARNESS instead."
58-
python -m oumi.evaluate \
59-
-c configs/oumi/llama8b.eval.legacy.yaml \
60-
"model.adapter_model=${EVAL_CHECKPOINT_DIR}"
61-
else
62-
echo "Unknown evaluation framework: ${EVALUATION_FRAMEWORK}"
63-
exit 1
64-
fi
65-
66-
echo -e "Finished eval on ${OUMI_NUM_NODES} node(s):\n$(cat $PBS_NODEFILE)"
67-
echo "Polaris job is all done!"
52+
echo -e "Finished eval on node:\n$(cat $PBS_NODEFILE)"

configs/oumi/llama8b.eval.legacy.yaml

-27
This file was deleted.

docs/DEV_SETUP.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,10 @@
5959
make setup
6060
```
6161

62-
If you'd like to only run the pre-commits before a push, you can run:
62+
If you'd like to only run the pre-commits before a push, instead of every commit, you can run:
6363

6464
```shell
65+
pre-commit uninstall
6566
pre-commit install --install-hooks --hook-type pre-push
6667
```
6768

src/oumi/builders/models.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77
from transformers import BitsAndBytesConfig
88

99
from oumi.core.configs import ModelParams, PeftParams
10-
from oumi.core.distributed import get_device_rank_info, is_using_accelerate_fsdp
10+
from oumi.core.distributed import get_device_rank_info
1111
from oumi.core.registry import REGISTRY, RegistryType
12+
from oumi.utils.distributed_utils import is_using_accelerate_fsdp
1213
from oumi.utils.io_utils import get_oumi_root_directory, load_file
1314
from oumi.utils.logging import logger
1415
from oumi.utils.torch_naming_heuristics import disable_dropout

src/oumi/core/configs/params/model_params.py

+11
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from oumi.core.configs.params.base_params import BaseParams
99
from oumi.core.types.exceptions import HardwareException
10+
from oumi.utils.distributed_utils import is_using_accelerate
1011

1112

1213
@dataclass
@@ -137,6 +138,10 @@ class ModelParams(BaseParams):
137138
138139
This is needed for large models that do not fit on a single GPU.
139140
It is used as the value for the `parallelize` argument in LM Harness.
141+
142+
If this is enabled, the eval job must be kicked off with `python` as opposed to
143+
`accelerate launch`, as described here:
144+
https://github.com/EleutherAI/lm-evaluation-harness?tab=readme-ov-file#multi-gpu-evaluation-with-hugging-face-accelerate
140145
"""
141146

142147
freeze_layers: List[str] = field(default_factory=list)
@@ -185,3 +190,9 @@ def __validate__(self):
185190
"supported. Confirm that your hardware is compatible and then "
186191
"consider installing it: pip install -U flash-attn --no-build-isolation"
187192
)
193+
194+
if self.shard_for_eval and is_using_accelerate():
195+
raise ValueError(
196+
"Sharded-model evaluations with LM Harness should be invoked with "
197+
"`python`, not `accelerate launch`."
198+
)

src/oumi/core/datasets/base_dataset.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import gc
12
import os
23
from abc import ABC, abstractmethod
34
from typing import Literal, Optional, Union, cast
@@ -136,6 +137,9 @@ def _load_data(self) -> pd.DataFrame:
136137
else:
137138
result = self._load_hf_hub_dataset(self.dataset_name_or_path)
138139

140+
# Reclaim memory after data loading.
141+
gc.collect()
142+
139143
logger.info(
140144
f"Loaded DataFrame with shape: {result.shape}. Columns:\n"
141145
f"{result.dtypes}"
@@ -188,7 +192,9 @@ def _load_hf_hub_dataset(self, path: str) -> pd.DataFrame:
188192
)
189193
)
190194

191-
return cast(pd.DataFrame, dataset.to_pandas())
195+
result = dataset.to_pandas()
196+
del dataset
197+
return cast(pd.DataFrame, result)
192198

193199
def _load_jsonl_dataset(self, path: str) -> pd.DataFrame:
194200
return pd.read_json(path, lines=True)

0 commit comments

Comments
 (0)