3
3
# oumi-launch -p configs/oumi/jobs/gcp/llama8b_eval.yaml -c llama8b-eval
4
4
name : llama8b-eval
5
5
6
- num_nodes : 1
7
6
resources :
8
7
cloud : gcp
9
8
accelerators : " A100:4"
@@ -19,18 +18,11 @@ file_mounts:
19
18
# This is created on the local machine by running `huggingface-cli login`.
20
19
~/.cache/huggingface/token : ~/.cache/huggingface/token
21
20
22
- storage_mounts :
23
- # See https://github.com/oumi-ai/oumi/wiki/Clouds-Setup#mounting-gcs-buckets
24
- # for documentation on using GCS buckets.
25
- /output_dir_gcs :
26
- source : gs://oumi-dev-us-central1
27
- store : gcs
28
-
29
21
envs :
30
- WANDB_PROJECT : oumi-eval
31
- # HF datasets require trusting remote code to be enabled.
32
- HF_DATASETS_TRUST_REMOTE_CODE : 1
33
- OUMI_EVALUATION_FRAMEWORK : lm_harness # Valid values: "lm_harness", "oumi "
22
+ # NOTE: For SFT, update this to point to your model checkpoint.
23
+ MODEL_CHECKPOINT_DIR : meta-llama/Meta-Llama-3.1-8B-Instruct
24
+ # NOTE: For LoRA, update this to point to your LoRA adapter.
25
+ LORA_ADAPTER_DIR : " "
34
26
35
27
setup : |
36
28
set -e
@@ -44,38 +36,21 @@ run: |
44
36
set -e # Exit if any command failed.
45
37
source ./configs/skypilot/sky_init.sh
46
38
47
- # NOTE: For SFT, update this to point to your model checkpoint.
48
- MODEL_CHECKPOINT_DIR="meta-llama/Meta-Llama-3.1-8B-Instruct "
49
- # NOTE: For LoRA, update this to point to your LoRA adapter.
50
- LORA_ADAPTER_DIR=""
39
+ if test ${OUMI_NUM_NODES} -ne 1; then
40
+ echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes. "
41
+ exit 1
42
+ fi
51
43
52
- echo "Starting evaluation for ${EVAL_CHECKPOINT_DIR} ..."
44
+ echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
45
+ if test -n "$LORA_ADAPTER_DIR"; then
46
+ echo "Using LoRA adapter: ${LORA_ADAPTER_DIR}"
47
+ fi
53
48
54
49
set -x # Enable command tracing.
55
- TOTAL_NUM_GPUS=$((${OUMI_NUM_NODES} * ${SKYPILOT_NUM_GPUS_PER_NODE}))
56
-
57
- if [ "$OUMI_EVALUATION_FRAMEWORK" == "lm_harness" ]; then
58
- accelerate launch \
59
- --num_processes=${TOTAL_NUM_GPUS} \
60
- --num_machines=${OUMI_NUM_NODES} \
61
- --machine_rank=${SKYPILOT_NODE_RANK} \
62
- --main_process_ip ${OUMI_MASTER_ADDR} \
63
- --main_process_port 8007 \
64
- -m oumi.evaluate \
65
- -c configs/oumi/llama8b.eval.yaml \
66
- "model.adapter_model=${EVAL_CHECKPOINT_DIR}"
67
- elif [ "$OUMI_EVALUATION_FRAMEWORK" == "oumi" ]; then
68
- echo "The custom eval framework is deprecated. Use LM_HARNESS instead."
69
- if test ${OUMI_NUM_NODES} -ne 1; then
70
- echo "Legacy evaluation can only run on 1 node. Actual: ${OUMI_NUM_NODES} nodes."
71
- exit 1
72
- fi
73
- python -m oumi.evaluate \
74
- -c configs/oumi/llama8b.eval.legacy.yaml \
75
- "model.adapter_model=${EVAL_CHECKPOINT_DIR}"
76
- else
77
- echo "Unknown evaluation framework: ${OUMI_EVALUATION_FRAMEWORK}"
78
- exit 1
79
- fi
50
+ accelerate launch \
51
+ -m oumi.evaluate \
52
+ -c configs/oumi/llama8b.eval.yaml \
53
+ "model.model_name=${MODEL_CHECKPOINT_DIR}" \
54
+ "model.adapter_model=${LORA_ADAPTER_DIR}"
80
55
81
56
echo "Node ${SKYPILOT_NODE_RANK} is all done!"
0 commit comments