Merge branch 'main' into jgreer013/async_inference_writes

jgreer013 · web-flow · commit 283895a28df2 · 2024-09-27T11:14:44.000-07:00
diff --git a/Makefile b/Makefile
@@ -1,3 +1,5 @@
+SHELL := /bin/bash
+
 # General makefile
 # Conda environment name
 CONDA_ENV := oumi
@@ -47,10 +49,10 @@ setup:
 	else \
 		conda create -n $(CONDA_ENV) python=3.11 -y; \
 		if [ -f ~/.zshrc ]; then \
-    			source ~/.zshrc \
+			source ~/.zshrc; \
 		elif [ -f ~/.bashrc ]; then \
-			source ~/.bashrc \
-		fi \
+			source ~/.bashrc; \
+		fi; \
 		conda activate $(CONDA_ENV); \
 		pip install -e ".[all]"; \
 		pre-commit install; \
diff --git a/configs/oumi/jobs/gcp/llama70b_eval.yaml b/configs/oumi/jobs/gcp/llama70b_eval.yaml
@@ -0,0 +1,56 @@
+# Config to eval Llama 3.1 70B Instruct on GCP.
+# Example command:
+# oumi-launch -p configs/oumi/jobs/gcp/llama70b_eval.yaml -c llama70b-eval
+name: llama70b-eval
+
+resources:
+  cloud: gcp
+  accelerators: "A100:4"
+  use_spot: true
+  disk_size: 400 # Disk size in GBs
+
+# Upload working directory to remote ~/sky_workdir.
+working_dir: .
+
+# Mount local files.
+file_mounts:
+  ~/.netrc: ~/.netrc  # WandB credentials
+  # Mount HF token, which is needed to download locked-down models from HF Hub.
+  # This is created on the local machine by running `huggingface-cli login`.
+  ~/.cache/huggingface/token: ~/.cache/huggingface/token
+
+envs:
+  # NOTE: For SFT, update this to point to your model checkpoint.
+  MODEL_CHECKPOINT_DIR: meta-llama/Meta-Llama-3.1-70B-Instruct
+  # NOTE: For LoRA, update this to point to your LoRA adapter.
+  LORA_ADAPTER_DIR: ""
+
+setup: |
+  set -e
+  pip install '.[train,gpu]'
+  # Install model from HF Hub. This tool increases download speed compared to
+  # downloading the model during eval.
+  pip install hf_transfer
+  HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download meta-llama/Meta-Llama-3.1-70B-Instruct
+
+run: |
+  set -e  # Exit if any command failed.
+  source ./configs/skypilot/sky_init.sh
+
+  if test ${OUMI_NUM_NODES} -ne 1; then
+    echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes."
+    exit 1
+  fi
+
+  echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
+  if test -n "$LORA_ADAPTER_DIR"; then
+    echo "Using LoRA adapter: ${LORA_ADAPTER_DIR}"
+  fi
+
+  set -x # Enable command tracing.
+  python -m oumi.evaluate \
+    -c configs/oumi/llama70b.eval.yaml \
+    "model.model_name=${MODEL_CHECKPOINT_DIR}" \
+    "model.adapter_model=${LORA_ADAPTER_DIR}"
+
+  echo "Node ${SKYPILOT_NODE_RANK} is all done!"
diff --git a/configs/oumi/jobs/gcp/llama8b_eval.yaml b/configs/oumi/jobs/gcp/llama8b_eval.yaml
@@ -3,7 +3,6 @@
 # oumi-launch -p configs/oumi/jobs/gcp/llama8b_eval.yaml -c llama8b-eval
 name: llama8b-eval
 
-num_nodes: 1
 resources:
   cloud: gcp
   accelerators: "A100:4"
@@ -19,18 +18,11 @@ file_mounts:
   # This is created on the local machine by running `huggingface-cli login`.
   ~/.cache/huggingface/token: ~/.cache/huggingface/token
 
-storage_mounts:
-  # See https://github.com/oumi-ai/oumi/wiki/Clouds-Setup#mounting-gcs-buckets
-  # for documentation on using GCS buckets.
-  /output_dir_gcs:
-    source: gs://oumi-dev-us-central1
-    store: gcs
-
 envs:
-  WANDB_PROJECT: oumi-eval
-  # HF datasets require trusting remote code to be enabled.
-  HF_DATASETS_TRUST_REMOTE_CODE: 1
-  OUMI_EVALUATION_FRAMEWORK: lm_harness # Valid values: "lm_harness", "oumi"
+  # NOTE: For SFT, update this to point to your model checkpoint.
+  MODEL_CHECKPOINT_DIR: meta-llama/Meta-Llama-3.1-8B-Instruct
+  # NOTE: For LoRA, update this to point to your LoRA adapter.
+  LORA_ADAPTER_DIR: ""
 
 setup: |
   set -e
@@ -44,38 +36,21 @@ run: |
   set -e  # Exit if any command failed.
   source ./configs/skypilot/sky_init.sh
 
-  # NOTE: For SFT, update this to point to your model checkpoint.
-  MODEL_CHECKPOINT_DIR="meta-llama/Meta-Llama-3.1-8B-Instruct"
-  # NOTE: For LoRA, update this to point to your LoRA adapter.
-  LORA_ADAPTER_DIR=""
+  if test ${OUMI_NUM_NODES} -ne 1; then
+    echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes."
+    exit 1
+  fi
 
-  echo "Starting evaluation for ${EVAL_CHECKPOINT_DIR} ..."
+  echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
+  if test -n "$LORA_ADAPTER_DIR"; then
+    echo "Using LoRA adapter: ${LORA_ADAPTER_DIR}"
+  fi
 
   set -x # Enable command tracing.
-  TOTAL_NUM_GPUS=$((${OUMI_NUM_NODES} * ${SKYPILOT_NUM_GPUS_PER_NODE}))
-
-  if [ "$OUMI_EVALUATION_FRAMEWORK" == "lm_harness" ]; then
-    accelerate launch \
-      --num_processes=${TOTAL_NUM_GPUS} \
-      --num_machines=${OUMI_NUM_NODES} \
-      --machine_rank=${SKYPILOT_NODE_RANK} \
-      --main_process_ip ${OUMI_MASTER_ADDR} \
-      --main_process_port 8007 \
-      -m oumi.evaluate  \
-      -c configs/oumi/llama8b.eval.yaml \
-      "model.adapter_model=${EVAL_CHECKPOINT_DIR}"
-  elif [ "$OUMI_EVALUATION_FRAMEWORK" == "oumi" ]; then
-    echo "The custom eval framework is deprecated. Use LM_HARNESS instead."
-    if test ${OUMI_NUM_NODES} -ne 1; then
-      echo "Legacy evaluation can only run on 1 node. Actual: ${OUMI_NUM_NODES} nodes."
-      exit 1
-    fi
-    python -m oumi.evaluate \
-      -c configs/oumi/llama8b.eval.legacy.yaml \
-      "model.adapter_model=${EVAL_CHECKPOINT_DIR}"
-  else
-    echo "Unknown evaluation framework: ${OUMI_EVALUATION_FRAMEWORK}"
-    exit 1
-  fi
+  accelerate launch \
+    -m oumi.evaluate \
+    -c configs/oumi/llama8b.eval.yaml \
+    "model.model_name=${MODEL_CHECKPOINT_DIR}" \
+    "model.adapter_model=${LORA_ADAPTER_DIR}"
 
   echo "Node ${SKYPILOT_NODE_RANK} is all done!"
diff --git a/configs/oumi/jobs/polaris/llama70b_eval.yaml b/configs/oumi/jobs/polaris/llama70b_eval.yaml
@@ -1,4 +1,4 @@
-# Config to eval Llama 3.1 70B Instruct.
+# Config to eval Llama 3.1 70B Instruct on Polaris.
 # Example command:
 # oumi-launch -p configs/oumi/jobs/polaris/llama70b_eval.yaml -c preemptable.$ALCF_USER user=$ALCF_USER
 name: llama70b-eval
@@ -11,6 +11,12 @@ resources:
 # Upload working directory to /home/$USER/oumi_launcher/llama70b_eval.
 working_dir: .
 
+envs:
+  # NOTE: For SFT, update this to point to your model checkpoint.
+  MODEL_CHECKPOINT_DIR: meta-llama/Meta-Llama-3.1-70B-Instruct
+  # NOTE: For LoRA, update this to point to your LoRA adapter.
+  LORA_ADAPTER_DIR: ""
+
 # `setup` will always be executed before `run`. It's strongly suggested to set any PBS
 # directives in the `setup` section. Additional commands can also be run here after the
 # PBS directives.
@@ -23,22 +29,23 @@ setup: |
   #PBS -e /eagle/community_ai/jobs/logs/
 
 run: |
-  set -e
-
-  # Various setup for running on Polaris.
+  set -e  # Exit if any command failed.
   source ${PBS_O_WORKDIR}/scripts/polaris/polaris_init.sh
 
-  # NOTE: For SFT, update this to point to your model checkpoint.
-  MODEL_CHECKPOINT_DIR="meta-llama/Meta-Llama-3.1-70B-Instruct"
-  # NOTE: For LoRA, update this to point to your LoRA adapter.
-  LORA_ADAPTER_DIR=""
+  if test ${OUMI_NUM_NODES} -ne 1; then
+    echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes."
+    exit 1
+  fi
 
-  echo "Starting evaluation for ${EVAL_CHECKPOINT_DIR} ..."
+  echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
+  if test -n "$LORA_ADAPTER_DIR"; then
+    echo "Using LoRA adapter: ${LORA_ADAPTER_DIR}"
+  fi
 
   set -x # Enable command tracing.
   python -m oumi.evaluate \
-        -c configs/oumi/llama70b.eval.yaml \
-        "model.model_name=${MODEL_CHECKPOINT_DIR}" \
-        "model.adapter_model=${LORA_ADAPTER_DIR}"
+    -c configs/oumi/llama70b.eval.yaml \
+    "model.model_name=${MODEL_CHECKPOINT_DIR}" \
+    "model.adapter_model=${LORA_ADAPTER_DIR}"
 
-  echo "Polaris job is all done!"
+  echo -e "Finished eval on node:\n$(cat $PBS_NODEFILE)"
diff --git a/configs/oumi/jobs/polaris/llama8b_eval.yaml b/configs/oumi/jobs/polaris/llama8b_eval.yaml
@@ -1,4 +1,4 @@
-# Config to eval Llama 3.1 8B Instruct.
+# Config to eval Llama 3.1 8B Instruct on Polaris.
 # Example command:
 # oumi-launch -p configs/oumi/jobs/polaris/llama8b_eval.yaml -c debug.$ALCF_USER user=$ALCF_USER
 name: llama8b-eval
@@ -11,6 +11,12 @@ resources:
 # Upload working directory to /home/$USER/oumi_launcher/llama8b_eval.
 working_dir: .
 
+envs:
+  # NOTE: For SFT, update this to point to your model checkpoint.
+  MODEL_CHECKPOINT_DIR: meta-llama/Meta-Llama-3.1-8B-Instruct
+  # NOTE: For LoRA, update this to point to your LoRA adapter.
+  LORA_ADAPTER_DIR: ""
+
 # `setup` will always be executed before `run`. It's strongly suggested to set any PBS
 # directives in the `setup` section. Additional commands can also be run here after the
 # PBS directives.
@@ -23,45 +29,24 @@ setup: |
   #PBS -e /eagle/community_ai/jobs/logs/
 
 run: |
-  set -e
-
-  # Various setup for running on Polaris.
+  set -e  # Exit if any command failed.
   source ${PBS_O_WORKDIR}/scripts/polaris/polaris_init.sh
 
-  # NOTE: For SFT, update this to point to your model checkpoint.
-  MODEL_CHECKPOINT_DIR="meta-llama/Meta-Llama-3.1-8B-Instruct"
-  # NOTE: For LoRA, update this to point to your LoRA adapter.
-  LORA_ADAPTER_DIR=""
-
   if test ${OUMI_NUM_NODES} -ne 1; then
-    echo "Evaluation can only run on 1 Polaris node. Actual: ${OUMI_NUM_NODES} nodes."
+    echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes."
     exit 1
   fi
 
-  EVALUATION_FRAMEWORK="lm_harness" # Valid values: "lm_harness", "oumi"
-
-  echo "Starting evaluation for ${EVAL_CHECKPOINT_DIR} ..."
+  echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
+  if test -n "$LORA_ADAPTER_DIR"; then
+    echo "Using LoRA adapter: ${LORA_ADAPTER_DIR}"
+  fi
 
   set -x # Enable command tracing.
+  accelerate launch \
+    -m oumi.evaluate \
+    -c configs/oumi/llama8b.eval.yaml \
+    "model.model_name=${MODEL_CHECKPOINT_DIR}" \
+    "model.adapter_model=${LORA_ADAPTER_DIR}"
 
-  TOTAL_NUM_GPUS=$((${OUMI_NUM_NODES} * 4))
-
-  if [ "$EVALUATION_FRAMEWORK" == "lm_harness" ]; then
-      accelerate launch \
-        --num_processes=${TOTAL_NUM_GPUS} \
-        --num_machines=${OUMI_NUM_NODES} \
-        -m oumi.evaluate  \
-        -c configs/oumi/llama8b.eval.yaml \
-        "model.adapter_model=${EVAL_CHECKPOINT_DIR}"
-  elif [ "$EVALUATION_FRAMEWORK" == "oumi" ]; then
-      echo "The custom eval framework is deprecated. Use LM_HARNESS instead."
-      python -m oumi.evaluate \
-        -c configs/oumi/llama8b.eval.legacy.yaml \
-        "model.adapter_model=${EVAL_CHECKPOINT_DIR}"
-  else
-      echo "Unknown evaluation framework: ${EVALUATION_FRAMEWORK}"
-      exit 1
-  fi
-
-  echo -e "Finished eval on ${OUMI_NUM_NODES} node(s):\n$(cat $PBS_NODEFILE)"
-  echo "Polaris job is all done!"
+  echo -e "Finished eval on node:\n$(cat $PBS_NODEFILE)"
diff --git a/configs/oumi/llama8b.eval.legacy.yaml b/configs/oumi/llama8b.eval.legacy.yaml
diff --git a/docs/DEV_SETUP.md b/docs/DEV_SETUP.md
@@ -59,9 +59,10 @@
    make setup
    ```
 
-   If you'd like to only run the pre-commits before a push, you can run:
+   If you'd like to only run the pre-commits before a push, instead of every commit, you can run:
 
    ```shell
+   pre-commit uninstall
    pre-commit install --install-hooks --hook-type pre-push
    ```
 
diff --git a/src/oumi/builders/models.py b/src/oumi/builders/models.py
@@ -7,8 +7,9 @@
 from transformers import BitsAndBytesConfig
 
 from oumi.core.configs import ModelParams, PeftParams
-from oumi.core.distributed import get_device_rank_info, is_using_accelerate_fsdp
+from oumi.core.distributed import get_device_rank_info
 from oumi.core.registry import REGISTRY, RegistryType
+from oumi.utils.distributed_utils import is_using_accelerate_fsdp
 from oumi.utils.io_utils import get_oumi_root_directory, load_file
 from oumi.utils.logging import logger
 from oumi.utils.torch_naming_heuristics import disable_dropout
diff --git a/src/oumi/core/configs/params/model_params.py b/src/oumi/core/configs/params/model_params.py
@@ -7,6 +7,7 @@
 
 from oumi.core.configs.params.base_params import BaseParams
 from oumi.core.types.exceptions import HardwareException
+from oumi.utils.distributed_utils import is_using_accelerate
 
 
 @dataclass
@@ -137,6 +138,10 @@ class ModelParams(BaseParams):
 
     This is needed for large models that do not fit on a single GPU.
     It is used as the value for the `parallelize` argument in LM Harness.
+
+    If this is enabled, the eval job must be kicked off with `python` as opposed to
+    `accelerate launch`, as described here:
+    https://github.com/EleutherAI/lm-evaluation-harness?tab=readme-ov-file#multi-gpu-evaluation-with-hugging-face-accelerate
     """
 
     freeze_layers: List[str] = field(default_factory=list)
@@ -185,3 +190,9 @@ def __validate__(self):
                 "supported. Confirm that your hardware is compatible and then "
                 "consider installing it: pip install -U flash-attn --no-build-isolation"
             )
+
+        if self.shard_for_eval and is_using_accelerate():
+            raise ValueError(
+                "Sharded-model evaluations with LM Harness should be invoked with "
+                "`python`, not `accelerate launch`."
+            )
diff --git a/src/oumi/core/datasets/base_dataset.py b/src/oumi/core/datasets/base_dataset.py
@@ -1,3 +1,4 @@
+import gc
 import os
 from abc import ABC, abstractmethod
 from typing import Literal, Optional, Union, cast
@@ -136,6 +137,9 @@ def _load_data(self) -> pd.DataFrame:
         else:
             result = self._load_hf_hub_dataset(self.dataset_name_or_path)
 
+        # Reclaim memory after data loading.
+        gc.collect()
+
         logger.info(
             f"Loaded DataFrame with shape: {result.shape}. Columns:\n"
             f"{result.dtypes}"
@@ -188,7 +192,9 @@ def _load_hf_hub_dataset(self, path: str) -> pd.DataFrame:
             )
         )
 
-        return cast(pd.DataFrame, dataset.to_pandas())
+        result = dataset.to_pandas()
+        del dataset
+        return cast(pd.DataFrame, result)
 
     def _load_jsonl_dataset(self, path: str) -> pd.DataFrame:
         return pd.read_json(path, lines=True)
diff --git a/src/oumi/core/distributed.py b/src/oumi/core/distributed.py
diff --git a/src/oumi/train.py b/src/oumi/train.py
diff --git a/src/oumi/utils/distributed_utils.py b/src/oumi/utils/distributed_utils.py