[GRPO][Eval] Add letter counting eval (#1574)

wizeng23 · web-flow · commit eae46dd91cd6 · 2025-03-31T14:37:19.000-07:00
diff --git a/configs/examples/grpo_tldr/gcp_job.yaml b/configs/examples/grpo_tldr/gcp_job.yaml
@@ -31,7 +31,7 @@ envs:
 
 setup: |
   set -e
-  pip install uv && uv pip install oumi[gpu] vllm
+  pip install uv && uv pip install oumi[gpu] "vllm>=0.7.3,<0.8.0"
   pip install -U flash-attn --no-build-isolation
 
 run: |
diff --git a/configs/examples/letter_counting/evaluation/eval.yaml b/configs/examples/letter_counting/evaluation/eval.yaml
@@ -0,0 +1,35 @@
+# Config to eval an LLM's ability to count letters in words.
+#
+# Requirements:
+#   - Run `pip install vllm`
+#   - Log into HF: `huggingface-cli login`
+#
+# Usage:
+#   oumi evaluate -c oumi://configs/examples/letter_counting/evaluation/eval.yaml
+#
+# See Also:
+#   - Documentation: https://oumi.ai/docs/en/latest/user_guides/evaluate/evaluate.html
+#   - Config class: oumi.core.configs.EvaluationConfig
+#   - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/evaluation_config.py
+#   - Other eval configs: configs/**/evaluation/
+
+model:
+  model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+  model_max_length: 131072
+  torch_dtype_str: "bfloat16"
+  attn_implementation: "sdpa"
+  trust_remote_code: True
+
+generation:
+  max_new_tokens: 2048
+  # This isn't used by vLLM, but is used for the NATIVE inference engine.
+  batch_size: 4
+
+tasks:
+  - evaluation_backend: custom
+    task_name: count_letters
+    num_samples: 1000
+
+inference_engine: VLLM # Can also use NATIVE if not running on GPUs
+
+output_dir: "output/letter_counting/evaluation"
diff --git a/configs/examples/letter_counting/evaluation/gcp_job.yaml b/configs/examples/letter_counting/evaluation/gcp_job.yaml
@@ -0,0 +1,59 @@
+# Job config to eval an LLM's ability to count letters in words.
+#
+# Requirements:
+#   - Set up SkyPilot GCP: https://oumi.ai/docs/en/latest/user_guides/launch/launch.html#setup
+#   - Log into HF: `huggingface-cli login`
+#
+# Usage:
+#   oumi launch up -c oumi://configs/examples/letter_counting/evaluation/gcp_job.yaml --cluster letter-counting-eval
+#
+# See Also:
+#   - Documentation: https://oumi.ai/docs/en/latest/user_guides/launch/launch.html
+#   - Config class: oumi.core.configs.JobConfig
+#   - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/job_config.py
+#   - Other job configs: configs/**/*job.yaml
+
+name: letter-counting-eval
+
+resources:
+  cloud: gcp
+  accelerators: "A100"
+  use_spot: false
+
+working_dir: .
+
+file_mounts:
+  ~/.netrc: ~/.netrc  # WandB credentials
+  ~/.cache/huggingface/token: ~/.cache/huggingface/token # HF credentials
+
+envs:
+  # NOTE: For SFT, update this to point to your model checkpoint.
+  # NOTE: For LoRA, instead update this to point to your LoRA adapter.
+  #       The base model will be inferred automatically.
+  MODEL_CHECKPOINT_DIR: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+  WANDB_PROJECT: oumi-eval
+  OUMI_RUN_NAME: letter-counting.eval
+
+setup: |
+  set -e
+  pip install uv && uv pip install oumi[gpu,evaluation] "vllm>=0.7.3,<0.8.0"
+
+run: |
+  set -e  # Exit if any command failed.
+  source ./configs/examples/misc/sky_init.sh
+
+  if test ${OUMI_NUM_NODES} -ne 1; then
+    echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes."
+    exit 1
+  fi
+
+  echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
+  set -x
+
+  accelerate launch \
+    -m oumi evaluate \
+    -c oumi://configs/examples/letter_counting/evaluation/eval.yaml \
+    --run_name "${OUMI_RUN_NAME}.${SKYPILOT_TASK_ID}" \
+    --model.model_name "${MODEL_CHECKPOINT_DIR}"
+
+  echo "Node ${SKYPILOT_NODE_RANK} is all done!"
diff --git a/configs/examples/letter_counting/grpo/gcp_job.yaml b/configs/examples/letter_counting/grpo/gcp_job.yaml
@@ -3,6 +3,7 @@
 # Requirements:
 #   - Set up SkyPilot GCP: https://oumi.ai/docs/en/latest/user_guides/launch/launch.html#setup
 #   - Log into WandB (`wandb login`) or disable `enable_wandb`
+#   - Log into HF: `huggingface-cli login`
 #
 # Usage:
 #   oumi launch up -c oumi://configs/examples/letter_counting/grpo/gcp_job.yaml --cluster letter-counting-grpo
@@ -33,7 +34,7 @@ envs:
 setup: |
   set -e
   # vLLM needed for vLLM-powered generation during GRPO training.
-  pip install uv && uv pip install oumi[gpu] vllm
+  pip install uv && uv pip install oumi[gpu] "vllm>=0.7.3,<0.8.0"
   pip install -U flash-attn --no-build-isolation
 
 run: |
diff --git a/configs/examples/letter_counting/grpo/train.yaml b/configs/examples/letter_counting/grpo/train.yaml
@@ -2,6 +2,7 @@
 #
 # Requirements:
 #   - Log into WandB (`wandb login`) or disable `enable_wandb`
+#   - Log into HF: `huggingface-cli login`
 #
 # Usage:
 #   oumi train -c oumi://configs/examples/letter_counting/grpo/train.yaml
diff --git a/configs/recipes/phi3/evaluation/eval.yaml b/configs/recipes/phi3/evaluation/eval.yaml
@@ -16,7 +16,6 @@ model:
   model_name: "microsoft/Phi-3-mini-4k-instruct"
   trust_remote_code: True
   torch_dtype_str: "bfloat16"
-  shard_for_eval: True
 
 # HuggingFace Leaderboard V1
 tasks:
diff --git a/src/oumi/core/datasets/base_grpo_dataset.py b/src/oumi/core/datasets/base_grpo_dataset.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from abc import abstractmethod
 from typing import Optional, Union
 
 import pandas as pd
 from typing_extensions import override
 
 from oumi.core.datasets.base_map_dataset import BaseMapDataset
-from oumi.core.tokenizers.base_tokenizer import BaseTokenizer
+from oumi.core.types.conversation import Conversation
 
 _PROMPT_KEY = "prompt"
 _COMPLETION_KEY = "completion"
@@ -37,8 +38,6 @@ def __init__(
         dataset_name: Optional[str] = None,
         dataset_path: Optional[str] = None,
         split: Optional[str] = None,
-        tokenizer: Optional[BaseTokenizer] = None,
-        return_tensors: bool = False,
         **kwargs,
     ) -> None:
         """Initializes a new instance of the BaseExperimentalGrpoDataset class."""
@@ -49,14 +48,6 @@ def __init__(
             **kwargs,
         )
 
-        if return_tensors:
-            raise NotImplementedError(
-                "return_tensors=True is not implemented for this class"
-            )
-
-        self._tokenizer = tokenizer
-        self._return_tensors = return_tensors
-
         self._data = self._load_data()
 
     @staticmethod
@@ -65,7 +56,7 @@ def _process_text_value(s: str) -> str:
         # of text values. Let's strip them.
         return s.strip() if s else ""
 
-    def transform_grpo_example(self, example: Union[dict, pd.Series]) -> dict:
+    def _transform_grpo_example(self, example: Union[dict, pd.Series]) -> dict:
         """Validate and transform the GRPO sample into Python `dict`."""
         for required_key in (_PROMPT_KEY, _COMPLETION_KEY):
             if required_key not in example:
@@ -95,4 +86,37 @@ def transform_grpo_example(self, example: Union[dict, pd.Series]) -> dict:
     @override
     def transform(self, sample: pd.Series) -> dict:
         """Validate and transform the sample into Python `dict`."""
-        return self.transform_grpo_example(sample)
+        return self._transform_grpo_example(sample)
+
+    def conversation(self, idx: int) -> Conversation:
+        """Returns the conversation at the specified index.
+
+        Args:
+            idx (int): The index of the conversation to retrieve.
+
+        Returns:
+            str: The conversation at the specified index.
+        """
+        sample = self.raw(idx)
+        return self.transform_conversation(sample)
+
+    def conversations(self) -> list[Conversation]:
+        """Returns a list of all conversations."""
+        indexes = range(len(self))
+        return [self.conversation(index) for index in indexes]
+
+    #
+    # Abstract Methods
+    #
+    @abstractmethod
+    def transform_conversation(self, sample: Union[dict, pd.Series]) -> Conversation:
+        """Converts the input sample to a Conversation.
+
+        Args:
+            sample (Union[dict, pd.Series]): The input example.
+
+        Returns:
+            Conversation: The resulting conversation.
+
+        """
+        raise NotImplementedError
diff --git a/src/oumi/core/evaluation/evaluator.py b/src/oumi/core/evaluation/evaluator.py
@@ -238,6 +238,8 @@ def _get_custom_evaluation_fn(task_name: Optional[str]) -> Callable:
                 "task name, which should be corresponding to a registered evaluation "
                 "function, using the decorator `@register_evaluation_function`."
             )
+        # Import to ensure custom evaluation functions are added to REGISTRY.
+        import oumi.evaluation.registry as evaluation_registry  # noqa: F401
 
         if evaluation_fn := REGISTRY.get_evaluation_function(task_name):
             return evaluation_fn
diff --git a/src/oumi/datasets/grpo/letter_count.py b/src/oumi/datasets/grpo/letter_count.py
@@ -17,6 +17,13 @@
 
 from oumi.core.datasets.base_grpo_dataset import BaseExperimentalGrpoDataset
 from oumi.core.registry import register_dataset
+from oumi.core.types.conversation import Conversation
+
+_SYSTEM_PROMPT = (
+    "Your final answer should be written as digits and formatted as "
+    r'"\boxed{your_answer}". For example, if the answer is 42, '
+    r'make sure to output "\boxed{42}".'
+)
 
 
 @register_dataset("oumi-ai/oumi-letter-count")
@@ -47,7 +54,28 @@ class LetterCountGrpoDataset(BaseExperimentalGrpoDataset):
     @override
     def transform(self, sample: pd.Series) -> dict:
         """Validate and transform the sample into Python `dict`."""
+        # TODO: OPE-1122: Add system prompt to training.
+        # OPE-1158 seems to affect this, as the type of the input isn't consistent.
         return {
             "prompt": sample["messages"],
             "letter_count": sample["metadata"]["letter_count_integer"],
         }
+
+    @override
+    def transform_conversation(self, sample: pd.Series) -> Conversation:
+        """Converts the input sample to a Conversation.
+
+        Args:
+            sample (dict): The input example.
+
+        Returns:
+            Conversation: The resulting conversation.
+
+        """
+        # Example is already in conversation format and only needs light processing.
+        sample_dict = sample.to_dict()
+        # Convert messages from np.ndarray to list.
+        sample_dict["messages"] = sample_dict["messages"].tolist()
+        # Add system prompt.
+        sample_dict["messages"].append({"content": _SYSTEM_PROMPT, "role": "system"})
+        return Conversation.from_dict(sample_dict)
diff --git a/src/oumi/evaluation/registry/__init__.py b/src/oumi/evaluation/registry/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2025 - Oumi
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Evaluation registry module."""
+
+from oumi.evaluation.registry.count_letters_task import count_letters
+
+__all__ = [
+    "count_letters",
+]
diff --git a/src/oumi/evaluation/registry/count_letters_task.py b/src/oumi/evaluation/registry/count_letters_task.py
@@ -0,0 +1,86 @@
+# Copyright 2025 - Oumi
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Any, Optional
+
+from oumi.core.configs.params.evaluation_params import EvaluationTaskParams
+from oumi.core.inference.base_inference_engine import BaseInferenceEngine
+from oumi.core.registry import register_evaluation_function
+from oumi.datasets.grpo.letter_count import LetterCountGrpoDataset
+from oumi.utils.logging import logger
+
+
+def _extract_prediction(response: str) -> Optional[int]:
+    r"""Returns the numeric answer extracted from `\boxed{...}`, or None otherwise."""
+    regex_result = re.findall(r"\\boxed\{(\d+)\}", response)
+    if not regex_result or len(regex_result) != 1:
+        return None
+    number_str = regex_result[0]
+    # Except clause shouldn't trigger because the regex should only find ints.
+    try:
+        return int(number_str)
+    except ValueError:
+        return None
+
+
+@register_evaluation_function("count_letters")
+def count_letters(
+    task_params: EvaluationTaskParams,
+    inference_engine: BaseInferenceEngine,
+) -> dict[str, Any]:
+    """Custom evaluation function registered as `count_letters`."""
+    dataset = LetterCountGrpoDataset(split="test")
+    # TODO: OPE-1155: Add support for using Oumi dataset code to create the dataset.
+    # dataset = build_dataset("oumi-ai/oumi-letter-count", tokenizer=None, sample_count=10)  # noqa: E501
+    # dataset = build_dataset("oumi-ai/berrybench-v0.1.0", tokenizer=None, sample_count=10)  # noqa: E501
+    num_samples = task_params.num_samples
+    if num_samples is None:
+        num_samples = len(dataset)
+    input_conversations = [dataset.conversation(i) for i in range(num_samples)]
+    conversations = inference_engine.infer(input_conversations)
+    logger.info(f"Finished inference on {len(conversations)} conversations!")
+    if len(conversations) > 0:
+        logger.info(f"Sample conversation: {conversations[0]}")
+
+    count = 0  # The number of examples with correct answers extracted.
+    total = 0  # All examples.
+    valid_count = 0  # The number of examples with valid answers extracted.
+    for i, conversation in enumerate(conversations):
+        total += 1
+        # Grab the model's response
+        response = conversation.last_message()
+        # Ignore cases where model didn't respond or it's a multimodal response.
+        # For now, we focus on text-only responses.
+        if not response or not isinstance(response.content, str):
+            continue
+        # Count the example as correct if the extracted prediction is correct.
+        prediction = _extract_prediction(response.content)
+        if prediction is None:
+            continue
+        valid_count += 1
+        if prediction == conversation.metadata["letter_count_integer"]:
+            count += 1
+
+    return {
+        # Accuracy across all examples.
+        "accuracy": count / total,
+        # Accuracy when only counting examples with properly extracted answers.
+        "properly_extracted_accuracy": count / valid_count,
+        "num_samples": num_samples,
+        # These three values sum up to num_samples.
+        "num_correct_answers": count,
+        "num_incorrect_answers": valid_count - count,
+        "num_invalid_answers": total - valid_count,
+    }

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`	`#`
`3`	`3`	`# Requirements:`
`4`	`4`	# - Log into WandB (`wandb login`) or disable `enable_wandb`
	`5`	+# - Log into HF: `huggingface-cli login`
`5`	`6`	`#`
`6`	`7`	`# Usage:`
`7`	`8`	`# oumi train -c oumi://configs/examples/letter_counting/grpo/train.yaml`
Original file line number	Diff line number	Diff line change
`@@ -238,6 +238,8 @@ def _get_custom_evaluation_fn(task_name: Optional[str]) -> Callable:`
`238`	`238`	`"task name, which should be corresponding to a registered evaluation "`
`239`	`239`	"function, using the decorator `@register_evaluation_function`."
`240`	`240`	`)`
	`241`	`+ # Import to ensure custom evaluation functions are added to REGISTRY.`
	`242`	`+ import oumi.evaluation.registry as evaluation_registry # noqa: F401`
`241`	`243`
`242`	`244`	`if evaluation_fn := REGISTRY.get_evaluation_function(task_name):`
`243`	`245`	`return evaluation_fn`