instructlab · mergify · Sep 12, 2024 · Aug 14, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -9,4 +9,4 @@ transformers
 accelerate
 pandas
 pandas-stubs
-lm-eval>=0.4.3
+lm-eval>=0.4.4
diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
@@ -122,8 +122,14 @@ def __init__(
         self.batch_size = batch_size
         self.device = device
 
-    def _run_mmlu(self) -> dict:
-        model_args = f"pretrained={self.model_path},dtype={self.model_dtype}"
+    def _run_mmlu(self, server_url: str | None = None) -> dict:
+        if server_url is not None:
+            # Requires lm_eval >= 0.4.4
+            model_args = f"base_url={server_url}/completions,model={self.model_path},tokenizer_backend=huggingface"
+            model = "local-completions"
+        else:
+            model_args = f"pretrained={self.model_path},dtype={self.model_dtype}"
+            model = "hf"
         tm = None
         if self.tasks_dir is not None:
             if not os.path.exists(self.tasks_dir):
@@ -132,7 +138,7 @@ def _run_mmlu(self) -> dict:
                 raise InvalidTasksDirError(self.tasks_dir)
             tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir)
         mmlu_output = self._simple_evaluate_with_error_handling(
-            model="hf",
+            model=model,
             model_args=model_args,
             tasks=self.tasks,
             num_fewshot=self.few_shots,
@@ -199,10 +205,13 @@ def __init__(
             model_path, None, tasks, model_dtype, few_shots, batch_size, device
         )
 
-    def run(self) -> tuple:
+    def run(self, server_url: str | None = None) -> tuple:
         """
         Runs MMLU evaluation
 
+        Attributes
+            server_url          Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
+
         Returns:
             overall_score       MMLU score for the overall model evaluation
             individual_scores   Individual MMLU score for each task
@@ -214,7 +223,7 @@ def run(self) -> tuple:
         individual_scores: dict = {}
         agg_score: float = 0.0
 
-        results = self._run_mmlu()
+        results = self._run_mmlu(server_url)
 
         for task in self.tasks:
             mmlu_res = results[task]
@@ -243,10 +252,13 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):
 
     name = "mmlu_branch"
 
-    def run(self) -> tuple:
+    def run(self, server_url: str | None = None) -> tuple:
         """
         Runs MMLUBranch evaluation
 
+        Attributes
+            server_url          Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
+
         Returns:
             overall_score       Average MMLUBranch score for the task group
             individual_scores   Individual MMLUBranch scores for each task in the task group
@@ -259,7 +271,7 @@ def run(self) -> tuple:
         individual_scores: dict = {}
         agg_score: float = 0.0
 
-        results = self._run_mmlu()
+        results = self._run_mmlu(server_url)
 
         for task, result in results.items():
             if task in self.tasks: