Skip to content

Allow for external serving to be used with mmlu #99

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ transformers
accelerate
pandas
pandas-stubs
lm-eval>=0.4.3
lm-eval>=0.4.4
26 changes: 19 additions & 7 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,14 @@ def __init__(
self.batch_size = batch_size
self.device = device

def _run_mmlu(self) -> dict:
model_args = f"pretrained={self.model_path},dtype={self.model_dtype}"
def _run_mmlu(self, server_url: str | None = None) -> dict:
if server_url is not None:
# Requires lm_eval >= 0.4.4
model_args = f"base_url={server_url}/completions,model={self.model_path},tokenizer_backend=huggingface"
model = "local-completions"
else:
model_args = f"pretrained={self.model_path},dtype={self.model_dtype}"
model = "hf"
tm = None
if self.tasks_dir is not None:
if not os.path.exists(self.tasks_dir):
Expand All @@ -132,7 +138,7 @@ def _run_mmlu(self) -> dict:
raise InvalidTasksDirError(self.tasks_dir)
tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir)
mmlu_output = self._simple_evaluate_with_error_handling(
model="hf",
model=model,
model_args=model_args,
tasks=self.tasks,
num_fewshot=self.few_shots,
Expand Down Expand Up @@ -199,10 +205,13 @@ def __init__(
model_path, None, tasks, model_dtype, few_shots, batch_size, device
)

def run(self) -> tuple:
def run(self, server_url: str | None = None) -> tuple:
"""
Runs MMLU evaluation

Attributes
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated

Returns:
overall_score MMLU score for the overall model evaluation
individual_scores Individual MMLU score for each task
Expand All @@ -214,7 +223,7 @@ def run(self) -> tuple:
individual_scores: dict = {}
agg_score: float = 0.0

results = self._run_mmlu()
results = self._run_mmlu(server_url)

for task in self.tasks:
mmlu_res = results[task]
Expand Down Expand Up @@ -243,10 +252,13 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):

name = "mmlu_branch"

def run(self) -> tuple:
def run(self, server_url: str | None = None) -> tuple:
"""
Runs MMLUBranch evaluation

Attributes
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated

Returns:
overall_score Average MMLUBranch score for the task group
individual_scores Individual MMLUBranch scores for each task in the task group
Expand All @@ -259,7 +271,7 @@ def run(self) -> tuple:
individual_scores: dict = {}
agg_score: float = 0.0

results = self._run_mmlu()
results = self._run_mmlu(server_url)

for task, result in results.items():
if task in self.tasks:
Expand Down