Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 80 additions & 30 deletions src/instructlab/eval/mt_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,31 +52,49 @@ def __init__(
self.output_dir = output_dir
self.serving_gpus = serving_gpus
self.merge_system_user_message = merge_system_user_message
self.max_workers = self._calc_max_workers(max_workers, serving_gpus)

if max_workers == "auto":
try:
# Not available on all platforms
usable_cpu_count = len(os.sched_getaffinity(0)) # type: ignore[attr-defined]
except AttributeError:
usable_cpu_count = multiprocessing.cpu_count()
if serving_gpus is not None:
# Tune max_workers based on hardware configuration: min(#GPUs being used * 10, #CPU cores)
# Please see https://github.com/instructlab/instructlab/issues/2050 for detailed explanation
self.max_workers = min(max(serving_gpus, 1) * 10, usable_cpu_count)
logger.debug("Auto tuning max_workers to %s", self.max_workers)
def _calc_max_workers(
self, max_workers: int | str | None, serving_gpus: int | None
):
calculated_max_workers = None
if max_workers is not None:
if max_workers == "auto":
try:
# Not available on all platforms
usable_cpu_count = len(os.sched_getaffinity(0)) # type: ignore[attr-defined]
except AttributeError:
usable_cpu_count = multiprocessing.cpu_count()
if serving_gpus is not None:
# Tune max_workers based on hardware configuration: min(#GPUs being used * 10, #CPU cores)
# Please see https://github.com/instructlab/instructlab/issues/2050 for detailed explanation
calculated_max_workers = min(
max(serving_gpus, 1) * 10, usable_cpu_count
)
logger.debug(
"Auto tuning max_workers to %s", calculated_max_workers
)
else:
# Don't be too aggressive when serving_gpus isn't specified. Use half the cpu count.
calculated_max_workers = usable_cpu_count // 2
logger.debug(
"max_workers set to auto but serving_gpus is not specified. Defaulting to (cpu count / 2): %s",
calculated_max_workers,
)
else:
# Don't be too aggressive when serving_gpus isn't specified. Use half the cpu count.
self.max_workers = usable_cpu_count // 2
logger.debug(
"max_workers set to auto but serving_gpus is not specified. Defaulting to (cpu count / 2): %s",
self.max_workers,
)
if isinstance(max_workers, int) and max_workers > 0:
logger.debug("max_workers specified as: %s", max_workers)
calculated_max_workers = max_workers
else:
raise InvalidMaxWorkersError(max_workers)
return calculated_max_workers

def _get_effective_max_workers(self, max_workers, serving_gpus):
if max_workers is not None:
effective_max_workers = self._calc_max_workers(max_workers, serving_gpus)
else:
if isinstance(max_workers, int) and max_workers > 0:
logger.debug("max_workers specified as: %s", max_workers)
self.max_workers = max_workers
else:
raise InvalidMaxWorkersError(max_workers)
effective_max_workers = self.max_workers
return effective_max_workers


class MTBenchEvaluator(AbstractMTBenchEvaluator):
Expand All @@ -94,30 +112,46 @@ class MTBenchEvaluator(AbstractMTBenchEvaluator):

name = "mt_bench"

def gen_answers(self, server_url, api_key: str | None = None) -> None:
def gen_answers(
self,
server_url,
api_key: str | None = None,
max_workers: int | str | None = None,
serving_gpus: int | None = None,
) -> None:
"""
Asks questions to model

Attributes
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
api_key API token for authenticating with model server
max_workers Max parallel workers to run the evaluation with (int or "auto"). None indicates to use value specified in constructor.
serving_gpus Number of gpus allocated for serving. Used to tune with max_workers=auto. None indicates to use value specified in constructor.
"""
logger.debug(locals())
mt_bench_answers.generate_answers(
self.model_name,
server_url,
api_key=api_key,
output_dir=self.output_dir,
max_workers=self.max_workers,
max_workers=self._get_effective_max_workers(max_workers, serving_gpus),
)

def judge_answers(self, server_url, api_key: str | None = None) -> tuple:
def judge_answers(
self,
server_url,
api_key: str | None = None,
max_workers: int | str | None = None,
serving_gpus: int | None = None,
) -> tuple:
"""
Runs MT-Bench judgment

Attributes
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the judge model
api_key API token for authenticating with model server
max_workers Max parallel workers to run the evaluation with (int or "auto"). None indicates to use value specified in constructor.
serving_gpus Number of gpus allocated for serving. Used to tune with max_workers=auto. None indicates to use value specified in constructor.

Returns:
overall_score MT-Bench score for the overall model evaluation
Expand All @@ -130,7 +164,7 @@ def judge_answers(self, server_url, api_key: str | None = None) -> tuple:
self.judge_model_name,
server_url,
api_key=api_key,
max_workers=self.max_workers,
max_workers=self._get_effective_max_workers(max_workers, serving_gpus),
output_dir=self.output_dir,
merge_system_user_message=self.merge_system_user_message,
)
Expand Down Expand Up @@ -175,13 +209,21 @@ def __init__(
self.taxonomy_git_repo_path = taxonomy_git_repo_path
self.branch = branch

def gen_answers(self, server_url, api_key: str | None = None) -> None:
def gen_answers(
self,
server_url,
api_key: str | None = None,
max_workers: int | str | None = None,
serving_gpus: int | None = None,
) -> None:
"""
Asks questions to model

Attributes
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
api_key API token for authenticating with model server
max_workers Max parallel workers to run the evaluation with (int or "auto"). None indicates to use value specified in constructor.
serving_gpus Number of gpus allocated for serving. Used to tune with max_workers=auto. None indicates to use value specified in constructor.
"""
logger.debug(locals())
mt_bench_branch_generator.generate(
Expand All @@ -197,17 +239,25 @@ def gen_answers(self, server_url, api_key: str | None = None) -> None:
branch=self.branch,
output_dir=self.output_dir,
data_dir=self.output_dir,
max_workers=self.max_workers,
max_workers=self._get_effective_max_workers(max_workers, serving_gpus),
bench_name="mt_bench_branch",
)

def judge_answers(self, server_url, api_key: str | None = None) -> tuple:
def judge_answers(
self,
server_url,
api_key: str | None = None,
max_workers: int | str | None = None,
serving_gpus: int | None = None,
) -> tuple:
"""
Runs MT-Bench-Branch judgment. Judgments can be compared across runs with consistent question_id -> qna file name.

Attributes
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the judge model
api_key API token for authenticating with model server
max_workers Max parallel workers to run the evaluation with (int or "auto"). None indicates to use value specified in constructor.
serving_gpus Number of gpus allocated for serving. Used to tune with max_workers=auto. None indicates to use value specified in constructor.

Returns:
qa_pairs Question and answer pairs (with scores) from the evaluation
Expand All @@ -219,7 +269,7 @@ def judge_answers(self, server_url, api_key: str | None = None) -> tuple:
server_url,
api_key=api_key,
branch=self.branch,
max_workers=self.max_workers,
max_workers=self._get_effective_max_workers(max_workers, serving_gpus),
output_dir=self.output_dir,
data_dir=self.output_dir,
bench_name="mt_bench_branch",
Expand Down