Skip to content

Commit

Permalink
feat: use custom http_client
Browse files Browse the repository at this point in the history
This commit adds the ability to pass a custom HTTP client to the
MT-Bench evaluator. This is handy when using custom certificates when
interacting with the judge model serving endpoint.

Signed-off-by: Sébastien Han <seb@redhat.com>
  • Loading branch information
leseb committed Oct 29, 2024
1 parent cd0487e commit d11a178
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 4 deletions.
1 change: 1 addition & 0 deletions .spellcheck-en-custom.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dr
eval
gpt
hoc
http
instructlab
jsonl
justfile
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
## 0.4

* Added ability to specify a custom http client to MT-Bench

## v0.2
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ accelerate
pandas
pandas-stubs
lm-eval>=0.4.4
httpx
14 changes: 14 additions & 0 deletions src/instructlab/eval/mt_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import multiprocessing
import os

import httpx

# First Party
from instructlab.eval import (
mt_bench_answers,
Expand Down Expand Up @@ -110,6 +112,7 @@ def gen_answers(
api_key: str | None = None,
max_workers: int | str | None = None,
serving_gpus: int | None = None,
http_client: httpx.Client | None = None,
) -> None:
"""
Asks questions to model
Expand All @@ -119,6 +122,7 @@ def gen_answers(
api_key API token for authenticating with model server
max_workers Max parallel workers to run the evaluation with (int or "auto"). None indicates to use value specified in constructor.
serving_gpus Number of gpus allocated for serving. Used to tune with max_workers=auto. None indicates to use value specified in constructor.
http_client Custom http client to use for requests
"""
logger.debug(locals())
mt_bench_answers.generate_answers(
Expand All @@ -127,6 +131,7 @@ def gen_answers(
api_key=api_key,
output_dir=self.output_dir,
max_workers=self._get_effective_max_workers(max_workers, serving_gpus),
http_client=http_client,
)

def judge_answers(
Expand All @@ -135,6 +140,7 @@ def judge_answers(
api_key: str | None = None,
max_workers: int | str | None = None,
serving_gpus: int | None = None,
http_client: httpx.Client | None = None,
) -> tuple:
"""
Runs MT-Bench judgment
Expand All @@ -144,6 +150,7 @@ def judge_answers(
api_key API token for authenticating with model server
max_workers Max parallel workers to run the evaluation with (int or "auto"). None indicates to use value specified in constructor.
serving_gpus Number of gpus allocated for serving. Used to tune with max_workers=auto. None indicates to use value specified in constructor.
http_client Custom http client to use for requests
Returns:
overall_score MT-Bench score for the overall model evaluation
Expand All @@ -160,6 +167,7 @@ def judge_answers(
max_workers=self._get_effective_max_workers(max_workers, serving_gpus),
output_dir=self.output_dir,
merge_system_user_message=self.merge_system_user_message,
http_client=http_client,
)


Expand Down Expand Up @@ -202,6 +210,7 @@ def gen_answers(
api_key: str | None = None,
max_workers: int | str | None = None,
serving_gpus: int | None = None,
http_client: httpx.Client | None = None,
) -> None:
"""
Asks questions to model
Expand All @@ -211,6 +220,7 @@ def gen_answers(
api_key API token for authenticating with model server
max_workers Max parallel workers to run the evaluation with (int or "auto"). None indicates to use value specified in constructor.
serving_gpus Number of gpus allocated for serving. Used to tune with max_workers=auto. None indicates to use value specified in constructor.
http_client Custom http client to use for requests
"""
logger.debug(locals())
mt_bench_branch_generator.generate(
Expand All @@ -228,6 +238,7 @@ def gen_answers(
data_dir=self.output_dir,
max_workers=self._get_effective_max_workers(max_workers, serving_gpus),
bench_name="mt_bench_branch",
http_client=http_client,
)

def judge_answers(
Expand All @@ -236,6 +247,7 @@ def judge_answers(
api_key: str | None = None,
max_workers: int | str | None = None,
serving_gpus: int | None = None,
http_client: httpx.Client | None = None,
) -> tuple:
"""
Runs MT-Bench-Branch judgment. Judgments can be compared across runs with consistent question_id -> qna file name.
Expand All @@ -245,6 +257,7 @@ def judge_answers(
api_key API token for authenticating with model server
max_workers Max parallel workers to run the evaluation with (int or "auto"). None indicates to use value specified in constructor.
serving_gpus Number of gpus allocated for serving. Used to tune with max_workers=auto. None indicates to use value specified in constructor.
http_client Custom http client to use for requests
Returns:
overall_score Overall score from the evaluation
Expand All @@ -263,5 +276,6 @@ def judge_answers(
data_dir=self.output_dir,
bench_name="mt_bench_branch",
merge_system_user_message=self.merge_system_user_message,
http_client=http_client,
)
return overall_score, qa_pairs, error_rate
3 changes: 2 additions & 1 deletion src/instructlab/eval/mt_bench_answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,12 @@ def generate_answers(
max_tokens=1024,
max_workers=1,
bench_name="mt_bench",
http_client=None,
):
"""Generate model answers to be judged"""
logger.debug(locals())

openai_client = get_openai_client(model_api_base, api_key)
openai_client = get_openai_client(model_api_base, api_key, http_client)

if data_dir is None:
data_dir = os.path.join(os.path.dirname(__file__), "data")
Expand Down
12 changes: 10 additions & 2 deletions src/instructlab/eval/mt_bench_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import re
import time

import httpx

# Third Party
import openai

Expand Down Expand Up @@ -365,8 +367,14 @@ def get_model_list(answer_file):
return [os.path.splitext(os.path.basename(answer_file))[0]]


def get_openai_client(model_api_base, api_key):
def get_openai_client(
model_api_base,
api_key,
http_client: httpx.Client | None = None,
):
if api_key is None:
api_key = "NO_API_KEY"
openai_client = openai.OpenAI(base_url=model_api_base, api_key=api_key)
openai_client = openai.OpenAI(
base_url=model_api_base, api_key=api_key, http_client=http_client
)
return openai_client
3 changes: 2 additions & 1 deletion src/instructlab/eval/mt_bench_judgment.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,11 +286,12 @@ def generate_judgment(
max_workers=1,
first_n=None,
merge_system_user_message=False,
http_client=None,
):
"""Generate judgment with scores and qa_pairs for a model"""
logger.debug(locals())

openai_client = get_openai_client(model_api_base, api_key)
openai_client = get_openai_client(model_api_base, api_key, http_client)

first_n_env = os.environ.get("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS")
if first_n_env is not None and first_n is None:
Expand Down
15 changes: 15 additions & 0 deletions tests/test_branch_gen_answers_with_custom_http_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# First Party
import httpx

from instructlab.eval.mt_bench import MTBenchBranchEvaluator

mt_bench_branch = MTBenchBranchEvaluator(
"instructlab/granite-7b-lab",
"instructlab/granite-7b-lab",
"../taxonomy",
"main",
)
mt_bench_branch.gen_answers(
"http://localhost:8000/v1",
http_client=httpx.Client(verify=False),
)

0 comments on commit d11a178

Please sign in to comment.