diff --git a/eval/final/components.py b/eval/final/components.py index b2aad8e..469d222 100644 --- a/eval/final/components.py +++ b/eval/final/components.py @@ -23,9 +23,9 @@ def run_final_eval_op( ): import json import os - import httpx import subprocess + import httpx import torch from instructlab.eval.mmlu import MMLUBranchEvaluator from instructlab.eval.mt_bench import MTBenchBranchEvaluator @@ -35,7 +35,9 @@ def run_final_eval_op( judge_model_name = os.getenv("JUDGE_NAME") judge_endpoint = os.getenv("JUDGE_ENDPOINT") judge_ca_cert_path = os.getenv("JUDGE_CA_CERT_PATH") - use_tls = os.path.exists(judge_ca_cert_path) and (os.path.getsize(judge_ca_cert_path) > 0) + use_tls = os.path.exists(judge_ca_cert_path) and ( + os.path.getsize(judge_ca_cert_path) > 0 + ) judge_http_client = httpx.Client(verify=judge_ca_cert_path) if use_tls else None print("Starting Final Eval...") diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py index 5c2bdbb..5b1800a 100644 --- a/eval/mt_bench/components.py +++ b/eval/mt_bench/components.py @@ -1,6 +1,6 @@ # type: ignore # pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error -from typing import List, NamedTuple, Optional +from typing import NamedTuple, Optional from kfp.dsl import component @@ -20,9 +20,9 @@ def run_mt_bench_op( ) -> NamedTuple("outputs", best_model=str, best_score=float): import json import os - import httpx import subprocess + import httpx import torch from instructlab.eval.mt_bench import MTBenchEvaluator @@ -30,7 +30,9 @@ def run_mt_bench_op( judge_model_name = os.getenv("JUDGE_NAME") judge_endpoint = os.getenv("JUDGE_ENDPOINT") judge_ca_cert_path = os.getenv("JUDGE_CA_CERT_PATH") - use_tls = os.path.exists(judge_ca_cert_path) and (os.path.getsize(judge_ca_cert_path) > 0) + use_tls = os.path.exists(judge_ca_cert_path) and ( + os.path.getsize(judge_ca_cert_path) > 0 + ) judge_http_client = httpx.Client(verify=judge_ca_cert_path) if use_tls else None def launch_vllm( diff --git a/pipeline.py b/pipeline.py index 401c3c4..d52fddc 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,5 +1,6 @@ # type: ignore # pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error,no-member +import os import typing from typing import List, Literal, Optional @@ -9,13 +10,11 @@ CreatePVC, DeletePVC, mount_pvc, - set_image_pull_policy, use_config_map_as_env, use_config_map_as_volume, use_secret_as_env, use_secret_as_volume, ) -import os TEACHER_CONFIG_MAP = "teacher-server" TEACHER_SECRET = "teacher-server" @@ -104,7 +103,6 @@ def pipeline( sdg_pipeline: str = "full", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L122 sdg_max_batch_len: int = 5000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L334 sdg_sample_size: float = 1.0, # FIXME: Not present in default config. Not configurable upstream at this point, capability added via https://github.com/instructlab/sdg/pull/432 - # Training phase train_nproc_per_node: int = 2, # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification. train_nnodes: int = 2, # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification. @@ -122,13 +120,11 @@ def pipeline( # MT Bench mt_bench_max_workers: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L74 mt_bench_merge_system_user_message: bool = False, # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474 - # Final evaluation final_eval_max_workers: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L74 final_eval_few_shots: int = 5, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L56 final_eval_batch_size: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L52 final_eval_merge_system_user_message: bool = False, # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474 - # Other options k8s_storage_class_name: str = "standard", # FIXME: https://github.com/kubeflow/pipelines/issues/11396, https://issues.redhat.com/browse/RHOAIRFE-470 ): @@ -201,8 +197,12 @@ def pipeline( sdg_task, TEACHER_CONFIG_MAP, dict(endpoint="endpoint", model="model") ) use_secret_as_env(sdg_task, TEACHER_SECRET, {"api_key": "api_key"}) - use_config_map_as_volume(sdg_task, TEACHER_CONFIG_MAP, mount_path=SDG_CA_CERT_PATH) - sdg_task.set_env_variable(SDG_CA_CERT_ENV_VAR_NAME, os.path.join(SDG_CA_CERT_PATH, SDG_CA_CERT_CM_KEY)) + use_config_map_as_volume( + sdg_task, TEACHER_CONFIG_MAP, mount_path=SDG_CA_CERT_PATH + ) + sdg_task.set_env_variable( + SDG_CA_CERT_ENV_VAR_NAME, os.path.join(SDG_CA_CERT_PATH, SDG_CA_CERT_CM_KEY) + ) sdg_task.after(git_clone_task) mount_pvc( @@ -366,8 +366,13 @@ def pipeline( ) use_secret_as_env(run_mt_bench_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"}) - use_config_map_as_volume(run_mt_bench_task, JUDGE_CONFIG_MAP, mount_path=JUDGE_CA_CERT_PATH) - run_mt_bench_task.set_env_variable(JUDGE_CA_CERT_ENV_VAR_NAME, os.path.join(JUDGE_CA_CERT_PATH, JUDGE_CA_CERT_CM_KEY)) + use_config_map_as_volume( + run_mt_bench_task, JUDGE_CONFIG_MAP, mount_path=JUDGE_CA_CERT_PATH + ) + run_mt_bench_task.set_env_variable( + JUDGE_CA_CERT_ENV_VAR_NAME, + os.path.join(JUDGE_CA_CERT_PATH, JUDGE_CA_CERT_CM_KEY), + ) # uncomment if updating image with same tag # set_image_pull_policy(run_mt_bench_task, "Always") @@ -411,8 +416,13 @@ def pipeline( use_secret_as_env(final_eval_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"}) - use_config_map_as_volume(final_eval_task, JUDGE_CONFIG_MAP, mount_path=JUDGE_CA_CERT_PATH) - final_eval_task.set_env_variable(JUDGE_CA_CERT_ENV_VAR_NAME, os.path.join(JUDGE_CA_CERT_PATH, JUDGE_CA_CERT_CM_KEY)) + use_config_map_as_volume( + final_eval_task, JUDGE_CONFIG_MAP, mount_path=JUDGE_CA_CERT_PATH + ) + final_eval_task.set_env_variable( + JUDGE_CA_CERT_ENV_VAR_NAME, + os.path.join(JUDGE_CA_CERT_PATH, JUDGE_CA_CERT_CM_KEY), + ) final_eval_task.after(run_mt_bench_task) final_eval_task.set_accelerator_type("nvidia.com/gpu") diff --git a/sdg/components.py b/sdg/components.py index 3461751..3d891a1 100644 --- a/sdg/components.py +++ b/sdg/components.py @@ -37,19 +37,21 @@ def sdg_op( sdg_path: str = "/data/sdg", sdg_sampling_size: float = 1.0, ): + import os from os import getenv, path import instructlab.sdg import openai import yaml - import os api_key = getenv("api_key") model = getenv("model") endpoint = getenv("endpoint") sdg_ca_cert_path = getenv("SDG_CA_CERT_PATH") - use_tls = os.path.exists(sdg_ca_cert_path) and (os.path.getsize(sdg_ca_cert_path) > 0) + use_tls = os.path.exists(sdg_ca_cert_path) and ( + os.path.getsize(sdg_ca_cert_path) > 0 + ) if use_tls: import httpx diff --git a/training/components.py b/training/components.py index 84143d0..20bf353 100644 --- a/training/components.py +++ b/training/components.py @@ -1,7 +1,7 @@ # type: ignore # pylint: disable=import-outside-toplevel,missing-function-docstring -from typing import NamedTuple, Optional +from typing import Optional from kfp import dsl diff --git a/training/run_main_ds.py b/training/run_main_ds.py index 1b782ee..9c4fb53 100644 --- a/training/run_main_ds.py +++ b/training/run_main_ds.py @@ -20,8 +20,8 @@ def run_main_ds(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: f"--nproc_per_node={torch_args.nproc_per_node}", f"--rdzv_id={torch_args.rdzv_id}", f"--rdzv_endpoint={torch_args.rdzv_endpoint}", - f"-m", - f"instructlab.training.main_ds", + "-m", + "instructlab.training.main_ds", f"--model_name_or_path={train_args.model_path}", f"--data_path={train_args.data_output_dir}/data.jsonl", f"--output_dir={train_args.ckpt_output_dir}", @@ -30,7 +30,7 @@ def run_main_ds(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: f"--learning_rate={train_args.learning_rate}", f"--num_warmup_steps={train_args.warmup_steps}", f"--save_samples={train_args.save_samples}", - f"--log_level=INFO", + "--log_level=INFO", f"--max_batch_len={train_args.max_batch_len}", f"--seed={train_args.random_seed}", f"--chat-tmpl-path={train_args.chat_tmpl_path}", diff --git a/utils/components.py b/utils/components.py index 8ddabcd..35f3848 100644 --- a/utils/components.py +++ b/utils/components.py @@ -3,7 +3,7 @@ from kfp import dsl -from .consts import PYTHON_IMAGE, RHELAI_IMAGE, TOOLBOX_IMAGE +from .consts import RHELAI_IMAGE, TOOLBOX_IMAGE @dsl.container_component