Open
Description
What happened?
when trying to create a tune job to optimize hp of llm(s), with the below values for tune api, i received an unclear error - it could be better when it shows be what has been wrong.
hf_model = HuggingFaceModelParams(
model_uri = "hf://meta-llama/Llama-3.2-1B",
transformer_type = AutoModelForSequenceClassification,
)
# Train the model on 1000 movie reviews from imdb
# https://huggingface.co/datasets/stanfordnlp/imdb
hf_dataset = HuggingFaceDatasetParams(
repo_id = "imdb",
split = "train[:1000]",
)
hf_tuning_parameters = HuggingFaceTrainerParams(
training_parameters = TrainingArguments(
output_dir = "results",
save_strategy = "no",
hub_strategy="all_checkpoints",
learning_rate = 1e-05, #katib.search.double(min=1e-05, max=5e-05),
num_train_epochs=3,
)
)
cl = KatibClient(namespace="kubeflow")
exp_name = "testllm"
cl.tune(
name = exp_name,
model_provider_parameters = hf_model,
dataset_provider_parameters = hf_dataset,
trainer_parameters = hf_tuning_parameters,
objective_metric_name = "train_loss",
objective_type = "minimize",
algorithm_name = "random",
max_trial_count = 10,
parallel_trial_count = 2,
resources_per_trial={
"gpu": "2",
"cpu": "4",
"memory": "10G",
},
)
traceback
ValueError Traceback (most recent call last)
ValueError: '<HUB_TOKEN>' is not a valid HubStrategy
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Cell In[19], line 3
1 # Fine-tuning for Binary Classification
2 exp_name = "testllm"
----> 3 cl.tune(
4 name = exp_name,
5 model_provider_parameters = hf_model,
6 dataset_provider_parameters = hf_dataset,
7 trainer_parameters = hf_tuning_parameters,
8 objective_metric_name = "train_loss",
9 objective_type = "minimize",
10 algorithm_name = "random",
11 max_trial_count = 10,
12 parallel_trial_count = 2,
13 resources_per_trial={
14 "gpu": "2",
15 "cpu": "4",
16 "memory": "10G",
17 },
18 )
20 cl.wait_for_experiment_condition(name=exp_name)
22 # Get the best hyperparameters.
File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/site-packages/kubeflow/katib/api/katib_client.py:602, in KatibClient.tune(self, name, model_provider_parameters, dataset_provider_parameters, trainer_parameters, storage_config, objective, base_image, parameters, namespace, env_per_trial, algorithm_name, algorithm_settings, objective_metric_name, additional_metric_names, objective_type, objective_goal, max_trial_count, parallel_trial_count, max_failed_trial_count, resources_per_trial, retain_trials, packages_to_install, pip_index_url, metrics_collector_config)
600 experiment_params = []
601 trial_params = []
--> 602 training_args = utils.get_trial_substitutions_from_trainer(
603 trainer_parameters.training_parameters, experiment_params, trial_params
604 )
605 lora_config = utils.get_trial_substitutions_from_trainer(
606 trainer_parameters.lora_config, experiment_params, trial_params
607 )
609 # Create the init and the primary container.
File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/site-packages/kubeflow/katib/utils/utils.py:207, in get_trial_substitutions_from_trainer(parameters, experiment_params, trial_params)
205 value = copy.deepcopy(p_value)
206 else:
--> 207 value = type(old_attr)(p_value)
208 setattr(parameters, p_name, value)
210 if isinstance(parameters, TrainingArguments):
File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/enum.py:384, in EnumMeta.__call__(cls, value, names, module, qualname, type, start)
359 """
360 Either returns an existing member, or creates a new enum class.
361
(...)
381 `type`, if set, will be mixed in as the first base class.
382 """
383 if names is None: # simple value lookup
--> 384 return cls.__new__(cls, value)
385 # otherwise, functional API: we're creating a new Enum type
386 return cls._create_(
387 value,
388 names,
(...)
392 start=start,
393 )
File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/enum.py:709, in Enum.__new__(cls, value)
704 exc = TypeError(
705 'error in %s._missing_: returned %r instead of None or a valid member'
706 % (cls.__name__, result)
707 )
708 exc.__context__ = ve_exc
--> 709 raise exc
710 finally:
711 # ensure all variables that could hold an exception are destroyed
712 exc = None
File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/enum.py:692, in Enum.__new__(cls, value)
690 try:
691 exc = None
--> 692 result = cls._missing_(value)
693 except Exception as e:
694 exc = e
File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/site-packages/transformers/utils/generic.py:498, in ExplicitEnum._missing_(cls, value)
496 @classmethod
497 def _missing_(cls, value):
--> 498 raise ValueError(
499 f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
500 )
ValueError: <HUB_TOKEN> is not a valid HubStrategy, please select one of ['end', 'every_save', 'checkpoint', 'all_checkpoints']
What did you expect to happen?
a clear value error message to mention which field is wrong - and what is the expected value.
Environment
Kubernetes version:
$ kubectl version
Katib controller version:
$ kubectl get pods -n kubeflow -l katib.kubeflow.org/component=controller -o jsonpath="{.items[*].spec.containers[*].image}"
Katib Python SDK version:
$ pip show kubeflow-katib
Impacted by this bug?
Give it a 👍 We prioritize the issues with most 👍