Skip to content

[SDK] ValueError: <HUB_TOKEN> is not a valid HubStrategy, please select one of ['end', 'every_save', 'checkpoint', 'all_checkpoints'] #2495

Open
@mahdikhashan

Description

@mahdikhashan

What happened?

when trying to create a tune job to optimize hp of llm(s), with the below values for tune api, i received an unclear error - it could be better when it shows be what has been wrong.

hf_model = HuggingFaceModelParams(
    model_uri = "hf://meta-llama/Llama-3.2-1B",
    transformer_type = AutoModelForSequenceClassification,
)

# Train the model on 1000 movie reviews from imdb
# https://huggingface.co/datasets/stanfordnlp/imdb
hf_dataset = HuggingFaceDatasetParams(
    repo_id = "imdb",
    split = "train[:1000]",
)

hf_tuning_parameters = HuggingFaceTrainerParams(
    training_parameters = TrainingArguments(
        output_dir = "results",
        save_strategy = "no",
        hub_strategy="all_checkpoints",
        learning_rate = 1e-05, #katib.search.double(min=1e-05, max=5e-05),
        num_train_epochs=3,
    )
)

cl = KatibClient(namespace="kubeflow")

exp_name = "testllm"

cl.tune(
    name = exp_name,
    model_provider_parameters = hf_model,
    dataset_provider_parameters = hf_dataset,
    trainer_parameters = hf_tuning_parameters,
    objective_metric_name = "train_loss",
    objective_type = "minimize",
    algorithm_name = "random",
    max_trial_count = 10,
    parallel_trial_count = 2,
    resources_per_trial={
        "gpu": "2",
        "cpu": "4",
        "memory": "10G",
    },
)

traceback

ValueError                                Traceback (most recent call last)
ValueError: '<HUB_TOKEN>' is not a valid HubStrategy

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
Cell In[19], line 3
      1 # Fine-tuning for Binary Classification
      2 exp_name = "testllm"
----> 3 cl.tune(
      4     name = exp_name,
      5     model_provider_parameters = hf_model,
      6     dataset_provider_parameters = hf_dataset,
      7     trainer_parameters = hf_tuning_parameters,
      8     objective_metric_name = "train_loss",
      9     objective_type = "minimize",
     10     algorithm_name = "random",
     11     max_trial_count = 10,
     12     parallel_trial_count = 2,
     13     resources_per_trial={
     14         "gpu": "2",
     15         "cpu": "4",
     16         "memory": "10G",
     17     },
     18 )
     20 cl.wait_for_experiment_condition(name=exp_name)
     22 # Get the best hyperparameters.

File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/site-packages/kubeflow/katib/api/katib_client.py:602, in KatibClient.tune(self, name, model_provider_parameters, dataset_provider_parameters, trainer_parameters, storage_config, objective, base_image, parameters, namespace, env_per_trial, algorithm_name, algorithm_settings, objective_metric_name, additional_metric_names, objective_type, objective_goal, max_trial_count, parallel_trial_count, max_failed_trial_count, resources_per_trial, retain_trials, packages_to_install, pip_index_url, metrics_collector_config)
    600 experiment_params = []
    601 trial_params = []
--> 602 training_args = utils.get_trial_substitutions_from_trainer(
    603     trainer_parameters.training_parameters, experiment_params, trial_params
    604 )
    605 lora_config = utils.get_trial_substitutions_from_trainer(
    606     trainer_parameters.lora_config, experiment_params, trial_params
    607 )
    609 # Create the init and the primary container.

File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/site-packages/kubeflow/katib/utils/utils.py:207, in get_trial_substitutions_from_trainer(parameters, experiment_params, trial_params)
    205                 value = copy.deepcopy(p_value)
    206             else:
--> 207                 value = type(old_attr)(p_value)
    208         setattr(parameters, p_name, value)
    210 if isinstance(parameters, TrainingArguments):

File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/enum.py:384, in EnumMeta.__call__(cls, value, names, module, qualname, type, start)
    359 """
    360 Either returns an existing member, or creates a new enum class.
    361 
   (...)
    381 `type`, if set, will be mixed in as the first base class.
    382 """
    383 if names is None:  # simple value lookup
--> 384     return cls.__new__(cls, value)
    385 # otherwise, functional API: we're creating a new Enum type
    386 return cls._create_(
    387         value,
    388         names,
   (...)
    392         start=start,
    393         )

File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/enum.py:709, in Enum.__new__(cls, value)
    704             exc = TypeError(
    705                     'error in %s._missing_: returned %r instead of None or a valid member'
    706                     % (cls.__name__, result)
    707                     )
    708         exc.__context__ = ve_exc
--> 709         raise exc
    710 finally:
    711     # ensure all variables that could hold an exception are destroyed
    712     exc = None

File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/enum.py:692, in Enum.__new__(cls, value)
    690 try:
    691     exc = None
--> 692     result = cls._missing_(value)
    693 except Exception as e:
    694     exc = e

File ~/miniconda3/envs/llm-hp-optimization-katib-nb/lib/python3.9/site-packages/transformers/utils/generic.py:498, in ExplicitEnum._missing_(cls, value)
    496 @classmethod
    497 def _missing_(cls, value):
--> 498     raise ValueError(
    499         f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
    500     )

ValueError: <HUB_TOKEN> is not a valid HubStrategy, please select one of ['end', 'every_save', 'checkpoint', 'all_checkpoints']

What did you expect to happen?

a clear value error message to mention which field is wrong - and what is the expected value.

Environment

Kubernetes version:

$ kubectl version

Katib controller version:

$ kubectl get pods -n kubeflow -l katib.kubeflow.org/component=controller -o jsonpath="{.items[*].spec.containers[*].image}"

Katib Python SDK version:

$ pip show kubeflow-katib

Impacted by this bug?

Give it a 👍 We prioritize the issues with most 👍

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions