Closed
Description
Describe the bug
When the NNI HPO process starts, it will first use generate
to generate templated algo with the new hyper-parameters and then run_algo
with NNI to do algo.train()
. However, because the generated algo doesn't update bundle_root
, and the model checkpoints path is saved relative to the bundle_root
config, the model checkpoints will appear in wrong locations.
To Reproduce
import os
import tempfile
from monai.bundle.config_parser import ConfigParser
from monai.apps import download_and_extract
from monai.apps.auto3dseg import AutoRunner
if __name__ == '__main__':
directory = "./build"
root_dir = tempfile.mkdtemp() if directory is None else directory
print(root_dir)
msd_task = "Task04_Hippocampus"
resource = "https://msd-for-monai.s3-us-west-2.amazonaws.com/" + msd_task + ".tar"
compressed_file = os.path.join(root_dir, msd_task + ".tar")
dataroot = os.path.join(root_dir, msd_task)
if not os.path.exists(dataroot):
download_and_extract(resource, compressed_file, root_dir)
datalist_file = os.path.join("../tutorials/auto3dseg/tasks", "msd", msd_task, "msd_" + msd_task.lower() + "_folds.json")
input_cfg = {
"name": msd_task, # optional, it is only for your own record
"task": "segmentation", # optional, it is only for your own record
"modality": "MRI", # required
"datalist": datalist_file, # required
"dataroot": dataroot, # required
}
input = "./input.yaml"
ConfigParser.export_config_file(input_cfg, input)
runner = AutoRunner(work_dir="./work_dir", algos=('swinunetr', 'segresnet'), input=input, analyze=False, hpo=True, ensemble=False)
num_epoch = 2
hpo_params = {
"maxTrialNumber": 20,
"maxExperimentDuration": "30m",
"num_epochs_per_validation": 1,
"num_images_per_batch": 1,
"num_epochs": 2,
"num_warmup_epochs": 1,
"training#num_epochs": 2,
"training#num_epochs_per_validation": 1,
"searching#num_epochs": 2,
"searching#num_epochs_per_validation": 1,
"searching#num_warmup_epochs": 1,
"training#auto_scale_allowed": False, # new
"auto_scale_allowed": False, # new
}
search_space = {"learning_rate": {"_type": "choice", "_value": [0.0001, 0.01]}}
runner.set_num_fold(num_fold=1)
runner.set_hpo_params(params=hpo_params)
runner.set_nni_search_space(search_space)
runner.run()
Expected behavior
Model checkpoint saved correctly.
Metadata
Metadata
Assignees
Labels
No labels