Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

W&B: Improve documentation of the logger & use wandb assigned run names by default #4174

Merged
merged 1 commit into from
Jul 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion utils/loggers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def start(self):
assert 'wandb' in self.include and wandb
run_id = torch.load(self.weights).get('wandb_id') if self.opt.resume else None
self.opt.hyp = self.hyp # add hyperparameters
self.wandb = WandbLogger(self.opt, s.stem, run_id, self.data_dict)
self.wandb = WandbLogger(self.opt, run_id, self.data_dict)
except:
self.wandb = None

Expand Down
145 changes: 132 additions & 13 deletions utils/loggers/wandb/wandb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,19 @@ class WandbLogger():
https://docs.wandb.com/guides/integrations/yolov5
"""

def __init__(self, opt, name, run_id, data_dict, job_type='Training'):
def __init__(self, opt, run_id, data_dict, job_type='Training'):
'''
- Initialize WandbLogger instance
- Upload dataset if opt.upload_dataset is True
- Setup trainig processes if job_type is 'Training'
arguments:
opt (namespace) -- Commandline arguments for this run
run_id (str) -- Run ID of W&B run to be resumed
data_dict (Dict) -- Dictionary conataining info about the dataset to be used
job_type (str) -- To set the job_type for this run
'''
# Pre-training routine --
self.job_type = job_type
self.wandb, self.wandb_run = wandb, None if not wandb else wandb.run
Expand Down Expand Up @@ -129,7 +141,7 @@ def __init__(self, opt, name, run_id, data_dict, job_type='Training'):
resume="allow",
project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
entity=opt.entity,
name=name,
name=opt.name if opt.name != 'exp' else None,
job_type=job_type,
id=run_id,
allow_val_change=True) if not wandb.run else wandb.run
Expand All @@ -145,6 +157,15 @@ def __init__(self, opt, name, run_id, data_dict, job_type='Training'):
self.data_dict = self.check_and_upload_dataset(opt)

def check_and_upload_dataset(self, opt):
'''
Check if the dataset format is compatible and upload it as W&B artifact
arguments:
opt (namespace)-- Commandline arguments for current run
returns:
Updated dataset info dictionary where local dataset paths are replaced by WAND_ARFACT_PREFIX links.
'''
assert wandb, 'Install wandb to upload dataset'
config_path = self.log_dataset_artifact(check_file(opt.data),
opt.single_cls,
Expand All @@ -155,6 +176,19 @@ def check_and_upload_dataset(self, opt):
return wandb_data_dict

def setup_training(self, opt, data_dict):
'''
Setup the necessary processes for training YOLO models:
- Attempt to download model checkpoint and dataset artifacts if opt.resume stats with WANDB_ARTIFACT_PREFIX
- Update data_dict, to contain info of previous run if resumed and the paths of dataset artifact if downloaded
- Setup log_dict, initialize bbox_interval
arguments:
opt (namespace) -- commandline arguments for this run
data_dict (Dict) -- Dataset dictionary for this run
returns:
data_dict (Dict) -- contains the updated info about the dataset to be used for training
'''
self.log_dict, self.current_epoch = {}, 0
self.bbox_interval = opt.bbox_interval
if isinstance(opt.resume, str):
Expand Down Expand Up @@ -185,12 +219,22 @@ def setup_training(self, opt, data_dict):
self.val_table = self.val_artifact.get("val")
if self.val_table_path_map is None:
self.map_val_table_path()
wandb.log({"validation dataset": self.val_table})
if opt.bbox_interval == -1:
self.bbox_interval = opt.bbox_interval = (opt.epochs // 10) if opt.epochs > 10 else 1
return data_dict

def download_dataset_artifact(self, path, alias):
'''
download the model checkpoint artifact if the path starts with WANDB_ARTIFACT_PREFIX
arguments:
path -- path of the dataset to be used for training
alias (str)-- alias of the artifact to be download/used for training
returns:
(str, wandb.Artifact) -- path of the downladed dataset and it's corresponding artifact object if dataset
is found otherwise returns (None, None)
'''
if isinstance(path, str) and path.startswith(WANDB_ARTIFACT_PREFIX):
artifact_path = Path(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias)
dataset_artifact = wandb.use_artifact(artifact_path.as_posix().replace("\\", "/"))
Expand All @@ -200,6 +244,12 @@ def download_dataset_artifact(self, path, alias):
return None, None

def download_model_artifact(self, opt):
'''
download the model checkpoint artifact if the resume path starts with WANDB_ARTIFACT_PREFIX
arguments:
opt (namespace) -- Commandline arguments for this run
'''
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
model_artifact = wandb.use_artifact(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX) + ":latest")
assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist'
Expand All @@ -212,6 +262,16 @@ def download_model_artifact(self, opt):
return None, None

def log_model(self, path, opt, epoch, fitness_score, best_model=False):
'''
Log the model checkpoint as W&B artifact
arguments:
path (Path) -- Path of directory containing the checkpoints
opt (namespace) -- Command line arguments for this run
epoch (int) -- Current epoch number
fitness_score (float) -- fitness score for current epoch
best_model (boolean) -- Boolean representing if the current checkpoint is the best yet.
'''
model_artifact = wandb.Artifact('run_' + wandb.run.id + '_model', type='model', metadata={
'original_url': str(path),
'epochs_trained': epoch + 1,
Expand All @@ -226,6 +286,19 @@ def log_model(self, path, opt, epoch, fitness_score, best_model=False):
print("Saving model artifact on epoch ", epoch + 1)

def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False):
'''
Log the dataset as W&B artifact and return the new data file with W&B links
arguments:
data_file (str) -- the .yaml file with information about the dataset like - path, classes etc.
single_class (boolean) -- train multi-class data as single-class
project (str) -- project name. Used to construct the artifact path
overwrite_config (boolean) -- overwrites the data.yaml file if set to true otherwise creates a new
file with _wandb postfix. Eg -> data_wandb.yaml
returns:
the new .yaml file with artifact links. it can be used to start training directly from artifacts
'''
with open(data_file, encoding='ascii', errors='ignore') as f:
data = yaml.safe_load(f) # data dict
check_dataset(data)
Expand Down Expand Up @@ -257,12 +330,27 @@ def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=
return path

def map_val_table_path(self):
'''
Map the validation dataset Table like name of file -> it's id in the W&B Table.
Useful for - referencing artifacts for evaluation.
'''
self.val_table_path_map = {}
print("Mapping dataset")
for i, data in enumerate(tqdm(self.val_table.data)):
self.val_table_path_map[data[3]] = data[0]

def create_dataset_table(self, dataset, class_to_id, name='dataset'):
'''
Create and return W&B artifact containing W&B Table of the dataset.
arguments:
dataset (LoadImagesAndLabels) -- instance of LoadImagesAndLabels class used to iterate over the data to build Table
class_to_id (dict(int, str)) -- hash map that maps class ids to labels
name (str) -- name of the artifact
returns:
dataset artifact to be logged or used
'''
# TODO: Explore multiprocessing to slpit this loop parallely| This is essential for speeding up the the logging
artifact = wandb.Artifact(name=name, type="dataset")
img_files = tqdm([dataset.path]) if isinstance(dataset.path, str) and Path(dataset.path).is_dir() else None
Expand Down Expand Up @@ -294,6 +382,14 @@ def create_dataset_table(self, dataset, class_to_id, name='dataset'):
return artifact

def log_training_progress(self, predn, path, names):
'''
Build evaluation Table. Uses reference from validation dataset table.
arguments:
predn (list): list of predictions in the native space in the format - [xmin, ymin, xmax, ymax, confidence, class]
path (str): local path of the current evaluation image
names (dict(int, str)): hash map that maps class ids to labels
'''
class_set = wandb.Classes([{'id': id, 'name': name} for id, name in names.items()])
box_data = []
total_conf = 0
Expand All @@ -316,25 +412,45 @@ def log_training_progress(self, predn, path, names):
)

def val_one_image(self, pred, predn, path, names, im):
'''
Log validation data for one image. updates the result Table if validation dataset is uploaded and log bbox media panel
arguments:
pred (list): list of scaled predictions in the format - [xmin, ymin, xmax, ymax, confidence, class]
predn (list): list of predictions in the native space - [xmin, ymin, xmax, ymax, confidence, class]
path (str): local path of the current evaluation image
'''
if self.val_table and self.result_table: # Log Table if Val dataset is uploaded as artifact
self.log_training_progress(predn, path, names)
else: # Default to bbox media panelif Val artifact not found
if len(self.bbox_media_panel_images) < self.max_imgs_to_log and self.current_epoch > 0:
if self.current_epoch % self.bbox_interval == 0:
box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
"class_id": int(cls),
"box_caption": "%s %.3f" % (names[cls], conf),
"scores": {"class_score": conf},
"domain": "pixel"} for *xyxy, conf, cls in pred.tolist()]
boxes = {"predictions": {"box_data": box_data, "class_labels": names}} # inference-space
self.bbox_media_panel_images.append(wandb.Image(im, boxes=boxes, caption=path.name))

if len(self.bbox_media_panel_images) < self.max_imgs_to_log and self.current_epoch > 0:
if self.current_epoch % self.bbox_interval == 0:
box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
"class_id": int(cls),
"box_caption": "%s %.3f" % (names[cls], conf),
"scores": {"class_score": conf},
"domain": "pixel"} for *xyxy, conf, cls in pred.tolist()]
boxes = {"predictions": {"box_data": box_data, "class_labels": names}} # inference-space
self.bbox_media_panel_images.append(wandb.Image(im, boxes=boxes, caption=path.name))

def log(self, log_dict):
'''
save the metrics to the logging dictionary
arguments:
log_dict (Dict) -- metrics/media to be logged in current step
'''
if self.wandb_run:
for key, value in log_dict.items():
self.log_dict[key] = value

def end_epoch(self, best_result=False):
'''
commit the log_dict, model artifacts and Tables to W&B and flush the log_dict.
arguments:
best_result (boolean): Boolean representing if the result of this evaluation is best or not
'''
if self.wandb_run:
with all_logging_disabled():
if self.bbox_media_panel_images:
Expand All @@ -352,6 +468,9 @@ def end_epoch(self, best_result=False):
self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")

def finish_run(self):
'''
Log metrics if any and finish the current W&B run
'''
if self.wandb_run:
if self.log_dict:
with all_logging_disabled():
Expand Down