From 81b8e9d5a0c2dbf9e52df298f25c9353fbe1c9ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Richard=C2=A0Wang?= <yongxinw@amazon.com>
Date: Mon, 5 Dec 2022 21:10:18 -0800
Subject: [PATCH] Use detectron2 visualizer and update quickstart (#2502)

* Adding tutorial for converting data to COCO format.

* adding python script to get all unique classes in voc format, and dump all annotation xml files

* adding docs

* adding docs

* refactor function name

* Reformat

* Reformat: single quote to double quote

* Added inference from pretrained. Added Visualization

* Adding visualization. Now `get_voc_format_classes` dumps class_names when labels.txt is not present.

* Reformat previous commit.

* Addressing issues in PR#2298.

* Addressing usage of voc2coco.py

* Reformat files to pass Lint Check

* Deleting as_pandas. Adding default saving path.

* Reformatting to pass Lint Check

* Reformatting to pass Lint Check

* Reformat imports

* Adding save results to data frame. Deprecating `get_voc_format_classes` to using `get_detection_classes`. Refactoring saving results to utils.

* Reformat for Lint

* Reformat for Lint

* Finalizing saving results; Refactored visualization into predictor.py.

* Add running instructions

* Lint check

* Removing unused imports

* Addressing issues in code review: deleting unused functions, refactoring visualization to a separate function call outside predict, removing prints, modifying doc strings, adding TODO to use mmdet visualization later,

* removing mypy

* Adding tutorials for running inference, saving results, and visualization.

* fixing save_path issue

* Added in index.rst instructions to install mmdet, add return detection as df by setting as_pandas flag, added quick start tutorial with downloading tiny motorbike, added warning in predict init for mmdet and mmcv-full, reorganized tutorials,

* Add mmdet, mmcv-full links

* Addressing issues in reviews

* Removing .input

* changing quick start coco_*.json to *_cocoformat.json. putting `mim install mmcv-full` before `pip install mmdet`

* Editing visualize_detection to take as input pd.DataFrame. Fixed a bug in save results. Update tutorials

* refactor text semantic search tutorial (#2375)

* Minor fix to infer_speed calc (#2374)

* Refactor the clip embedding tutorial (#2378)

* refactor the clip embedding tutorial

* address comments

* Auto-switch to additive attention for FTT (#2379)

* Remove build_all from platform tests (#2382)

* [timeseries] Fix use of dataframe.iteritems in timeseries (#2385)

* [timeseries] fix usage of pandas iteritems in timeseries

* fix black and isort

* [v0.6] Updated dependency versions (#2373)

* Updated dependency versions

* fix XGBoost slowdown by adding small min_delta to early stopping

* Modified default parameters for FTT (#2386)

* Modified default parameters for FTT

* Modified default parameters for FTT

* [Tutorial] Update AutoMM Detection Tutorial (#2380)

* Updated the logic to infer rois.

* remove tutorial and a minor fix

* add raw_feature for hf_text

* add onnx export

* black

* format and hp changes

* black

* fix get_column_features for other modality, need refactor

* add export_onnx, remove index vectorization

* minor fixes

* black

* add eval

* remove hard code in get_processed_batch

* remove hardcoded batch size

* fix bug and refactor

* black

* add descriptions for functions

* lint

* update example and fix comments

* remove extra inpit

* change back config

* fix label column infer in _predict

* update feature extraction example

* black

* add test add fix several comments

* black

* black

* lint

* add onnxruntime in setup

* fix

* remove code for test

* black

* remove print in test

* remove default batch test

* skip onnx test for version conflict

* edit setup

* minor fix

* change eval to evaluate

* fix merge

* fix merge

* change cascade_rcnn_x101 to a smaller centernet in detection test

* increase timeout from 15 to 120 for mim install

* update centernet version

* fix

* change to maskrcnn

* finished ft, testing val metric

* complete val metric integration for obj detection

* fix val bug

* fix val_metric error

* yolo voc finetune success

* refactors

* remove COLLECTION

* refactors

* refactor

* refactor

* black

* small fic

* small fix

* black

* lint

* lint

* lint

* fix pred without label

* fix ocr

* add comments

* add comments

* fix merge

* revert detection inference example change

* fix a conflict in matcher and update a dependency for mmocr

* decrease mim timeout to 60 second

* import mmcv only when use

* import mmcv only when use

* black

* black

* bug fix

* bug fix

* update detection eval example

* add todo mark

* cleaner code and black

* resolve several comments

* change infer_label_column_type_by's name

* fix typo

* add todo mark

* change constant map to mAP

* change mmcv_model constant to mmlab_models

* lint

* add todos and add if det to reset_index

* multi gpu training

* black

* add val_metric option and fix some TODOs

* small refactors

* black

* lint

* fix circular import

* black

* small fix

* fix collate and better integrate with lightning for detection

* remove unnecessary save/load

* remove train_to_val and val_to_train in litmmdet

* fix map

* temp fix for MMOCR, need to update

* enable multigpu inference

* merge

* lint

* fix single gpu inference

* add doc

* skip ocr test for now

* black

* black

* rename output_shape to num_classes

* fix

* add a TODO

* add train from scratch for detection model

* fix hardcoding for voc and coco while load coco format data

* black

* small fix and add voc2coco tool

* update voc2coco.py

* fix type

* fix voc2coco

* fix

* fix classes hardcode and add support for two stage lr

* fix single gpu eval and fix torchmetrics eval

* add support for voc format input

* fix a word

* add preset, fix eval bug for voc

* some refactors

* black and small refactor

* lint

* fix assets loading

* fix

* black

* add docs

* open PIL safely to avoid resource warning

* add fasterrcnn 2 stage lr and black

* add set_num_gpus

* add evaluation tutorial

* refine documents

* update ddetr

* fix typo

* change ref names

* merge

* add eval on voc and default two stage setting

* finish tutorial in datasets preparation

* small fix

* update tutorial and codes

* add EOF line and remove finetune script

* black

* fix typo in docs

* fix wordings and add get_detection_classes, fix voc2coco, fix predict

* fix type

* small update

* fix predict df error

* fix voc2coco

* add finetune tutorial and support vfnet, raise error for mask models

* add voc2coco tutorial

* add eof

* update doc

* minor

* support more models and skip image size if not in config

* black

* update docs

* add load predictor and change to problem type

* update fps information

* update load preditor example

* fix type

* update tutorial

* move val_metric

* add presets and fix val_metric in tuto and example

* black

* remove mypy in test_common

* fix

* change preset name

* restart ci

* index tutorial and fix save_path problem

* fix get_num_gpu

* fix save_path

* fix tuto and example typo and path error

* add a TODO and force detection not realtime in _predict

* fix voc2coco

* black

* bug bash + quick start

* merge

* merge

* fix save_path problem for DDP

* black

* update warning

* update tutorial, lint

* update tuto

* update voc to coco

* lint

* refine index

* lint

* update docs in multimodal

* update the multimodal index

* add doc depth

* fix save path init

* restart ci

* restart ci

* fix save_path in fit

* fix save_path

* fix save_path

* update quick start

* update cli in doc

* change prefix coco to postfix cocoformat for voc2coco

* fix s3 path and index error

* restart ci

* install necessary packages

* restart ci

* add build mmcv in .github

* fix mmlab installation

* fix

* fix mmcv install in tuto

* fix

* fix

* fix index

* update docs

* refine quick start

* remove unnecessary warnings

* restart ci

* replace voc2012 to 2007

* Update tutorial links (#2381)

* update to raw content

* trigger ci

* fix inconsistencies in AutoMM tutorials (#2388)

* Fix FTTransformer errors, add max_features (#2389)

* Fix FTTransformer errors, add max_features

* Hide progress bar if verbosity<=2 FTT

* Create save_path in init and remove warning in fit (#2392)

* create save_path in init and remove warning in fit

* remove unecessary line

* fix

* [timeseries] Update tutorials and FAQ (#2355)

Co-authored-by: Caner Turkmen <turkmen.ac@gmail.com>

* [Tutorial] Fix AutoMM detection tutorial layouts and cli tool (#2391)

* minor fixes

* black

* add eval

* remove hard code in get_processed_batch

* remove hardcoded batch size

* fix bug and refactor

* black

* add descriptions for functions

* lint

* update example and fix comments

* remove extra inpit

* change back config

* fix label column infer in _predict

* update feature extraction example

* black

* add test add fix several comments

* black

* black

* lint

* add onnxruntime in setup

* fix

* remove code for test

* black

* remove print in test

* remove default batch test

* skip onnx test for version conflict

* edit setup

* minor fix

* change eval to evaluate

* fix merge

* fix merge

* change cascade_rcnn_x101 to a smaller centernet in detection test

* increase timeout from 15 to 120 for mim install

* update centernet version

* fix

* change to maskrcnn

* finished ft, testing val metric

* complete val metric integration for obj detection

* fix val bug

* fix val_metric error

* yolo voc finetune success

* refactors

* remove COLLECTION

* refactors

* refactor

* refactor

* black

* small fic

* small fix

* black

* lint

* lint

* lint

* fix pred without label

* fix ocr

* add comments

* add comments

* fix merge

* revert detection inference example change

* fix a conflict in matcher and update a dependency for mmocr

* decrease mim timeout to 60 second

* import mmcv only when use

* import mmcv only when use

* black

* black

* bug fix

* bug fix

* update detection eval example

* add todo mark

* cleaner code and black

* resolve several comments

* change infer_label_column_type_by's name

* fix typo

* add todo mark

* change constant map to mAP

* change mmcv_model constant to mmlab_models

* lint

* add todos and add if det to reset_index

* multi gpu training

* black

* add val_metric option and fix some TODOs

* small refactors

* black

* lint

* fix circular import

* black

* small fix

* fix collate and better integrate with lightning for detection

* remove unnecessary save/load

* remove train_to_val and val_to_train in litmmdet

* fix map

* temp fix for MMOCR, need to update

* enable multigpu inference

* merge

* lint

* fix single gpu inference

* add doc

* skip ocr test for now

* black

* black

* rename output_shape to num_classes

* fix

* add a TODO

* add train from scratch for detection model

* fix hardcoding for voc and coco while load coco format data

* black

* small fix and add voc2coco tool

* update voc2coco.py

* fix type

* fix voc2coco

* fix

* fix classes hardcode and add support for two stage lr

* fix single gpu eval and fix torchmetrics eval

* add support for voc format input

* fix a word

* add preset, fix eval bug for voc

* some refactors

* black and small refactor

* lint

* fix assets loading

* fix

* black

* add docs

* open PIL safely to avoid resource warning

* add fasterrcnn 2 stage lr and black

* add set_num_gpus

* add evaluation tutorial

* refine documents

* update ddetr

* fix typo

* change ref names

* merge

* add eval on voc and default two stage setting

* finish tutorial in datasets preparation

* small fix

* update tutorial and codes

* add EOF line and remove finetune script

* black

* fix typo in docs

* fix wordings and add get_detection_classes, fix voc2coco, fix predict

* fix type

* small update

* fix predict df error

* fix voc2coco

* add finetune tutorial and support vfnet, raise error for mask models

* add voc2coco tutorial

* add eof

* update doc

* minor

* support more models and skip image size if not in config

* black

* update docs

* add load predictor and change to problem type

* update fps information

* update load preditor example

* fix type

* update tutorial

* move val_metric

* add presets and fix val_metric in tuto and example

* black

* remove mypy in test_common

* fix

* change preset name

* restart ci

* index tutorial and fix save_path problem

* fix get_num_gpu

* fix save_path

* fix tuto and example typo and path error

* add a TODO and force detection not realtime in _predict

* fix voc2coco

* black

* bug bash + quick start

* merge

* merge

* fix save_path problem for DDP

* black

* update warning

* update tutorial, lint

* update tuto

* update voc to coco

* lint

* refine index

* lint

* update docs in multimodal

* update the multimodal index

* add doc depth

* fix save path init

* restart ci

* restart ci

* fix save_path in fit

* fix save_path

* fix save_path

* update quick start

* update cli in doc

* change prefix coco to postfix cocoformat for voc2coco

* fix s3 path and index error

* restart ci

* install necessary packages

* restart ci

* add build mmcv in .github

* fix mmlab installation

* fix

* fix mmcv install in tuto

* fix

* fix

* fix index

* update docs

* refine quick start

* remove unnecessary warnings

* restart ci

* replace voc2012 to 2007

* fix tuto and cli tool

* merge

* fix docs

* change max depth

* add all detection tuto in multimodal index

* restart ci

* fix max depth

* Remove dummy layer (#2394)

* Update constants.py

* Update predictor.py

* Update ner_text.py

* Update ner_text.py

* Update ner_text.py

* add standalone test

* fix

* Update test_ner_standalone.py

* Update test_ner.py

* Update test_ner.py

* fix

* Update process_ner.py

* Update test_ner_standalone.py

* Update process_ner.py

* Update fusion_mlp_image_text_tabular.yaml

* Update model.py

* Update process_ner.py

* Update process_ner.py

* Update ner_text.py

* start ci

Co-authored-by: Ubuntu <ubuntu@ip-172-31-53-45.us-west-2.compute.internal>

* Fix leaderboard with static features (#2398)

* [Tutorial] Knowledge Distillation tutorial (#2397)

* Create model_distillation.md

* Update automm_distillation_glue.py

* Update automm_distillation_glue.py

* update

* Update tutorial

* update

* Update model_distillation.md

* update

* Fix seed

* fix tutorial

* Update model_distillation.md

* Fix installation error for pycocotools package (#2400)

* add eval

* remove hard code in get_processed_batch

* remove hardcoded batch size

* fix bug and refactor

* black

* add descriptions for functions

* lint

* update example and fix comments

* remove extra inpit

* change back config

* fix label column infer in _predict

* update feature extraction example

* black

* add test add fix several comments

* black

* black

* lint

* add onnxruntime in setup

* fix

* remove code for test

* black

* remove print in test

* remove default batch test

* skip onnx test for version conflict

* edit setup

* minor fix

* change eval to evaluate

* fix merge

* fix merge

* change cascade_rcnn_x101 to a smaller centernet in detection test

* increase timeout from 15 to 120 for mim install

* update centernet version

* fix

* change to maskrcnn

* finished ft, testing val metric

* complete val metric integration for obj detection

* fix val bug

* fix val_metric error

* yolo voc finetune success

* refactors

* remove COLLECTION

* refactors

* refactor

* refactor

* black

* small fic

* small fix

* black

* lint

* lint

* lint

* fix pred without label

* fix ocr

* add comments

* add comments

* fix merge

* revert detection inference example change

* fix a conflict in matcher and update a dependency for mmocr

* decrease mim timeout to 60 second

* import mmcv only when use

* import mmcv only when use

* black

* black

* bug fix

* bug fix

* update detection eval example

* add todo mark

* cleaner code and black

* resolve several comments

* change infer_label_column_type_by's name

* fix typo

* add todo mark

* change constant map to mAP

* change mmcv_model constant to mmlab_models

* lint

* add todos and add if det to reset_index

* multi gpu training

* black

* add val_metric option and fix some TODOs

* small refactors

* black

* lint

* fix circular import

* black

* small fix

* fix collate and better integrate with lightning for detection

* remove unnecessary save/load

* remove train_to_val and val_to_train in litmmdet

* fix map

* temp fix for MMOCR, need to update

* enable multigpu inference

* merge

* lint

* fix single gpu inference

* add doc

* skip ocr test for now

* black

* black

* rename output_shape to num_classes

* fix

* add a TODO

* add train from scratch for detection model

* fix hardcoding for voc and coco while load coco format data

* black

* small fix and add voc2coco tool

* update voc2coco.py

* fix type

* fix voc2coco

* fix

* fix classes hardcode and add support for two stage lr

* fix single gpu eval and fix torchmetrics eval

* add support for voc format input

* fix a word

* add preset, fix eval bug for voc

* some refactors

* black and small refactor

* lint

* fix assets loading

* fix

* black

* add docs

* open PIL safely to avoid resource warning

* add fasterrcnn 2 stage lr and black

* add set_num_gpus

* add evaluation tutorial

* refine documents

* update ddetr

* fix typo

* change ref names

* merge

* add eval on voc and default two stage setting

* finish tutorial in datasets preparation

* small fix

* update tutorial and codes

* add EOF line and remove finetune script

* black

* fix typo in docs

* fix wordings and add get_detection_classes, fix voc2coco, fix predict

* fix type

* small update

* fix predict df error

* fix voc2coco

* add finetune tutorial and support vfnet, raise error for mask models

* add voc2coco tutorial

* add eof

* update doc

* minor

* support more models and skip image size if not in config

* black

* update docs

* add load predictor and change to problem type

* update fps information

* update load preditor example

* fix type

* update tutorial

* move val_metric

* add presets and fix val_metric in tuto and example

* black

* remove mypy in test_common

* fix

* change preset name

* restart ci

* index tutorial and fix save_path problem

* fix get_num_gpu

* fix save_path

* fix tuto and example typo and path error

* add a TODO and force detection not realtime in _predict

* fix voc2coco

* black

* bug bash + quick start

* merge

* merge

* fix save_path problem for DDP

* black

* update warning

* update tutorial, lint

* update tuto

* update voc to coco

* lint

* refine index

* lint

* update docs in multimodal

* update the multimodal index

* add doc depth

* fix save path init

* restart ci

* restart ci

* fix save_path in fit

* fix save_path

* fix save_path

* update quick start

* update cli in doc

* change prefix coco to postfix cocoformat for voc2coco

* fix s3 path and index error

* restart ci

* install necessary packages

* restart ci

* add build mmcv in .github

* fix mmlab installation

* fix

* fix mmcv install in tuto

* fix

* fix

* fix index

* update docs

* refine quick start

* remove unnecessary warnings

* restart ci

* replace voc2012 to 2007

* fix tuto and cli tool

* merge

* fix docs

* change max depth

* add all detection tuto in multimodal index

* restart ci

* fix max depth

* fix installation error for pycocotools

* remove windows

* Fix hyperparameters in matcher fit (#2404)

* support hyperparameters in matcher fit

* fix

* fix

* [docs] Apple Silicon Instructions (#2403)

* [timeseries] make core and tabular dependencies explicit (#2405)

* Turn off AutoMM prediction progress bar in tabular (#2401)

* save and load enable_progress_bar

* turn off progress bar for automm in tabular

* Remove warnings (#2402)

* replace iteritems with items

* replace iteritems with items

* remove seqeval warning

* Update metric.py

* set num_workers to 0 by default

Some huggingface checkpoints such as deberta-v3, mdeberta-v3, roberta, flan-t5-xl do not work with ddp_spawn when setting num_workers=2

* update ner presets

* add fixme

* Update presets.py

* Improve text_prediction tutorial (#2414)

* fix multimodal tutorial index (#2407)

* Remove warnings and duplicate function (#2409)

* create save_path in init and remove warning in fit

* remove unecessary line

* fix

* remove duplicate function

* filter mmcv warning

* lint

* Fix cpu inference bug (#2413)

* fix cpu inference bug

* add test-case of cpu-only inference

* Fix NER best quality preset (#2412)

* Update presets.py

* Update presets.py

* Update presets.py

* Update Quick Start Tutorial and Add Installation Warnings (#2418)

* create save_path in init and remove warning in fit

* remove unecessary line

* fix

* remove duplicate function

* filter mmcv warning

* lint

* add warnings on import error, update quick start example

* update quick start doc

* fix wording

* add installation info

* fix

* Remove dependency on vision.imagedataset (#2411)

* reemove imagedataset

* add datasets

* move dataset download to utils

* lint

* lint

* lint

* Fixed model save and load path with uuid for multiple runs (#2415)

* FTT preset (#2410)

* Fix FTTransformer errors, add max_features

* Add FTT presets

* Added parallel bagging on CPU with FTTransformer

* remove ultra preset, add docs

* Set num_workers for image related presets and clean matcher config (#2416)

* clean matcher config

* num_workers

* Fix bad refs in detection tutorial (#2419)

* create save_path in init and remove warning in fit

* remove unecessary line

* fix

* remove duplicate function

* filter mmcv warning

* lint

* add warnings on import error, update quick start example

* update quick start doc

* fix wording

* add installation info

* fix

* fix bad ref

* [Tutorial] Shorten multilingual tutorial + Improve tutorial website (#2417)

* improve document

* shorten tutorial

* Update index.rst

* update

* revise tutorial.

* update tutorial

* Fix typo

* Adding inference quick start tutorial

* Fixing YOLOv3 description

* Moving mmdet mmcv to top of page

* Changing {.python .input} to python .input

* [0.6 Release][Object Detection][Tutorial] Remove mmdet output format (#2393)

* Adding tutorial for converting data to COCO format.

* adding python script to get all unique classes in voc format, and dump all annotation xml files

* adding docs

* adding docs

* refactor function name

* Reformat

* Reformat: single quote to double quote

* Added inference from pretrained. Added Visualization

* Adding visualization. Now `get_voc_format_classes` dumps class_names when labels.txt is not present.

* Reformat previous commit.

* Addressing issues in PR#2298.

* Addressing usage of voc2coco.py

* Reformat files to pass Lint Check

* Deleting as_pandas. Adding default saving path.

* Reformatting to pass Lint Check

* Reformatting to pass Lint Check

* Reformat imports

* Adding save results to data frame. Deprecating `get_voc_format_classes` to using `get_detection_classes`. Refactoring saving results to utils.

* Reformat for Lint

* Reformat for Lint

* Finalizing saving results; Refactored visualization into predictor.py.

* Add running instructions

* Lint check

* Removing unused imports

* Addressing issues in code review: deleting unused functions, refactoring visualization to a separate function call outside predict, removing prints, modifying doc strings, adding TODO to use mmdet visualization later,

* removing mypy

* Adding tutorials for running inference, saving results, and visualization.

* fixing save_path issue

* Added in index.rst instructions to install mmdet, add return detection as df by setting as_pandas flag, added quick start tutorial with downloading tiny motorbike, added warning in predict init for mmdet and mmcv-full, reorganized tutorials,

* Add mmdet, mmcv-full links

* Addressing issues in reviews

* Removing .input

* changing quick start coco_*.json to *_cocoformat.json. putting `mim install mmcv-full` before `pip install mmdet`

* Editing visualize_detection to take as input pd.DataFrame. Fixed a bug in save results. Update tutorials

* editting tutorials to reflect pd.DataFrame detection output.

* Update tutorials to print pred

* Update inference quick start tutorial to single image

* Update inference quick start tutorial to single image; Changing matplotlib to PIL for displaying visualization.

* Update inference quick start tutorial to single image; Changing matplotlib to PIL for displaying visualization.

* Fixing quick start import error with numpy.

* Fixed links in inference tutorials, moved inference quick start to quick_start, fixed a typo in detection_eval_fasterrcnn_coco

* fixing kernel died issue

* Removing quick start to another PR

* Remove detection load predictor

* Changing {.python} to python

* Removing quick start in index.rst

* Update to v0.6 (#2425)

* [Release 0.6] [Multimodal] Check if folder is empty before raise (#2426)

* fix

* fix

* lint

* fix

* fix

* Update to v0.6.1

* bump evaluate to `0.3.0` (#2433)

* [Cloud] Cloud Custom Image and Cleanup (#2408)

* checkpoint

* checkpoint

* api

* cleanup old version code

* fix to local

* fix

* fix

* fix

* additional volume size

* fix

* fix

* addressing comments

* fix

Co-authored-by: Weisu Yin <weisuyin96@gmail.com>

* fix (#2437)

Co-authored-by: Weisu Yin <weisuyin96@gmail.com>

* 0.6.0 Release notes (#2383)

* Add finetune/eval tests for AutoMM detection (#2441)

* create save_path in init and remove warning in fit

* remove unecessary line

* fix

* add fintune/eval test for detection

* lint

* fix layout (#2450)

* [CI][Cloud] Nightly Build of AG Images for Cloud Testing (#2436)

* initial

fix

fix

fix

checkpoint

checkpoint

checkpoint

api

cleanup old version code

fix to local

fix

fix

fix

additional volume size

fix

fix

tests

* fix

* fix

* enable nightly for cloud CI

* minor fix entry point

* comments

Co-authored-by: Weisu Yin <weisuyin96@gmail.com>

* Adding Joint IA3_LoRA as efficient finetuning strategy (#2451)

* adding ia3_lora peft and raft#1 preset

* fix

* dynamic max length template choice;

* fix trigger of PEFT

* add coment for 11B model in preset

Co-authored-by: Ubuntu <ubuntu@ip-172-31-44-212.eu-west-1.compute.internal>

* Fix AutoMM warnings about object detection (#2458)

* fix warnings

* ocr constant

* [CI][Fair] Enable CI for Fair Module (#2460)

* fair ci

* fix

* [Cloud][CI] Enable Cloud Lint (#2455)

* lint

* fix

* fix

* isort skip

* gox

* fix

* fix

* fix

* fix

Co-authored-by: Weisu Yin <weisuyin96@gmail.com>

* adding python script to get all unique classes in voc format, and dump all annotation xml files

* Reformat previous commit.

* Adding save results to data frame. Deprecating `get_voc_format_classes` to using `get_detection_classes`. Refactoring saving results to utils.

* Addressing issues in code review: deleting unused functions, refactoring visualization to a separate function call outside predict, removing prints, modifying doc strings, adding TODO to use mmdet visualization later,

* Added in index.rst instructions to install mmdet, add return detection as df by setting as_pandas flag, added quick start tutorial with downloading tiny motorbike, added warning in predict init for mmdet and mmcv-full, reorganized tutorials,

* Deleting inference tutorials, refactoring inference quick start to quick_start_coco.md, fixing the bug for fit->predict with dummy df converter from dict, updating detection unit test

* Deleting inference tutorials, refactoring inference quick start to quick_start_coco.md, fixing the bug for fit->predict with dummy df converter from dict, updating detection unit test

* Addressing comments in reviews.

* Re-organizing tutorial, picking a different image to display

* Deleting inference cards in the object detection index page

* Adding visualizer from detectron2

* Adding visualizer from detectron2 - Complete. Updating quick_start_coco.md

* Editing docs for colormap.py

* Deleting unused classes. Reformatting for lint.

* Reformatting for lint.

* Adding new line

* add back newline under cv2

* removing __main__ in colormap as it is not necessary

Co-authored-by: Zhiqiang Tang <zhiqiang.tang@rutgers.edu>
Co-authored-by: Nick Erickson <neerick@amazon.com>
Co-authored-by: BingzhaoZhu <39958219+BingzhaoZhu@users.noreply.github.com>
Co-authored-by: tonyhu <tonyhoo@users.noreply.github.com>
Co-authored-by: Caner Turkmen <turkmen.ac@gmail.com>
Co-authored-by: Haoyang Fang <107515844+FANGAreNotGnu@users.noreply.github.com>
Co-authored-by: Yi Zhu <yizhu59@gmail.com>
Co-authored-by: Oleksandr Shchur <shchuro@amazon.com>
Co-authored-by: Shuai Zhang <cheungdaven@gmail.com>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-53-45.us-west-2.compute.internal>
Co-authored-by: Xingjian Shi <xshiab@connect.ust.hk>
Co-authored-by: Alexander Shirkov <10080307+gradientsky@users.noreply.github.com>
Co-authored-by: Weisu Yin <weisy@amazon.com>
Co-authored-by: Alexander Shirkov <ashyrkou@amazon.com>
Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>
Co-authored-by: Weisu Yin <weisuyin96@gmail.com>
Co-authored-by: Rami <rami-aly@hotmail.de>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-44-212.eu-west-1.compute.internal>
---
 .../quick_start/quick_start_coco.md           |  17 +-
 .../autogluon/multimodal/utils/__init__.py    |   1 +
 .../autogluon/multimodal/utils/colormap.py    | 148 ++++++
 .../utils/object_detection_visualizer.py      | 493 ++++++++++++++++++
 4 files changed, 650 insertions(+), 9 deletions(-)
 create mode 100644 multimodal/src/autogluon/multimodal/utils/colormap.py
 create mode 100644 multimodal/src/autogluon/multimodal/utils/object_detection_visualizer.py

diff --git a/docs/tutorials/multimodal/object_detection/quick_start/quick_start_coco.md b/docs/tutorials/multimodal/object_detection/quick_start/quick_start_coco.md
index 06099a4c3c9..5d57c1466c4 100644
--- a/docs/tutorials/multimodal/object_detection/quick_start/quick_start_coco.md
+++ b/docs/tutorials/multimodal/object_detection/quick_start/quick_start_coco.md
@@ -204,21 +204,20 @@ To run visualizations, ensure that you have `opencv` installed. If you haven't a
 
 To visualize the detection bounding boxes, run the following:
 ```python .input
-from autogluon.multimodal.utils import visualize_detection
+from autogluon.multimodal.utils import Visualizer
 
 conf_threshold = 0.4  # Specify a confidence threshold to filter out unwanted boxes
-visualization_result_dir = "./"  # Use the pwd as result dir to save the visualized image
+image_result = pred.iloc[30]
 
-visualized = visualize_detection(
-    pred=pred[30:31],
-    detection_classes=predictor.get_predictor_classes(),
-    conf_threshold=conf_threshold,
-    visualization_result_dir=visualization_result_dir,
-)
+img_path = image_result.image  # Select an image to visualize
+
+visualizer = Visualizer(img_path)  # Initialize the Visualizer
+out = visualizer.draw_instance_predictions(image_result, conf_threshold=conf_threshold)  # Draw detections
+visualized = out.get_image()  # Get the visualized image
 
 from PIL import Image
 from IPython.display import display
-img = Image.fromarray(visualized[0][:, :, ::-1], 'RGB')
+img = Image.fromarray(visualized, 'RGB')
 display(img)
 ```
 
diff --git a/multimodal/src/autogluon/multimodal/utils/__init__.py b/multimodal/src/autogluon/multimodal/utils/__init__.py
index 8c572b0c2d1..d9127104127 100644
--- a/multimodal/src/autogluon/multimodal/utils/__init__.py
+++ b/multimodal/src/autogluon/multimodal/utils/__init__.py
@@ -57,6 +57,7 @@
     save_result_voc_format,
     visualize_detection,
 )
+from .object_detection_visualizer import Visualizer
 from .onnx import get_onnx_input
 from .pipeline import init_pretrained, init_pretrained_matcher
 from .save import process_save_path, save_pretrained_model_configs, save_text_tokenizers, setup_save_path
diff --git a/multimodal/src/autogluon/multimodal/utils/colormap.py b/multimodal/src/autogluon/multimodal/utils/colormap.py
new file mode 100644
index 00000000000..6015f1b0e1b
--- /dev/null
+++ b/multimodal/src/autogluon/multimodal/utils/colormap.py
@@ -0,0 +1,148 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+An awesome colormap for really neat visualizations.
+Copied from Detectron, and removed gray colors.
+"""
+import random
+
+import numpy as np
+
+__all__ = ["colormap", "random_color", "random_colors"]
+
+# fmt: off
+# RGB:
+_COLORS = np.array(
+    [
+        0.000, 0.447, 0.741,
+        0.850, 0.325, 0.098,
+        0.929, 0.694, 0.125,
+        0.494, 0.184, 0.556,
+        0.466, 0.674, 0.188,
+        0.301, 0.745, 0.933,
+        0.635, 0.078, 0.184,
+        0.300, 0.300, 0.300,
+        0.600, 0.600, 0.600,
+        1.000, 0.000, 0.000,
+        1.000, 0.500, 0.000,
+        0.749, 0.749, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.333, 0.333, 0.000,
+        0.333, 0.667, 0.000,
+        0.333, 1.000, 0.000,
+        0.667, 0.333, 0.000,
+        0.667, 0.667, 0.000,
+        0.667, 1.000, 0.000,
+        1.000, 0.333, 0.000,
+        1.000, 0.667, 0.000,
+        1.000, 1.000, 0.000,
+        0.000, 0.333, 0.500,
+        0.000, 0.667, 0.500,
+        0.000, 1.000, 0.500,
+        0.333, 0.000, 0.500,
+        0.333, 0.333, 0.500,
+        0.333, 0.667, 0.500,
+        0.333, 1.000, 0.500,
+        0.667, 0.000, 0.500,
+        0.667, 0.333, 0.500,
+        0.667, 0.667, 0.500,
+        0.667, 1.000, 0.500,
+        1.000, 0.000, 0.500,
+        1.000, 0.333, 0.500,
+        1.000, 0.667, 0.500,
+        1.000, 1.000, 0.500,
+        0.000, 0.333, 1.000,
+        0.000, 0.667, 1.000,
+        0.000, 1.000, 1.000,
+        0.333, 0.000, 1.000,
+        0.333, 0.333, 1.000,
+        0.333, 0.667, 1.000,
+        0.333, 1.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.667, 0.333, 1.000,
+        0.667, 0.667, 1.000,
+        0.667, 1.000, 1.000,
+        1.000, 0.000, 1.000,
+        1.000, 0.333, 1.000,
+        1.000, 0.667, 1.000,
+        0.333, 0.000, 0.000,
+        0.500, 0.000, 0.000,
+        0.667, 0.000, 0.000,
+        0.833, 0.000, 0.000,
+        1.000, 0.000, 0.000,
+        0.000, 0.167, 0.000,
+        0.000, 0.333, 0.000,
+        0.000, 0.500, 0.000,
+        0.000, 0.667, 0.000,
+        0.000, 0.833, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 0.167,
+        0.000, 0.000, 0.333,
+        0.000, 0.000, 0.500,
+        0.000, 0.000, 0.667,
+        0.000, 0.000, 0.833,
+        0.000, 0.000, 1.000,
+        0.000, 0.000, 0.000,
+        0.143, 0.143, 0.143,
+        0.857, 0.857, 0.857,
+        1.000, 1.000, 1.000
+    ]
+).astype(np.float32).reshape(-1, 3)
+# fmt: on
+
+
+def colormap(rgb=False, maximum=255):
+    """
+    Parameters
+    ----------
+    rgb (bool): whether to return RGB colors or BGR colors.
+    maximum (int): either 255 or 1
+
+    Returns
+    -------
+    ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1]
+    """
+    assert maximum in [255, 1], maximum
+    c = _COLORS * maximum
+    if not rgb:
+        c = c[:, ::-1]
+    return c
+
+
+def random_color(rgb=False, maximum=255):
+    """
+    Parameters
+    ----------
+    rgb (bool): whether to return RGB colors or BGR colors.
+    maximum (int): either 255 or 1
+
+    Returns
+    -------
+    ndarray: a vector of 3 numbers
+    """
+    idx = np.random.randint(0, len(_COLORS))
+    ret = _COLORS[idx] * maximum
+    if not rgb:
+        ret = ret[::-1]
+    return ret
+
+
+def random_colors(N, rgb=False, maximum=255):
+    """
+    Parameters
+    ----------
+    N (int): number of unique colors needed
+    rgb (bool): whether to return RGB colors or BGR colors.
+    maximum (int): either 255 or 1
+
+    Returns
+    -------
+    ndarray: a list of random_color
+    """
+    indices = random.sample(range(len(_COLORS)), N)
+    ret = [_COLORS[i] * maximum for i in indices]
+    if not rgb:
+        ret = [x[::-1] for x in ret]
+    return ret
diff --git a/multimodal/src/autogluon/multimodal/utils/object_detection_visualizer.py b/multimodal/src/autogluon/multimodal/utils/object_detection_visualizer.py
new file mode 100644
index 00000000000..b57f0ca7828
--- /dev/null
+++ b/multimodal/src/autogluon/multimodal/utils/object_detection_visualizer.py
@@ -0,0 +1,493 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Disclaimer: Special thanks to the Detectron2 developers
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/visualizer.py!
+# We use part of its provided, open-source functionalities.
+
+import colorsys
+import logging
+from enum import Enum, unique
+from typing import List
+
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import numpy as np
+import pandas as pd
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+
+from .colormap import random_color
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["ColorMode", "VisImage", "Visualizer"]
+
+
+_SMALL_OBJECT_AREA_THRESH = 1000
+_LARGE_MASK_AREA_THRESH = 120000
+_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
+_BLACK = (0, 0, 0)
+_RED = (1.0, 0, 0)
+
+_KEYPOINT_THRESHOLD = 0.05
+
+
+@unique
+class ColorMode(Enum):
+    """
+    Enum of different color modes to use for instance visualizations.
+    """
+
+    IMAGE = 0
+    """
+    Picks a random color for every instance and overlay segmentations with low opacity.
+    """
+    SEGMENTATION = 1
+    """
+    Let instances of the same category have similar colors
+    (from metadata.thing_colors), and overlay them with
+    high opacity. This provides more attention on the quality of segmentation.
+    """
+    IMAGE_BW = 2
+    """
+    Same as IMAGE, but convert all areas without masks to gray-scale.
+    Only available for drawing per-instance mask predictions.
+    """
+
+
+def _create_text_labels(classes: List[str], scores: List[float]):
+    """
+    Create the label tags for visualization
+    Parameters
+    ----------
+    classes (list[str]): class names for all the detected instances
+    scores (list[float]); detection confidence scores for all the detected instances
+
+    Returns
+    -------
+    labels (list[str]): label tags for visualization
+    """
+    labels = None
+    if classes is not None:
+        labels = classes
+
+    if scores is not None:
+        if labels is None:
+            labels = ["{:.0f}%".format(s * 100) for s in scores]
+        else:
+            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+    return labels
+
+
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        """
+        Parameters
+        ----------
+            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
+            scale (float): scale the input image
+        """
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+
+    def _setup_figure(self, img):
+        """
+        Parameters
+        ----------
+            Same as in :meth:`__init__()`.
+
+        Returns
+        -------
+            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
+            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
+        """
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        self.fig = fig
+        self.ax = ax
+        self.reset_image(img)
+
+    def reset_image(self, img):
+        """
+        Parameters
+        ----------
+            img: same as in __init__
+        """
+        img = img.astype("uint8")
+        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
+
+    def save(self, filepath):
+        """
+        Parameters
+        ----------
+            filepath (str): a string that contains the absolute path, including the file name, where
+                the visualized image will be saved.
+        """
+        self.fig.savefig(filepath)
+
+    def get_image(self):
+        """
+        Returns
+        -------
+            ndarray:
+                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
+                The shape is scaled w.r.t the input image using the given `scale` argument.
+        """
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        # buf = io.BytesIO()  # works for cairo backend
+        # canvas.print_rgba(buf)
+        # width, height = self.width, self.height
+        # s = buf.getvalue()
+
+        buffer = np.frombuffer(s, dtype="uint8")
+
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+        return rgb.astype("uint8")
+
+
+class Visualizer:
+    """
+    Visualizer that draws data about detection on images.
+
+    It contains methods like `draw_{text,box}`
+    that draw primitive objects to images, as well as high-level wrappers like
+    `draw_{instance_predictions}` that draw composite data in some pre-defined style.
+
+    Note that the exact visualization style for the high-level wrappers are subject to change.
+    Style such as color, opacity, label contents, visibility of labels, or even the visibility
+    of objects themselves (e.g. when the object is too small) may change according
+    to different heuristics, as long as the results still look visually reasonable.
+
+    To obtain a consistent style, you can implement custom drawing functions with the
+    abovementioned primitive methods instead.  This class does not intend to satisfy
+    everyone's preference on drawing styles.
+
+    This visualizer focuses on high rendering quality rather than performance. It is not
+    designed to be used for real-time applications.
+    """
+
+    def __init__(self, img_path, scale=1.0, instance_mode=ColorMode.IMAGE):
+        """
+        Parameters
+        ----------
+        img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+            the height and width of the image respectively. C is the number of
+            color channels. The image is required to be in RGB format since that
+            is a requirement of the Matplotlib library. The image is also expected
+            to be in the range [0, 255].
+        metadata (Metadata): dataset metadata (e.g. class names and colors)
+        instance_mode (ColorMode): defines one of the pre-defined style for drawing
+            instances on an image.
+        """
+        try:
+            import cv2
+        except:
+            raise ImportError("No module named: cv2. Please install cv2 by 'pip install opencv-python'")
+
+        img_rgb = cv2.imread(img_path)
+        img_rgb = img_rgb[:, :, ::-1]
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        self.output = VisImage(self.img, scale=scale)
+
+        # too small texts are useless, therefore clamp to 9
+        self._default_font_size = max(np.sqrt(self.output.height * self.output.width) // 90, 10 // scale)
+        self._instance_mode = instance_mode
+
+    @staticmethod
+    def process_predictions(predictions: pd.DataFrame, conf_threshold: float = 0.4):
+        """
+        Process the classes, box coordinates and confidence scores of the predictions in the image
+
+        Parameters
+        ----------
+        predictions (pd.DataFrame): the output of object detection with 2 attributes:
+            "image": containing paths to the source image
+            "bboxes": containing detection results for the images with the following format
+                {"class": <predicted_class_name>, "bbox": [x1, y1, x2, y2], "score": <confidence_score>}
+        conf_threshold (float): detection confidence threshold to display instances
+
+        Returns
+        -------
+        boxes: XYXY format of bounding boxes shape = (N, 4)
+        scores: detection confidence scores, shape = (N, )
+        classes: detection classes, shape = (N, )
+        """
+        boxes, scores, classes = [], [], []
+        instances = predictions["bboxes"]
+        for instance in instances:
+            s = instance["score"]
+            if s >= conf_threshold:
+                box = instance["bbox"]
+                c = instance["class"]
+                boxes.append(box)
+                scores.append(s)
+                classes.append(c)
+        boxes = np.array(boxes)
+        scores = np.array(scores)
+        classes = np.array(classes)
+        assert (
+            len(boxes) == len(scores) == len(classes)
+        ), "Expected boxes, scores and classes to have the same length, but got len(boxes): {}, len(scores) = {}, len(classes) = {}".format(
+            len(boxes), len(scores), len(classes)
+        )
+        if len(boxes) == 0:
+            return None, None, None
+        return boxes, scores, classes
+
+    def draw_instance_predictions(self, predictions: pd.DataFrame, conf_threshold: float = 0.4):
+        """
+        Draw instance-level prediction results on an image.
+
+        Parameters
+        ----------
+        predictions (pd.DataFrame): the output of object detection for that image, with 2 attributes:
+            "image": containing paths to the source image
+            "bboxes": containing detection results for the images with the following format
+                {"class": <predicted_class_name>, "bbox": [x1, y1, x2, y2], "score": <confidence_score>}
+        conf_threshold (float): detection confidence threshold to display instances
+
+        Returns
+        -------
+        output (VisImage): image object with visualizations.
+        """
+        boxes, scores, classes = self.process_predictions(predictions, conf_threshold=conf_threshold)
+        labels = _create_text_labels(classes, scores)
+        colors = None
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(
+                self._create_grayscale_image(
+                    (predictions.pred_masks.any(dim=0) > 0).numpy() if predictions.has("pred_masks") else None
+                )
+            )
+
+        self.overlay_instances(
+            boxes=boxes,
+            labels=labels,
+            assigned_colors=colors,
+        )
+        return self.output
+
+    def overlay_instances(
+        self,
+        *,
+        boxes=None,
+        labels=None,
+        assigned_colors=None,
+    ):
+        """
+        Draw the visualizations
+        Parameters
+        ----------
+        boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
+            or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
+            or a :class:`RotatedBoxes`,
+            or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
+            for the N objects in a single image,
+        labels (list[str]): the text to be displayed for each instance.
+        assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+            corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+            for full list of formats that the colors are accepted in.
+        Returns
+        -------
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = 0
+        if boxes is not None:
+            num_instances = len(boxes)
+        if labels is not None:
+            assert len(labels) == num_instances
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        if boxes is not None:
+            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+
+        if areas is not None:
+            sorted_idxs = np.argsort(-areas).tolist()
+            # Re-order overlapped instances in descending order.
+            boxes = boxes[sorted_idxs] if boxes is not None else None
+            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], edge_color=color)
+
+            if labels is not None:
+                # first get a box
+                if boxes is not None:
+                    x0, y0, x1, y1 = boxes[i]
+                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+                    horiz_align = "left"
+                else:
+                    continue  # drawing the box confidence for keypoints isn't very useful.
+                # for small objects, draw text at the side to avoid occlusion
+                instance_area = (y1 - y0) * (x1 - x0)
+                if instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale or y1 - y0 < 40 * self.output.scale:
+                    if y1 >= self.output.height - 5:
+                        text_pos = (x1, y0)
+                    else:
+                        text_pos = (x0, y1)
+
+                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                font_size = np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
+                self.draw_text(
+                    labels[i],
+                    text_pos,
+                    color=lighter_color,
+                    horizontal_alignment=horiz_align,
+                    font_size=font_size,
+                )
+
+        return self.output
+
+    """
+    Primitive drawing functions:
+    """
+
+    def draw_text(
+        self,
+        text,
+        position,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0,
+    ):
+        """
+        Parameters
+        ----------
+        text (str): class label
+        position (tuple): a tuple of the x and y coordinates to place text on image.
+        font_size (int, optional): font of the text. If not provided, a font size
+            proportional to the image width is calculated and used.
+        color: color of the text. Refer to `matplotlib.colors` for full list
+            of formats that are accepted.
+        horizontal_alignment (str): see `matplotlib.text.Text`
+        rotation: rotation angle in degrees CCW
+
+        Returns
+        -------
+        output (VisImage): image object with text drawn.
+        """
+        if not font_size:
+            font_size = self._default_font_size
+
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            family="sans-serif",
+            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        """
+        Parameters
+        ----------
+        box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
+            are the coordinates of the image's top left corner. x1 and y1 are the
+            coordinates of the image's bottom right corner.
+        alpha (float): blending efficient. Smaller values lead to more transparent masks.
+        edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+            for full list of formats that are accepted.
+        line_style (string): the string to use to create the outline of the boxes.
+
+        Returns
+        -------
+        output (VisImage): image object with box drawn.
+        """
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+
+        linewidth = max(self._default_font_size / 4, 1)
+
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+
+    """
+    Internal methods:
+    """
+
+    def _create_grayscale_image(self, mask=None):
+        """
+        Create a grayscale version of the original image.
+        The colors in masked area, if given, will be kept.
+        """
+        img_bw = self.img.astype("f4").mean(axis=2)
+        img_bw = np.stack([img_bw] * 3, axis=2)
+        if mask is not None:
+            img_bw[mask] = self.img[mask]
+        return img_bw
+
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+
+        Parameters
+        ----------
+        color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+            formats that are accepted.
+        brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+            0 will correspond to no change, a factor in [-1.0, 0) range will result in
+            a darker color and a factor in (0, 1.0] range will result in a lighter color.
+
+        Returns
+        -------
+        modified_color (tuple[double]): a tuple containing the RGB values of the
+            modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        color = mplc.to_rgb(color)
+        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
+        return modified_color