From 31f85f8e3a7459892a9c8cfba1cca6168691405a Mon Sep 17 00:00:00 2001 From: wondervictor Date: Tue, 26 Mar 2024 00:50:14 +0800 Subject: [PATCH] update: finetune high-resolution models --- README.md | 11 +- configs/finetune_coco/README.md | 21 +- ...e-4_80e_8gpus_mask-refine_finetune_coco.py | 184 ++++++++++++++++ ...s_mask-refine_nocopypaste_finetune_coco.py | 185 ++++++++++++++++ ...s_mask-refine_nocopypaste_finetune_coco.py | 180 ++++++++++++++++ ...gpus_mask-refine_finetune_adddecay_coco.py | 184 ++++++++++++++++ ...e-4_80e_8gpus_mask-refine_finetune_coco.py | 6 +- ...e_4x8gpus_obj365v1_goldg_train_lvis_val.py | 171 +++++++++++++++ ...bj365v1_goldg_train_1280ft_lvis_minival.py | 198 ++++++++++++++++++ ...bj365v1_goldg_train_1280ft_lvis_minival.py | 195 +++++++++++++++++ ...8gpus_obj365v1_goldg_train_lvis_minival.py | 5 +- configs/pretrain_v1/README.md | 21 ++ ...8gpus_obj365v1_goldg_train_lvis_minival.py | 0 ...e_4x8gpus_obj365v1_goldg_train_lvis_val.py | 0 ...8gpus_obj365v1_goldg_train_lvis_minival.py | 0 ...8gpus_obj365v1_goldg_train_lvis_minival.py | 0 ...8gpus_obj365v1_goldg_train_lvis_minival.py | 0 configs/prompt_tuning_coco/READEME.md | 7 + ..._bn_2e-4_80e_8gpus_all_fine_tuning_coco.py | 45 +--- ..._8gpus_mask-refine_all_fine_tuning_coco.py | 2 +- ...pus_mask-refine_fine_prompt_tuning_coco.py | 156 ++++++++++++++ ...sgd_2e-4_80e_8gpus_all_fine_tuning_coco.py | 109 ++++++++++ configs/segmentation/README.md | 13 +- deploy/easydeploy/tools/image-demo.py | 2 +- deploy/image-demo.py | 152 ++++++++++++++ yolo_world/models/detectors/yolo_world.py | 10 +- 26 files changed, 1787 insertions(+), 70 deletions(-) create mode 100644 configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py create mode 100644 configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py create mode 100644 configs/finetune_coco/yolo_world_v2_s_vlpan_bn_sgd_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py create mode 100644 configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py create mode 100644 configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py create mode 100644 configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py create mode 100644 configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py create mode 100644 configs/pretrain_v1/README.md rename configs/{pretrain => pretrain_v1}/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py (100%) rename configs/{pretrain => pretrain_v1}/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py (100%) rename configs/{pretrain => pretrain_v1}/yolo_world_m_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py (100%) rename configs/{pretrain => pretrain_v1}/yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py (100%) rename configs/{pretrain => pretrain_v1}/yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py (100%) create mode 100644 configs/prompt_tuning_coco/READEME.md create mode 100644 configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_fine_prompt_tuning_coco.py create mode 100644 configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_sgd_2e-4_80e_8gpus_all_fine_tuning_coco.py create mode 100644 deploy/image-demo.py diff --git a/README.md b/README.md index 79506400..5e33dbe7 100644 --- a/README.md +++ b/README.md @@ -88,16 +88,13 @@ We've pre-trained YOLO-World-S/M/L from scratch and evaluate on the `LVIS val-1. | model | Pre-train Data | Size | APmini | APr | APc | APf | APval | APr | APc | APf | weights | | :------------------------------------------------------------------------------------------------------------------- | :------------------- | :----------------- | :--------------: | :------------: | :------------: | :------------: | :-------------: | :------------: | :------------: | :------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| [YOLO-World-S](./configs/pretrain/yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 24.3 | 16.6 | 22.1 | 27.7 | 17.8 | 11.0 | 14.8 | 24.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_s_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-18bea4d2.pth) | -| [YOLO-Worldv2-S](./configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG | 640 | 22.7 | 16.3 | 20.8 | 25.5 | 17.3 | 11.3 | 14.9 | 22.7 |[HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_obj365v1_goldg_pretrain-55b943ea.pth)| -| [YOLO-World-M](./configs/pretrain/yolo_world_m_dual_l2norm_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 31.0 | 28.6 | 19.7 | 26.6 | 31.9 | 22.3 | 16.2 | 19.0 | 28.7 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_m_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-2b7bd1be.pth) | +| [YOLO-Worldv2-S](./configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG | 640 | 22.7 | 16.3 | 20.8 | 25.5 | 17.3 | 11.3 | 14.9 | 22.7 |[HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_obj365v1_goldg_pretrain-55b943ea.pth)| +| [YOLO-Worldv2-S](./configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) 🔥 | O365+GoldG | 1280🔸 | 24.1 | 18.7 | 22.0 | 26.9 | 18.8 | 14.1 | 16.3 | 23.8 |[HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_obj365v1_goldg_pretrain_1280ft-fc4ff4f7.pth)| | [YOLO-Worldv2-M](./configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG | 640 | 30.0 | 25.0 | 27.2 | 33.4 | 23.5 | 17.1 | 20.0 | 30.1 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_obj365v1_goldg_pretrain-c6237d5b.pth)| -| [YOLO-World-L](./configs/pretrain/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 32.5 | 22.3 | 30.6 | 36.1 | 24.8 | 17.8 | 22.4 | 32.5 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth) | -| [YOLO-World-L](./configs/pretrain/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 33.0 | 23.6 | 32.0 | 35.5 | 25.3 | 18.0 | 22.1 | 32.1 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-7a5eea3b.pth) | +| [YOLO-Worldv2-M](./configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) 🔥 | O365+GoldG | 1280🔸 | 31.6 | 24.5 | 29.0 | 35.1 | 25.3 | 19.3 | 22.0 | 31.7 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_obj365v1_goldg_pretrain_1280ft-77d0346d.pth)| | [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG | 640 | 33.0 | 22.6 | 32.0 | 35.8 | 26.0 | 18.6 | 23.0 | 32.6 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth)| -| [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) 🔥 | O365+GoldG | 1280 🔸 | 34.6 | 29.2 | 32.8 | 37.2 | 27.6 | 21.9 | 24.2 | 34.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth)| +| [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) 🔥 | O365+GoldG | 1280🔸 | 34.6 | 29.2 | 32.8 | 37.2 | 27.6 | 21.9 | 24.2 | 34.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth)| | [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG+CC3M-Lite | 640 | 32.9 | 25.3 | 31.1 | 35.8 | 26.1 | 20.6 | 22.6 | 32.3 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth)| -| [YOLO-World-X](./configs/pretrain/yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 33.4 | 24.4 | 31.6 | 36.6 | 26.6 | 19.2 | 23.5 | 33.2 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_x_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-8cf6b025.pth) | | [YOLO-Worldv2-X](./configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG+CC3M-Lite | 640 | 35.4 | 28.7 | 32.9 | 38.7 | 28.4 | 20.6 | 25.6 | 35.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth) | **NOTE:** diff --git a/configs/finetune_coco/README.md b/configs/finetune_coco/README.md index 72300bca..279e04dc 100644 --- a/configs/finetune_coco/README.md +++ b/configs/finetune_coco/README.md @@ -6,14 +6,17 @@ ##### NOTE: -1. Fine-tune models **without** `mask-refine` have some unknow errors and are under evaluation. -2. `X` and `S` models are coming soon. - -| model | `mask-refine`| efficient neck | AP | AP50 | AP75 | weights | log | -| :---- | :----------: |:-------------: | :-: | :--------------:| :-------------: |:------: | :-: | -| [YOLO-World-v2-S](./yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | ✔️ | ✖️ | 45.7 | 62.0 | 49.9 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-e6c2261e.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240119_121515.log) | -| [YOLO-World-v2-M](yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | ✔️ | ✖️ | 50.7 | 67.5 | 55.2 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-c6232481.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240320_204957.log) | -| [YOLO-World-v2-L](./yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | ✔️ | ✖️ | 53.3 | 70.3 | 58.0 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-ac9177d6.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240317_171126.log) | -| [ YOLO-World-v2-X](./yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | ✔️ | ✖️ | - | - | - | - | - | +1. APZS: AP evaluated in the zero-shot setting (w/o fine-tuning on COCO dataset). +2. Fine-tune models **without** `mask-refine` have some unknow errors and are under evaluation. +2. `X` models are coming soon. + + +| model | `mask-refine`| efficient neck | APZS| AP | AP50 | AP75 | weights | log | +| :---- | :----------: |:-------------: | :------------: | :-: | :--------------:| :-------------: |:------: | :-: | +| [YOLO-World-v2-S](./yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | ✔️ | ✖️ | 37.5 | 45.7 | 62.0 | 49.9 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-e6c2261e.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240119_121515.log) | +| [YOLO-World-v2-M](yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | ✔️ | ✖️ | 42.8 |50.7 | 67.5 | 55.2 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-c6232481.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240320_204957.log) | +| [YOLO-World-v2-L](./yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | ✔️ | ✖️ | 45.1 | 53.3 | 70.3 | 58.0 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-ac9177d6.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240317_171126.log) | +| [YOLO-World-v2-L](./yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | ✔️ | ✔️ | 45.1 | | | | [HF Checkpoints]() | [log]() | +| [ YOLO-World-v2-X](./yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | ✔️ | ✖️ | 46.8 | | - | - | [HF Checkpoints]() | [log]() | diff --git a/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 00000000..50fd7b3e --- /dev/null +++ b/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,184 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict( + imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='EfficientCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py new file mode 100644 index 00000000..b598ca86 --- /dev/null +++ b/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py @@ -0,0 +1,185 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict( + imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_s_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-55b943ea.pth' +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False +mixup_prob = 0.15 +copypaste_prob = 0.3 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_sgd_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_sgd_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py new file mode 100644 index 00000000..8e0766ea --- /dev/null +++ b/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_sgd_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py @@ -0,0 +1,180 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict( + imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 1e-3 +weight_decay = 0.0005 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_s_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-55b943ea.pth' +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False +mixup_prob = 0.15 +copypaste_prob = 0.3 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='SGD', + lr=base_lr, + momentum=0.937, + nesterov=True, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu)) + +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py b/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py new file mode 100644 index 00000000..25c30fb0 --- /dev/null +++ b/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py @@ -0,0 +1,184 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/' + 'yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict( + imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_x_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc250k_train_lviseval-8698fbfa.pth' +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py index c9f4c6f5..4b52bacc 100644 --- a/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +++ b/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -18,8 +18,8 @@ weight_decay = 0.05 train_batch_size_per_gpu = 16 load_from = 'pretrained_models/yolo_world_x_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc250k_train_lviseval-8698fbfa.pth' -# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' -text_model_name = 'openai/clip-vit-base-patch32' +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' persistent_workers = False # model settings @@ -112,6 +112,7 @@ batch_size=train_batch_size_per_gpu, collate_fn=dict(type='yolow_collate'), dataset=coco_train_dataset) + test_pipeline = [ *_base_.test_pipeline[:-1], dict(type='LoadText'), @@ -120,6 +121,7 @@ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor', 'pad_param', 'texts')) ] + coco_val_dataset = dict( _delete_=True, type='MultiModalDataset', diff --git a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py new file mode 100644 index 00000000..70b19b28 --- /dev/null +++ b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py @@ -0,0 +1,171 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 +# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_val.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_val.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py new file mode 100644 index 00000000..9a430e84 --- /dev/null +++ b/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py @@ -0,0 +1,198 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_m_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +img_scale = (1280, 1280) + +# text_model_name = 'openai/clip-vit-base-patch32' +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] + +train_pipeline_stage2 = [ + *_base_.pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform +] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py new file mode 100644 index 00000000..3afb76aa --- /dev/null +++ b/configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py @@ -0,0 +1,195 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.025 +train_batch_size_per_gpu = 4 +img_scale = (1280, 1280) + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [ + *_base_.pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform +] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py index ec8ccb57..a8e68ddc 100644 --- a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py +++ b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -15,7 +15,8 @@ base_lr = 2e-3 weight_decay = 0.05 / 2 train_batch_size_per_gpu = 16 - +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' # model settings model = dict( type='YOLOWorldDetector', @@ -29,7 +30,7 @@ image_model={{_base_.model.backbone}}, text_model=dict( type='HuggingCLIPLanguageBackbone', - model_name='openai/clip-vit-base-patch32', + model_name=text_model_name, frozen_modules=['all'])), neck=dict(type='YOLOWorldPAFPN', guide_channels=text_channels, diff --git a/configs/pretrain_v1/README.md b/configs/pretrain_v1/README.md new file mode 100644 index 00000000..3290c7e0 --- /dev/null +++ b/configs/pretrain_v1/README.md @@ -0,0 +1,21 @@ +## Pre-training YOLO-World-v1 + +> The YOLO-World-v1 is an initial version, and now is nearly deprecated! We strongly suggest you use the [latest version](../pretrain/). + + + +### Zero-shot Inference on LVIS dataset + +| model | Pre-train Data | Size | APmini | APr | APc | APf | APval | APr | APc | APf | weights | +| :------------------------------------------------------------------------------------------------------------------- | :------------------- | :----------------- | :--------------: | :------------: | :------------: | :------------: | :-------------: | :------------: | :------------: | :------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [YOLO-World-S](./yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 24.3 | 16.6 | 22.1 | 27.7 | 17.8 | 11.0 | 14.8 | 24.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_s_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-18bea4d2.pth) | +| [YOLO-World-M](./yolo_world_m_dual_l2norm_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 28.6 | 19.7 | 26.6 | 31.9 | 22.3 | 16.2 | 19.0 | 28.7 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_m_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-2b7bd1be.pth) | +| [YOLO-World-L](./yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 32.5 | 22.3 | 30.6 | 36.1 | 24.8 | 17.8 | 22.4 | 32.5 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth) | +| [YOLO-World-L](./yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 33.0 | 23.6 | 32.0 | 35.5 | 25.3 | 18.0 | 22.1 | 32.1 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-7a5eea3b.pth) | +| [YOLO-World-X](./yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 33.4 | 24.4 | 31.6 | 36.6 | 26.6 | 19.2 | 23.5 | 33.2 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_x_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-8cf6b025.pth) | + + +**NOTE:** +1. APmini: evaluated on LVIS `minival`. +3. APval: evaluated on LVIS `val 1.0`. +4. [HuggingFace Mirror](https://hf-mirror.com/) provides the mirror of HuggingFace, which is a choice for users who are unable to reach. \ No newline at end of file diff --git a/configs/pretrain/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py similarity index 100% rename from configs/pretrain/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py rename to configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py diff --git a/configs/pretrain/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py b/configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py similarity index 100% rename from configs/pretrain/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py rename to configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py diff --git a/configs/pretrain/yolo_world_m_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain_v1/yolo_world_m_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py similarity index 100% rename from configs/pretrain/yolo_world_m_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py rename to configs/pretrain_v1/yolo_world_m_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py diff --git a/configs/pretrain/yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain_v1/yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py similarity index 100% rename from configs/pretrain/yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py rename to configs/pretrain_v1/yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py diff --git a/configs/pretrain/yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain_v1/yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py similarity index 100% rename from configs/pretrain/yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py rename to configs/pretrain_v1/yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py diff --git a/configs/prompt_tuning_coco/READEME.md b/configs/prompt_tuning_coco/READEME.md new file mode 100644 index 00000000..dd63e752 --- /dev/null +++ b/configs/prompt_tuning_coco/READEME.md @@ -0,0 +1,7 @@ +## Prompt Tuning for YOLO-World + + +| Model | Config | AP | AP50 | AP75 | APS | APM | APL | +| :---- | :----: | :--: | :--: | :---: | :-: | :-: | :-: | +| YOLO-World-v2-L | Zero-shot | 45.7 | 61.6 | 49.8 | 29.9 | 50.0 | 60.8 | +| [YOLO-World-v2-L](./../configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py) | Prompt tuning | 47.9 | 64.3 | 52.5 | 31.9 | 52.6 | 61.3 | diff --git a/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_all_fine_tuning_coco.py b/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_all_fine_tuning_coco.py index 92f4d4cd..44c2fd98 100644 --- a/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_all_fine_tuning_coco.py +++ b/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_all_fine_tuning_coco.py @@ -11,7 +11,7 @@ text_channels = 512 neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] -base_lr = 2e-3 +base_lr = 2e-4 weight_decay = 0.05 train_batch_size_per_gpu = 16 load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' @@ -48,60 +48,24 @@ train_cfg=dict(assigner=dict(num_classes=num_training_classes))) # dataset settings -final_transform = [ - dict(type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', - 'flip_direction')) -] -mosaic_affine_transform = [ - dict(type='Mosaic', - img_scale=_base_.img_scale, - pad_val=114.0, - pre_transform=_base_.pre_transform), - dict( - type='YOLOv5RandomAffine', - max_rotate_degree=0.0, - max_shear_degree=0.0, - max_aspect_ratio=100., - scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), - # img_scale is (width, height) - border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), - border_val=(114, 114, 114)) -] -train_pipeline = [ - *_base_.pre_transform, *mosaic_affine_transform, - dict(type='YOLOv5MixUp', - prob=_base_.mixup_prob, - pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]), - *_base_.last_transform[:-1], *final_transform -] - -train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *final_transform] - coco_train_dataset = dict(type='YOLOv5CocoDataset', data_root='data/coco', ann_file='annotations/instances_train2017.json', data_prefix=dict(img='train2017/'), filter_cfg=dict(filter_empty_gt=False, min_size=32), - pipeline=train_pipeline) + pipeline=_base_.train_pipeline) train_dataloader = dict(persistent_workers=persistent_workers, batch_size=train_batch_size_per_gpu, collate_fn=dict(type='yolow_collate'), dataset=coco_train_dataset) -test_pipeline = [ - *_base_.test_pipeline[:-1], - dict(type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor', 'pad_param')) -] coco_val_dataset = dict(type='YOLOv5CocoDataset', data_root='data/coco', ann_file='annotations/instances_val2017.json', data_prefix=dict(img='val2017/'), filter_cfg=dict(filter_empty_gt=False, min_size=32), - pipeline=test_pipeline) + pipeline=_base_.test_pipeline) val_dataloader = dict(dataset=coco_val_dataset) test_dataloader = val_dataloader @@ -121,12 +85,13 @@ priority=49), dict(type='mmdet.PipelineSwitchHook', switch_epoch=max_epochs - close_mosaic_epochs, - switch_pipeline=train_pipeline_stage2) + switch_pipeline=_base_.train_pipeline_stage2) ] train_cfg = dict(max_epochs=max_epochs, val_interval=5, dynamic_intervals=[((max_epochs - close_mosaic_epochs), _base_.val_interval_stage2)]) + optim_wrapper = dict(optimizer=dict( _delete_=True, type='AdamW', diff --git a/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_all_fine_tuning_coco.py b/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_all_fine_tuning_coco.py index d86ff844..8d5629a4 100644 --- a/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_all_fine_tuning_coco.py +++ b/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_all_fine_tuning_coco.py @@ -11,7 +11,7 @@ text_channels = 512 neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] -base_lr = 2e-3 +base_lr = 2e-4 weight_decay = 0.05 train_batch_size_per_gpu = 16 load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' diff --git a/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_fine_prompt_tuning_coco.py b/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_fine_prompt_tuning_coco.py new file mode 100644 index 00000000..f9358f4c --- /dev/null +++ b/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_fine_prompt_tuning_coco.py @@ -0,0 +1,156 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' +persistent_workers = False + +# model settings +model = dict(type='YOLOWorldPromptDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + embedding_path='embeddings/clip_vit_b32_coco_80_embeddings.npy', + prompt_dim=text_channels, + num_prompts=80, + freeze_prompt=False, + data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), + backbone=dict(_delete_=True, + type='MultiModalYOLOBackbone', + text_model=None, + image_model={{_base_.model.backbone}}, + with_text_model=False), + neck=dict(type='YOLOWorldPAFPN', + freeze_all=False, + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict( + type='YOLOWorldHeadModule', + freeze_all=False, + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +final_transform = [ + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] +mosaic_affine_transform = [ + dict(type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]), + *_base_.last_transform[:-1], *final_transform +] + +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *final_transform] + +coco_train_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +coco_val_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=test_pipeline) + +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0), + 'embeddings': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict(_delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_sgd_2e-4_80e_8gpus_all_fine_tuning_coco.py b/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_sgd_2e-4_80e_8gpus_all_fine_tuning_coco.py new file mode 100644 index 00000000..46357ba0 --- /dev/null +++ b/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_sgd_2e-4_80e_8gpus_all_fine_tuning_coco.py @@ -0,0 +1,109 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 1e-3 +weight_decay = 0.0005 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' +persistent_workers = False + +# model settings +model = dict(type='YOLOWorldPromptDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + embedding_path='embeddings/clip_vit_b32_coco_80_embeddings.npy', + prompt_dim=text_channels, + num_prompts=80, + freeze_prompt=True, + data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), + backbone=dict(_delete_=True, + type='MultiModalYOLOBackbone', + text_model=None, + image_model={{_base_.model.backbone}}, + with_text_model=False), + neck=dict(type='YOLOWorldPAFPN', + freeze_all=False, + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict( + type='YOLOWorldHeadModule', + freeze_all=False, + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +coco_train_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +coco_val_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.test_pipeline) + +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=_base_.train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) + +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='SGD', + lr=base_lr, + momentum=0.937, + nesterov=True, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu)) + +# evaluation settings +val_evaluator = dict(_delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/configs/segmentation/README.md b/configs/segmentation/README.md index 5f816b96..8cfd3034 100644 --- a/configs/segmentation/README.md +++ b/configs/segmentation/README.md @@ -13,11 +13,14 @@ We provide two fine-tuning strategies YOLO-World towards open-vocabulary instanc | Model | Fine-tuning Data | Fine-tuning Modules| APmask | APr | APc | APf | Weights | | :---- | :--------------- | :----------------: | :--------------: | :------------: | :------------: | :------------: | :-----: | -| [YOLO-World-Seg-M](./configs/segmentation/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py) | `LVIS-Base` | `all modules` | 25.9 | 13.4 | 24.9 | 32.6 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-ca465825.pth) | -| [YOLO-World-Seg-L](./configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py) | `LVIS-Base` | `all modules` | 28.7 | 15.0 | 28.3 | 35.2| [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-8c58c916.pth) | -| [YOLO-World-Seg-M](./configs/segmentation/yolo_seg_world_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py) | `LVIS-Base` | `seg head` | 16.7 | 12.6 | 14.6 | 20.8 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-7bca59a7.pth) | -| [YOLO-World-Seg-L](./configs/segmentation/yolo_seg_world_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py) | `LVIS-Base` | `seg head` | 19.1 | 14.2 | 17.2 | 23.5 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-5a642d30.pth) | - +| [YOLO-World-Seg-M](./yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py) | `LVIS-Base` | `all modules` | 25.9 | 13.4 | 24.9 | 32.6 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-ca465825.pth) | +| [YOLO-World-v2-Seg-M](./yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py) | `LVIS-Base` | `all modules` | 25.9 | 13.4 | 24.9 | 32.6 | [HF Checkpoints 🤗]() | +| [YOLO-World-Seg-L](./yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py) | `LVIS-Base` | `all modules` | 28.7 | 15.0 | 28.3 | 35.2| [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-8c58c916.pth) | +| [YOLO-World-v2-Seg-L](./yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py) | `LVIS-Base` | `all modules` | 28.7 | 15.0 | 28.3 | 35.2| [HF Checkpoints 🤗]() | +| [YOLO-World-Seg-M](./yolo_seg_world_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py) | `LVIS-Base` | `seg head` | 16.7 | 12.6 | 14.6 | 20.8 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-7bca59a7.pth) | +| [YOLO-World-v2-Seg-M](./yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py) | `LVIS-Base` | `seg head` | 17.8 | 13.9 | 15.5 | 22.0 | [HF Checkpoints 🤗]() | +| [YOLO-World-Seg-L](yolo_seg_world_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py) | `LVIS-Base` | `seg head` | 19.1 | 14.2 | 17.2 | 23.5 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-5a642d30.pth) | +| [YOLO-World-v2-Seg-L](./yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py) | `LVIS-Base` | `seg head` | 19.8 | 17.2 | 17.5 | 23.6 | [HF Checkpoints 🤗]() | **NOTE:** 1. The mask AP are evaluated on the LVIS `val 1.0`. 2. All models are fine-tuned for 80 epochs on `LVIS-Base` (866 categories, `common + frequent`). diff --git a/deploy/easydeploy/tools/image-demo.py b/deploy/easydeploy/tools/image-demo.py index c85f31a0..12ebaddc 100644 --- a/deploy/easydeploy/tools/image-demo.py +++ b/deploy/easydeploy/tools/image-demo.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from projects.easydeploy.model import ORTWrapper, TRTWrapper # isort:skip +from easydeploy.model import ORTWrapper, TRTWrapper # isort:skip import os import random from argparse import ArgumentParser diff --git a/deploy/image-demo.py b/deploy/image-demo.py new file mode 100644 index 00000000..12ebaddc --- /dev/null +++ b/deploy/image-demo.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from easydeploy.model import ORTWrapper, TRTWrapper # isort:skip +import os +import random +from argparse import ArgumentParser + +import cv2 +import mmcv +import numpy as np +import torch +from mmcv.transforms import Compose +from mmdet.utils import get_test_pipeline_cfg +from mmengine.config import Config, ConfigDict +from mmengine.utils import ProgressBar, path + +from mmyolo.utils import register_all_modules +from mmyolo.utils.misc import get_file_list + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + args = parser.parse_args() + return args + + +def preprocess(config): + data_preprocess = config.get('model', {}).get('data_preprocessor', {}) + mean = data_preprocess.get('mean', [0., 0., 0.]) + std = data_preprocess.get('std', [1., 1., 1.]) + mean = torch.tensor(mean, dtype=torch.float32).reshape(1, 3, 1, 1) + std = torch.tensor(std, dtype=torch.float32).reshape(1, 3, 1, 1) + + class PreProcess(torch.nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, x): + x = x[None].float() + x -= mean.to(x.device) + x /= std.to(x.device) + return x + + return PreProcess().eval() + + +def main(): + args = parse_args() + + # register all modules in mmdet into the registries + register_all_modules() + + colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(1000)] + + # build the model from a config file and a checkpoint file + if args.checkpoint.endswith('.onnx'): + model = ORTWrapper(args.checkpoint, args.device) + elif args.checkpoint.endswith('.engine') or args.checkpoint.endswith( + '.plan'): + model = TRTWrapper(args.checkpoint, args.device) + else: + raise NotImplementedError + + model.to(args.device) + + cfg = Config.fromfile(args.config) + class_names = cfg.get('class_name') + + test_pipeline = get_test_pipeline_cfg(cfg) + test_pipeline[0] = ConfigDict({'type': 'mmdet.LoadImageFromNDArray'}) + test_pipeline = Compose(test_pipeline) + + pre_pipeline = preprocess(cfg) + + if not args.show: + path.mkdir_or_exist(args.out_dir) + + # get file list + files, source_type = get_file_list(args.img) + + # start detector inference + progress_bar = ProgressBar(len(files)) + for i, file in enumerate(files): + bgr = mmcv.imread(file) + rgb = mmcv.imconvert(bgr, 'bgr', 'rgb') + data, samples = test_pipeline(dict(img=rgb, img_id=i)).values() + pad_param = samples.get('pad_param', + np.array([0, 0, 0, 0], dtype=np.float32)) + h, w = samples.get('ori_shape', rgb.shape[:2]) + pad_param = torch.asarray( + [pad_param[2], pad_param[0], pad_param[2], pad_param[0]], + device=args.device) + scale_factor = samples.get('scale_factor', [1., 1]) + scale_factor = torch.asarray(scale_factor * 2, device=args.device) + data = pre_pipeline(data).to(args.device) + + result = model(data) + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # Get candidate predict info by num_dets + num_dets, bboxes, scores, labels = result + scores = scores[0, :num_dets] + bboxes = bboxes[0, :num_dets] + labels = labels[0, :num_dets] + bboxes -= pad_param + bboxes /= scale_factor + + bboxes[:, 0::2].clamp_(0, w) + bboxes[:, 1::2].clamp_(0, h) + bboxes = bboxes.round().int() + + for (bbox, score, label) in zip(bboxes, scores, labels): + bbox = bbox.tolist() + color = colors[label] + + if class_names is not None: + label_name = class_names[label] + name = f'cls:{label_name}_score:{score:0.4f}' + else: + name = f'cls:{label}_score:{score:0.4f}' + + cv2.rectangle(bgr, bbox[:2], bbox[2:], color, 2) + cv2.putText( + bgr, + name, (bbox[0], bbox[1] - 2), + cv2.FONT_HERSHEY_SIMPLEX, + 2.0, [225, 255, 255], + thickness=3) + + if args.show: + mmcv.imshow(bgr, 'result', 0) + else: + mmcv.imwrite(bgr, out_file) + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/yolo_world/models/detectors/yolo_world.py b/yolo_world/models/detectors/yolo_world.py index 91e869b4..12fa6010 100644 --- a/yolo_world/models/detectors/yolo_world.py +++ b/yolo_world/models/detectors/yolo_world.py @@ -43,7 +43,8 @@ def predict(self, img_feats, txt_feats = self.extract_feat(batch_inputs, batch_data_samples) - self.bbox_head.num_classes = txt_feats[0].shape[0] + self.bbox_head.num_classes = self.num_test_classes + # self.bbox_head.num_classes = txt_feats[0].shape[0] results_list = self.bbox_head.predict(img_feats, txt_feats, batch_data_samples, @@ -78,10 +79,13 @@ def extract_feat( if batch_data_samples is None: texts = self.texts txt_feats = self.text_feats - elif isinstance(batch_data_samples, dict): + elif isinstance(batch_data_samples, dict) and 'texts' in batch_data_samples: texts = batch_data_samples['texts'] - elif isinstance(batch_data_samples, list): + elif isinstance(batch_data_samples, list) and hasattr(batch_data_samples[0], 'texts'): texts = [data_sample.texts for data_sample in batch_data_samples] + elif hasattr(self, 'text_feats'): + texts = self.texts + txt_feats = self.text_feats else: raise TypeError('batch_data_samples should be dict or list.') if txt_feats is not None: