diff --git a/README.md b/README.md index cc8bab26..93e380ff 100644 --- a/README.md +++ b/README.md @@ -91,16 +91,17 @@ We've pre-trained YOLO-World-S/M/L from scratch and evaluate on the `LVIS val-1. | model | Pre-train Data | Size | APmini | APr | APc | APf | APval | APr | APc | APf | weights | | :------------------------------------------------------------------------------------------------------------------- | :------------------- | :----------------- | :--------------: | :------------: | :------------: | :------------: | :-------------: | :------------: | :------------: | :------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| [YOLO-Worldv2-S](./configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG | 640 | 22.7 | 16.3 | 20.8 | 25.5 | 17.3 | 11.3 | 14.9 | 22.7 |[HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_obj365v1_goldg_pretrain-55b943ea.pth)| -| [YOLO-Worldv2-S](./configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) 🔥 | O365+GoldG | 1280🔸 | 24.1 | 18.7 | 22.0 | 26.9 | 18.8 | 14.1 | 16.3 | 23.8 |[HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_obj365v1_goldg_pretrain_1280ft-fc4ff4f7.pth)| -| [YOLO-Worldv2-M](./configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG | 640 | 30.0 | 25.0 | 27.2 | 33.4 | 23.5 | 17.1 | 20.0 | 30.1 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_obj365v1_goldg_pretrain-c6237d5b.pth)| -| [YOLO-Worldv2-M](./configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) 🔥 | O365+GoldG | 1280🔸 | 31.6 | 24.5 | 29.0 | 35.1 | 25.3 | 19.3 | 22.0 | 31.7 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_obj365v1_goldg_pretrain_1280ft-77d0346d.pth)| -| [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG | 640 | 33.0 | 22.6 | 32.0 | 35.8 | 26.0 | 18.6 | 23.0 | 32.6 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth)| -| [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) 🔥 | O365+GoldG | 1280🔸 | 34.6 | 29.2 | 32.8 | 37.2 | 27.6 | 21.9 | 24.2 | 34.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth)| -| [YOLO-Worldv2-L (CLIP-Large)](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG | 640 | 33.0 | 22.6 | 32.0 | 35.8 | 26.0 | 18.6 | 23.0 | 32.6 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth)| -| [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG+CC3M-Lite | 640 | 32.9 | 25.3 | 31.1 | 35.8 | 26.1 | 20.6 | 22.6 | 32.3 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth)| -| [YOLO-Worldv2-X](./configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG+CC3M-Lite | 640 | 35.4 | 28.7 | 32.9 | 38.7 | 28.4 | 20.6 | 25.6 | 35.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth) | -| [YOLO-Worldv2-XL](./configs/pretrain/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG+CC3M-Lite | 640 | 36.0 | 25.8 | 34.1 | 39.5 | 29.1 | 21.1 | 26.3 | 35.8 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth) | +| [YOLO-Worldv2-S](./configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 22.7 | 16.3 | 20.8 | 25.5 | 17.3 | 11.3 | 14.9 | 22.7 |[HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_obj365v1_goldg_pretrain-55b943ea.pth)| +| [YOLO-Worldv2-S](./configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) | O365+GoldG | 1280🔸 | 24.1 | 18.7 | 22.0 | 26.9 | 18.8 | 14.1 | 16.3 | 23.8 |[HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_obj365v1_goldg_pretrain_1280ft-fc4ff4f7.pth)| +| [YOLO-Worldv2-M](./configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 30.0 | 25.0 | 27.2 | 33.4 | 23.5 | 17.1 | 20.0 | 30.1 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_obj365v1_goldg_pretrain-c6237d5b.pth)| +| [YOLO-Worldv2-M](./configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) | O365+GoldG | 1280🔸 | 31.6 | 24.5 | 29.0 | 35.1 | 25.3 | 19.3 | 22.0 | 31.7 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_obj365v1_goldg_pretrain_1280ft-77d0346d.pth)| +| [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 33.0 | 22.6 | 32.0 | 35.8 | 26.0 | 18.6 | 23.0 | 32.6 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth)| +| [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) | O365+GoldG | 1280🔸 | 34.6 | 29.2 | 32.8 | 37.2 | 27.6 | 21.9 | 24.2 | 34.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth)| +| [YOLO-Worldv2-L (CLIP-Large)](./configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG | 640 | 34.0 | 22.0 | 32.6 | 37.4 | 27.1 | 19.9 | 23.9 | 33.9 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_clip_large_o365v1_goldg_pretrain-8ff2e744.pth)| +| [YOLO-Worldv2-L (CLIP-Large)](./configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py) 🔥 | O365+GoldG | 800🔸 | 35.5 | 28.3 | 33.2 | 38.8 | 28.6 | 22.0 | 25.1 | 35.4 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_clip_large_o365v1_goldg_pretrain_800ft-9df82e55.pth)| +| [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 32.9 | 25.3 | 31.1 | 35.8 | 26.1 | 20.6 | 22.6 | 32.3 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth)| +| [YOLO-Worldv2-X](./configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 35.4 | 28.7 | 32.9 | 38.7 | 28.4 | 20.6 | 25.6 | 35.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth) | +| [YOLO-Worldv2-XL](./configs/pretrain/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 36.0 | 25.8 | 34.1 | 39.5 | 29.1 | 21.1 | 26.3 | 35.8 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth) | diff --git a/configs/finetune_coco/README.md b/configs/finetune_coco/README.md index e1d91d1e..2d789467 100644 --- a/configs/finetune_coco/README.md +++ b/configs/finetune_coco/README.md @@ -15,12 +15,12 @@ BTW, the COCO fine-tuning results are updated with higher performance (with `mas | model | Schedule | `mask-refine` | efficient neck | APZS| AP | AP50 | AP75 | weights | log | | :---- | :-------: | :----------: |:-------------: | :------------: | :-: | :--------------:| :-------------: |:------: | :-: | -| [YOLO-World-v2-S](./yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 37.5 | 45.7 | 62.0 | 49.9 | [HF Checkpoints]() | [log]() | -| [YOLO-World-v2-M](./yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 42.8 | 51.0 | 67.5 | 55.2 | [HF Checkpoints]() | [log]() | -| [YOLO-World-v2-L](./yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 45.1 | 53.9 | 70.9 | 58.8 | [HF Checkpoints]() | [log]() | +| [YOLO-World-v2-S](./yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 37.5 | 46.1 | 62.0 | 49.9 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-492dc329.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240327_110411.log) | +| [YOLO-World-v2-M](./yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 42.8 | 51.0 | 67.5 | 55.2 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-69c27ac7.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240327_110411.log) | +| [YOLO-World-v2-L](./yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 45.1 | 53.9 | 70.9 | 58.8 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-81c701ee.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240326_160313.log) | | [YOLO-World-v2-L](./yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✔️ | 45.1 | | | | [HF Checkpoints]() | [log]() | -| [YOLO-World-v2-X](./yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 46.8 | 54.7 | 71.6 | 59.6 | [HF Checkpoints]() | [log]() | -| [YOLO-World-v2-L]() | SGD, 1e-3, 40e | ✖️ | ✖️ | 45.1 | | | | [HF Checkpoints]() | [log]() | +| [YOLO-World-v2-X](./yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 46.8 | 54.7 | 71.6 | 59.6 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-76bc0cbd.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240322_181232.log) | +| [YOLO-World-v2-L](./yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py) 🔥 | SGD, 1e-3, 40e | ✖️ | ✖️ | 45.1 | 52.8 | 69.5 | 57.8 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco_ep80-e1288152.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetuning_coco_20240327_014902.log) | diff --git a/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py b/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py index eb338e75..101a571d 100644 --- a/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py +++ b/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py @@ -166,8 +166,6 @@ weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict( - bias_decay_mult=0.0, - norm_decay_mult=0.0, custom_keys={'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0)}), constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_mask-refine_finetune_coco.py index 064abe97..2ddbe50d 100644 --- a/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_mask-refine_finetune_coco.py +++ b/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -169,12 +169,9 @@ weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict( - bias_decay_mult=0.0, - norm_decay_mult=0.0, custom_keys={'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0)}), constructor='YOLOWv5OptimizerConstructor') - # evaluation settings val_evaluator = dict( _delete_=True, diff --git a/configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py index 28553855..b5cdca50 100644 --- a/configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py +++ b/configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -139,21 +139,17 @@ val_interval=5, dynamic_intervals=[((max_epochs - close_mosaic_epochs), _base_.val_interval_stage2)]) -optim_wrapper = dict(optimizer=dict( - _delete_=True, - type='AdamW', - lr=base_lr, - weight_decay=weight_decay, - batch_size_per_gpu=train_batch_size_per_gpu), - paramwise_cfg=dict(bias_decay_mult=0.0, - norm_decay_mult=0.0, - custom_keys={ - 'backbone.text_model': - dict(lr_mult=0.01), - 'logit_scale': - dict(weight_decay=0.0) - }), - constructor='YOLOWv5OptimizerConstructor') +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') # evaluation settings val_evaluator = dict(_delete_=True, diff --git a/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py index 50fd7b3e..27531183 100644 --- a/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py +++ b/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -169,8 +169,6 @@ weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict( - bias_decay_mult=0.0, - norm_decay_mult=0.0, custom_keys={'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0)}), constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_finetune_coco_womixup.py b/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_finetune_coco_womixup.py deleted file mode 100644 index 5490fd54..00000000 --- a/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_finetune_coco_womixup.py +++ /dev/null @@ -1,160 +0,0 @@ -_base_ = ('../../third_party/mmyolo/configs/yolov8/' - 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') -custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) - -# hyper-parameters -num_classes = 80 -num_training_classes = 80 -max_epochs = 80 # Maximum training epochs -close_mosaic_epochs = 10 -save_epoch_intervals = 5 -text_channels = 512 -neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] -neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] -base_lr = 2e-4 -weight_decay = 0.05 -train_batch_size_per_gpu = 16 -load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' -text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' -# text_model_name = 'openai/clip-vit-base-patch32' -persistent_workers = False - -# model settings -model = dict(type='YOLOWorldDetector', - mm_neck=True, - num_train_classes=num_training_classes, - num_test_classes=num_classes, - data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), - backbone=dict(_delete_=True, - type='MultiModalYOLOBackbone', - image_model={{_base_.model.backbone}}, - text_model=dict(type='HuggingCLIPLanguageBackbone', - model_name=text_model_name, - frozen_modules=['all'])), - neck=dict(type='YOLOWorldPAFPN', - guide_channels=text_channels, - embed_channels=neck_embed_channels, - num_heads=neck_num_heads, - block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), - bbox_head=dict(type='YOLOWorldHead', - head_module=dict( - type='YOLOWorldHeadModule', - use_bn_head=True, - embed_dims=text_channels, - num_classes=num_training_classes)), - train_cfg=dict(assigner=dict(num_classes=num_training_classes))) - -# dataset settings -text_transform = [ - dict(type='RandomLoadText', - num_neg_samples=(num_classes, num_classes), - max_num_samples=num_training_classes, - padding_to_max=True, - padding_value=''), - dict(type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', - 'flip_direction', 'texts')) -] -mosaic_affine_transform = [ - dict(type='MultiModalMosaic', - img_scale=_base_.img_scale, - pad_val=114.0, - pre_transform=_base_.pre_transform), - dict( - type='YOLOv5RandomAffine', - max_rotate_degree=0.0, - max_shear_degree=0.0, - max_aspect_ratio=100., - scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), - # img_scale is (width, height) - border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), - border_val=(114, 114, 114)) -] -train_pipeline = [ - *_base_.pre_transform, *mosaic_affine_transform, - # dict(type='YOLOv5MultiModalMixUp', - # prob=_base_.mixup_prob, - # pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]), - *_base_.last_transform[:-1], *text_transform -] -train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] - -coco_train_dataset = dict(_delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_train2017.json', - data_prefix=dict(img='train2017/'), - filter_cfg=dict(filter_empty_gt=False, - min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=train_pipeline) - -train_dataloader = dict(persistent_workers=persistent_workers, - batch_size=train_batch_size_per_gpu, - collate_fn=dict(type='yolow_collate'), - dataset=coco_train_dataset) -test_pipeline = [ - *_base_.test_pipeline[:-1], - dict(type='LoadText'), - dict(type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor', 'pad_param', 'texts')) -] -coco_val_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict(type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_val2017.json', - data_prefix=dict(img='val2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=test_pipeline) -val_dataloader = dict(dataset=coco_val_dataset) -test_dataloader = val_dataloader -# training settings -default_hooks = dict(param_scheduler=dict(scheduler_type='linear', - lr_factor=0.01, - max_epochs=max_epochs), - checkpoint=dict(max_keep_ckpts=-1, - save_best=None, - interval=save_epoch_intervals)) -custom_hooks = [ - dict(type='EMAHook', - ema_type='ExpMomentumEMA', - momentum=0.0001, - update_buffers=True, - strict_load=False, - priority=49), - dict(type='mmdet.PipelineSwitchHook', - switch_epoch=max_epochs - close_mosaic_epochs, - switch_pipeline=train_pipeline_stage2) -] -train_cfg = dict(max_epochs=max_epochs, - val_interval=5, - dynamic_intervals=[((max_epochs - close_mosaic_epochs), - _base_.val_interval_stage2)]) -optim_wrapper = dict(optimizer=dict( - _delete_=True, - type='AdamW', - lr=base_lr, - weight_decay=weight_decay, - batch_size_per_gpu=train_batch_size_per_gpu), - paramwise_cfg=dict(bias_decay_mult=0.0, - norm_decay_mult=0.0, - custom_keys={ - 'backbone.text_model': - dict(lr_mult=0.01), - 'logit_scale': - dict(weight_decay=0.0) - }), - constructor='YOLOWv5OptimizerConstructor') - -# evaluation settings -val_evaluator = dict(_delete_=True, - type='mmdet.CocoMetric', - proposal_nums=(100, 1, 10), - ann_file='data/coco/annotations/instances_val2017.json', - metric='bbox') diff --git a/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py b/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py deleted file mode 100644 index 79414898..00000000 --- a/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py +++ /dev/null @@ -1,182 +0,0 @@ -_base_ = ( - '../../third_party/mmyolo/configs/yolov8/' - 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py') -custom_imports = dict( - imports=['yolo_world'], - allow_failed_imports=False) - -# hyper-parameters -num_classes = 80 -num_training_classes = 80 -max_epochs = 80 # Maximum training epochs -close_mosaic_epochs = 10 -save_epoch_intervals = 5 -text_channels = 512 -neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] -neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] -base_lr = 2e-4 -weight_decay = 0.05 -train_batch_size_per_gpu = 16 -load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' -text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' -# text_model_name = 'openai/clip-vit-base-patch32' -persistent_workers = False - -# model settings -model = dict( - type='YOLOWorldDetector', - mm_neck=True, - num_train_classes=num_training_classes, - num_test_classes=num_classes, - data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), - backbone=dict( - _delete_=True, - type='MultiModalYOLOBackbone', - image_model={{_base_.model.backbone}}, - text_model=dict( - type='HuggingCLIPLanguageBackbone', - model_name=text_model_name, - frozen_modules=['all'])), - neck=dict(type='YOLOWorldPAFPN', - guide_channels=text_channels, - embed_channels=neck_embed_channels, - num_heads=neck_num_heads, - block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), - bbox_head=dict(type='YOLOWorldHead', - head_module=dict(type='YOLOWorldHeadModule', - use_bn_head=True, - embed_dims=text_channels, - num_classes=num_training_classes)), - train_cfg=dict(assigner=dict(num_classes=num_training_classes))) - -# dataset settings -text_transform = [ - dict(type='RandomLoadText', - num_neg_samples=(num_classes, num_classes), - max_num_samples=num_training_classes, - padding_to_max=True, - padding_value=''), - dict(type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', - 'flip_direction', 'texts')) -] -mosaic_affine_transform = [ - dict( - type='MultiModalMosaic', - img_scale=_base_.img_scale, - pad_val=114.0, - pre_transform=_base_.pre_transform), - dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), - dict( - type='YOLOv5RandomAffine', - max_rotate_degree=0.0, - max_shear_degree=0.0, - max_aspect_ratio=100., - scaling_ratio_range=(1 - _base_.affine_scale, - 1 + _base_.affine_scale), - # img_scale is (width, height) - border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), - border_val=(114, 114, 114), - min_area_ratio=_base_.min_area_ratio, - use_mask_refine=_base_.use_mask2refine) -] -train_pipeline = [ - *_base_.pre_transform, - *mosaic_affine_transform, - dict( - type='YOLOv5MultiModalMixUp', - prob=_base_.mixup_prob, - pre_transform=[*_base_.pre_transform, - *mosaic_affine_transform]), - *_base_.last_transform[:-1], - *text_transform -] -train_pipeline_stage2 = [ - *_base_.train_pipeline_stage2[:-1], - *text_transform -] -coco_train_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_train2017.json', - data_prefix=dict(img='train2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=train_pipeline) - -train_dataloader = dict( - persistent_workers=persistent_workers, - batch_size=train_batch_size_per_gpu, - collate_fn=dict(type='yolow_collate'), - dataset=coco_train_dataset) -test_pipeline = [ - *_base_.test_pipeline[:-1], - dict(type='LoadText'), - dict( - type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor', 'pad_param', 'texts')) -] -coco_val_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_val2017.json', - data_prefix=dict(img='val2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=test_pipeline) -val_dataloader = dict(dataset=coco_val_dataset) -test_dataloader = val_dataloader -# training settings -default_hooks = dict( - param_scheduler=dict( - scheduler_type='linear', - lr_factor=0.01, - max_epochs=max_epochs), - checkpoint=dict( - max_keep_ckpts=-1, - save_best=None, - interval=save_epoch_intervals)) -custom_hooks = [ - dict( - type='EMAHook', - ema_type='ExpMomentumEMA', - momentum=0.0001, - update_buffers=True, - strict_load=False, - priority=49), - dict( - type='mmdet.PipelineSwitchHook', - switch_epoch=max_epochs - close_mosaic_epochs, - switch_pipeline=train_pipeline_stage2) -] -train_cfg = dict( - max_epochs=max_epochs, - val_interval=5, - dynamic_intervals=[((max_epochs - close_mosaic_epochs), - _base_.val_interval_stage2)]) -optim_wrapper = dict( - optimizer=dict( - _delete_=True, - type='AdamW', - lr=base_lr, - weight_decay=weight_decay, - batch_size_per_gpu=train_batch_size_per_gpu), - paramwise_cfg=dict( - custom_keys={'backbone.text_model': dict(lr_mult=0.01), - 'logit_scale': dict(weight_decay=0.0)}), - constructor='YOLOWv5OptimizerConstructor') - -# evaluation settings -val_evaluator = dict( - _delete_=True, - type='mmdet.CocoMetric', - proposal_nums=(100, 1, 10), - ann_file='data/coco/annotations/instances_val2017.json', - metric='bbox') diff --git a/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py index 48e8b94e..d01affe6 100644 --- a/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +++ b/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -169,12 +169,9 @@ weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict( - bias_decay_mult=0.0, - norm_decay_mult=0.0, custom_keys={'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0)}), constructor='YOLOWv5OptimizerConstructor') - # evaluation settings val_evaluator = dict( _delete_=True, diff --git a/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py similarity index 90% rename from configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_finetune_coco.py rename to configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py index 8c02a5b8..3978469f 100644 --- a/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_finetune_coco.py +++ b/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py @@ -5,17 +5,17 @@ # hyper-parameters num_classes = 80 num_training_classes = 80 -max_epochs = 80 # Maximum training epochs -close_mosaic_epochs = 10 +max_epochs = 40 # Maximum training epochs +close_mosaic_epochs = 30 save_epoch_intervals = 5 text_channels = 512 neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] -base_lr = 2e-4 -weight_decay = 0.05 +base_lr = 1e-3 +weight_decay = 0.0005 train_batch_size_per_gpu = 16 load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' -#text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' # text_model_name = 'openai/clip-vit-base-patch32' persistent_workers = False @@ -70,6 +70,7 @@ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), border_val=(114, 114, 114)) ] + train_pipeline = [ *_base_.pre_transform, *mosaic_affine_transform, dict(type='YOLOv5MultiModalMixUp', @@ -138,18 +139,17 @@ _base_.val_interval_stage2)]) optim_wrapper = dict(optimizer=dict( _delete_=True, - type='AdamW', + type='SGD', lr=base_lr, + momentum=0.937, + nesterov=True, weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), - paramwise_cfg=dict(bias_decay_mult=0.0, - norm_decay_mult=0.0, - custom_keys={ - 'backbone.text_model': - dict(lr_mult=0.01), - 'logit_scale': - dict(weight_decay=0.0) - }), + paramwise_cfg=dict( + custom_keys={ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), constructor='YOLOWv5OptimizerConstructor') # evaluation settings diff --git a/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_mask-refine_finetune_adddecay_coco.py b/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_mask-refine_finetune_coco.py similarity index 100% rename from configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_mask-refine_finetune_adddecay_coco.py rename to configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_mask-refine_finetune_coco.py diff --git a/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py b/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py deleted file mode 100644 index 830eebf5..00000000 --- a/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py +++ /dev/null @@ -1,182 +0,0 @@ -_base_ = ( - '../../third_party/mmyolo/configs/yolov8/' - 'yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py') -custom_imports = dict( - imports=['yolo_world'], - allow_failed_imports=False) - -# hyper-parameters -num_classes = 80 -num_training_classes = 80 -max_epochs = 80 # Maximum training epochs -close_mosaic_epochs = 10 -save_epoch_intervals = 5 -text_channels = 512 -neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] -neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] -base_lr = 2e-4 -weight_decay = 0.05 -train_batch_size_per_gpu = 16 -load_from = 'pretrained_models/yolo_world_m_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-c6237d5b.pth' -text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' -# text_model_name = 'openai/clip-vit-base-patch32' -persistent_workers = False - -# model settings -model = dict( - type='YOLOWorldDetector', - mm_neck=True, - num_train_classes=num_training_classes, - num_test_classes=num_classes, - data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), - backbone=dict( - _delete_=True, - type='MultiModalYOLOBackbone', - image_model={{_base_.model.backbone}}, - text_model=dict( - type='HuggingCLIPLanguageBackbone', - model_name=text_model_name, - frozen_modules=['all'])), - neck=dict(type='YOLOWorldPAFPN', - guide_channels=text_channels, - embed_channels=neck_embed_channels, - num_heads=neck_num_heads, - block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), - bbox_head=dict(type='YOLOWorldHead', - head_module=dict(type='YOLOWorldHeadModule', - use_bn_head=True, - embed_dims=text_channels, - num_classes=num_training_classes)), - train_cfg=dict(assigner=dict(num_classes=num_training_classes))) - -# dataset settings -text_transform = [ - dict(type='RandomLoadText', - num_neg_samples=(num_classes, num_classes), - max_num_samples=num_training_classes, - padding_to_max=True, - padding_value=''), - dict(type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', - 'flip_direction', 'texts')) -] -mosaic_affine_transform = [ - dict( - type='MultiModalMosaic', - img_scale=_base_.img_scale, - pad_val=114.0, - pre_transform=_base_.pre_transform), - dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), - dict( - type='YOLOv5RandomAffine', - max_rotate_degree=0.0, - max_shear_degree=0.0, - max_aspect_ratio=100., - scaling_ratio_range=(1 - _base_.affine_scale, - 1 + _base_.affine_scale), - # img_scale is (width, height) - border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), - border_val=(114, 114, 114), - min_area_ratio=_base_.min_area_ratio, - use_mask_refine=_base_.use_mask2refine) -] -train_pipeline = [ - *_base_.pre_transform, - *mosaic_affine_transform, - dict( - type='YOLOv5MultiModalMixUp', - prob=_base_.mixup_prob, - pre_transform=[*_base_.pre_transform, - *mosaic_affine_transform]), - *_base_.last_transform[:-1], - *text_transform -] -train_pipeline_stage2 = [ - *_base_.train_pipeline_stage2[:-1], - *text_transform -] -coco_train_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_train2017.json', - data_prefix=dict(img='train2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=train_pipeline) - -train_dataloader = dict( - persistent_workers=persistent_workers, - batch_size=train_batch_size_per_gpu, - collate_fn=dict(type='yolow_collate'), - dataset=coco_train_dataset) -test_pipeline = [ - *_base_.test_pipeline[:-1], - dict(type='LoadText'), - dict( - type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor', 'pad_param', 'texts')) -] -coco_val_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_val2017.json', - data_prefix=dict(img='val2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=test_pipeline) -val_dataloader = dict(dataset=coco_val_dataset) -test_dataloader = val_dataloader -# training settings -default_hooks = dict( - param_scheduler=dict( - scheduler_type='linear', - lr_factor=0.01, - max_epochs=max_epochs), - checkpoint=dict( - max_keep_ckpts=-1, - save_best=None, - interval=save_epoch_intervals)) -custom_hooks = [ - dict( - type='EMAHook', - ema_type='ExpMomentumEMA', - momentum=0.0001, - update_buffers=True, - strict_load=False, - priority=49), - dict( - type='mmdet.PipelineSwitchHook', - switch_epoch=max_epochs - close_mosaic_epochs, - switch_pipeline=train_pipeline_stage2) -] -train_cfg = dict( - max_epochs=max_epochs, - val_interval=5, - dynamic_intervals=[((max_epochs - close_mosaic_epochs), - _base_.val_interval_stage2)]) -optim_wrapper = dict( - optimizer=dict( - _delete_=True, - type='AdamW', - lr=base_lr, - weight_decay=weight_decay, - batch_size_per_gpu=train_batch_size_per_gpu), - paramwise_cfg=dict( - custom_keys={'backbone.text_model': dict(lr_mult=0.01), - 'logit_scale': dict(weight_decay=0.0)}), - constructor='YOLOWv5OptimizerConstructor') - -# evaluation settings -val_evaluator = dict( - _delete_=True, - type='mmdet.CocoMetric', - proposal_nums=(100, 1, 10), - ann_file='data/coco/annotations/instances_val2017.json', - metric='bbox') diff --git a/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py index 5e017c84..32fcc51c 100644 --- a/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +++ b/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -169,8 +169,6 @@ weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict( - bias_decay_mult=0.0, - norm_decay_mult=0.0, custom_keys={'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0)}), constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py b/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py deleted file mode 100644 index 43cdda82..00000000 --- a/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py +++ /dev/null @@ -1,184 +0,0 @@ -_base_ = ( - '../../third_party/mmyolo/configs/yolov8/' - 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') -custom_imports = dict( - imports=['yolo_world'], - allow_failed_imports=False) - -# hyper-parameters -num_classes = 80 -num_training_classes = 80 -max_epochs = 80 # Maximum training epochs -close_mosaic_epochs = 10 -save_epoch_intervals = 5 -text_channels = 512 -neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] -neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] -base_lr = 2e-4 -weight_decay = 0.05 -train_batch_size_per_gpu = 16 -load_from = 'pretrained_models/yolo_world_s_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-55b943ea.pth' -text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' -# text_model_name = 'openai/clip-vit-base-patch32' -persistent_workers = False -mixup_prob = 0.15 -copypaste_prob = 0.3 - -# model settings -model = dict( - type='YOLOWorldDetector', - mm_neck=True, - num_train_classes=num_training_classes, - num_test_classes=num_classes, - data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), - backbone=dict( - _delete_=True, - type='MultiModalYOLOBackbone', - image_model={{_base_.model.backbone}}, - text_model=dict( - type='HuggingCLIPLanguageBackbone', - model_name=text_model_name, - frozen_modules=['all'])), - neck=dict(type='YOLOWorldPAFPN', - guide_channels=text_channels, - embed_channels=neck_embed_channels, - num_heads=neck_num_heads, - block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), - bbox_head=dict(type='YOLOWorldHead', - head_module=dict(type='YOLOWorldHeadModule', - use_bn_head=True, - embed_dims=text_channels, - num_classes=num_training_classes)), - train_cfg=dict(assigner=dict(num_classes=num_training_classes))) - -# dataset settings -text_transform = [ - dict(type='RandomLoadText', - num_neg_samples=(num_classes, num_classes), - max_num_samples=num_training_classes, - padding_to_max=True, - padding_value=''), - dict(type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', - 'flip_direction', 'texts')) -] -mosaic_affine_transform = [ - dict( - type='MultiModalMosaic', - img_scale=_base_.img_scale, - pad_val=114.0, - pre_transform=_base_.pre_transform), - dict(type='YOLOv5CopyPaste', prob=copypaste_prob), - dict( - type='YOLOv5RandomAffine', - max_rotate_degree=0.0, - max_shear_degree=0.0, - max_aspect_ratio=100., - scaling_ratio_range=(1 - _base_.affine_scale, - 1 + _base_.affine_scale), - # img_scale is (width, height) - border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), - border_val=(114, 114, 114), - min_area_ratio=_base_.min_area_ratio, - use_mask_refine=_base_.use_mask2refine) -] -train_pipeline = [ - *_base_.pre_transform, - *mosaic_affine_transform, - dict( - type='YOLOv5MultiModalMixUp', - prob=mixup_prob, - pre_transform=[*_base_.pre_transform, - *mosaic_affine_transform]), - *_base_.last_transform[:-1], - *text_transform -] -train_pipeline_stage2 = [ - *_base_.train_pipeline_stage2[:-1], - *text_transform -] -coco_train_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_train2017.json', - data_prefix=dict(img='train2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=train_pipeline) - -train_dataloader = dict( - persistent_workers=persistent_workers, - batch_size=train_batch_size_per_gpu, - collate_fn=dict(type='yolow_collate'), - dataset=coco_train_dataset) -test_pipeline = [ - *_base_.test_pipeline[:-1], - dict(type='LoadText'), - dict( - type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor', 'pad_param', 'texts')) -] -coco_val_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_val2017.json', - data_prefix=dict(img='val2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=test_pipeline) -val_dataloader = dict(dataset=coco_val_dataset) -test_dataloader = val_dataloader -# training settings -default_hooks = dict( - param_scheduler=dict( - scheduler_type='linear', - lr_factor=0.01, - max_epochs=max_epochs), - checkpoint=dict( - max_keep_ckpts=-1, - save_best=None, - interval=save_epoch_intervals)) -custom_hooks = [ - dict( - type='EMAHook', - ema_type='ExpMomentumEMA', - momentum=0.0001, - update_buffers=True, - strict_load=False, - priority=49), - dict( - type='mmdet.PipelineSwitchHook', - switch_epoch=max_epochs - close_mosaic_epochs, - switch_pipeline=train_pipeline_stage2) -] -train_cfg = dict( - max_epochs=max_epochs, - val_interval=5, - dynamic_intervals=[((max_epochs - close_mosaic_epochs), - _base_.val_interval_stage2)]) -optim_wrapper = dict( - optimizer=dict( - _delete_=True, - type='AdamW', - lr=base_lr, - weight_decay=weight_decay, - batch_size_per_gpu=train_batch_size_per_gpu), - paramwise_cfg=dict( - custom_keys={'backbone.text_model': dict(lr_mult=0.01), - 'logit_scale': dict(weight_decay=0.0)}), - constructor='YOLOWv5OptimizerConstructor') - -# evaluation settings -val_evaluator = dict( - _delete_=True, - type='mmdet.CocoMetric', - proposal_nums=(100, 1, 10), - ann_file='data/coco/annotations/instances_val2017.json', - metric='bbox') diff --git a/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py index b5645a96..49d2e4bd 100644 --- a/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +++ b/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -171,8 +171,6 @@ weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict( - bias_decay_mult=0.0, - norm_decay_mult=0.0, custom_keys={'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0)}), constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py deleted file mode 100644 index b598ca86..00000000 --- a/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py +++ /dev/null @@ -1,185 +0,0 @@ -_base_ = ( - '../../third_party/mmyolo/configs/yolov8/' - 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') -custom_imports = dict( - imports=['yolo_world'], - allow_failed_imports=False) - -# hyper-parameters -num_classes = 80 -num_training_classes = 80 -max_epochs = 80 # Maximum training epochs -close_mosaic_epochs = 10 -save_epoch_intervals = 5 -text_channels = 512 -neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] -neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] -base_lr = 2e-4 -weight_decay = 0.05 -train_batch_size_per_gpu = 16 -load_from = 'pretrained_models/yolo_world_s_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-55b943ea.pth' -text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' -# text_model_name = 'openai/clip-vit-base-patch32' -persistent_workers = False -mixup_prob = 0.15 -copypaste_prob = 0.3 - -# model settings -model = dict( - type='YOLOWorldDetector', - mm_neck=True, - num_train_classes=num_training_classes, - num_test_classes=num_classes, - data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), - backbone=dict( - _delete_=True, - type='MultiModalYOLOBackbone', - image_model={{_base_.model.backbone}}, - text_model=dict( - type='HuggingCLIPLanguageBackbone', - model_name=text_model_name, - frozen_modules=['all'])), - neck=dict(type='YOLOWorldPAFPN', - guide_channels=text_channels, - embed_channels=neck_embed_channels, - num_heads=neck_num_heads, - block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), - bbox_head=dict(type='YOLOWorldHead', - head_module=dict(type='YOLOWorldHeadModule', - use_bn_head=True, - embed_dims=text_channels, - num_classes=num_training_classes)), - train_cfg=dict(assigner=dict(num_classes=num_training_classes))) - -# dataset settings -text_transform = [ - dict(type='RandomLoadText', - num_neg_samples=(num_classes, num_classes), - max_num_samples=num_training_classes, - padding_to_max=True, - padding_value=''), - dict(type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', - 'flip_direction', 'texts')) -] -mosaic_affine_transform = [ - dict( - type='MultiModalMosaic', - img_scale=_base_.img_scale, - pad_val=114.0, - pre_transform=_base_.pre_transform), - dict( - type='YOLOv5RandomAffine', - max_rotate_degree=0.0, - max_shear_degree=0.0, - max_aspect_ratio=100., - scaling_ratio_range=(1 - _base_.affine_scale, - 1 + _base_.affine_scale), - # img_scale is (width, height) - border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), - border_val=(114, 114, 114), - min_area_ratio=_base_.min_area_ratio, - use_mask_refine=_base_.use_mask2refine) -] -train_pipeline = [ - *_base_.pre_transform, - *mosaic_affine_transform, - dict( - type='YOLOv5MultiModalMixUp', - prob=mixup_prob, - pre_transform=[*_base_.pre_transform, - *mosaic_affine_transform]), - *_base_.last_transform[:-1], - *text_transform -] -train_pipeline_stage2 = [ - *_base_.train_pipeline_stage2[:-1], - *text_transform -] -coco_train_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_train2017.json', - data_prefix=dict(img='train2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=train_pipeline) - -train_dataloader = dict( - persistent_workers=persistent_workers, - batch_size=train_batch_size_per_gpu, - collate_fn=dict(type='yolow_collate'), - dataset=coco_train_dataset) -test_pipeline = [ - *_base_.test_pipeline[:-1], - dict(type='LoadText'), - dict( - type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor', 'pad_param', 'texts')) -] -coco_val_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_val2017.json', - data_prefix=dict(img='val2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=test_pipeline) -val_dataloader = dict(dataset=coco_val_dataset) -test_dataloader = val_dataloader -# training settings -default_hooks = dict( - param_scheduler=dict( - scheduler_type='linear', - lr_factor=0.01, - max_epochs=max_epochs), - checkpoint=dict( - max_keep_ckpts=-1, - save_best=None, - interval=save_epoch_intervals)) -custom_hooks = [ - dict( - type='EMAHook', - ema_type='ExpMomentumEMA', - momentum=0.0001, - update_buffers=True, - strict_load=False, - priority=49), - dict( - type='mmdet.PipelineSwitchHook', - switch_epoch=max_epochs - close_mosaic_epochs, - switch_pipeline=train_pipeline_stage2) -] -train_cfg = dict( - max_epochs=max_epochs, - val_interval=5, - dynamic_intervals=[((max_epochs - close_mosaic_epochs), - _base_.val_interval_stage2)]) -optim_wrapper = dict( - optimizer=dict( - _delete_=True, - type='AdamW', - lr=base_lr, - weight_decay=weight_decay, - batch_size_per_gpu=train_batch_size_per_gpu), - paramwise_cfg=dict( - bias_decay_mult=0.0, - norm_decay_mult=0.0, - custom_keys={'backbone.text_model': dict(lr_mult=0.01), - 'logit_scale': dict(weight_decay=0.0)}), - constructor='YOLOWv5OptimizerConstructor') - -# evaluation settings -val_evaluator = dict( - _delete_=True, - type='mmdet.CocoMetric', - proposal_nums=(100, 1, 10), - ann_file='data/coco/annotations/instances_val2017.json', - metric='bbox') diff --git a/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_sgd_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_sgd_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py deleted file mode 100644 index 8e0766ea..00000000 --- a/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_sgd_2e-4_80e_8gpus_mask-refine_nocopypaste_finetune_coco.py +++ /dev/null @@ -1,180 +0,0 @@ -_base_ = ( - '../../third_party/mmyolo/configs/yolov8/' - 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') -custom_imports = dict( - imports=['yolo_world'], - allow_failed_imports=False) - -# hyper-parameters -num_classes = 80 -num_training_classes = 80 -max_epochs = 80 # Maximum training epochs -close_mosaic_epochs = 10 -save_epoch_intervals = 5 -text_channels = 512 -neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] -neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] -base_lr = 1e-3 -weight_decay = 0.0005 -train_batch_size_per_gpu = 16 -load_from = 'pretrained_models/yolo_world_s_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-55b943ea.pth' -text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' -# text_model_name = 'openai/clip-vit-base-patch32' -persistent_workers = False -mixup_prob = 0.15 -copypaste_prob = 0.3 - -# model settings -model = dict( - type='YOLOWorldDetector', - mm_neck=True, - num_train_classes=num_training_classes, - num_test_classes=num_classes, - data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), - backbone=dict( - _delete_=True, - type='MultiModalYOLOBackbone', - image_model={{_base_.model.backbone}}, - text_model=dict( - type='HuggingCLIPLanguageBackbone', - model_name=text_model_name, - frozen_modules=['all'])), - neck=dict(type='YOLOWorldPAFPN', - guide_channels=text_channels, - embed_channels=neck_embed_channels, - num_heads=neck_num_heads, - block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), - bbox_head=dict(type='YOLOWorldHead', - head_module=dict(type='YOLOWorldHeadModule', - use_bn_head=True, - embed_dims=text_channels, - num_classes=num_training_classes)), - train_cfg=dict(assigner=dict(num_classes=num_training_classes))) - -# dataset settings -text_transform = [ - dict(type='RandomLoadText', - num_neg_samples=(num_classes, num_classes), - max_num_samples=num_training_classes, - padding_to_max=True, - padding_value=''), - dict(type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', - 'flip_direction', 'texts')) -] -mosaic_affine_transform = [ - dict( - type='MultiModalMosaic', - img_scale=_base_.img_scale, - pad_val=114.0, - pre_transform=_base_.pre_transform), - dict( - type='YOLOv5RandomAffine', - max_rotate_degree=0.0, - max_shear_degree=0.0, - max_aspect_ratio=100., - scaling_ratio_range=(1 - _base_.affine_scale, - 1 + _base_.affine_scale), - # img_scale is (width, height) - border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), - border_val=(114, 114, 114), - min_area_ratio=_base_.min_area_ratio, - use_mask_refine=_base_.use_mask2refine) -] -train_pipeline = [ - *_base_.pre_transform, - *mosaic_affine_transform, - dict( - type='YOLOv5MultiModalMixUp', - prob=mixup_prob, - pre_transform=[*_base_.pre_transform, - *mosaic_affine_transform]), - *_base_.last_transform[:-1], - *text_transform -] -train_pipeline_stage2 = [ - *_base_.train_pipeline_stage2[:-1], - *text_transform -] -coco_train_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_train2017.json', - data_prefix=dict(img='train2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=train_pipeline) - -train_dataloader = dict( - persistent_workers=persistent_workers, - batch_size=train_batch_size_per_gpu, - collate_fn=dict(type='yolow_collate'), - dataset=coco_train_dataset) -test_pipeline = [ - *_base_.test_pipeline[:-1], - dict(type='LoadText'), - dict( - type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor', 'pad_param', 'texts')) -] -coco_val_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_val2017.json', - data_prefix=dict(img='val2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=test_pipeline) -val_dataloader = dict(dataset=coco_val_dataset) -test_dataloader = val_dataloader -# training settings -default_hooks = dict( - param_scheduler=dict( - scheduler_type='linear', - lr_factor=0.01, - max_epochs=max_epochs), - checkpoint=dict( - max_keep_ckpts=-1, - save_best=None, - interval=save_epoch_intervals)) -custom_hooks = [ - dict( - type='EMAHook', - ema_type='ExpMomentumEMA', - momentum=0.0001, - update_buffers=True, - strict_load=False, - priority=49), - dict( - type='mmdet.PipelineSwitchHook', - switch_epoch=max_epochs - close_mosaic_epochs, - switch_pipeline=train_pipeline_stage2) -] -train_cfg = dict( - max_epochs=max_epochs, - val_interval=5, - dynamic_intervals=[((max_epochs - close_mosaic_epochs), - _base_.val_interval_stage2)]) -optim_wrapper = dict(optimizer=dict( - _delete_=True, - type='SGD', - lr=base_lr, - momentum=0.937, - nesterov=True, - weight_decay=weight_decay, - batch_size_per_gpu=train_batch_size_per_gpu)) - -# evaluation settings -val_evaluator = dict( - _delete_=True, - type='mmdet.CocoMetric', - proposal_nums=(100, 1, 10), - ann_file='data/coco/annotations/instances_val2017.json', - metric='bbox') diff --git a/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py b/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py deleted file mode 100644 index 25c30fb0..00000000 --- a/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_adddecay_coco.py +++ /dev/null @@ -1,184 +0,0 @@ -_base_ = ( - '../../third_party/mmyolo/configs/yolov8/' - 'yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py') -custom_imports = dict( - imports=['yolo_world'], - allow_failed_imports=False) - -# hyper-parameters -num_classes = 80 -num_training_classes = 80 -max_epochs = 80 # Maximum training epochs -close_mosaic_epochs = 10 -save_epoch_intervals = 5 -text_channels = 512 -neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] -neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] -base_lr = 2e-4 -weight_decay = 0.05 -train_batch_size_per_gpu = 16 -load_from = 'pretrained_models/yolo_world_x_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc250k_train_lviseval-8698fbfa.pth' -text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' -# text_model_name = 'openai/clip-vit-base-patch32' -persistent_workers = False - -# model settings -model = dict( - type='YOLOWorldDetector', - mm_neck=True, - num_train_classes=num_training_classes, - num_test_classes=num_classes, - data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), - backbone=dict( - _delete_=True, - type='MultiModalYOLOBackbone', - image_model={{_base_.model.backbone}}, - text_model=dict( - type='HuggingCLIPLanguageBackbone', - model_name=text_model_name, - frozen_modules=['all'])), - neck=dict(type='YOLOWorldPAFPN', - guide_channels=text_channels, - embed_channels=neck_embed_channels, - num_heads=neck_num_heads, - block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), - bbox_head=dict(type='YOLOWorldHead', - head_module=dict(type='YOLOWorldHeadModule', - use_bn_head=True, - embed_dims=text_channels, - num_classes=num_training_classes)), - train_cfg=dict(assigner=dict(num_classes=num_training_classes))) - -# dataset settings -text_transform = [ - dict(type='RandomLoadText', - num_neg_samples=(num_classes, num_classes), - max_num_samples=num_training_classes, - padding_to_max=True, - padding_value=''), - dict(type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', - 'flip_direction', 'texts')) -] -mosaic_affine_transform = [ - dict( - type='MultiModalMosaic', - img_scale=_base_.img_scale, - pad_val=114.0, - pre_transform=_base_.pre_transform), - dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), - dict( - type='YOLOv5RandomAffine', - max_rotate_degree=0.0, - max_shear_degree=0.0, - max_aspect_ratio=100., - scaling_ratio_range=(1 - _base_.affine_scale, - 1 + _base_.affine_scale), - # img_scale is (width, height) - border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), - border_val=(114, 114, 114), - min_area_ratio=_base_.min_area_ratio, - use_mask_refine=_base_.use_mask2refine) -] -train_pipeline = [ - *_base_.pre_transform, - *mosaic_affine_transform, - dict( - type='YOLOv5MultiModalMixUp', - prob=_base_.mixup_prob, - pre_transform=[*_base_.pre_transform, - *mosaic_affine_transform]), - *_base_.last_transform[:-1], - *text_transform -] -train_pipeline_stage2 = [ - *_base_.train_pipeline_stage2[:-1], - *text_transform -] -coco_train_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_train2017.json', - data_prefix=dict(img='train2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=train_pipeline) - -train_dataloader = dict( - persistent_workers=persistent_workers, - batch_size=train_batch_size_per_gpu, - collate_fn=dict(type='yolow_collate'), - dataset=coco_train_dataset) - -test_pipeline = [ - *_base_.test_pipeline[:-1], - dict(type='LoadText'), - dict( - type='mmdet.PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor', 'pad_param', 'texts')) -] - -coco_val_dataset = dict( - _delete_=True, - type='MultiModalDataset', - dataset=dict( - type='YOLOv5CocoDataset', - data_root='data/coco', - ann_file='annotations/instances_val2017.json', - data_prefix=dict(img='val2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32)), - class_text_path='data/texts/coco_class_texts.json', - pipeline=test_pipeline) -val_dataloader = dict(dataset=coco_val_dataset) -test_dataloader = val_dataloader -# training settings -default_hooks = dict( - param_scheduler=dict( - scheduler_type='linear', - lr_factor=0.01, - max_epochs=max_epochs), - checkpoint=dict( - max_keep_ckpts=-1, - save_best=None, - interval=save_epoch_intervals)) -custom_hooks = [ - dict( - type='EMAHook', - ema_type='ExpMomentumEMA', - momentum=0.0001, - update_buffers=True, - strict_load=False, - priority=49), - dict( - type='mmdet.PipelineSwitchHook', - switch_epoch=max_epochs - close_mosaic_epochs, - switch_pipeline=train_pipeline_stage2) -] -train_cfg = dict( - max_epochs=max_epochs, - val_interval=5, - dynamic_intervals=[((max_epochs - close_mosaic_epochs), - _base_.val_interval_stage2)]) -optim_wrapper = dict( - optimizer=dict( - _delete_=True, - type='AdamW', - lr=base_lr, - weight_decay=weight_decay, - batch_size_per_gpu=train_batch_size_per_gpu), - paramwise_cfg=dict( - custom_keys={'backbone.text_model': dict(lr_mult=0.01), - 'logit_scale': dict(weight_decay=0.0)}), - constructor='YOLOWv5OptimizerConstructor') - -# evaluation settings -val_evaluator = dict( - _delete_=True, - type='mmdet.CocoMetric', - proposal_nums=(100, 1, 10), - ann_file='data/coco/annotations/instances_val2017.json', - metric='bbox') diff --git a/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py index 4b52bacc..6ce88a89 100644 --- a/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +++ b/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -171,12 +171,9 @@ weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict( - bias_decay_mult=0.0, - norm_decay_mult=0.0, custom_keys={'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0)}), constructor='YOLOWv5OptimizerConstructor') - # evaluation settings val_evaluator = dict( _delete_=True, diff --git a/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py b/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py new file mode 100644 index 00000000..316f7cd1 --- /dev/null +++ b/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py @@ -0,0 +1,200 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.0125 +train_batch_size_per_gpu = 16 +# text_model_name = '../pretrained_models/clip-vit-large-patch14-336' +text_model_name = 'openai/clip-vit-large-patch14-336' +img_scale = (800, 800) + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] + +train_pipeline_stage2 = [ + *_base_.pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform +] + +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict(type='LoadTextFixed'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 00000000..ddbba83a --- /dev/null +++ b/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,171 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.0125 +train_batch_size_per_gpu = 16 +# text_model_name = '../pretrained_models/clip-vit-large-patch14-336' +text_model_name = 'openai/clip-vit-large-patch14-336' +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/prompt_tuning_coco/READEME.md b/configs/prompt_tuning_coco/READEME.md index dd63e752..2888d1bf 100644 --- a/configs/prompt_tuning_coco/READEME.md +++ b/configs/prompt_tuning_coco/READEME.md @@ -1,5 +1,10 @@ ## Prompt Tuning for YOLO-World +### NOTE: + +This folder contains many experimental config files, which will be removed later!! + +### Experimental Results | Model | Config | AP | AP50 | AP75 | APS | APM | APL | | :---- | :----: | :--: | :--: | :---: | :-: | :-: | :-: |