@@ -964,11 +964,13 @@ def _convert_openai_clip(
964
964
v = v .unsqueeze (0 )
965
965
if v .shape [1 ] != model .pos_embed .shape [1 ]:
966
966
# To resize pos embedding when using model at different size from pretrained weights
967
- v = resize_pos_embed (
967
+ num_prefix_tokens = 0 if getattr (model , 'no_embed_class' , False ) \
968
+ else getattr (model , 'num_prefix_tokens' , 1 )
969
+ v = resample_abs_pos_embed (
968
970
v ,
969
- model .pos_embed ,
970
- 0 if getattr ( model , 'no_embed_class' ) else getattr ( model , ' num_prefix_tokens' , 1 ) ,
971
- model . patch_embed . grid_size
971
+ new_size = model .patch_embed . grid_size ,
972
+ num_prefix_tokens = num_prefix_tokens ,
973
+ verbose = True ,
972
974
)
973
975
out_dict [k ] = v
974
976
return out_dict
@@ -1015,8 +1017,6 @@ def checkpoint_filter_fn(
1015
1017
return _convert_openai_clip (state_dict , model )
1016
1018
elif 'module.visual.class_embedding' in state_dict :
1017
1019
return _convert_openai_clip (state_dict , model , prefix = 'module.visual.' )
1018
- elif '_image_encoder.module.visual.class_embedding' in state_dict :
1019
- return _convert_openai_clip (state_dict , model , prefix = '_image_encoder.module.visual.' )
1020
1020
1021
1021
if "mask_token" in state_dict :
1022
1022
state_dict = _convert_dinov2 (state_dict , model )
@@ -1737,20 +1737,24 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1737
1737
input_size = (3 , 384 , 384 ),
1738
1738
num_classes = 0 ),
1739
1739
1740
- 'vit_8m_patch16_tinyclip_224.yfcc15m' : _cfg (
1741
- url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-8M-16-Text-3M-YFCC15M.pt' ,
1740
+ 'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m' : _cfg (
1741
+ hf_hub_id = 'timm/' ,
1742
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1742
1743
license = 'mit' ,
1743
1744
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1744
- 'vit_39m_patch16_tinyclip_224.yfcc15m' : _cfg (
1745
- url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-39M-16-Text-19M-YFCC15M.pt' ,
1745
+ 'vit_medium_patch32_clip_224.tinyclip_laion400m' : _cfg (
1746
+ hf_hub_id = 'timm/' ,
1747
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1746
1748
license = 'mit' ,
1747
1749
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1748
- 'vit_40m_patch32_tinyclip_224.laion400m' : _cfg (
1749
- url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-40M-32-Text-19M-LAION400M.pt' ,
1750
+ 'vit_medium_patch16_clip_224.tinyclip_yfcc15m' : _cfg (
1751
+ hf_hub_id = 'timm/' ,
1752
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1750
1753
license = 'mit' ,
1751
1754
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1752
- 'vit_61m_patch32_tinyclip_224.laion400m' : _cfg (
1753
- url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-61M-32-Text-29M-LAION400M.pt' ,
1755
+ 'vit_betwixt_patch32_clip_224.tinyclip_laion400m' : _cfg (
1756
+ hf_hub_id = 'timm/' ,
1757
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1754
1758
license = 'mit' ,
1755
1759
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1756
1760
@@ -2092,6 +2096,44 @@ def vit_giant_patch16_gap_224(pretrained: bool = False, **kwargs) -> VisionTrans
2092
2096
return model
2093
2097
2094
2098
2099
+ @register_model
2100
+ def vit_xsmall_patch16_clip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2101
+ # TinyCLIP 8M
2102
+ model_args = dict (embed_dim = 256 , depth = 10 , num_heads = 4 , pre_norm = True , norm_layer = nn .LayerNorm )
2103
+ model = _create_vision_transformer (
2104
+ 'vit_xsmall_patch16_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2105
+ return model
2106
+
2107
+
2108
+ @register_model
2109
+ def vit_medium_patch32_clip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2110
+ # TinyCLIP 40M
2111
+ model_args = dict (
2112
+ patch_size = 32 , embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2113
+ model = _create_vision_transformer (
2114
+ 'vit_medium_patch32_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2115
+ return model
2116
+
2117
+
2118
+ @register_model
2119
+ def vit_medium_patch16_clip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2120
+ # TinyCLIP 39M
2121
+ model_args = dict (embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2122
+ model = _create_vision_transformer (
2123
+ 'vit_medium_patch16_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2124
+ return model
2125
+
2126
+
2127
+ @register_model
2128
+ def vit_betwixt_patch32_clip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2129
+ # TinyCLIP 61M
2130
+ model_args = dict (
2131
+ patch_size = 32 , embed_dim = 640 , depth = 12 , num_heads = 10 , pre_norm = True , norm_layer = nn .LayerNorm )
2132
+ model = _create_vision_transformer (
2133
+ 'vit_betwixt_patch32_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2134
+ return model
2135
+
2136
+
2095
2137
@register_model
2096
2138
def vit_base_patch32_clip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2097
2139
""" ViT-B/32 CLIP image tower @ 224x224
@@ -2640,40 +2682,6 @@ def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionT
2640
2682
return model
2641
2683
2642
2684
2643
- @register_model
2644
- def vit_8m_patch16_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2645
- model_args = dict (embed_dim = 256 , depth = 10 , num_heads = 4 , pre_norm = True , norm_layer = nn .LayerNorm )
2646
- model = _create_vision_transformer (
2647
- 'vit_8m_patch16_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2648
- return model
2649
-
2650
-
2651
- @register_model
2652
- def vit_39m_patch16_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2653
- model_args = dict (embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2654
- model = _create_vision_transformer (
2655
- 'vit_39m_patch16_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2656
- return model
2657
-
2658
-
2659
- @register_model
2660
- def vit_40m_patch32_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2661
- model_args = dict (
2662
- patch_size = 32 , embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2663
- model = _create_vision_transformer (
2664
- 'vit_40m_patch32_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2665
- return model
2666
-
2667
-
2668
- @register_model
2669
- def vit_61m_patch32_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2670
- model_args = dict (
2671
- patch_size = 32 , embed_dim = 640 , depth = 12 , num_heads = 10 , pre_norm = True , norm_layer = nn .LayerNorm )
2672
- model = _create_vision_transformer (
2673
- 'vit_61m_patch32_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2674
- return model
2675
-
2676
-
2677
2685
@register_model
2678
2686
def vit_medium_patch16_reg4_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2679
2687
model_args = dict (
0 commit comments