Skip to content

Commit 256cf19

Browse files
committed
Rename tinyclip models to fit existing 'clip' variants, use consistently mapped OpenCLIP compatible checkpoint on hf hub
1 parent 1a1d07d commit 256cf19

File tree

1 file changed

+56
-48
lines changed

1 file changed

+56
-48
lines changed

timm/models/vision_transformer.py

+56-48
Original file line numberDiff line numberDiff line change
@@ -964,11 +964,13 @@ def _convert_openai_clip(
964964
v = v.unsqueeze(0)
965965
if v.shape[1] != model.pos_embed.shape[1]:
966966
# To resize pos embedding when using model at different size from pretrained weights
967-
v = resize_pos_embed(
967+
num_prefix_tokens = 0 if getattr(model, 'no_embed_class', False) \
968+
else getattr(model, 'num_prefix_tokens', 1)
969+
v = resample_abs_pos_embed(
968970
v,
969-
model.pos_embed,
970-
0 if getattr(model, 'no_embed_class') else getattr(model, 'num_prefix_tokens', 1),
971-
model.patch_embed.grid_size
971+
new_size=model.patch_embed.grid_size,
972+
num_prefix_tokens=num_prefix_tokens,
973+
verbose=True,
972974
)
973975
out_dict[k] = v
974976
return out_dict
@@ -1015,8 +1017,6 @@ def checkpoint_filter_fn(
10151017
return _convert_openai_clip(state_dict, model)
10161018
elif 'module.visual.class_embedding' in state_dict:
10171019
return _convert_openai_clip(state_dict, model, prefix='module.visual.')
1018-
elif '_image_encoder.module.visual.class_embedding' in state_dict:
1019-
return _convert_openai_clip(state_dict, model, prefix='_image_encoder.module.visual.')
10201020

10211021
if "mask_token" in state_dict:
10221022
state_dict = _convert_dinov2(state_dict, model)
@@ -1737,20 +1737,24 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
17371737
input_size=(3, 384, 384),
17381738
num_classes=0),
17391739

1740-
'vit_8m_patch16_tinyclip_224.yfcc15m': _cfg(
1741-
url='https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-8M-16-Text-3M-YFCC15M.pt',
1740+
'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m': _cfg(
1741+
hf_hub_id='timm/',
1742+
hf_hub_filename='open_clip_pytorch_model.bin',
17421743
license='mit',
17431744
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
1744-
'vit_39m_patch16_tinyclip_224.yfcc15m': _cfg(
1745-
url='https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-39M-16-Text-19M-YFCC15M.pt',
1745+
'vit_medium_patch32_clip_224.tinyclip_laion400m': _cfg(
1746+
hf_hub_id='timm/',
1747+
hf_hub_filename='open_clip_pytorch_model.bin',
17461748
license='mit',
17471749
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
1748-
'vit_40m_patch32_tinyclip_224.laion400m': _cfg(
1749-
url='https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-40M-32-Text-19M-LAION400M.pt',
1750+
'vit_medium_patch16_clip_224.tinyclip_yfcc15m': _cfg(
1751+
hf_hub_id='timm/',
1752+
hf_hub_filename='open_clip_pytorch_model.bin',
17501753
license='mit',
17511754
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
1752-
'vit_61m_patch32_tinyclip_224.laion400m': _cfg(
1753-
url='https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-61M-32-Text-29M-LAION400M.pt',
1755+
'vit_betwixt_patch32_clip_224.tinyclip_laion400m': _cfg(
1756+
hf_hub_id='timm/',
1757+
hf_hub_filename='open_clip_pytorch_model.bin',
17541758
license='mit',
17551759
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
17561760

@@ -2092,6 +2096,44 @@ def vit_giant_patch16_gap_224(pretrained: bool = False, **kwargs) -> VisionTrans
20922096
return model
20932097

20942098

2099+
@register_model
2100+
def vit_xsmall_patch16_clip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
2101+
# TinyCLIP 8M
2102+
model_args = dict(embed_dim=256, depth=10, num_heads=4, pre_norm=True, norm_layer=nn.LayerNorm)
2103+
model = _create_vision_transformer(
2104+
'vit_xsmall_patch16_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
2105+
return model
2106+
2107+
2108+
@register_model
2109+
def vit_medium_patch32_clip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
2110+
# TinyCLIP 40M
2111+
model_args = dict(
2112+
patch_size=32, embed_dim=512, depth=12, num_heads=8, pre_norm=True, norm_layer=nn.LayerNorm)
2113+
model = _create_vision_transformer(
2114+
'vit_medium_patch32_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
2115+
return model
2116+
2117+
2118+
@register_model
2119+
def vit_medium_patch16_clip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
2120+
# TinyCLIP 39M
2121+
model_args = dict(embed_dim=512, depth=12, num_heads=8, pre_norm=True, norm_layer=nn.LayerNorm)
2122+
model = _create_vision_transformer(
2123+
'vit_medium_patch16_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
2124+
return model
2125+
2126+
2127+
@register_model
2128+
def vit_betwixt_patch32_clip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
2129+
# TinyCLIP 61M
2130+
model_args = dict(
2131+
patch_size=32, embed_dim=640, depth=12, num_heads=10, pre_norm=True, norm_layer=nn.LayerNorm)
2132+
model = _create_vision_transformer(
2133+
'vit_betwixt_patch32_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
2134+
return model
2135+
2136+
20952137
@register_model
20962138
def vit_base_patch32_clip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
20972139
""" ViT-B/32 CLIP image tower @ 224x224
@@ -2640,40 +2682,6 @@ def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionT
26402682
return model
26412683

26422684

2643-
@register_model
2644-
def vit_8m_patch16_tinyclip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
2645-
model_args = dict(embed_dim=256, depth=10, num_heads=4, pre_norm=True, norm_layer=nn.LayerNorm)
2646-
model = _create_vision_transformer(
2647-
'vit_8m_patch16_tinyclip_224', pretrained=pretrained, **dict(model_args, **kwargs))
2648-
return model
2649-
2650-
2651-
@register_model
2652-
def vit_39m_patch16_tinyclip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
2653-
model_args = dict(embed_dim=512, depth=12, num_heads=8, pre_norm=True, norm_layer=nn.LayerNorm)
2654-
model = _create_vision_transformer(
2655-
'vit_39m_patch16_tinyclip_224', pretrained=pretrained, **dict(model_args, **kwargs))
2656-
return model
2657-
2658-
2659-
@register_model
2660-
def vit_40m_patch32_tinyclip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
2661-
model_args = dict(
2662-
patch_size=32, embed_dim=512, depth=12, num_heads=8, pre_norm=True, norm_layer=nn.LayerNorm)
2663-
model = _create_vision_transformer(
2664-
'vit_40m_patch32_tinyclip_224', pretrained=pretrained, **dict(model_args, **kwargs))
2665-
return model
2666-
2667-
2668-
@register_model
2669-
def vit_61m_patch32_tinyclip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
2670-
model_args = dict(
2671-
patch_size=32, embed_dim=640, depth=12, num_heads=10, pre_norm=True, norm_layer=nn.LayerNorm)
2672-
model = _create_vision_transformer(
2673-
'vit_61m_patch32_tinyclip_224', pretrained=pretrained, **dict(model_args, **kwargs))
2674-
return model
2675-
2676-
26772685
@register_model
26782686
def vit_medium_patch16_reg4_256(pretrained: bool = False, **kwargs) -> VisionTransformer:
26792687
model_args = dict(

0 commit comments

Comments
 (0)