Skip to content

Update some clip pretrained weights to point to new hub locations #2311

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions timm/models/byobnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2315,6 +2315,27 @@ def _cfgr(url='', **kwargs):
fixed_input_size=True, input_size=(3, 448, 448), pool_size=(14, 14),
classifier='head.proj',
),
'resnet50_clip.cc12m': _cfgr(
hf_hub_id='timm/',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
classifier='head.proj',
),
'resnet50_clip.yfcc15m': _cfgr(
hf_hub_id='timm/',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
classifier='head.proj',
),
'resnet101_clip.yfcc15m': _cfgr(
hf_hub_id='timm/',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=512, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
classifier='head.proj',
),

# avg-pool w/ optional standard classifier head variants
'resnet50_clip_gap.openai': _cfgr(
Expand Down Expand Up @@ -2347,6 +2368,24 @@ def _cfgr(url='', **kwargs):
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
input_size=(3, 448, 448), pool_size=(14, 14),
),
'resnet50_clip_gap.cc12m': _cfgr(
hf_hub_id='timm/resnet50_clip.cc12m',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
input_size=(3, 224, 224), pool_size=(7, 7),
),
'resnet50_clip_gap.yfcc15m': _cfgr(
hf_hub_id='timm/resnet50_clip.yfcc15m',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
input_size=(3, 224, 224), pool_size=(7, 7),
),
'resnet101_clip_gap.yfcc15m': _cfgr(
hf_hub_id='timm/resnet101_clip.yfcc15m',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
input_size=(3, 224, 224), pool_size=(7, 7),
),

'resnet50_mlp.untrained': _cfgr(
input_size=(3, 256, 256), pool_size=(8, 8),
Expand Down
101 changes: 73 additions & 28 deletions timm/models/vision_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

Hacked together by / Copyright 2020, Ross Wightman
"""
import copy
import logging
import math
from collections import OrderedDict
Expand Down Expand Up @@ -1601,6 +1602,21 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
hf_hub_filename='open_clip_pytorch_model.bin',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),

'vit_base_patch32_clip_224.laion400m_e32': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
'vit_base_patch16_clip_224.laion400m_e32': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
'vit_base_patch16_plus_clip_240.laion400m_e32': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
input_size=(3, 240, 240), crop_pct=1.0, num_classes=512),
'vit_large_patch14_clip_224.laion400m_e32': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),

'vit_base_patch32_clip_224.datacompxl': _cfg(
hf_hub_id='laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K',
hf_hub_filename='open_clip_pytorch_model.bin',
Expand Down Expand Up @@ -1641,44 +1657,60 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024),

'vit_base_patch32_clip_224.metaclip_2pt5b': _cfg(
hf_hub_id='facebook/metaclip-b32-fullcc2.5b',
hf_hub_filename='metaclip_b32_fullcc2.5b.bin',
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
'vit_base_patch16_clip_224.metaclip_2pt5b': _cfg(
hf_hub_id='facebook/metaclip-b16-fullcc2.5b',
hf_hub_filename='metaclip_b16_fullcc2.5b.bin',
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
'vit_large_patch14_clip_224.metaclip_2pt5b': _cfg(
hf_hub_id='facebook/metaclip-l14-fullcc2.5b',
hf_hub_filename='metaclip_l14_fullcc2.5b.bin',
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
'vit_huge_patch14_clip_224.metaclip_2pt5b': _cfg(
hf_hub_id='facebook/metaclip-h14-fullcc2.5b',
hf_hub_filename='metaclip_h14_fullcc2.5b.bin',
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
'vit_gigantic_patch14_clip_224.metaclip_2pt5b': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),
'vit_base_patch32_clip_224.metaclip_400m': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
'vit_base_patch16_clip_224.metaclip_400m': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
'vit_large_patch14_clip_224.metaclip_400m': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),

'vit_base_patch32_clip_224.openai': _cfg(
hf_hub_id='timm/vit_base_patch32_clip_224.openai',
hf_hub_id='timm/',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
'vit_base_patch16_clip_224.openai': _cfg(
hf_hub_id='timm/vit_base_patch16_clip_224.openai',
hf_hub_id='timm/',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
'vit_large_patch14_clip_224.openai': _cfg(
hf_hub_id='timm/vit_large_patch14_clip_224.openai',
hf_hub_id='timm/',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
'vit_large_patch14_clip_336.openai': _cfg(
hf_hub_id='timm/vit_large_patch14_clip_336.openai', hf_hub_filename='open_clip_pytorch_model.bin',
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
crop_pct=1.0, input_size=(3, 336, 336), num_classes=768),
Expand Down Expand Up @@ -2071,22 +2103,13 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
input_size=(3, 160, 160), crop_pct=0.95),
}

_quick_gelu_cfgs = [
'vit_large_patch14_clip_224.dfn2b',
'vit_huge_patch14_clip_224.dfn5b',
'vit_huge_patch14_clip_378.dfn5b',
'vit_base_patch32_clip_224.metaclip_2pt5b',
'vit_base_patch16_clip_224.metaclip_2pt5b',
'vit_large_patch14_clip_224.metaclip_2pt5b',
'vit_huge_patch14_clip_224.metaclip_2pt5b',
'vit_base_patch32_clip_224.openai',
'vit_base_patch16_clip_224.openai',
'vit_large_patch14_clip_224.openai',
'vit_large_patch14_clip_336.openai',
]
default_cfgs.update({
n.replace('_clip_', '_clip_quickgelu_'): default_cfgs[n] for n in _quick_gelu_cfgs
})
_quick_gelu_cfgs = [n for n, c in default_cfgs.items() if c.get('notes', ()) and 'quickgelu' in c['notes'][0]]
for n in _quick_gelu_cfgs:
# generate quickgelu default cfgs based on contents of notes field
c = copy.deepcopy(default_cfgs[n])
if c['hf_hub_id'] == 'timm/':
c['hf_hub_id'] = 'timm/' + n # need to use non-quickgelu model name for hub id
default_cfgs[n.replace('_clip_', '_clip_quickgelu_')] = c
default_cfgs = generate_default_cfgs(default_cfgs)


Expand Down Expand Up @@ -2510,6 +2533,16 @@ def vit_base_patch16_clip_384(pretrained: bool = False, **kwargs) -> VisionTrans
return model


@register_model
def vit_base_patch16_plus_clip_240(pretrained: bool = False, **kwargs) -> VisionTransformer:
""" ViT-Base (ViT-B/16+) CLIP image tower @ 240x240
"""
model_args = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_base_patch16_plus_clip_240', pretrained=pretrained, **dict(model_args, **kwargs))
return model


@register_model
def vit_large_patch14_clip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
""" ViT-Large model (ViT-L/14) CLIP image tower
Expand Down Expand Up @@ -2656,6 +2689,18 @@ def vit_huge_patch14_clip_quickgelu_378(pretrained: bool = False, **kwargs) -> V
return model


@register_model
def vit_gigantic_patch14_clip_quickgelu_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
""" ViT-bigG model (ViT-G/14) w/ QuickGELU act
"""
model_args = dict(
patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16, pre_norm=True,
norm_layer=nn.LayerNorm, act_layer='quick_gelu')
model = _create_vision_transformer(
'vit_gigantic_patch14_clip_quickgelu_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model


# Experimental models below

@register_model
Expand Down
Loading