Skip to content

Commit

Permalink
Readme update
Browse files Browse the repository at this point in the history
  • Loading branch information
timojl committed Dec 15, 2021
1 parent 82e7df3 commit 2e35e50
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 37 deletions.
26 changes: 21 additions & 5 deletions Readme.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Image Segmentation Using Text and Image Prompts
This repository contains the code used in the paper "Image Segmentation Using Text and Image Prompts".

<img src="overview.png" alt="drawing" height="200em"/>

### Dependencies
This code base depends on pytorch, torchvision and clip (`pip install git+https://github.com/openai/CLIP.git`).
Additional dependencies are hidden for double blind review.
The systems allows to create segmentation models without training based on:
- An arbitrary text query
- Or an image with a mask highlighting stuff or an object.

### Quick Start

Expand All @@ -13,6 +14,11 @@ It can also be used interactively using [MyBinder](https://mybinder.org/v2/gh/ti
(please note that the VM does not use a GPU, thus inference takes a few seconds).


### Dependencies
This code base depends on pytorch, torchvision and clip (`pip install git+https://github.com/openai/CLIP.git`).
Additional dependencies are hidden for double blind review.


### Datasets

* `PhraseCut` and `PhraseCutPlus`: Referring expression dataset
Expand All @@ -33,8 +39,8 @@ For some of the datasets third party dependencies are required. Run the followin
`git clone https://github.com/juhongm999/hsnet.git`

### Weights
CLIPSeg-rd64, CLIPSeg-rd16

- [CLIPSeg-D64](https://github.com/timojl/clipseg/raw/master/weights/rd64-uni.pth) (4.1MB, without CLIP weights)
- [CLIPSeg-D16](https://github.com/timojl/clipseg/raw/master/weights/rd16-uni.pth)

### Training

Expand All @@ -45,3 +51,13 @@ See the experiment folder for yaml definitions of the training configurations. T
In order to use the dataset and model wrappers for PFENet, the PFENet repository needs to be cloned to the root folder.
`git clone https://github.com/Jia-Research-Lab/PFENet.git `

### Citation

```
@article{lueddecke21
title={Image Segmentation Using Text and Image Prompts},
author={Timo Lüddecke and Alexander Ecker},
journal={...},
year={2021}
}
```
5 changes: 3 additions & 2 deletions Tables.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,9 @@
"hash": "800ed241f7db2bd3aa6942aa3be6809cdb30ee6b0a9e773dfecfa9fef1f4c586"
},
"kernelspec": {
"display_name": "Python 3.8.8 64-bit ('env2': conda)",
"name": "python3"
"display_name": "env2",
"language": "python",
"name": "env2"
},
"language_info": {
"codemirror_mode": {
Expand Down
26 changes: 13 additions & 13 deletions Visual_Feature_Engineering.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -174,23 +174,23 @@
"outputs": [],
"source": [
"preprocessing_functions = [\n",
" ['clip mask CLS L11', lambda x: {'x_inp': x[1].cuda(), 'mask': (11, 'cls_token', x[2].cuda())}],\n",
" ['clip mask CLS all', lambda x: {'x_inp': x[1].cuda(), 'mask': ('all', 'cls_token', x[2].cuda())}],\n",
" ['clip mask all all', lambda x: {'x_inp': x[1].cuda(), 'mask': ('all', 'all', x[2].cuda())}],\n",
" ['colorize object red', partial(img_preprocess, colorize=True)],\n",
" ['add red outline', partial(img_preprocess, outline=True)],\n",
"# ['clip mask CLS L11', lambda x: {'x_inp': x[1].cuda(), 'mask': (11, 'cls_token', x[2].cuda())}],\n",
"# ['clip mask CLS all', lambda x: {'x_inp': x[1].cuda(), 'mask': ('all', 'cls_token', x[2].cuda())}],\n",
"# ['clip mask all all', lambda x: {'x_inp': x[1].cuda(), 'mask': ('all', 'all', x[2].cuda())}],\n",
"# ['colorize object red', partial(img_preprocess, colorize=True)],\n",
"# ['add red outline', partial(img_preprocess, outline=True)],\n",
" \n",
" ['BG brightness 50%', partial(img_preprocess, bg_fac=0.5)],\n",
" ['BG brightness 10%', partial(img_preprocess, bg_fac=0.1)],\n",
" ['BG brightness 0%', partial(img_preprocess, bg_fac=0.0)],\n",
" ['BG blur', partial(img_preprocess, blur=3)],\n",
" ['BG blur & intensity 10%', partial(img_preprocess, blur=3, bg_fac=0.1)],\n",
"# ['BG brightness 50%', partial(img_preprocess, bg_fac=0.5)],\n",
"# ['BG brightness 10%', partial(img_preprocess, bg_fac=0.1)],\n",
"# ['BG brightness 0%', partial(img_preprocess, bg_fac=0.0)],\n",
"# ['BG blur', partial(img_preprocess, blur=3)],\n",
"# ['BG blur & intensity 10%', partial(img_preprocess, blur=3, bg_fac=0.1)],\n",
" \n",
" ['crop large context', partial(img_preprocess, center_context=0.5)],\n",
" ['crop small context', partial(img_preprocess, center_context=0.1)],\n",
"# ['crop large context', partial(img_preprocess, center_context=0.5)],\n",
"# ['crop small context', partial(img_preprocess, center_context=0.1)],\n",
" ['crop & background blur', partial(img_preprocess, blur=3, center_context=0.5)],\n",
" ['crop & intensity 10%', partial(img_preprocess, blur=3, bg_fac=0.1)],\n",
" ['crop & background blur & intensity 10%', partial(img_preprocess, blur=3, center_context=0.1, bg_fac=0.1)],\n",
"# ['crop & background blur & intensity 10%', partial(img_preprocess, blur=3, center_context=0.1, bg_fac=0.1)],\n",
"]\n",
"\n",
"preprocessing_functions = preprocessing_functions\n",
Expand Down
2 changes: 1 addition & 1 deletion datasets/coco_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def build_img_metadata_classwise(self):
transforms.Normalize(mean, std)
])

self.coco = DatasetCOCO('/user/tluedde/datasets/COCO-20i/', fold, transform, split, 1, False)
self.coco = DatasetCOCO(expanduser('~/datasets/COCO-20i/'), fold, transform, split, 1, False)

self.all_classes = [self.coco.class_ids]
self.coco.base_path = join(expanduser('~/datasets/COCO-20i'))
Expand Down
6 changes: 3 additions & 3 deletions datasets/pascal_zeroshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
from general_utils import log
from torchvision import transforms

PASCAL_VOC_CLASSES_ZS = [['cattle.n.01', 'motorcycle.n.01'], ['aeroplane.n.01', 'sofa.n.01'],
['cat.n.01', 'television.n.03'], ['train.n.01', 'bottle.n.01'],
['chair.n.01', 'pot_plant.n.01']]
# PASCAL_VOC_CLASSES_ZS = [['cattle.n.01', 'motorcycle.n.01'], ['aeroplane.n.01', 'sofa.n.01'],
# ['cat.n.01', 'television.n.03'], ['train.n.01', 'bottle.n.01'],
# ['chair.n.01', 'pot_plant.n.01']]


class PascalZeroShot(object):
Expand Down
3 changes: 2 additions & 1 deletion experiment_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def score(config, train_checkpoint_id, train_config):
metric_args['resize_to'] = config.resize_to

if 'sigmoid' in config:
metric_args['sigmoid'] = config.sigmoid
metric_args['sigmoid'] = config.sigmoid

if 'custom_threshold' in config:
metric_args['custom_threshold'] = config.custom_threshold
Expand Down Expand Up @@ -449,6 +449,7 @@ def score(config, train_checkpoint_id, train_config):

only_visual = config.only_visual is not None and config.only_visual
with_visual = config.with_visual is not None and config.with_visual

dataset = PhraseCut('test',
image_size=train_config.image_size,
mask=config.mask,
Expand Down
46 changes: 36 additions & 10 deletions models/clipseg.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def forward_multihead_attention(x, b, with_aff=False, attn_mask=None):

class CLIPDenseBase(nn.Module):

def __init__(self, version, reduce_cond, reduce_dim, prompt):
def __init__(self, version, reduce_cond, reduce_dim, prompt, n_tokens):
super().__init__()

import clip
Expand All @@ -115,6 +115,9 @@ def __init__(self, version, reduce_cond, reduce_dim, prompt):
self.clip_model, _ = clip.load(version, device='cpu', jit=False)
self.model = self.clip_model.visual

# if not None, scale conv weights such that we obtain n_tokens.
self.n_tokens = n_tokens

for p in self.clip_model.parameters():
p.requires_grad_(False)

Expand Down Expand Up @@ -150,8 +153,18 @@ def rescaled_pos_emb(self, new_size):

def visual_forward(self, x_inp, extract_layers=(), skip=False, mask=None):


with torch.no_grad():
x = self.model.conv1(x_inp) # shape = [*, width, grid, grid]

inp_size = x_inp.shape[2:]

if self.n_tokens is not None:
stride2 = x_inp.shape[2] // self.n_tokens
conv_weight2 = nnf.interpolate(self.model.conv1.weight, (stride2, stride2), mode='bilinear', align_corners=True)
x = nnf.conv2d(x_inp, conv_weight2, bias=self.model.conv1.bias, stride=stride2, dilation=self.model.conv1.dilation)
else:
x = self.model.conv1(x_inp) # shape = [*, width, grid, grid]

x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]

Expand Down Expand Up @@ -189,6 +202,10 @@ def visual_forward(self, x_inp, extract_layers=(), skip=False, mask=None):

if i in extract_layers:
affinities += [aff_per_head]

#if self.n_tokens is not None:
# activations += [nnf.interpolate(x, inp_size, mode='bilinear', align_corners=True)]
#else:
activations += [x]

if len(extract_layers) > 0 and i == max(extract_layers) and skip:
Expand Down Expand Up @@ -283,16 +300,16 @@ class CLIPDensePredT(CLIPDenseBase):
def __init__(self, version='ViT-B/32', extract_layers=(3, 6, 9), cond_layer=0, reduce_dim=128, n_heads=4, prompt='fixed',
extra_blocks=0, reduce_cond=None, fix_shift=False,
learn_trans_conv_only=False, limit_to_clip_only=False, upsample=False,
add_calibration=False, rev_actionvations=False, process_cond=None, not_pretrained=False):
add_calibration=False, rev_activations=False, trans_conv=None, n_tokens=None):

super().__init__(version, reduce_cond, reduce_dim, prompt)
super().__init__(version, reduce_cond, reduce_dim, prompt, n_tokens)
# device = 'cpu'

self.extract_layers = extract_layers
self.cond_layer = cond_layer
self.limit_to_clip_only = limit_to_clip_only
self.process_cond = None
self.rev_activations = None
self.rev_activations = rev_activations

depth = len(extract_layers)

Expand All @@ -314,7 +331,12 @@ def __init__(self, version='ViT-B/32', extract_layers=(3, 6, 9), cond_layer=0, r
else:
self.shift_vector = None

trans_conv_ks = {'ViT-B/32': (32, 32), 'ViT-B/16': (16, 16)}[version]
if trans_conv is None:
trans_conv_ks = {'ViT-B/32': (32, 32), 'ViT-B/16': (16, 16)}[version]
else:
# explicitly define transposed conv kernel size
trans_conv_ks = (trans_conv, trans_conv)

self.trans_conv = nn.ConvTranspose2d(reduce_dim, 1, trans_conv_ks, stride=trans_conv_ks)

assert len(self.extract_layers) == depth
Expand Down Expand Up @@ -382,8 +404,12 @@ def forward(self, inp_image, conditional=None, return_features=False, mask=None)
size = int(math.sqrt(a.shape[2]))

a = a.view(bs, a.shape[1], size, size)

a = self.trans_conv(a)

if self.n_tokens is not None:
a = nnf.interpolate(a, x_inp.shape[2:], mode='bilinear', align_corners=True)

if self.upsample_proj is not None:
a = self.upsample_proj(a)
a = nnf.interpolate(a, x_inp.shape[2:], mode='bilinear')
Expand All @@ -399,13 +425,13 @@ class CLIPDensePredTMasked(CLIPDensePredT):

def __init__(self, version='ViT-B/32', extract_layers=(3, 6, 9), cond_layer=0, reduce_dim=128, n_heads=4,
prompt='fixed', extra_blocks=0, reduce_cond=None, fix_shift=False, learn_trans_conv_only=False,
refine=None, limit_to_clip_only=False, upsample=False, add_calibration=False, process_cond=None):
refine=None, limit_to_clip_only=False, upsample=False, add_calibration=False, n_tokens=None):

super().__init__(version=version, extract_layers=extract_layers, cond_layer=cond_layer, reduce_dim=reduce_dim,
n_heads=n_heads, prompt=prompt, extra_blocks=extra_blocks, reduce_cond=reduce_cond,
fix_shift=fix_shift, learn_trans_conv_only=learn_trans_conv_only,
limit_to_clip_only=limit_to_clip_only, upsample=upsample, add_calibration=add_calibration,
process_cond=process_cond)
n_tokens=n_tokens)

def visual_forward_masked(self, img_s, seg_s):
return super().visual_forward(img_s, mask=('all', 'cls_token', seg_s))
Expand All @@ -428,9 +454,9 @@ class CLIPDenseBaseline(CLIPDenseBase):

def __init__(self, version='ViT-B/32', cond_layer=0,
extract_layer=9, reduce_dim=128, reduce2_dim=None, prompt='fixed',
reduce_cond=None, limit_to_clip_only=False):
reduce_cond=None, limit_to_clip_only=False, n_tokens=None):

super().__init__(version, reduce_cond, reduce_dim, prompt)
super().__init__(version, reduce_cond, reduce_dim, prompt, n_tokens)
device = 'cpu'

# self.cond_layer = cond_layer
Expand Down
Binary file added overview.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions weights/rd16-uni.pth
100755 → 100644
Git LFS file not shown

0 comments on commit 2e35e50

Please sign in to comment.