Readme update

timojl · Dec 15, 2021 · 2e35e50 · 2e35e50
1 parent 82e7df3
commit 2e35e50
Show file tree

Hide file tree

Showing 9 changed files with 81 additions and 37 deletions.
diff --git a/Readme.md b/Readme.md
@@ -1,10 +1,11 @@
 # Image Segmentation Using Text and Image Prompts
 This repository contains the code used in the paper "Image Segmentation Using Text and Image Prompts".
 
+<img src="overview.png" alt="drawing" height="200em"/>
 
-### Dependencies
-This code base depends on pytorch, torchvision and clip (`pip install git+https://github.com/openai/CLIP.git`).
-Additional dependencies are hidden for double blind review.
+The systems allows to create segmentation models without training based on:
+- An arbitrary text query
+- Or an image with a mask highlighting stuff or an object.
 
 ### Quick Start
 
@@ -13,6 +14,11 @@ It can also be used interactively using [MyBinder](https://mybinder.org/v2/gh/ti
 (please note that the VM does not use a GPU, thus inference takes a few seconds).
 
 
+### Dependencies
+This code base depends on pytorch, torchvision and clip (`pip install git+https://github.com/openai/CLIP.git`).
+Additional dependencies are hidden for double blind review.
+
+
 ### Datasets
 
 * `PhraseCut` and `PhraseCutPlus`: Referring expression dataset
@@ -33,8 +39,8 @@ For some of the datasets third party dependencies are required. Run the followin
 `git clone https://github.com/juhongm999/hsnet.git`
 
 ### Weights
-CLIPSeg-rd64, CLIPSeg-rd16
-
+- [CLIPSeg-D64](https://github.com/timojl/clipseg/raw/master/weights/rd64-uni.pth) (4.1MB, without CLIP weights)
+- [CLIPSeg-D16](https://github.com/timojl/clipseg/raw/master/weights/rd16-uni.pth)
 
 ### Training
 
@@ -45,3 +51,13 @@ See the experiment folder for yaml definitions of the training configurations. T
 In order to use the dataset and model wrappers for PFENet, the PFENet repository needs to be cloned to the root folder.
 `git clone https://github.com/Jia-Research-Lab/PFENet.git `
 
+### Citation
+
+```
+@article{lueddecke21
+    title={Image Segmentation Using Text and Image Prompts},
+    author={Timo Lüddecke and Alexander Ecker},
+    journal={...},
+    year={2021}
+}
+```
diff --git a/Tables.ipynb b/Tables.ipynb
@@ -327,8 +327,9 @@
    "hash": "800ed241f7db2bd3aa6942aa3be6809cdb30ee6b0a9e773dfecfa9fef1f4c586"
   },
   "kernelspec": {
-   "display_name": "Python 3.8.8 64-bit ('env2': conda)",
-   "name": "python3"
+   "display_name": "env2",
+   "language": "python",
+   "name": "env2"
   },
   "language_info": {
    "codemirror_mode": {

diff --git a/Visual_Feature_Engineering.ipynb b/Visual_Feature_Engineering.ipynb
@@ -174,23 +174,23 @@
    "outputs": [],
    "source": [
     "preprocessing_functions = [\n",
-    "    ['clip mask CLS L11', lambda x: {'x_inp': x[1].cuda(), 'mask': (11, 'cls_token', x[2].cuda())}],\n",
-    "    ['clip mask CLS all', lambda x: {'x_inp': x[1].cuda(), 'mask': ('all', 'cls_token', x[2].cuda())}],\n",
-    "    ['clip mask all all', lambda x: {'x_inp': x[1].cuda(), 'mask': ('all', 'all', x[2].cuda())}],\n",
-    "    ['colorize object red', partial(img_preprocess, colorize=True)],\n",
-    "    ['add red outline', partial(img_preprocess, outline=True)],\n",
+    "#     ['clip mask CLS L11', lambda x: {'x_inp': x[1].cuda(), 'mask': (11, 'cls_token', x[2].cuda())}],\n",
+    "#     ['clip mask CLS all', lambda x: {'x_inp': x[1].cuda(), 'mask': ('all', 'cls_token', x[2].cuda())}],\n",
+    "#     ['clip mask all all', lambda x: {'x_inp': x[1].cuda(), 'mask': ('all', 'all', x[2].cuda())}],\n",
+    "#     ['colorize object red', partial(img_preprocess, colorize=True)],\n",
+    "#     ['add red outline', partial(img_preprocess, outline=True)],\n",
     "    \n",
-    "    ['BG brightness 50%', partial(img_preprocess, bg_fac=0.5)],\n",
-    "    ['BG brightness 10%', partial(img_preprocess, bg_fac=0.1)],\n",
-    "    ['BG brightness 0%', partial(img_preprocess, bg_fac=0.0)],\n",
-    "    ['BG blur', partial(img_preprocess, blur=3)],\n",
-    "    ['BG blur & intensity 10%', partial(img_preprocess, blur=3, bg_fac=0.1)],\n",
+    "#     ['BG brightness 50%', partial(img_preprocess, bg_fac=0.5)],\n",
+    "#     ['BG brightness 10%', partial(img_preprocess, bg_fac=0.1)],\n",
+    "#     ['BG brightness 0%', partial(img_preprocess, bg_fac=0.0)],\n",
+    "#     ['BG blur', partial(img_preprocess, blur=3)],\n",
+    "#     ['BG blur & intensity 10%', partial(img_preprocess, blur=3, bg_fac=0.1)],\n",
     "   \n",
-    "    ['crop large context', partial(img_preprocess, center_context=0.5)],\n",
-    "    ['crop small context', partial(img_preprocess, center_context=0.1)],\n",
+    "#     ['crop large context', partial(img_preprocess, center_context=0.5)],\n",
+    "#     ['crop small context', partial(img_preprocess, center_context=0.1)],\n",
     "    ['crop & background blur', partial(img_preprocess, blur=3, center_context=0.5)],\n",
     "    ['crop & intensity 10%', partial(img_preprocess, blur=3, bg_fac=0.1)],\n",
-    "    ['crop & background blur & intensity 10%', partial(img_preprocess, blur=3, center_context=0.1, bg_fac=0.1)],\n",
+    "#     ['crop & background blur & intensity 10%', partial(img_preprocess, blur=3, center_context=0.1, bg_fac=0.1)],\n",
     "]\n",
     "\n",
     "preprocessing_functions = preprocessing_functions\n",

diff --git a/datasets/coco_wrapper.py b/datasets/coco_wrapper.py
@@ -51,7 +51,7 @@ def build_img_metadata_classwise(self):
             transforms.Normalize(mean, std)
         ])
 
-        self.coco = DatasetCOCO('/user/tluedde/datasets/COCO-20i/', fold, transform, split, 1, False)
+        self.coco = DatasetCOCO(expanduser('~/datasets/COCO-20i/'), fold, transform, split, 1, False)
 
         self.all_classes = [self.coco.class_ids]
         self.coco.base_path = join(expanduser('~/datasets/COCO-20i'))

diff --git a/datasets/pascal_zeroshot.py b/datasets/pascal_zeroshot.py
@@ -7,9 +7,9 @@
 from general_utils import log
 from torchvision import transforms
 
-PASCAL_VOC_CLASSES_ZS = [['cattle.n.01', 'motorcycle.n.01'], ['aeroplane.n.01', 'sofa.n.01'], 
-                         ['cat.n.01', 'television.n.03'], ['train.n.01', 'bottle.n.01'],
-                          ['chair.n.01', 'pot_plant.n.01']]
+# PASCAL_VOC_CLASSES_ZS = [['cattle.n.01', 'motorcycle.n.01'], ['aeroplane.n.01', 'sofa.n.01'], 
+#                          ['cat.n.01', 'television.n.03'], ['train.n.01', 'bottle.n.01'],
+#                           ['chair.n.01', 'pot_plant.n.01']]
 
 
 class PascalZeroShot(object):

diff --git a/experiment_setup.py b/experiment_setup.py
@@ -337,7 +337,7 @@ def score(config, train_checkpoint_id, train_config):
         metric_args['resize_to'] = config.resize_to
 
     if 'sigmoid' in config:
-        metric_args['sigmoid'] = config.sigmoid
+        metric_args['sigmoid'] = config.sigmoid    
 
     if 'custom_threshold' in config:
         metric_args['custom_threshold'] = config.custom_threshold     
@@ -449,6 +449,7 @@ def score(config, train_checkpoint_id, train_config):
 
         only_visual = config.only_visual is not None and config.only_visual
         with_visual = config.with_visual is not None and config.with_visual
+
         dataset = PhraseCut('test', 
                             image_size=train_config.image_size,
                             mask=config.mask, 

diff --git a/models/clipseg.py b/models/clipseg.py
@@ -106,7 +106,7 @@ def forward_multihead_attention(x, b, with_aff=False, attn_mask=None):
 
 class CLIPDenseBase(nn.Module):
 
-    def __init__(self, version, reduce_cond, reduce_dim, prompt):
+    def __init__(self, version, reduce_cond, reduce_dim, prompt, n_tokens):
         super().__init__()
 
         import clip
@@ -115,6 +115,9 @@ def __init__(self, version, reduce_cond, reduce_dim, prompt):
         self.clip_model, _ = clip.load(version, device='cpu', jit=False)
         self.model = self.clip_model.visual
 
+        # if not None, scale conv weights such that we obtain n_tokens.
+        self.n_tokens = n_tokens
+
         for p in self.clip_model.parameters():
             p.requires_grad_(False)
 
@@ -150,8 +153,18 @@ def rescaled_pos_emb(self, new_size):
 
     def visual_forward(self, x_inp, extract_layers=(), skip=False, mask=None):
 
+
         with torch.no_grad():
-            x = self.model.conv1(x_inp)  # shape = [*, width, grid, grid]
+
+            inp_size = x_inp.shape[2:]
+
+            if self.n_tokens is not None:
+                stride2 = x_inp.shape[2] // self.n_tokens
+                conv_weight2 = nnf.interpolate(self.model.conv1.weight, (stride2, stride2), mode='bilinear', align_corners=True)
+                x = nnf.conv2d(x_inp, conv_weight2, bias=self.model.conv1.bias, stride=stride2, dilation=self.model.conv1.dilation)
+            else:
+                x = self.model.conv1(x_inp)  # shape = [*, width, grid, grid]
+
             x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
             x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
 
@@ -189,6 +202,10 @@ def visual_forward(self, x_inp, extract_layers=(), skip=False, mask=None):
 
                 if i in extract_layers:
                     affinities += [aff_per_head]
+
+                    #if self.n_tokens is not None:
+                    #    activations += [nnf.interpolate(x, inp_size, mode='bilinear', align_corners=True)]
+                    #else:
                     activations += [x]
 
                 if len(extract_layers) > 0 and i == max(extract_layers) and skip:
@@ -283,16 +300,16 @@ class CLIPDensePredT(CLIPDenseBase):
     def __init__(self, version='ViT-B/32', extract_layers=(3, 6, 9), cond_layer=0, reduce_dim=128, n_heads=4, prompt='fixed', 
                  extra_blocks=0, reduce_cond=None, fix_shift=False,
                  learn_trans_conv_only=False,  limit_to_clip_only=False, upsample=False, 
-                 add_calibration=False, rev_actionvations=False, process_cond=None, not_pretrained=False):
+                 add_calibration=False, rev_activations=False, trans_conv=None, n_tokens=None):
 
-        super().__init__(version, reduce_cond, reduce_dim, prompt)
+        super().__init__(version, reduce_cond, reduce_dim, prompt, n_tokens)
         # device = 'cpu'
 
         self.extract_layers = extract_layers
         self.cond_layer = cond_layer
         self.limit_to_clip_only = limit_to_clip_only
         self.process_cond = None
-        self.rev_activations = None
+        self.rev_activations = rev_activations
 
         depth = len(extract_layers)
 
@@ -314,7 +331,12 @@ def __init__(self, version='ViT-B/32', extract_layers=(3, 6, 9), cond_layer=0, r
         else:
             self.shift_vector = None
 
-        trans_conv_ks = {'ViT-B/32': (32, 32), 'ViT-B/16': (16, 16)}[version]
+        if trans_conv is None:
+            trans_conv_ks = {'ViT-B/32': (32, 32), 'ViT-B/16': (16, 16)}[version]
+        else:
+            # explicitly define transposed conv kernel size
+            trans_conv_ks = (trans_conv, trans_conv)
+
         self.trans_conv = nn.ConvTranspose2d(reduce_dim, 1, trans_conv_ks, stride=trans_conv_ks)
 
         assert len(self.extract_layers) == depth
@@ -382,8 +404,12 @@ def forward(self, inp_image, conditional=None, return_features=False, mask=None)
         size = int(math.sqrt(a.shape[2]))
 
         a = a.view(bs, a.shape[1], size, size)
+
         a = self.trans_conv(a)
 
+        if self.n_tokens is not None:
+            a = nnf.interpolate(a, x_inp.shape[2:], mode='bilinear', align_corners=True) 
+
         if self.upsample_proj is not None:
             a = self.upsample_proj(a)
             a = nnf.interpolate(a, x_inp.shape[2:], mode='bilinear')
@@ -399,13 +425,13 @@ class CLIPDensePredTMasked(CLIPDensePredT):
 
     def __init__(self, version='ViT-B/32', extract_layers=(3, 6, 9), cond_layer=0, reduce_dim=128, n_heads=4, 
                  prompt='fixed', extra_blocks=0, reduce_cond=None, fix_shift=False, learn_trans_conv_only=False, 
-                 refine=None, limit_to_clip_only=False, upsample=False, add_calibration=False, process_cond=None):
+                 refine=None, limit_to_clip_only=False, upsample=False, add_calibration=False, n_tokens=None):
 
         super().__init__(version=version, extract_layers=extract_layers, cond_layer=cond_layer, reduce_dim=reduce_dim, 
                          n_heads=n_heads, prompt=prompt, extra_blocks=extra_blocks, reduce_cond=reduce_cond, 
                          fix_shift=fix_shift, learn_trans_conv_only=learn_trans_conv_only,
                          limit_to_clip_only=limit_to_clip_only, upsample=upsample, add_calibration=add_calibration,
-                         process_cond=process_cond)
+                         n_tokens=n_tokens)
 
     def visual_forward_masked(self, img_s, seg_s):
         return super().visual_forward(img_s, mask=('all', 'cls_token', seg_s))
@@ -428,9 +454,9 @@ class CLIPDenseBaseline(CLIPDenseBase):
 
     def __init__(self, version='ViT-B/32', cond_layer=0, 
                 extract_layer=9, reduce_dim=128, reduce2_dim=None, prompt='fixed', 
-                 reduce_cond=None, limit_to_clip_only=False):
+                 reduce_cond=None, limit_to_clip_only=False, n_tokens=None):
 
-        super().__init__(version, reduce_cond, reduce_dim, prompt)
+        super().__init__(version, reduce_cond, reduce_dim, prompt, n_tokens)
         device = 'cpu'
 
         # self.cond_layer = cond_layer

diff --git a/overview.png b/overview.png
diff --git a/weights/rd16-uni.pth b/weights/rd16-uni.pth