add requirements

AILab-CVC · Apr 28, 2024 · 0d1dd63 · 0d1dd63
1 parent 2bb6516
commit 0d1dd63
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -127,7 +127,7 @@ We provide the pre-training logs of `YOLO-World-v2`. Due to the unexpected error
 
 ### 1. Installation
 
-YOLO-World is developed based on `torch==1.11.0` `mmyolo==0.6.0` and `mmdetection==3.0.0`.
+YOLO-World is developed based on `torch==1.11.0` `mmyolo==0.6.0` and `mmdetection==3.0.0`. Check more details about `requirements` and `mmcv` in [docs/installation](./docs/installation.md).
 
 #### Clone Project 
 

diff --git a/docs/installation.md b/docs/installation.md
@@ -17,7 +17,6 @@ see more in [official guide](https://github.com/open-mmlab/mmcv/tree/master?tab=
 ```bash
 pip install openmim
 mim install mmcv==2.0.0 
-
 ```
 
 **2. using `pip`**:

diff --git a/tools/generate_image_prompts.py b/tools/generate_image_prompts.py
@@ -0,0 +1,59 @@
+import os
+import tqdm
+import argparse
+import os.path as osp
+import numpy as np
+from PIL import Image
+from transformers import (AutoTokenizer, AutoProcessor,
+                          CLIPVisionModelWithProjection,
+                          CLIPTextModelWithProjection)
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='../pretrained_models/open-ai-clip-vit-base-patch32')
+    parser.add_argument('--image-dir', type=str, default='data/samples.txt')
+    parser.add_argument('--out-dir', type=str, default='')
+    parser.add_argument('--out-file', type=str)
+
+    args = parser.parse_args()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    vision_model = CLIPVisionModelWithProjection.from_pretrained(args.model)
+    text_model = CLIPTextModelWithProjection.from_pretrained(args.model)
+    processor = AutoProcessor.from_pretrained(args.model)
+
+    # padding prompts
+    device = 'cuda:0'
+    text_model.to(device)
+    texts = tokenizer(text=[' '], return_tensors='pt', padding=True)
+    texts = texts.to(device)
+    text_outputs = text_model(**texts)
+    txt_feats = text_outputs.text_embeds
+    txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
+    txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]).cpu().data.numpy()
+
+    images = os.listdir(args.image_dir)
+    category_embeds = []
+
+    def _forward_vision_model(image_name):
+        image_path = osp.join(args.image_dir, image_name)
+        # category = image_name.split('-')[1]
+        image = Image.open(image_path).convert("RGB")
+        inputs = processor(images=image, return_tensors="pt", padding=True)
+        image_outputs = vision_model(**inputs)
+        img_feats = image_outputs.image_embeds
+        # img_feats
+        img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True)
+        img_feats = img_feats.reshape(
+            -1, img_feats.shape[-1])[0].cpu().data.numpy()
+        category_embeds.append(img_feats)
+
+    for image_ in tqdm.tqdm(images):
+        _forward_vision_model(image_)
+    category_embeds.append(txt_feats)
+    category_embeds = np.stack(category_embeds)
+    np.save(osp.join(args.out_dir, args.out_file), category_embeds)