Skip to content

Commit

Permalink
add requirements
Browse files Browse the repository at this point in the history
  • Loading branch information
wondervictor committed Apr 28, 2024
1 parent 2bb6516 commit 0d1dd63
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 2 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ We provide the pre-training logs of `YOLO-World-v2`. Due to the unexpected error

### 1. Installation

YOLO-World is developed based on `torch==1.11.0` `mmyolo==0.6.0` and `mmdetection==3.0.0`.
YOLO-World is developed based on `torch==1.11.0` `mmyolo==0.6.0` and `mmdetection==3.0.0`. Check more details about `requirements` and `mmcv` in [docs/installation](./docs/installation.md).

#### Clone Project

Expand Down
1 change: 0 additions & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ see more in [official guide](https://github.com/open-mmlab/mmcv/tree/master?tab=
```bash
pip install openmim
mim install mmcv==2.0.0

```

**2. using `pip`**:
Expand Down
59 changes: 59 additions & 0 deletions tools/generate_image_prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import tqdm
import argparse
import os.path as osp
import numpy as np
from PIL import Image
from transformers import (AutoTokenizer, AutoProcessor,
CLIPVisionModelWithProjection,
CLIPTextModelWithProjection)

if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument(
'--model',
type=str,
default='../pretrained_models/open-ai-clip-vit-base-patch32')
parser.add_argument('--image-dir', type=str, default='data/samples.txt')
parser.add_argument('--out-dir', type=str, default='')
parser.add_argument('--out-file', type=str)

args = parser.parse_args()

tokenizer = AutoTokenizer.from_pretrained(args.model)
vision_model = CLIPVisionModelWithProjection.from_pretrained(args.model)
text_model = CLIPTextModelWithProjection.from_pretrained(args.model)
processor = AutoProcessor.from_pretrained(args.model)

# padding prompts
device = 'cuda:0'
text_model.to(device)
texts = tokenizer(text=[' '], return_tensors='pt', padding=True)
texts = texts.to(device)
text_outputs = text_model(**texts)
txt_feats = text_outputs.text_embeds
txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]).cpu().data.numpy()

images = os.listdir(args.image_dir)
category_embeds = []

def _forward_vision_model(image_name):
image_path = osp.join(args.image_dir, image_name)
# category = image_name.split('-')[1]
image = Image.open(image_path).convert("RGB")
inputs = processor(images=image, return_tensors="pt", padding=True)
image_outputs = vision_model(**inputs)
img_feats = image_outputs.image_embeds
# img_feats
img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True)
img_feats = img_feats.reshape(
-1, img_feats.shape[-1])[0].cpu().data.numpy()
category_embeds.append(img_feats)

for image_ in tqdm.tqdm(images):
_forward_vision_model(image_)
category_embeds.append(txt_feats)
category_embeds = np.stack(category_embeds)
np.save(osp.join(args.out_dir, args.out_file), category_embeds)

0 comments on commit 0d1dd63

Please sign in to comment.