Skip to content

Commit 01c5286

Browse files
committed
added BLIP
1 parent 8bb2894 commit 01c5286

File tree

2 files changed

+374
-7
lines changed

2 files changed

+374
-7
lines changed

Model Notebooks/BLIP/blip.ipynb

Lines changed: 353 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,353 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "5f7ca252",
6+
"metadata": {},
7+
"source": [
8+
"[![Labellerr](https://storage.googleapis.com/labellerr-cdn/%200%20Labellerr%20template/notebook.webp)](https://www.labellerr.com)\n",
9+
"\n",
10+
"# BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation\n",
11+
"\n",
12+
"---\n",
13+
"\n",
14+
"[![labellerr](https://img.shields.io/badge/Labellerr-BLOG-black.svg)](https://www.labellerr.com/blog/<BLOG_NAME>)\n",
15+
"[![Youtube](https://img.shields.io/badge/Labellerr-YouTube-b31b1b.svg)](https://www.youtube.com/@Labellerr)\n",
16+
"[![Github](https://img.shields.io/badge/Labellerr-GitHub-green.svg)](https://github.com/Labellerr/Hands-On-Learning-in-Computer-Vision)\n",
17+
"[![Scientific Paper](https://img.shields.io/badge/Official-Paper-blue.svg)](<PAPER LINK>)\n"
18+
]
19+
},
20+
{
21+
"cell_type": "markdown",
22+
"id": "3ab4c690",
23+
"metadata": {},
24+
"source": [
25+
"## Installing Required Libraries"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"id": "9de785ed",
32+
"metadata": {},
33+
"outputs": [],
34+
"source": [
35+
"%pip install torch transformers pillow\n",
36+
"%pip install accelerate"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"id": "58f26a11",
42+
"metadata": {},
43+
"source": [
44+
"## Importing Libraries"
45+
]
46+
},
47+
{
48+
"cell_type": "code",
49+
"execution_count": null,
50+
"id": "a38f97d0",
51+
"metadata": {},
52+
"outputs": [],
53+
"source": [
54+
"import torch\n",
55+
"from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering\n",
56+
"import requests\n",
57+
"from PIL import Image\n",
58+
"from io import BytesIO\n",
59+
"from IPython.display import display\n",
60+
"import os\n",
61+
"from transformers.utils import logging\n",
62+
"\n",
63+
"# Suppress unnecessary logs\n",
64+
"logging.set_verbosity_error()\n"
65+
]
66+
},
67+
{
68+
"cell_type": "markdown",
69+
"id": "0d98e278",
70+
"metadata": {},
71+
"source": [
72+
"## Helper Function"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": null,
78+
"id": "5d5fe17c",
79+
"metadata": {},
80+
"outputs": [],
81+
"source": [
82+
"def show_image(source):\n",
83+
" \"\"\"\n",
84+
" Display an image from a URL or a local file path.\n",
85+
"\n",
86+
" Args:\n",
87+
" source (str): The URL or local file path of the image.\n",
88+
" \"\"\"\n",
89+
" try:\n",
90+
" if source.startswith(\"http://\") or source.startswith(\"https://\"):\n",
91+
" # Load image from URL\n",
92+
" response = requests.get(source)\n",
93+
" response.raise_for_status() # Raise exception for bad response\n",
94+
" img = Image.open(BytesIO(response.content))\n",
95+
" elif os.path.exists(source):\n",
96+
" # Load image from local file path\n",
97+
" img = Image.open(source)\n",
98+
" else:\n",
99+
" raise ValueError(\"Invalid source. Provide a valid URL or local file path.\")\n",
100+
" \n",
101+
" display(img)\n",
102+
" \n",
103+
" except Exception as e:\n",
104+
" print(f\"Error displaying image: {e}\")"
105+
]
106+
},
107+
{
108+
"cell_type": "markdown",
109+
"id": "82f9c591",
110+
"metadata": {},
111+
"source": [
112+
"## Implementing BLIP"
113+
]
114+
},
115+
{
116+
"cell_type": "code",
117+
"execution_count": null,
118+
"id": "7aa31226",
119+
"metadata": {},
120+
"outputs": [],
121+
"source": [
122+
"def blip(ques: str, img_url: str) -> str:\n",
123+
" \"\"\" Perform visual question answering using the BLIP model.\"\"\"\n",
124+
" processor = AutoProcessor.from_pretrained(\"Salesforce/blip-vqa-base\")\n",
125+
" model = AutoModelForVisualQuestionAnswering.from_pretrained(\n",
126+
" \"Salesforce/blip-vqa-base\", \n",
127+
" torch_dtype=torch.float16,\n",
128+
" device_map=\"auto\"\n",
129+
" )\n",
130+
" image = Image.open(requests.get(img_url, stream=True).raw)\n",
131+
"\n",
132+
" question = ques\n",
133+
" inputs = processor(images=image, text=question, return_tensors=\"pt\").to(\"cuda\", torch.float16)\n",
134+
"\n",
135+
" output = model.generate(**inputs)\n",
136+
" answer = processor.batch_decode(output, skip_special_tokens=True)[0]\n",
137+
" return answer"
138+
]
139+
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": null,
143+
"id": "e10ff77c",
144+
"metadata": {},
145+
"outputs": [],
146+
"source": [
147+
"# def blip(ques: str, img: str) -> str:\n",
148+
"# \"\"\"\n",
149+
"# Perform visual question answering using the BLIP model.\n",
150+
"\n",
151+
"# Args:\n",
152+
"# ques (str): The question to ask about the image.\n",
153+
"# image (str): The URL or local file path of the image.\n",
154+
"\n",
155+
"# Returns:\n",
156+
"# str: The answer to the question.\n",
157+
"# \"\"\"\n",
158+
"# blip_pipeline = pipeline(\n",
159+
"# task=\"visual-question-answering\",\n",
160+
"# model=\"Salesforce/blip-vqa-base\",\n",
161+
"# torch_dtype=torch.float16,\n",
162+
"# device=0\n",
163+
"# )\n",
164+
" \n",
165+
"# answer = blip_pipeline(question=ques, image=img)[0]['answer']\n",
166+
"# return answer"
167+
]
168+
},
169+
{
170+
"cell_type": "code",
171+
"execution_count": null,
172+
"id": "e93aa608",
173+
"metadata": {},
174+
"outputs": [],
175+
"source": [
176+
"url = \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg\"\n",
177+
"show_image(url)"
178+
]
179+
},
180+
{
181+
"cell_type": "code",
182+
"execution_count": null,
183+
"id": "2bcbcbd2",
184+
"metadata": {},
185+
"outputs": [],
186+
"source": [
187+
"blip(\"What is the weather in this image?\", url)"
188+
]
189+
},
190+
{
191+
"cell_type": "code",
192+
"execution_count": null,
193+
"id": "eecfb69d",
194+
"metadata": {},
195+
"outputs": [],
196+
"source": [
197+
"url1 = \"https://farm9.staticflickr.com/8198/8233776747_b27f40f3c2_z.jpg\"\n",
198+
"show_image(url1)"
199+
]
200+
},
201+
{
202+
"cell_type": "code",
203+
"execution_count": null,
204+
"id": "85200d60",
205+
"metadata": {},
206+
"outputs": [],
207+
"source": [
208+
"question = \"how many animals in this image?\"\n",
209+
"blip(question, url1)"
210+
]
211+
},
212+
{
213+
"cell_type": "code",
214+
"execution_count": null,
215+
"id": "63e3bcd5",
216+
"metadata": {},
217+
"outputs": [],
218+
"source": [
219+
"ques_list = [\n",
220+
" \"What is the weather in this image?\",\n",
221+
" \"how many animals in this image?\",\n",
222+
" \"which animal is in the image?\",\n",
223+
" \"what type of terrain in the image?\",\n",
224+
" \"any flowers in the image?\",\n",
225+
" \"which time of day it is\"]\n",
226+
"\n",
227+
"for ques in ques_list:\n",
228+
" print(f\"Question: {ques}\")\n",
229+
" answer = blip(ques, url1)\n",
230+
" print(f\"Answer: {answer}\\n\")"
231+
]
232+
},
233+
{
234+
"cell_type": "code",
235+
"execution_count": null,
236+
"id": "b3b137ad",
237+
"metadata": {},
238+
"outputs": [],
239+
"source": [
240+
"url2 = 'https://i.pinimg.com/1200x/c4/01/99/c40199e777e9467353f41432c351c90a.jpg'\n",
241+
"show_image(url2)"
242+
]
243+
},
244+
{
245+
"cell_type": "code",
246+
"execution_count": null,
247+
"id": "bd1b1661",
248+
"metadata": {},
249+
"outputs": [],
250+
"source": [
251+
"ques_list = [\n",
252+
" \"Numbers of posters in this image\",\n",
253+
" \"Name of the device in this image\",\n",
254+
" \"On right-side poster, what is written on it?\",\n",
255+
" \"Any plant in this image?\"\n",
256+
" ]\n",
257+
"\n",
258+
"for ques in ques_list:\n",
259+
" print(f\"Question: {ques}\")\n",
260+
" answer = blip(ques, url2)\n",
261+
" print(f\"Answer: {answer}\\n\")"
262+
]
263+
},
264+
{
265+
"cell_type": "code",
266+
"execution_count": null,
267+
"id": "753b7b2b",
268+
"metadata": {},
269+
"outputs": [],
270+
"source": [
271+
"url3 = \"https://i.pinimg.com/1200x/0b/41/71/0b417194ea4f479af82c1269b96a81d2.jpg\"\n",
272+
"show_image(url3)"
273+
]
274+
},
275+
{
276+
"cell_type": "code",
277+
"execution_count": null,
278+
"id": "034910e6",
279+
"metadata": {},
280+
"outputs": [],
281+
"source": [
282+
"ques_list = [\n",
283+
" \"Numbers of coins in this image\",\n",
284+
" \"what is the color of coins in this image\",\n",
285+
" \"Value written on the coin\",\n",
286+
" \"which currency does the coins belong to?\",\n",
287+
" \"which currency is written on the coin?\"\n",
288+
" ]\n",
289+
"\n",
290+
"for ques in ques_list:\n",
291+
" print(f\"Question: {ques}\")\n",
292+
" answer = blip(ques, url3)\n",
293+
" print(f\"Answer: {answer}\\n\")"
294+
]
295+
},
296+
{
297+
"cell_type": "code",
298+
"execution_count": null,
299+
"id": "19c2b834",
300+
"metadata": {},
301+
"outputs": [],
302+
"source": [
303+
"url4 = \"https://i.pinimg.com/736x/f9/0a/08/f90a0858d9271593f2be424cd62b38ba.jpg\"\n",
304+
"show_image(url4)"
305+
]
306+
},
307+
{
308+
"cell_type": "code",
309+
"execution_count": null,
310+
"id": "687425cf",
311+
"metadata": {},
312+
"outputs": [],
313+
"source": [
314+
"ques_list = [\n",
315+
" \"which vehicle is in the image?\",\n",
316+
" \"what is the color of the vehicle?\",\n",
317+
" \"what is the brand of vehicle?\",\n",
318+
" \"Numbers of person in the image?\",\n",
319+
" \"where is the persons in the image?\",\n",
320+
" \"which place is in the image?\",\n",
321+
" \"what time of day is in the image\",\n",
322+
" \"what is the van plate vehicle ID?\"\n",
323+
" ]\n",
324+
"\n",
325+
"for ques in ques_list:\n",
326+
" print(f\"Question: {ques}\")\n",
327+
" answer = blip(ques, url4)\n",
328+
" print(f\"Answer: {answer}\\n\")"
329+
]
330+
}
331+
],
332+
"metadata": {
333+
"kernelspec": {
334+
"display_name": "VLM",
335+
"language": "python",
336+
"name": "python3"
337+
},
338+
"language_info": {
339+
"codemirror_mode": {
340+
"name": "ipython",
341+
"version": 3
342+
},
343+
"file_extension": ".py",
344+
"mimetype": "text/x-python",
345+
"name": "python",
346+
"nbconvert_exporter": "python",
347+
"pygments_lexer": "ipython3",
348+
"version": "3.10.18"
349+
}
350+
},
351+
"nbformat": 4,
352+
"nbformat_minor": 5
353+
}

0 commit comments

Comments
 (0)