Skip to content

Commit

Permalink
move llava notebook on openvino genai (#2445)
Browse files Browse the repository at this point in the history
CVS-147996
  • Loading branch information
eaidova authored Oct 21, 2024
1 parent 395b629 commit ca3248d
Show file tree
Hide file tree
Showing 8 changed files with 840 additions and 1,723 deletions.
2 changes: 1 addition & 1 deletion .ci/ignore_convert_execution.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ notebooks/llm-chatbot/llm-chatbot.ipynb
notebooks/llm-rag-langchain/llm-rag-langchain.ipynb
notebooks/mms-massively-multilingual-speech/mms-massively-multilingual-speech.ipynb
notebooks/bark-text-to-audio/bark-text-to-audio.ipynb
notebooks/llava-multimodal-chatbot/llava-multimodal-chatbot.ipynb
notebooks/llava-multimodal-chatbot/llava-multimodal-chatbot-genai.ipynb
notebooks/llava-multimodal-chatbot/videollava-multimodal-chatbot.ipynb
notebooks/pix2struct-docvqa/pix2struct-docvqa.ipynb
notebooks/softvc-voice-conversion/softvc-voice-conversion.ipynb
Expand Down
2 changes: 1 addition & 1 deletion .ci/skipped_notebooks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@
- ubuntu-20.04
- ubuntu-22.04
- windows-2019
- notebook: notebooks/llava-multimodal-chatbot/llava-multimodal-chatbot.ipynb
- notebook: notebooks/llava-multimodal-chatbot/llava-multimodal-chatbot-genai.ipynb
skips:
- os:
- macos-12
Expand Down
1 change: 1 addition & 0 deletions .ci/spellcheck/.pyspelling.wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,7 @@ MultiHeadAttention
multilayer
multimodal
Multimodality
multinomial
MusicGen
MuRAG
Müller
Expand Down
2 changes: 0 additions & 2 deletions notebooks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@
- [Create Function-calling Agent using OpenVINO and Qwen-Agent](./llm-agent-functioncall/llm-agent-functioncall-qwen.ipynb)
- [Visual-language assistant with LLaVA Next and OpenVINO](./llava-next-multimodal-chatbot/llava-next-multimodal-chatbot.ipynb)
- [Visual-language assistant with Video-LLaVA and OpenVINO](./llava-multimodal-chatbot/videollava-multimodal-chatbot.ipynb)
- [Visual-language assistant with LLaVA and OpenVINO](./llava-multimodal-chatbot/llava-multimodal-chatbot.ipynb)
- [Text-to-Image Generation with LCM LoRA and ControlNet Conditioning](./latent-consistency-models-image-generation/lcm-lora-controlnet.ipynb)
- [Latent Consistency Model using Optimum-Intel OpenVINO](./latent-consistency-models-image-generation/latent-consistency-models-optimum-demo.ipynb)
- [Image generation with Latent Consistency Model and OpenVINO](./latent-consistency-models-image-generation/latent-consistency-models-image-generation.ipynb)
Expand Down Expand Up @@ -245,7 +244,6 @@
- [Create Function-calling Agent using OpenVINO and Qwen-Agent](./llm-agent-functioncall/llm-agent-functioncall-qwen.ipynb)
- [Visual-language assistant with LLaVA Next and OpenVINO](./llava-next-multimodal-chatbot/llava-next-multimodal-chatbot.ipynb)
- [Visual-language assistant with Video-LLaVA and OpenVINO](./llava-multimodal-chatbot/videollava-multimodal-chatbot.ipynb)
- [Visual-language assistant with LLaVA and OpenVINO](./llava-multimodal-chatbot/llava-multimodal-chatbot.ipynb)
- [Text-to-Image Generation with LCM LoRA and ControlNet Conditioning](./latent-consistency-models-image-generation/lcm-lora-controlnet.ipynb)
- [Latent Consistency Model using Optimum-Intel OpenVINO](./latent-consistency-models-image-generation/latent-consistency-models-optimum-demo.ipynb)
- [Image generation with Latent Consistency Model and OpenVINO](./latent-consistency-models-image-generation/latent-consistency-models-image-generation.ipynb)
Expand Down
211 changes: 126 additions & 85 deletions notebooks/llava-multimodal-chatbot/gradio_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,91 +3,132 @@
import gradio as gr


def make_demo_llava(handle_user_message: Callable, run_chatbot: Callable, clear_history: Callable):
title_markdown = """
# 🌋 LLaVA: Large Language and Vision Assistant
"""

tos_markdown = """
### Terms of use
By using this service, users are required to agree to the following terms:
The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
"""
gr.close_all()
with gr.Blocks(title="LLaVA") as demo:
gr.Markdown(title_markdown)

with gr.Row():
with gr.Column():
imagebox = gr.Image(type="pil")
with gr.Accordion("Parameters", open=False, visible=True) as parameter_row:
temperature = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.2,
step=0.1,
interactive=True,
label="Temperature",
)
top_p = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.7,
step=0.1,
interactive=True,
label="Top P",
)
max_output_tokens = gr.Slider(
minimum=0,
maximum=1024,
value=512,
step=64,
interactive=True,
label="Max output tokens",
)

with gr.Column(scale=3):
with gr.Column(scale=6):
chatbot = gr.Chatbot(height=400)
with gr.Row():
with gr.Column(scale=8):
textbox = gr.Textbox(
show_label=False,
placeholder="Enter text and press ENTER",
visible=True,
container=False,
)
with gr.Column(scale=1, min_width=60):
submit_btn = gr.Button(value="Submit", visible=True)
with gr.Row(visible=True) as button_row:
clear_btn = gr.Button(value="🗑️ Clear history", interactive=True)

gr.Markdown(tos_markdown)

submit_event = textbox.submit(
fn=handle_user_message,
inputs=[textbox, chatbot],
outputs=[textbox, chatbot],
queue=False,
).then(
fn=run_chatbot,
inputs=[imagebox, chatbot, temperature, top_p, max_output_tokens],
outputs=chatbot,
queue=True,
)
# Register listeners
clear_btn.click(fn=clear_history, inputs=[textbox, imagebox, chatbot], outputs=[chatbot, textbox, imagebox])
submit_click_event = submit_btn.click(
fn=handle_user_message,
inputs=[textbox, chatbot],
outputs=[textbox, chatbot],
queue=False,
).then(
fn=run_chatbot,
inputs=[imagebox, chatbot, temperature, top_p, max_output_tokens],
outputs=chatbot,
queue=True,
)
from PIL import Image
from typing import Callable
import numpy as np
import requests
from threading import Event, Thread
import inspect
from queue import Queue

example_image_urls = [
(
"https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/1d6a0188-5613-418d-a1fd-4560aae1d907",
"bee.jpg",
),
(
"https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/6cc7feeb-0721-4b5d-8791-2576ed9d2863",
"baklava.png",
),
]
for url, file_name in example_image_urls:
Image.open(requests.get(url, stream=True).raw).save(file_name)


def make_demo_llava(model):
import openvino_genai
import openvino as ov

has_additonal_buttons = "undo_button" in inspect.signature(gr.ChatInterface.__init__).parameters

def read_image(path: str) -> ov.Tensor:
"""
Args:
path: The path to the image.
Returns: the ov.Tensor containing the image.
"""
pic = Image.open(path).convert("RGB")
image_data = np.array(pic.getdata()).reshape(1, 3, pic.size[1], pic.size[0]).astype(np.byte)
return ov.Tensor(image_data)

class TextQueue:
def __init__(self) -> None:
self.text_queue = Queue()
self.stop_signal = None
self.stop_tokens = []

def __call__(self, text):
self.text_queue.put(text)

def __iter__(self):
return self

def __next__(self):
value = self.text_queue.get()
if value == self.stop_signal or value in self.stop_tokens:
raise StopIteration()
else:
return value

def reset(self):
self.text_queue = Queue()

def end(self):
self.text_queue.put(self.stop_signal)

def bot_streaming(message, history):
print(f"message is - {message}")
print(f"history is - {history}")

if not history:
model.start_chat()
generation_config = openvino_genai.GenerationConfig()
generation_config.max_new_tokens = 128
files = message["files"] if isinstance(message, dict) else message.files
message_text = message["text"] if isinstance(message, dict) else message.text

image = None
if files:
# message["files"][-1] is a Dict or just a string
if isinstance(files[-1], dict):
image = files[-1]["path"]
else:
if isinstance(files[-1], (str, Path)):
image = files[-1]
else:
image = files[-1] if isinstance(files[-1], (list, tuple)) else files[-1].path
if image is not None:
image = read_image(image)
streamer = TextQueue()
stream_complete = Event()

def generate_and_signal_complete():
"""
generation function for single thread
"""
streamer.reset()
generation_kwargs = {"prompt": message_text, "generation_config": generation_config, "streamer": streamer}
if image is not None:
generation_kwargs["image"] = image
model.generate(**generation_kwargs)
stream_complete.set()
streamer.end()

t1 = Thread(target=generate_and_signal_complete)
t1.start()

buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer

additional_buttons = {}
if has_additonal_buttons:
additional_buttons = {"undo_button": None, "retry_button": None}
demo = gr.ChatInterface(
fn=bot_streaming,
title="LLaVA OpenVINO Chatbot",
examples=[
{"text": "What is on the flower?", "files": ["./bee.jpg"]},
{"text": "How to make this pastry?", "files": ["./baklava.png"]},
],
stop_btn=None,
multimodal=True,
**additional_buttons,
)
return demo


Expand Down
Loading

0 comments on commit ca3248d

Please sign in to comment.