Skip to content

Commit

Permalink
omniparser (#2526)
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova authored Nov 18, 2024
1 parent db882f2 commit 3d47df5
Show file tree
Hide file tree
Showing 11 changed files with 2,765 additions and 21 deletions.
5 changes: 4 additions & 1 deletion .ci/spellcheck/.pyspelling.wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ DreamBooth Tsinghua
Dreamshaper
dropdown
DynamiCrafter
EasyOCR
ECCV
editability
EfficientNet
Expand Down Expand Up @@ -318,6 +319,7 @@ GroundedSAM
GroundingDINO
gRPC
Gu
GUIs
Gutendex
Hafner
HugginFaceH
Expand Down Expand Up @@ -355,7 +357,6 @@ impactful
IMU
IMUs
InceptionResNetV
intialization
inferencing
InferRequest
InferRequests
Expand All @@ -370,6 +371,7 @@ instantiation
InstructGPT
InstructPix
intel
interactable
InternLM
internlm
Interpolative
Expand Down Expand Up @@ -579,6 +581,7 @@ ocr
OCRBench
OCRv
odometry
OmniParser
OMZ
OneFormer
oneformer
Expand Down
4 changes: 2 additions & 2 deletions notebooks/florence2/florence2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@
"## Run model inference\n",
"[back to top ⬆️](#Table-of-contents:)\n",
"\n",
"`OvFlorence@Model` class defined in `ov_florence2_helper.py` provides convenient way for running model. It accepts directory with converted model and inference device as arguments. For running model we will use `generate` method."
"`OvFlorence2Model` class defined in `ov_florence2_helper.py` provides convenient way for running model. It accepts directory with converted model and inference device as arguments. For running model we will use `generate` method."
]
},
{
Expand Down Expand Up @@ -451,7 +451,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.4"
},
"openvino_notebooks": {
"imageUrl": "https://github.com/user-attachments/assets/b2469455-8ab6-4718-8fe0-3e9ea17ec1ce",
Expand Down
35 changes: 20 additions & 15 deletions notebooks/florence2/ov_florence2_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,23 @@ def cleanup_torchscript_cache():
torch.jit._state._clear_class_state()


def convert_florence2(model_id, output_dir):
def download_original_model(model_id, orig_model_dir):
hf_hub.snapshot_download(repo_id=model_id, local_dir=orig_model_dir)
modeling_file = orig_model_dir / "modeling_florence2.py"
orig_modeling_file = orig_model_dir / f"orig_{modeling_file.name}"
if not orig_modeling_file.exists():
modeling_file.rename(orig_modeling_file)
with orig_modeling_file.open("r") as f:
content = f.read()
content = content.replace("if is_flash_attn_2_available():", "")
content = content.replace(" from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "")
content = content.replace(" from flash_attn import flash_attn_func, flash_attn_varlen_func", "")
content = content.replace(" from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "")
with modeling_file.open("w") as out_f:
out_f.write(content)


def convert_florence2(model_id, output_dir, orig_model_dir=None):
output_dir = Path(output_dir)

required_conversion = not all(
Expand All @@ -60,21 +76,10 @@ def convert_florence2(model_id, output_dir):

print(f"⌛ {model_id} conversion started. Be patient, it may takes some time.")
print("⌛ Load Original model")
orig_model_dir = output_dir / "chkpt"
if orig_model_dir is None:
orig_model_dir = output_dir / "chkpt"
if not orig_model_dir.exists():
hf_hub.snapshot_download(repo_id=model_id, local_dir=orig_model_dir)
modeling_file = orig_model_dir / "modeling_florence2.py"
orig_modeling_file = orig_model_dir / f"orig_{modeling_file.name}"
if not orig_modeling_file.exists():
modeling_file.rename(orig_modeling_file)
with orig_modeling_file.open("r") as f:
content = f.read()
content = content.replace("if is_flash_attn_2_available():", "")
content = content.replace(" from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "")
content = content.replace(" from flash_attn import flash_attn_func, flash_attn_varlen_func", "")
content = content.replace(" from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "")
with modeling_file.open("w") as out_f:
out_f.write(content)
download_original_model(model_id, orig_model_dir)

model = AutoModelForCausalLM.from_pretrained(orig_model_dir, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(orig_model_dir, trust_remote_code=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@
"\n",
"After that, we can load model to Optimum Intel interface replacing the `AutoModelForXxx` class from transformers with the corresponding `OVModelForXxx`. Model conversion will be performed on the fly. For avoiding next time conversion, we can save model on disk using `save_pretrained` method and in the next time pass directory with already converted model as argument in `from_pretrained` method. We also specified `device` parameter for compiling the model on the specific device, if not provided, the default device will be used. The device can be changed later in runtime using `model.to(device)`, please note that it may require some time for model compilation on a newly selected device. In some cases, it can be useful to separate model initialization and compilation, for example, if you want to reshape the model using `reshape` method, you can postpone compilation, providing the parameter `compile=False` into `from_pretrained` method, compilation can be performed manually using `compile` method or will be performed automatically during first inference run.\n",
"\n",
"As example, we will use [nlp_bert_sentiment-analysis_english-base](https://modelscope.cn/models/iic/nlp_bert_sentiment-analysis_english-base). This model was trained for classification input text on 3 sentiment categories: negative, positive and neutral. In transformers, `AutoModelForSequenceClassification` should be used for model intialization, so for usage model with OpenVINO, it is enough just replace `AutoModelForSequenceClassification` to `OVModelForSequenceClassification`.\n"
"As example, we will use [nlp_bert_sentiment-analysis_english-base](https://modelscope.cn/models/iic/nlp_bert_sentiment-analysis_english-base). This model was trained for classification input text on 3 sentiment categories: negative, positive and neutral. In transformers, `AutoModelForSequenceClassification` should be used for model initialization, so for usage model with OpenVINO, it is enough just replace `AutoModelForSequenceClassification` to `OVModelForSequenceClassification`.\n"
]
},
{
Expand Down
29 changes: 29 additions & 0 deletions notebooks/omniparser/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Screen Parsing with OmniParser and OpenVINO

Recent breakthrough in Visual Language Processing and Large Language models made significant strides in understanding and interacting with the world through text and images. However, accurately parsing and understanding complex graphical user interfaces (GUIs) remains a significant challenge.
[OmniParser](https://microsoft.github.io/OmniParser/) is a comprehensive method for parsing user interface screenshots into structured and easy-to-understand elements. This enables more accurate and efficient interaction with GUIs, empowering AI agents to perform tasks across various platforms and applications.

![](https://microsoft.github.io/OmniParser/static/images/flow_merged0.png)

More details about model can be found in [Microsoft blog post](https://www.microsoft.com/en-us/research/articles/omniparser-for-pure-vision-based-gui-agent/), [paper](https://arxiv.org/pdf/2408.00203), [original repo](https://github.com/microsoft/OmniParser) and [model card](https://huggingface.co/microsoft/OmniParser).

In this tutorial we consider how to run OmniParser using OpenVINO.

# Notebook contents
The tutorial consists from following steps:

- Install requirements
- Convert model
- Run OpenVINO model inference
- Launch Interactive demo

In this demonstration, you'll try to run OmniParser for recognition of UI elements on screenshots.


## Installation instructions
This is a self-contained example that relies solely on its own code.</br>
We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
For details, please refer to [Installation Guide](../../README.md).

<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/florence2/README.md" />
<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/omniparser/README.md" />
50 changes: 50 additions & 0 deletions notebooks/omniparser/gradio_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from pathlib import Path
import requests
from PIL import Image
import gradio as gr

MARKDOWN = """
# OpenVINO OmniParser for Pure Vision Based General GUI Agent 🔥
OmniParser is a screen parsing tool to convert general GUI screen to structured elements.
"""

example_images = [
("https://github.com/microsoft/OmniParser/blob/master/imgs/windows_home.png?raw=true", "examples/windows_home.png"),
("https://github.com/microsoft/OmniParser/blob/master/imgs/logo.png?raw=true", "examples/logo.png"),
("https://github.com/microsoft/OmniParser/blob/master/imgs/windows_multitab.png?raw=true", "examples/multitab.png"),
]


def make_demo(process_fn):
examples_dir = Path("examples")
examples_dir.mkdir(exist_ok=True, parents=True)
for url, filename in example_images:
if not Path(filename).exists():
image = Image.open(requests.get(url, stream=True).raw)
image.save(filename)

with gr.Blocks() as demo:
gr.Markdown(MARKDOWN)
with gr.Row():
with gr.Column():
image_input_component = gr.Image(type="filepath", label="Upload image")
# set the threshold for removing the bounding boxes with low confidence, default is 0.05
box_threshold_component = gr.Slider(label="Box Threshold", minimum=0.01, maximum=1.0, step=0.01, value=0.05)
# set the threshold for removing the bounding boxes with large overlap, default is 0.1
iou_threshold_component = gr.Slider(label="IOU Threshold", minimum=0.01, maximum=1.0, step=0.01, value=0.1)
imgsz_component = gr.Slider(label="Icon Detect Image Size", minimum=640, maximum=1920, step=32, value=640)
submit_button_component = gr.Button(value="Submit", variant="primary")
with gr.Column():
image_output_component = gr.Image(type="pil", label="Image Output")
text_output_component = gr.Textbox(label="Parsed screen elements", placeholder="Text Output")
gr.Examples(
examples=list(Path("examples").glob("*.png")),
inputs=[image_input_component],
label="Try examples",
)
submit_button_component.click(
fn=process_fn,
inputs=[image_input_component, box_threshold_component, iou_threshold_component, imgsz_component],
outputs=[image_output_component, text_output_component],
)
return demo
Loading

0 comments on commit 3d47df5

Please sign in to comment.