omniparser (#2526)

openvinotoolkit · Nov 18, 2024 · 3d47df5 · 3d47df5
1 parent db882f2
commit 3d47df5
Show file tree

Hide file tree

Showing 11 changed files with 2,765 additions and 21 deletions.
diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -237,6 +237,7 @@ DreamBooth Tsinghua
 Dreamshaper
 dropdown
 DynamiCrafter
+EasyOCR
 ECCV
 editability
 EfficientNet
@@ -318,6 +319,7 @@ GroundedSAM
 GroundingDINO
 gRPC
 Gu
+GUIs
 Gutendex
 Hafner
 HugginFaceH
@@ -355,7 +357,6 @@ impactful
 IMU
 IMUs
 InceptionResNetV
-intialization
 inferencing
 InferRequest
 InferRequests
@@ -370,6 +371,7 @@ instantiation
 InstructGPT
 InstructPix
 intel
+interactable
 InternLM
 internlm
 Interpolative
@@ -579,6 +581,7 @@ ocr
 OCRBench
 OCRv
 odometry
+OmniParser
 OMZ
 OneFormer
 oneformer

diff --git a/notebooks/florence2/florence2.ipynb b/notebooks/florence2/florence2.ipynb
@@ -247,7 +247,7 @@
     "## Run model inference\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
-    "`OvFlorence@Model` class defined in `ov_florence2_helper.py` provides convenient way for running model. It accepts directory with converted model and inference device as arguments. For running model we will use `generate` method."
+    "`OvFlorence2Model` class defined in `ov_florence2_helper.py` provides convenient way for running model. It accepts directory with converted model and inference device as arguments. For running model we will use `generate` method."
    ]
   },
   {
@@ -451,7 +451,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.11.4"
   },
   "openvino_notebooks": {
    "imageUrl": "https://github.com/user-attachments/assets/b2469455-8ab6-4718-8fe0-3e9ea17ec1ce",

diff --git a/notebooks/florence2/ov_florence2_helper.py b/notebooks/florence2/ov_florence2_helper.py
@@ -42,7 +42,23 @@ def cleanup_torchscript_cache():
     torch.jit._state._clear_class_state()
 
 
-def convert_florence2(model_id, output_dir):
+def download_original_model(model_id, orig_model_dir):
+    hf_hub.snapshot_download(repo_id=model_id, local_dir=orig_model_dir)
+    modeling_file = orig_model_dir / "modeling_florence2.py"
+    orig_modeling_file = orig_model_dir / f"orig_{modeling_file.name}"
+    if not orig_modeling_file.exists():
+        modeling_file.rename(orig_modeling_file)
+    with orig_modeling_file.open("r") as f:
+        content = f.read()
+        content = content.replace("if is_flash_attn_2_available():", "")
+        content = content.replace("    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "")
+        content = content.replace("    from flash_attn import flash_attn_func, flash_attn_varlen_func", "")
+        content = content.replace("    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "")
+        with modeling_file.open("w") as out_f:
+            out_f.write(content)
+
+
+def convert_florence2(model_id, output_dir, orig_model_dir=None):
     output_dir = Path(output_dir)
 
     required_conversion = not all(
@@ -60,21 +76,10 @@ def convert_florence2(model_id, output_dir):
 
     print(f"⌛ {model_id} conversion started. Be patient, it may takes some time.")
     print("⌛ Load Original model")
-    orig_model_dir = output_dir / "chkpt"
+    if orig_model_dir is None:
+        orig_model_dir = output_dir / "chkpt"
     if not orig_model_dir.exists():
-        hf_hub.snapshot_download(repo_id=model_id, local_dir=orig_model_dir)
-        modeling_file = orig_model_dir / "modeling_florence2.py"
-        orig_modeling_file = orig_model_dir / f"orig_{modeling_file.name}"
-        if not orig_modeling_file.exists():
-            modeling_file.rename(orig_modeling_file)
-        with orig_modeling_file.open("r") as f:
-            content = f.read()
-            content = content.replace("if is_flash_attn_2_available():", "")
-            content = content.replace("    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "")
-            content = content.replace("    from flash_attn import flash_attn_func, flash_attn_varlen_func", "")
-            content = content.replace("    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "")
-            with modeling_file.open("w") as out_f:
-                out_f.write(content)
+        download_original_model(model_id, orig_model_dir)
 
     model = AutoModelForCausalLM.from_pretrained(orig_model_dir, trust_remote_code=True)
     processor = AutoProcessor.from_pretrained(orig_model_dir, trust_remote_code=True)

diff --git a/notebooks/modelscope-to-openvino/modelscope-to-openvino.ipynb b/notebooks/modelscope-to-openvino/modelscope-to-openvino.ipynb
@@ -306,7 +306,7 @@
     "\n",
     "After that, we can load model to Optimum Intel interface replacing the `AutoModelForXxx` class from transformers with the corresponding `OVModelForXxx`. Model conversion will be performed on the fly. For avoiding next time conversion, we can save model on disk using `save_pretrained` method and in the next time pass directory with already converted model as argument in `from_pretrained` method. We also specified `device` parameter for compiling the model on the specific device, if not provided, the default device will be used. The device can be changed later in runtime using `model.to(device)`, please note that it may require some time for model compilation on a newly selected device. In some cases, it can be useful to separate model initialization and compilation, for example, if you want to reshape the model using `reshape` method, you can postpone compilation, providing the parameter `compile=False` into `from_pretrained` method, compilation can be performed manually using `compile` method or will be performed automatically during first inference run.\n",
     "\n",
-    "As example, we will use [nlp_bert_sentiment-analysis_english-base](https://modelscope.cn/models/iic/nlp_bert_sentiment-analysis_english-base). This model was trained for classification input text on 3 sentiment categories: negative, positive and neutral. In transformers, `AutoModelForSequenceClassification` should be used for model intialization, so for usage model with OpenVINO, it is enough just replace `AutoModelForSequenceClassification` to `OVModelForSequenceClassification`.\n"
+    "As example, we will use [nlp_bert_sentiment-analysis_english-base](https://modelscope.cn/models/iic/nlp_bert_sentiment-analysis_english-base). This model was trained for classification input text on 3 sentiment categories: negative, positive and neutral. In transformers, `AutoModelForSequenceClassification` should be used for model initialization, so for usage model with OpenVINO, it is enough just replace `AutoModelForSequenceClassification` to `OVModelForSequenceClassification`.\n"
    ]
   },
   {

diff --git a/notebooks/omniparser/README.md b/notebooks/omniparser/README.md
@@ -0,0 +1,29 @@
+# Screen Parsing with OmniParser and OpenVINO
+
+Recent breakthrough in Visual Language Processing and Large Language models made significant strides in understanding and interacting with the world through text and images. However, accurately parsing and understanding complex graphical user interfaces (GUIs) remains a significant challenge.
+[OmniParser](https://microsoft.github.io/OmniParser/) is a comprehensive method for parsing user interface screenshots into structured and easy-to-understand elements. This enables more accurate and efficient interaction with GUIs, empowering AI agents to perform tasks across various platforms and applications.
+
+![](https://microsoft.github.io/OmniParser/static/images/flow_merged0.png)
+
+More details about model can be found in [Microsoft blog post](https://www.microsoft.com/en-us/research/articles/omniparser-for-pure-vision-based-gui-agent/), [paper](https://arxiv.org/pdf/2408.00203), [original repo](https://github.com/microsoft/OmniParser) and [model card](https://huggingface.co/microsoft/OmniParser). 
+
+In this tutorial we consider how to run OmniParser using OpenVINO.
+
+# Notebook contents
+The tutorial consists from following steps:
+
+- Install requirements
+- Convert model
+- Run OpenVINO model inference
+- Launch Interactive demo
+
+In this demonstration, you'll try to run OmniParser for recognition of UI elements on screenshots.
+
+
+## Installation instructions
+This is a self-contained example that relies solely on its own code.</br>
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
+For details, please refer to [Installation Guide](../../README.md).
+
+<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/florence2/README.md" />
+<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/omniparser/README.md" />
diff --git a/notebooks/omniparser/gradio_helper.py b/notebooks/omniparser/gradio_helper.py
@@ -0,0 +1,50 @@
+from pathlib import Path
+import requests
+from PIL import Image
+import gradio as gr
+
+MARKDOWN = """
+# OpenVINO OmniParser for Pure Vision Based General GUI Agent 🔥
+OmniParser is a screen parsing tool to convert general GUI screen to structured elements. 
+"""
+
+example_images = [
+    ("https://github.com/microsoft/OmniParser/blob/master/imgs/windows_home.png?raw=true", "examples/windows_home.png"),
+    ("https://github.com/microsoft/OmniParser/blob/master/imgs/logo.png?raw=true", "examples/logo.png"),
+    ("https://github.com/microsoft/OmniParser/blob/master/imgs/windows_multitab.png?raw=true", "examples/multitab.png"),
+]
+
+
+def make_demo(process_fn):
+    examples_dir = Path("examples")
+    examples_dir.mkdir(exist_ok=True, parents=True)
+    for url, filename in example_images:
+        if not Path(filename).exists():
+            image = Image.open(requests.get(url, stream=True).raw)
+            image.save(filename)
+
+    with gr.Blocks() as demo:
+        gr.Markdown(MARKDOWN)
+        with gr.Row():
+            with gr.Column():
+                image_input_component = gr.Image(type="filepath", label="Upload image")
+                # set the threshold for removing the bounding boxes with low confidence, default is 0.05
+                box_threshold_component = gr.Slider(label="Box Threshold", minimum=0.01, maximum=1.0, step=0.01, value=0.05)
+                # set the threshold for removing the bounding boxes with large overlap, default is 0.1
+                iou_threshold_component = gr.Slider(label="IOU Threshold", minimum=0.01, maximum=1.0, step=0.01, value=0.1)
+                imgsz_component = gr.Slider(label="Icon Detect Image Size", minimum=640, maximum=1920, step=32, value=640)
+                submit_button_component = gr.Button(value="Submit", variant="primary")
+            with gr.Column():
+                image_output_component = gr.Image(type="pil", label="Image Output")
+                text_output_component = gr.Textbox(label="Parsed screen elements", placeholder="Text Output")
+        gr.Examples(
+            examples=list(Path("examples").glob("*.png")),
+            inputs=[image_input_component],
+            label="Try examples",
+        )
+        submit_button_component.click(
+            fn=process_fn,
+            inputs=[image_input_component, box_threshold_component, iou_threshold_component, imgsz_component],
+            outputs=[image_output_component, text_output_component],
+        )
+    return demo
-Original file line number
+Diff line change
@@ Expand Up / @@ -306,7 +306,7 @@ @@
         "\n",
         "After that, we can load model to Optimum Intel interface replacing the `AutoModelForXxx` class from transformers with the corresponding `OVModelForXxx`. Model conversion will be performed on the fly. For avoiding next time conversion, we can save model on disk using `save_pretrained` method and in the next time pass directory with already converted model as argument in `from_pretrained` method. We also specified `device` parameter for compiling the model on the specific device, if not provided, the default device will be used. The device can be changed later in runtime using `model.to(device)`, please note that it may require some time for model compilation on a newly selected device. In some cases, it can be useful to separate model initialization and compilation, for example, if you want to reshape the model using `reshape` method, you can postpone compilation, providing the parameter `compile=False` into `from_pretrained` method, compilation can be performed manually using `compile` method or will be performed automatically during first inference run.\n",
         "\n",
-        "As example, we will use [nlp_bert_sentiment-analysis_english-base](https://modelscope.cn/models/iic/nlp_bert_sentiment-analysis_english-base). This model was trained for classification input text on 3 sentiment categories: negative, positive and neutral. In transformers, `AutoModelForSequenceClassification` should be used for model intialization, so for usage model with OpenVINO, it is enough just replace `AutoModelForSequenceClassification` to `OVModelForSequenceClassification`.\n"
+        "As example, we will use [nlp_bert_sentiment-analysis_english-base](https://modelscope.cn/models/iic/nlp_bert_sentiment-analysis_english-base). This model was trained for classification input text on 3 sentiment categories: negative, positive and neutral. In transformers, `AutoModelForSequenceClassification` should be used for model initialization, so for usage model with OpenVINO, it is enough just replace `AutoModelForSequenceClassification` to `OVModelForSequenceClassification`.\n"
        ]
       },
       {
@@ Expand Down @@