diff --git a/autogen/agentchat/contrib/img_utils.py b/autogen/agentchat/contrib/img_utils.py
index 2d2592418747..a389c74b064d 100644
--- a/autogen/agentchat/contrib/img_utils.py
+++ b/autogen/agentchat/contrib/img_utils.py
@@ -1,14 +1,15 @@
import base64
import copy
-import mimetypes
import os
import re
from io import BytesIO
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Tuple, Union
import requests
from PIL import Image
+from autogen.agentchat import utils
+
def get_pil_image(image_file: Union[str, Image.Image]) -> Image.Image:
"""
@@ -179,13 +180,9 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
last_index = 0
image_count = 0
- # Regular expression pattern for matching tags
- img_tag_pattern = re.compile(r"]+)>")
-
# Find all image tags
- for match in img_tag_pattern.finditer(prompt):
- image_location = match.group(1)
-
+ for parsed_tag in utils.parse_tags_from_content("img", prompt):
+ image_location = parsed_tag["attr"]["src"]
try:
if img_format == "pil":
img_data = get_pil_image(image_location)
@@ -202,12 +199,12 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
continue
# Add text before this image tag to output list
- output.append({"type": "text", "text": prompt[last_index : match.start()]})
+ output.append({"type": "text", "text": prompt[last_index : parsed_tag["match"].start()]})
# Add image data to output list
output.append({"type": "image_url", "image_url": {"url": img_data}})
- last_index = match.end()
+ last_index = parsed_tag["match"].end()
image_count += 1
# Add remaining text to output list
diff --git a/autogen/agentchat/utils.py b/autogen/agentchat/utils.py
index fde0b0b88b3a..eef3741605d8 100644
--- a/autogen/agentchat/utils.py
+++ b/autogen/agentchat/utils.py
@@ -1,4 +1,6 @@
-from typing import Any, List, Dict, Tuple, Callable
+import re
+from typing import Any, Callable, Dict, List, Tuple, Union
+
from .agent import Agent
@@ -76,3 +78,108 @@ def aggregate_summary(usage_summary: Dict[str, Any], agent_summary: Dict[str, An
aggregate_summary(actual_usage_summary, agent.client.actual_usage_summary)
return total_usage_summary, actual_usage_summary
+
+
+def parse_tags_from_content(tag: str, content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Dict[str, str]]]:
+ """Parses HTML style tags from message contents.
+
+ The parsing is done by looking for patterns in the text that match the format of HTML tags. The tag to be parsed is
+ specified as an argument to the function. The function looks for this tag in the text and extracts its content. The
+ content of a tag is everything that is inside the tag, between the opening and closing angle brackets. The content
+ can be a single string or a set of attribute-value pairs.
+
+ Examples:
+ -> [{"tag": "img", "attr": {"src": "http://example.com/image.png"}, "match": re.Match}]
+