Skip to content

Commit

Permalink
Parse Any HTML-esh Style Tags (microsoft#2046)
Browse files Browse the repository at this point in the history
* tried implementing my own regex

* improves tests

* finally works

* removes prints

* fixed test

* adds start and end

* delete unused imports

* refactored to use new tool

* significantly improved algo

* tag content -> tag attr

* fix tests + adds new field

* return full match

* return remove start and end

* update docstrings

* update docstrings

* update docstrings

---------

Co-authored-by: Beibin Li <BeibinLi@users.noreply.github.com>
Co-authored-by: Chi Wang <wang.chi@microsoft.com>
  • Loading branch information
3 people authored Mar 26, 2024
1 parent 59a7790 commit 66d96dd
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 11 deletions.
17 changes: 7 additions & 10 deletions autogen/agentchat/contrib/img_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import base64
import copy
import mimetypes
import os
import re
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Dict, List, Tuple, Union

import requests
from PIL import Image

from autogen.agentchat import utils


def get_pil_image(image_file: Union[str, Image.Image]) -> Image.Image:
"""
Expand Down Expand Up @@ -179,13 +180,9 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
last_index = 0
image_count = 0

# Regular expression pattern for matching <img ...> tags
img_tag_pattern = re.compile(r"<img ([^>]+)>")

# Find all image tags
for match in img_tag_pattern.finditer(prompt):
image_location = match.group(1)

for parsed_tag in utils.parse_tags_from_content("img", prompt):
image_location = parsed_tag["attr"]["src"]
try:
if img_format == "pil":
img_data = get_pil_image(image_location)
Expand All @@ -202,12 +199,12 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
continue

# Add text before this image tag to output list
output.append({"type": "text", "text": prompt[last_index : match.start()]})
output.append({"type": "text", "text": prompt[last_index : parsed_tag["match"].start()]})

# Add image data to output list
output.append({"type": "image_url", "image_url": {"url": img_data}})

last_index = match.end()
last_index = parsed_tag["match"].end()
image_count += 1

# Add remaining text to output list
Expand Down
109 changes: 108 additions & 1 deletion autogen/agentchat/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import Any, List, Dict, Tuple, Callable
import re
from typing import Any, Callable, Dict, List, Tuple, Union

from .agent import Agent


Expand Down Expand Up @@ -76,3 +78,108 @@ def aggregate_summary(usage_summary: Dict[str, Any], agent_summary: Dict[str, An
aggregate_summary(actual_usage_summary, agent.client.actual_usage_summary)

return total_usage_summary, actual_usage_summary


def parse_tags_from_content(tag: str, content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Dict[str, str]]]:
"""Parses HTML style tags from message contents.
The parsing is done by looking for patterns in the text that match the format of HTML tags. The tag to be parsed is
specified as an argument to the function. The function looks for this tag in the text and extracts its content. The
content of a tag is everything that is inside the tag, between the opening and closing angle brackets. The content
can be a single string or a set of attribute-value pairs.
Examples:
<img http://example.com/image.png> -> [{"tag": "img", "attr": {"src": "http://example.com/image.png"}, "match": re.Match}]
<audio text="Hello I'm a robot" prompt="whisper"> ->
[{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}, "match": re.Match}]
Args:
tag (str): The HTML style tag to be parsed.
content (Union[str, List[Dict[str, Any]]]): The message content to parse. Can be a string or a list of content
items.
Returns:
List[Dict[str, str]]: A list of dictionaries, where each dictionary represents a parsed tag. Each dictionary
contains three key-value pairs: 'type' which is the tag, 'attr' which is a dictionary of the parsed attributes,
and 'match' which is a regular expression match object.
Raises:
ValueError: If the content is not a string or a list.
"""
results = []
if isinstance(content, str):
results.extend(_parse_tags_from_text(tag, content))
# Handles case for multimodal messages.
elif isinstance(content, list):
for item in content:
if item.get("type") == "text":
results.extend(_parse_tags_from_text(tag, item["text"]))
else:
raise ValueError(f"content must be str or list, but got {type(content)}")

return results


def _parse_tags_from_text(tag: str, text: str) -> List[Dict[str, str]]:
pattern = re.compile(f"<{tag} (.*?)>")

results = []
for match in re.finditer(pattern, text):
tag_attr = match.group(1).strip()
attr = _parse_attributes_from_tags(tag_attr)

results.append({"tag": tag, "attr": attr, "match": match})
return results


def _parse_attributes_from_tags(tag_content: str):
pattern = r"([^ ]+)"
attrs = re.findall(pattern, tag_content)
reconstructed_attrs = _reconstruct_attributes(attrs)

def _append_src_value(content, value):
if "src" in content:
content["src"] += f" {value}"
else:
content["src"] = value

content = {}
for attr in reconstructed_attrs:
if "=" not in attr:
_append_src_value(content, attr)
continue

key, value = attr.split("=", 1)
if value.startswith("'") or value.startswith('"'):
content[key] = value[1:-1] # remove quotes
else:
_append_src_value(content, attr)

return content


def _reconstruct_attributes(attrs: List[str]) -> List[str]:
"""Reconstructs attributes from a list of strings where some attributes may be split across multiple elements."""

def is_attr(attr: str) -> bool:
if "=" in attr:
_, value = attr.split("=", 1)
if value.startswith("'") or value.startswith('"'):
return True
return False

reconstructed = []
found_attr = False
for attr in attrs:
if is_attr(attr):
reconstructed.append(attr)
found_attr = True
else:
if found_attr:
reconstructed[-1] += f" {attr}"
found_attr = True
elif reconstructed:
reconstructed[-1] += f" {attr}"
else:
reconstructed.append(attr)
return reconstructed
76 changes: 76 additions & 0 deletions test/agentchat/test_agentchat_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from typing import Dict, List, Union
from autogen import agentchat
import pytest

TAG_PARSING_TESTS = [
{
"message": "Hello agent, can you take a look at this image <img http://example.com/image.png>",
"expected": [{"tag": "img", "attr": {"src": "http://example.com/image.png"}}],
},
{
"message": "Can you transcribe this audio? <audio http://example.com/au=dio.mp3>",
"expected": [{"tag": "audio", "attr": {"src": "http://example.com/au=dio.mp3"}}],
},
{
"message": "Can you describe what's in this image <img url='http://example.com/=image.png'>",
"expected": [{"tag": "img", "attr": {"url": "http://example.com/=image.png"}}],
},
{
"message": "Can you describe what's in this image <img http://example.com/image.png> and transcribe this audio? <audio http://example.com/audio.mp3>",
"expected": [
{"tag": "img", "attr": {"src": "http://example.com/image.png"}},
{"tag": "audio", "attr": {"src": "http://example.com/audio.mp3"}},
],
},
{
"message": "Can you generate this audio? <audio text='Hello I'm a robot' prompt='whisper'>",
"expected": [{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}}],
},
{
"message": "Can you describe what's in this image <img http://example.com/image.png width='100'> and this image <img http://hello.com/image=.png>?",
"expected": [
{"tag": "img", "attr": {"src": "http://example.com/image.png", "width": "100"}},
{"tag": "img", "attr": {"src": "http://hello.com/image=.png"}},
],
},
{
"message": "Text with no tags",
"expected": [],
},
]


def _delete_unused_keys(d: Dict) -> None:
if "match" in d:
del d["match"]


@pytest.mark.parametrize("test_case", TAG_PARSING_TESTS)
def test_tag_parsing(test_case: Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]) -> None:
"""Test the tag_parsing function."""
message = test_case["message"]
expected = test_case["expected"]
tags = ["img", "audio", "random"]

result = []
for tag in tags:
parsed_tags = agentchat.utils.parse_tags_from_content(tag, message)
for item in parsed_tags:
_delete_unused_keys(item)

result.extend(parsed_tags)
assert result == expected

result = []
for tag in tags:
content = [{"type": "text", "text": message}]
parsed_tags = agentchat.utils.parse_tags_from_content(tag, content)
for item in parsed_tags:
_delete_unused_keys(item)

result.extend(parsed_tags)
assert result == expected


if __name__ == "__main__":
test_tag_parsing(TAG_PARSING_TESTS[0])

0 comments on commit 66d96dd

Please sign in to comment.