Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 87 additions & 7 deletions app/llm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import base64
import io
import json
import math
from typing import Dict, List, Optional, Union
import os
from typing import Dict, List, Literal, Optional, Union

import tiktoken
from openai import (
Expand All @@ -11,6 +15,7 @@
RateLimitError,
)
from openai.types.chat import ChatCompletion, ChatCompletionMessage
from PIL import Image
from tenacity import (
retry,
retry_if_exception_type,
Expand All @@ -30,7 +35,6 @@
ToolChoice,
)


REASONING_MODELS = ["o1", "o3-mini"]
MULTIMODAL_MODELS = [
"gpt-4-vision-preview",
Expand Down Expand Up @@ -321,12 +325,17 @@ def format_messages(
for item in message["content"]
]

# Add the image to content
# Optimize the image before adding to content
optimized_image = LLM.optimize_image_for_api(
message["base64_image"]
)

# Add the optimized image to content
message["content"].append(
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{message['base64_image']}"
"url": f"data:image/webp;base64,{optimized_image}"
},
}
)
Expand All @@ -351,6 +360,79 @@ def format_messages(

return formatted_messages

@staticmethod
def optimize_image_for_api(
base64_image: str, max_size: int = 2048, quality: int = 85
) -> str:
"""
Optimize image by converting to WebP format and resizing if necessary.

Args:
base64_image: Base64 encoded image string
max_size: Maximum dimension (width or height) in pixels
quality: WebP quality (0-100, higher = better quality but larger file)

Returns:
str: Optimized base64 encoded WebP image

Benefits:
- WebP typically 25-35% smaller than JPEG at similar quality
- Faster upload times and reduced bandwidth
- Smaller base64 strings in JSON payloads
"""
try:
# Decode base64 image
image_data = base64.b64decode(base64_image)
image = Image.open(io.BytesIO(image_data))

# Convert to RGB if necessary (WebP doesn't support RGBA)
if image.mode in ("RGBA", "LA", "P"):
# Create white background for transparent images
background = Image.new("RGB", image.size, (255, 255, 255))
if image.mode == "P":
image = image.convert("RGBA")
background.paste(
image, mask=image.split()[-1] if image.mode == "RGBA" else None
)
image = background
elif image.mode != "RGB":
image = image.convert("RGB")

# Resize if image is too large
width, height = image.size
if width > max_size or height > max_size:
# Calculate new dimensions maintaining aspect ratio
ratio = min(max_size / width, max_size / height)
new_width = int(width * ratio)
new_height = int(height * ratio)
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
logger.info(
f"Resized image from {width}x{height} to {new_width}x{new_height}"
)

# Convert to WebP
output_buffer = io.BytesIO()
image.save(output_buffer, format="WEBP", quality=quality, optimize=True)
optimized_data = output_buffer.getvalue()

# Encode back to base64
optimized_base64 = base64.b64encode(optimized_data).decode("utf-8")

# Calculate size reduction
original_size = len(base64_image)
optimized_size = len(optimized_base64)
reduction = ((original_size - optimized_size) / original_size) * 100

logger.info(
f"Image optimization: {original_size} -> {optimized_size} bytes ({reduction:.1f}% reduction)"
)

return optimized_base64

except Exception as e:
logger.warning(f"Failed to optimize image: {e}. Using original image.")
return base64_image

@retry(
wait=wait_random_exponential(min=1, max=60),
stop=stop_after_attempt(6),
Expand Down Expand Up @@ -537,9 +619,7 @@ async def ask_with_images(
multimodal_content = (
[{"type": "text", "text": content}]
if isinstance(content, str)
else content
if isinstance(content, list)
else []
else content if isinstance(content, list) else []
)

# Add images to content
Expand Down
7 changes: 4 additions & 3 deletions app/tool/browser_use_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from app.tool.base import BaseTool, ToolResult
from app.tool.web_search import WebSearch


_BROWSER_DESCRIPTION = """\
A powerful browser automation tool that allows interaction with web pages through various actions.
* This tool provides commands for controlling a browser session, navigating web pages, and extracting information
Expand Down Expand Up @@ -508,7 +507,9 @@ async def get_current_state(
full_page=True, animations="disabled", type="jpeg", quality=100
)

screenshot = base64.b64encode(screenshot).decode("utf-8")
# Convert to base64 and optimize with WebP
screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
optimized_screenshot = LLM.optimize_image_for_api(screenshot_base64)

# Build the state info with all required fields
state_info = {
Expand All @@ -533,7 +534,7 @@ async def get_current_state(

return ToolResult(
output=json.dumps(state_info, indent=4, ensure_ascii=False),
base64_image=screenshot,
base64_image=optimized_screenshot,
)
except Exception as e:
return ToolResult(error=f"Failed to get browser state: {str(e)}")
Expand Down