FoundationAgents · kimasplund · Jul 22, 2025
diff --git a/app/llm.py b/app/llm.py
@@ -1,5 +1,9 @@
+import base64
+import io
+import json
 import math
-from typing import Dict, List, Optional, Union
+import os
+from typing import Dict, List, Literal, Optional, Union
 
 import tiktoken
 from openai import (
@@ -11,6 +15,7 @@
     RateLimitError,
 )
 from openai.types.chat import ChatCompletion, ChatCompletionMessage
+from PIL import Image
 from tenacity import (
     retry,
     retry_if_exception_type,
@@ -30,7 +35,6 @@
     ToolChoice,
 )
 
-
 REASONING_MODELS = ["o1", "o3-mini"]
 MULTIMODAL_MODELS = [
     "gpt-4-vision-preview",
@@ -321,12 +325,17 @@ def format_messages(
                             for item in message["content"]
                         ]
 
-                    # Add the image to content
+                    # Optimize the image before adding to content
+                    optimized_image = LLM.optimize_image_for_api(
+                        message["base64_image"]
+                    )
+
+                    # Add the optimized image to content
                     message["content"].append(
                         {
                             "type": "image_url",
                             "image_url": {
-                                "url": f"data:image/jpeg;base64,{message['base64_image']}"
+                                "url": f"data:image/webp;base64,{optimized_image}"
                             },
                         }
                     )
@@ -351,6 +360,79 @@ def format_messages(
 
         return formatted_messages
 
+    @staticmethod
+    def optimize_image_for_api(
+        base64_image: str, max_size: int = 2048, quality: int = 85
+    ) -> str:
+        """
+        Optimize image by converting to WebP format and resizing if necessary.
+
+        Args:
+            base64_image: Base64 encoded image string
+            max_size: Maximum dimension (width or height) in pixels
+            quality: WebP quality (0-100, higher = better quality but larger file)
+
+        Returns:
+            str: Optimized base64 encoded WebP image
+
+        Benefits:
+            - WebP typically 25-35% smaller than JPEG at similar quality
+            - Faster upload times and reduced bandwidth
+            - Smaller base64 strings in JSON payloads
+        """
+        try:
+            # Decode base64 image
+            image_data = base64.b64decode(base64_image)
+            image = Image.open(io.BytesIO(image_data))
+
+            # Convert to RGB if necessary (WebP doesn't support RGBA)
+            if image.mode in ("RGBA", "LA", "P"):
+                # Create white background for transparent images
+                background = Image.new("RGB", image.size, (255, 255, 255))
+                if image.mode == "P":
+                    image = image.convert("RGBA")
+                background.paste(
+                    image, mask=image.split()[-1] if image.mode == "RGBA" else None
+                )
+                image = background
+            elif image.mode != "RGB":
+                image = image.convert("RGB")
+
+            # Resize if image is too large
+            width, height = image.size
+            if width > max_size or height > max_size:
+                # Calculate new dimensions maintaining aspect ratio
+                ratio = min(max_size / width, max_size / height)
+                new_width = int(width * ratio)
+                new_height = int(height * ratio)
+                image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+                logger.info(
+                    f"Resized image from {width}x{height} to {new_width}x{new_height}"
+                )
+
+            # Convert to WebP
+            output_buffer = io.BytesIO()
+            image.save(output_buffer, format="WEBP", quality=quality, optimize=True)
+            optimized_data = output_buffer.getvalue()
+
+            # Encode back to base64
+            optimized_base64 = base64.b64encode(optimized_data).decode("utf-8")
+
+            # Calculate size reduction
+            original_size = len(base64_image)
+            optimized_size = len(optimized_base64)
+            reduction = ((original_size - optimized_size) / original_size) * 100
+
+            logger.info(
+                f"Image optimization: {original_size} -> {optimized_size} bytes ({reduction:.1f}% reduction)"
+            )
+
+            return optimized_base64
+
+        except Exception as e:
+            logger.warning(f"Failed to optimize image: {e}. Using original image.")
+            return base64_image
+
     @retry(
         wait=wait_random_exponential(min=1, max=60),
         stop=stop_after_attempt(6),
@@ -537,9 +619,7 @@ async def ask_with_images(
             multimodal_content = (
                 [{"type": "text", "text": content}]
                 if isinstance(content, str)
-                else content
-                if isinstance(content, list)
-                else []
+                else content if isinstance(content, list) else []
             )
 
             # Add images to content

diff --git a/app/tool/browser_use_tool.py b/app/tool/browser_use_tool.py
@@ -15,7 +15,6 @@
 from app.tool.base import BaseTool, ToolResult
 from app.tool.web_search import WebSearch
 
-
 _BROWSER_DESCRIPTION = """\
 A powerful browser automation tool that allows interaction with web pages through various actions.
 * This tool provides commands for controlling a browser session, navigating web pages, and extracting information
@@ -508,7 +507,9 @@ async def get_current_state(
                 full_page=True, animations="disabled", type="jpeg", quality=100
             )
 
-            screenshot = base64.b64encode(screenshot).decode("utf-8")
+            # Convert to base64 and optimize with WebP
+            screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
+            optimized_screenshot = LLM.optimize_image_for_api(screenshot_base64)
 
             # Build the state info with all required fields
             state_info = {
@@ -533,7 +534,7 @@ async def get_current_state(
 
             return ToolResult(
                 output=json.dumps(state_info, indent=4, ensure_ascii=False),
-                base64_image=screenshot,
+                base64_image=optimized_screenshot,
             )
         except Exception as e:
             return ToolResult(error=f"Failed to get browser state: {str(e)}")