pydantic · Kludex · Mar 13, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 26, 2025
diff --git a/docs/input.md b/docs/input.md
@@ -1,6 +1,6 @@
-# Image and Audio Input
+# Image, Audio & Document Input
 
-Some LLMs are now capable of understanding both audio and image content.
+Some LLMs are now capable of understanding both audio, image and document content.
 
 ## Image Input
 
@@ -51,3 +51,54 @@ print(result.data)
     Some models do not support audio input. Please check the model's documentation to confirm whether it supports audio input.
 
 You can provide audio input using either [`AudioUrl`][pydantic_ai.AudioUrl] or [`BinaryContent`][pydantic_ai.BinaryContent]. The process is analogous to the examples above.
+
+## Document Input
+
+!!! info
+    Some models do not support document input. Please check the model's documentation to confirm whether it supports document input.
+
+!!! warning
+    When using Gemini models, the document content will always be sent as binary data, regardless of whether you use `DocumentUrl` or `BinaryContent`. This is due to differences in how Vertex AI and Google AI handle document inputs.
+
+    For more details, see [this discussion](https://discuss.ai.google.dev/t/i-am-using-google-generative-ai-model-gemini-1-5-pro-for-image-analysis-but-getting-error/34866/4).
+
+    If you are unsatisfied with this behavior, please let us know by opening an issue on
+    [GitHub](https://github.com/pydantic/pydantic-ai/issues).
+
+You can provide document input using either [`DocumentUrl`][pydantic_ai.DocumentUrl] or [`BinaryContent`][pydantic_ai.BinaryContent]. The process is similar to the examples above.
+
+If you have a direct URL for the document, you can use [`DocumentUrl`][pydantic_ai.DocumentUrl]:
+
+```py {title="main.py" test="skip" lint="skip"}
+from pydantic_ai import Agent, DocumentUrl
+
+agent = Agent(model='anthropic:claude-3-sonnet')
+result = agent.run_sync(
+    [
+        'What is the main content of this document?',
+        DocumentUrl(url='https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf'),
+    ]
+)
+print(result.data)
+#> This document is the technical report introducing Gemini 1.5, Google's latest large language model...
+```
+
+The supported document formats vary by model.
+
+You can also use [`BinaryContent`][pydantic_ai.BinaryContent] to pass document data directly:
+
+```py {title="main.py" test="skip" lint="skip"}
+from pathlib import Path
+from pydantic_ai import Agent, BinaryContent
+
+pdf_path = Path('document.pdf')
+agent = Agent(model='anthropic:claude-3-sonnet')
+result = agent.run_sync(
+    [
+        'What is the main content of this document?',
+        BinaryContent(data=pdf_path.read_bytes(), media_type='application/pdf'),
+    ]
+)
+print(result.data)
+#> The document discusses...
+```
diff --git a/pydantic_ai_slim/pydantic_ai/__init__.py b/pydantic_ai_slim/pydantic_ai/__init__.py
@@ -10,7 +10,7 @@
     UsageLimitExceeded,
     UserError,
 )
-from .messages import AudioUrl, BinaryContent, ImageUrl
+from .messages import AudioUrl, BinaryContent, DocumentUrl, ImageUrl
 from .tools import RunContext, Tool
 
 __all__ = (
@@ -33,6 +33,7 @@
     # messages
     'ImageUrl',
     'AudioUrl',
+    'DocumentUrl',
     'BinaryContent',
     # tools
     'Tool',

diff --git a/pydantic_ai_slim/pydantic_ai/messages.py b/pydantic_ai_slim/pydantic_ai/messages.py
@@ -4,6 +4,7 @@
 from collections.abc import Sequence
 from dataclasses import dataclass, field, replace
 from datetime import datetime
+from mimetypes import guess_type
 from typing import Annotated, Any, Literal, Union, cast, overload
 
 import pydantic
@@ -83,9 +84,57 @@ def media_type(self) -> ImageMediaType:
         else:
             raise ValueError(f'Unknown image file extension: {self.url}')
 
+    @property
+    def format(self) -> ImageFormat:
+        """The file format of the image.
+
+        The choice of supported formats were based on the Bedrock Converse API. Other APIs don't require to use a format.
+        """
+        return _image_format(self.media_type)
+
+
+@dataclass
+class DocumentUrl:
+    """The URL of the document."""
+
+    url: str
+    """The URL of the document."""
+
+    kind: Literal['document-url'] = 'document-url'
+    """Type identifier, this is available on all parts as a discriminator."""
+
+    @property
+    def media_type(self) -> str:
+        """Return the media type of the document, based on the url."""
+        type_, _ = guess_type(self.url)
+        if type_ is None:
+            raise RuntimeError(f'Unknown document file extension: {self.url}')
+        return type_
+
+    @property
+    def format(self) -> DocumentFormat:
+        """The file format of the document.
+
+        The choice of supported formats were based on the Bedrock Converse API. Other APIs don't require to use a format.
+        """
+        return _document_format(self.media_type)
+
 
 AudioMediaType: TypeAlias = Literal['audio/wav', 'audio/mpeg']
 ImageMediaType: TypeAlias = Literal['image/jpeg', 'image/png', 'image/gif', 'image/webp']
+DocumentMediaType: TypeAlias = Literal[
+    'application/pdf',
+    'text/plain',
+    'text/csv',
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+    'text/html',
+    'text/markdown',
+    'application/vnd.ms-excel',
+]
+AudioFormat: TypeAlias = Literal['wav', 'mp3']
+ImageFormat: TypeAlias = Literal['jpeg', 'png', 'gif', 'webp']
+DocumentFormat: TypeAlias = Literal['csv', 'doc', 'docx', 'html', 'md', 'pdf', 'txt', 'xls', 'xlsx']
 
 
 @dataclass
@@ -95,7 +144,7 @@ class BinaryContent:
     data: bytes
     """The binary data."""
 
-    media_type: AudioMediaType | ImageMediaType | str
+    media_type: AudioMediaType | ImageMediaType | DocumentMediaType | str
     """The media type of the binary data."""
 
     kind: Literal['binary'] = 'binary'
@@ -112,17 +161,69 @@ def is_image(self) -> bool:
         return self.media_type.startswith('image/')
 
     @property
-    def audio_format(self) -> Literal['mp3', 'wav']:
-        """Return the audio format given the media type."""
-        if self.media_type == 'audio/mpeg':
-            return 'mp3'
-        elif self.media_type == 'audio/wav':
-            return 'wav'
-        else:
-            raise ValueError(f'Unknown audio media type: {self.media_type}')
+    def is_document(self) -> bool:
+        """Return `True` if the media type is a document type."""
+        return self.media_type in {
+            'application/pdf',
+            'text/plain',
+            'text/csv',
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+            'text/html',
+            'text/markdown',
+            'application/vnd.ms-excel',
+        }
 
-
-UserContent: TypeAlias = 'str | ImageUrl | AudioUrl | BinaryContent'
+    @property
+    def format(self) -> str:
+        """The file format of the binary content."""
+        if self.is_audio:
+            if self.media_type == 'audio/mpeg':
+                return 'mp3'
+            elif self.media_type == 'audio/wav':
+                return 'wav'
+        elif self.is_image:
+            return _image_format(self.media_type)
+        elif self.is_document:
+            return _document_format(self.media_type)
+        raise ValueError(f'Unknown media type: {self.media_type}')
+
+
+UserContent: TypeAlias = 'str | ImageUrl | AudioUrl | DocumentUrl | BinaryContent'
+
+
+def _document_format(media_type: str) -> DocumentFormat:
+    if media_type == 'application/pdf':
+        return 'pdf'
+    elif media_type == 'text/plain':
+        return 'txt'
+    elif media_type == 'text/csv':
+        return 'csv'
+    elif media_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
+        return 'docx'
+    elif media_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
+        return 'xlsx'
+    elif media_type == 'text/html':
+        return 'html'
+    elif media_type == 'text/markdown':
+        return 'md'
+    elif media_type == 'application/vnd.ms-excel':
+        return 'xls'
+    else:
+        raise ValueError(f'Unknown document media type: {media_type}')
+
+
+def _image_format(media_type: str) -> ImageFormat:
+    if media_type == 'image/jpeg':
+        return 'jpeg'
+    elif media_type == 'image/png':
+        return 'png'
+    elif media_type == 'image/gif':
+        return 'gif'
+    elif media_type == 'image/webp':
+        return 'webp'
+    else:
+        raise ValueError(f'Unknown image media type: {media_type}')
 
 
 @dataclass

diff --git a/pydantic_ai_slim/pydantic_ai/models/anthropic.py b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
@@ -9,13 +9,15 @@
 from json import JSONDecodeError, loads as json_loads
 from typing import Any, Literal, Union, cast, overload
 
+from anthropic.types import DocumentBlockParam
 from httpx import AsyncClient as AsyncHTTPClient
 from typing_extensions import assert_never
 
 from .. import ModelHTTPError, UnexpectedModelBehavior, _utils, usage
 from .._utils import guard_tool_call_id as _guard_tool_call_id
 from ..messages import (
     BinaryContent,
+    DocumentUrl,
     ImageUrl,
     ModelMessage,
     ModelRequest,
@@ -42,11 +44,13 @@
 try:
     from anthropic import NOT_GIVEN, APIStatusError, AsyncAnthropic, AsyncStream
     from anthropic.types import (
+        Base64PDFSourceParam,
         ContentBlock,
         ImageBlockParam,
         Message as AnthropicMessage,
         MessageParam,
         MetadataParam,
+        PlainTextSourceParam,
         RawContentBlockDeltaEvent,
         RawContentBlockStartEvent,
         RawContentBlockStopEvent,
@@ -288,7 +292,9 @@ async def _map_message(self, messages: list[ModelMessage]) -> tuple[str, list[Me
         anthropic_messages: list[MessageParam] = []
         for m in messages:
             if isinstance(m, ModelRequest):
-                user_content_params: list[ToolResultBlockParam | TextBlockParam | ImageBlockParam] = []
+                user_content_params: list[
+                    ToolResultBlockParam | TextBlockParam | ImageBlockParam | DocumentBlockParam
+                ] = []
                 for request_part in m.parts:
                     if isinstance(request_part, SystemPromptPart):
                         system_prompt += request_part.content
@@ -334,7 +340,9 @@ async def _map_message(self, messages: list[ModelMessage]) -> tuple[str, list[Me
         return system_prompt, anthropic_messages
 
     @staticmethod
-    async def _map_user_prompt(part: UserPromptPart) -> AsyncGenerator[ImageBlockParam | TextBlockParam]:
+    async def _map_user_prompt(
+        part: UserPromptPart,
+    ) -> AsyncGenerator[ImageBlockParam | TextBlockParam | DocumentBlockParam]:
         if isinstance(part.content, str):
             yield TextBlockParam(text=part.content, type='text')
         else:
@@ -379,6 +387,25 @@ async def _map_user_prompt(part: UserPromptPart) -> AsyncGenerator[ImageBlockPar
                             )
                         else:  # pragma: no cover
                             raise RuntimeError(f'Unsupported image type: {mime_type}')
+                elif isinstance(item, DocumentUrl):
+                    response = await cached_async_http_client().get(item.url)
+                    response.raise_for_status()
+                    if item.media_type == 'application/pdf':
+                        yield DocumentBlockParam(
+                            source=Base64PDFSourceParam(
+                                data=io.BytesIO(response.content),
+                                media_type=item.media_type,
+                                type='base64',
+                            ),
+                            type='document',
+                        )
+                    elif item.media_type == 'text/plain':
+                        yield DocumentBlockParam(
+                            source=PlainTextSourceParam(data=response.text, media_type=item.media_type, type='text'),
+                            type='document',
+                        )
+                    else:  # pragma: no cover
+                        raise RuntimeError(f'Unsupported media type: {item.media_type}')
                 else:
                     raise RuntimeError(f'Unsupported content type: {type(item)}')