vectorize-io · nicoloboschi · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/hindsight-api/hindsight_api/config.py b/hindsight-api/hindsight_api/config.py
@@ -272,6 +272,8 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 ENV_FILE_STORAGE_AZURE_ACCOUNT_NAME = "HINDSIGHT_API_FILE_STORAGE_AZURE_ACCOUNT_NAME"
 ENV_FILE_STORAGE_AZURE_ACCOUNT_KEY = "HINDSIGHT_API_FILE_STORAGE_AZURE_ACCOUNT_KEY"
 ENV_FILE_PARSER = "HINDSIGHT_API_FILE_PARSER"
+ENV_FILE_PARSER_IRIS_TOKEN = "HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN"
+ENV_FILE_PARSER_IRIS_ORG_ID = "HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID"
 ENV_FILE_CONVERSION_MAX_BATCH_SIZE_MB = "HINDSIGHT_API_FILE_CONVERSION_MAX_BATCH_SIZE_MB"
 ENV_FILE_CONVERSION_MAX_BATCH_SIZE = "HINDSIGHT_API_FILE_CONVERSION_MAX_BATCH_SIZE"
 ENV_ENABLE_FILE_UPLOAD_API = "HINDSIGHT_API_ENABLE_FILE_UPLOAD_API"
@@ -645,7 +647,9 @@ class HindsightConfig:
     file_storage_azure_container: str | None  # Azure container name (required for azure storage)
     file_storage_azure_account_name: str | None  # Azure storage account name
     file_storage_azure_account_key: str | None  # Azure storage account key
-    file_parser: str  # File parser to use (e.g., "markitdown")
+    file_parser: str  # File parser to use (e.g., "markitdown", "iris")
+    file_parser_iris_token: str | None  # Vectorize API token for iris parser (VECTORIZE_TOKEN)
+    file_parser_iris_org_id: str | None  # Vectorize org ID for iris parser (VECTORIZE_ORG_ID)
     file_conversion_max_batch_size_mb: int  # Max total batch size in MB (all files combined)
     file_conversion_max_batch_size: int  # Max files per request
     enable_file_upload_api: bool
@@ -712,6 +716,8 @@ class HindsightConfig:
         "file_storage_s3_secret_access_key",
         "file_storage_gcs_service_account_key",
         "file_storage_azure_account_key",
+        # File parser credentials
+        "file_parser_iris_token",
     }
 
     # CONFIGURABLE_FIELDS: Safe behavioral settings that can be customized per-tenant/bank
@@ -1030,6 +1036,8 @@ def from_env(cls) -> "HindsightConfig":
             file_storage_azure_account_name=os.getenv(ENV_FILE_STORAGE_AZURE_ACCOUNT_NAME) or None,
             file_storage_azure_account_key=os.getenv(ENV_FILE_STORAGE_AZURE_ACCOUNT_KEY) or None,
             file_parser=os.getenv(ENV_FILE_PARSER, DEFAULT_FILE_PARSER),
+            file_parser_iris_token=os.getenv(ENV_FILE_PARSER_IRIS_TOKEN) or None,
+            file_parser_iris_org_id=os.getenv(ENV_FILE_PARSER_IRIS_ORG_ID) or None,
             file_conversion_max_batch_size_mb=int(
                 os.getenv(ENV_FILE_CONVERSION_MAX_BATCH_SIZE_MB, str(DEFAULT_FILE_CONVERSION_MAX_BATCH_SIZE_MB))
             ),

diff --git a/hindsight-api/hindsight_api/engine/memory_engine.py b/hindsight-api/hindsight_api/engine/memory_engine.py
@@ -1374,14 +1374,21 @@ async def verify_llm():
         logger.debug(f"File storage initialized ({config.file_storage_type})")
 
         # Initialize parser registry
-        from .parsers import FileParserRegistry, MarkitdownParser
+        from .parsers import FileParserRegistry, IrisParser, MarkitdownParser
 
         self._parser_registry = FileParserRegistry()
         try:
             self._parser_registry.register(MarkitdownParser())
             logger.debug("Registered markitdown parser")
         except ImportError:
             logger.warning("markitdown not available - file parsing disabled")
+        iris_token = config.file_parser_iris_token
+        iris_org_id = config.file_parser_iris_org_id
+        if iris_token and iris_org_id:
+            self._parser_registry.register(IrisParser(token=iris_token, org_id=iris_org_id))
+            logger.debug("Registered iris parser")
+        else:
+            logger.debug("Iris parser not registered (VECTORIZE_TOKEN or VECTORIZE_ORG_ID not set)")
 
         # Set executor for task backend and initialize
         self._task_backend.set_executor(self.execute_task)

diff --git a/hindsight-api/hindsight_api/engine/parsers/__init__.py b/hindsight-api/hindsight_api/engine/parsers/__init__.py
@@ -1,9 +1,10 @@
 """File parser implementations."""
 
-from .base import FileParser
+from .base import FileParser, UnsupportedFileTypeError
+from .iris import IrisParser
 from .markitdown import MarkitdownParser
 
-__all__ = ["FileParser", "MarkitdownParser", "FileParserRegistry"]
+__all__ = ["FileParser", "UnsupportedFileTypeError", "IrisParser", "MarkitdownParser", "FileParserRegistry"]
 
 
 class FileParserRegistry:
@@ -43,7 +44,8 @@ def get_parser(
             ValueError: If no suitable parser found
         """
         if name:
-            # Explicit parser requested
+            # Explicit parser requested — return it directly, let the parser
+            # raise UnsupportedFileTypeError from convert() if needed
             if name not in self._parsers:
                 raise ValueError(f"Parser '{name}' not found. Available: {list(self._parsers.keys())}")
             return self._parsers[name]

diff --git a/hindsight-api/hindsight_api/engine/parsers/base.py b/hindsight-api/hindsight_api/engine/parsers/base.py
@@ -3,6 +3,12 @@
 from abc import ABC, abstractmethod
 
 
+class UnsupportedFileTypeError(Exception):
+    """Raised by a parser when it does not support the given file type."""
+
+    pass
+
+
 class FileParser(ABC):
     """Abstract base for file to markdown parsers."""
 
@@ -19,24 +25,27 @@ async def convert(self, file_data: bytes, filename: str) -> str:
             Markdown content as string
 
         Raises:
-            ValueError: If file format is not supported
-            RuntimeError: If parsing fails
+            UnsupportedFileTypeError: If the file type is not supported by this parser
+            RuntimeError: If parsing fails for another reason
         """
         pass
 
-    @abstractmethod
     def supports(self, filename: str, content_type: str | None = None) -> bool:
         """
         Check if parser supports this file type.
 
+        Override this for local/static extension-based filtering.
+        Parsers that delegate to a remote service should leave this as True
+        and raise UnsupportedFileTypeError from convert() instead.
+
         Args:
             filename: File name (used for extension check)
             content_type: MIME type (optional)
 
         Returns:
-            True if this parser can handle the file
+            True if this parser can handle the file (default: True)
         """
-        pass
+        return True
 
     @abstractmethod
     def name(self) -> str:

diff --git a/hindsight-api/hindsight_api/engine/parsers/iris.py b/hindsight-api/hindsight_api/engine/parsers/iris.py
@@ -0,0 +1,137 @@
+"""Iris parser implementation using the Vectorize Iris HTTP API."""
+
+import asyncio
+import logging
+import mimetypes
+import time
+
+import httpx
+
+from .base import FileParser, UnsupportedFileTypeError
+
+logger = logging.getLogger(__name__)
+
+_IRIS_BASE_URL = "https://api.vectorize.io/v1"
+_DEFAULT_POLL_INTERVAL = 2.0  # seconds
+_DEFAULT_TIMEOUT = 300.0  # seconds
+
+
+class IrisParser(FileParser):
+    """
+    Iris file parser using the Vectorize Iris cloud extraction service.
+
+    Uploads files to the Vectorize Iris API, starts an extraction job,
+    and polls until the text is ready. The API determines which file types
+    are supported — UnsupportedFileTypeError is raised if the file is rejected.
+
+    Authentication:
+        Requires HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN and
+        HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID environment variables,
+        or pass them explicitly via the constructor.
+    """
+
+    def __init__(
+        self,
+        token: str,
+        org_id: str,
+        poll_interval: float = _DEFAULT_POLL_INTERVAL,
+        timeout: float = _DEFAULT_TIMEOUT,
+    ):
+        """
+        Initialize iris parser.
+
+        Args:
+            token: Vectorize API token
+            org_id: Vectorize organization ID
+            poll_interval: Seconds between status poll requests (default: 2)
+            timeout: Maximum seconds to wait for extraction (default: 300)
+        """
+        self._token = token
+        self._org_id = org_id
+        self._poll_interval = poll_interval
+        self._timeout = timeout
+        self._auth_headers = {"Authorization": f"Bearer {token}"}
+
+    async def convert(self, file_data: bytes, filename: str) -> str:
+        """
+        Parse file to text using the Vectorize Iris API.
+
+        Raises:
+            UnsupportedFileTypeError: If the Iris API rejects the file type (4xx)
+            RuntimeError: If extraction fails for another reason
+        """
+        content_type = mimetypes.guess_type(filename)[0] or "application/octet-stream"
+
+        async with httpx.AsyncClient() as client:
+            # Step 1: Request a presigned upload URL
+            init_resp = await client.post(
+                f"{_IRIS_BASE_URL}/org/{self._org_id}/files",
+                headers=self._auth_headers,
+                json={"name": filename, "contentType": content_type},
+            )
+            _raise_for_status(init_resp, filename, "file upload init")
+            init_data = init_resp.json()
+            file_id: str = init_data["fileId"]
+            upload_url: str = init_data["uploadUrl"]
+
+            # Step 2: Upload the file bytes to the presigned URL (no auth header)
+            upload_resp = await client.put(
+                upload_url,
+                content=file_data,
+                headers={"Content-Type": content_type},
+            )
+            _raise_for_status(upload_resp, filename, "file upload")
+
+            # Step 3: Start extraction
+            extract_resp = await client.post(
+                f"{_IRIS_BASE_URL}/org/{self._org_id}/extraction",
+                headers=self._auth_headers,
+                json={"fileId": file_id},
+            )
+            _raise_for_status(extract_resp, filename, "start extraction")
+            extraction_id: str = extract_resp.json()["extractionId"]
+
+            # Step 4: Poll until ready or timeout
+            deadline = time.monotonic() + self._timeout
+            while True:
+                status_resp = await client.get(
+                    f"{_IRIS_BASE_URL}/org/{self._org_id}/extraction/{extraction_id}",
+                    headers=self._auth_headers,
+                )
+                _raise_for_status(status_resp, filename, "poll extraction status")
+                status_data = status_resp.json()
+
+                if status_data.get("ready"):
+                    data = status_data.get("data", {})
+                    if not data.get("success"):
+                        error = data.get("error", "unknown error")
+                        raise RuntimeError(f"Iris extraction failed for '{filename}': {error}")
+                    text = data.get("text")
+                    if not text:
+                        raise RuntimeError(f"No content extracted from '{filename}'")
+                    return text
+
+                if time.monotonic() >= deadline:
+                    raise RuntimeError(f"Iris extraction timed out after {self._timeout}s for '{filename}'")
+
+                await asyncio.sleep(self._poll_interval)
+
+    def name(self) -> str:
+        """Get parser name."""
+        return "iris"
+
+
+def _raise_for_status(response: httpx.Response, filename: str, step: str) -> None:
+    """
+    Raise an appropriate error including the response body on HTTP errors.
+
+    Raises UnsupportedFileTypeError for 4xx responses (file rejected by the API),
+    RuntimeError for other HTTP errors.
+    """
+    if not response.is_error:
+        return
+    body = response.text or "<empty>"
+    msg = f"Iris API error during {step} for '{filename}': {response.status_code} {response.reason_phrase} — {body}"
+    if response.is_client_error:
+        raise UnsupportedFileTypeError(msg)
+    raise RuntimeError(msg)
diff --git a/hindsight-api/hindsight_api/main.py b/hindsight-api/hindsight_api/main.py
@@ -262,6 +262,8 @@ def main():
             file_storage_azure_account_name=config.file_storage_azure_account_name,
             file_storage_azure_account_key=config.file_storage_azure_account_key,
             file_parser=config.file_parser,
+            file_parser_iris_token=config.file_parser_iris_token,
+            file_parser_iris_org_id=config.file_parser_iris_org_id,
             file_conversion_max_batch_size_mb=config.file_conversion_max_batch_size_mb,
             file_conversion_max_batch_size=config.file_conversion_max_batch_size,
             enable_file_upload_api=config.enable_file_upload_api,

diff --git a/hindsight-api/tests/test_iris_parser.py b/hindsight-api/tests/test_iris_parser.py
@@ -0,0 +1,72 @@
+"""
+Integration tests for the Iris file parser.
+
+Tests are skipped automatically if HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN
+and HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID are not set in the environment.
+"""
+
+import os
+
+import pytest
+
+from hindsight_api.config import ENV_FILE_PARSER_IRIS_ORG_ID, ENV_FILE_PARSER_IRIS_TOKEN
+from hindsight_api.engine.parsers.iris import IrisParser
+
+_token = os.getenv(ENV_FILE_PARSER_IRIS_TOKEN)
+_org_id = os.getenv(ENV_FILE_PARSER_IRIS_ORG_ID)
+
+pytestmark = pytest.mark.skipif(
+    not (_token and _org_id),
+    reason="HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN and HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID not set",
+)
+
+# Minimal valid PDF with the text "Hello from Hindsight"
+_SAMPLE_PDF = b"""%PDF-1.4
+1 0 obj
+<< /Type /Catalog /Pages 2 0 R >>
+endobj
+2 0 obj
+<< /Type /Pages /Kids [3 0 R] /Count 1 >>
+endobj
+3 0 obj
+<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]
+   /Contents 4 0 R /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >> >>
+endobj
+4 0 obj
+<< /Length 44 >>
+stream
+BT /F1 12 Tf 100 700 Td (Hello from Hindsight) Tj ET
+endstream
+endobj
+xref
+0 5
+0000000000 65535 f
+0000000009 00000 n
+0000000058 00000 n
+0000000115 00000 n
+0000000274 00000 n
+trailer << /Size 5 /Root 1 0 R >>
+startxref
+369
+%%EOF"""
+
+
+@pytest.fixture
+def iris_parser() -> IrisParser:
+    return IrisParser(token=_token, org_id=_org_id)
+
+
+@pytest.mark.asyncio
+async def test_iris_parser_converts_pdf(iris_parser: IrisParser):
+    """IrisParser should extract text from a valid PDF."""
+    result = await iris_parser.convert(_SAMPLE_PDF, "sample.pdf")
+    assert isinstance(result, str)
+    assert len(result) > 0
+
+
+@pytest.mark.asyncio
+async def test_iris_parser_name(iris_parser: IrisParser):
+    """IrisParser.name() should return 'iris'."""
+    assert iris_parser.name() == "iris"
+
+
diff --git a/hindsight-docs/docs/developer/configuration.md b/hindsight-docs/docs/developer/configuration.md
@@ -636,12 +636,34 @@ Configuration for the file upload and conversion pipeline (used by `POST /v1/def
 | Variable | Description | Default |
 |----------|-------------|---------|
 | `HINDSIGHT_API_ENABLE_FILE_UPLOAD_API` | Enable the file upload API endpoint | `true` |
-| `HINDSIGHT_API_FILE_PARSER` | File parser to use (`markitdown`) | `markitdown` |
+| `HINDSIGHT_API_FILE_PARSER` | File parser to use (`markitdown`, `iris`) | `markitdown` |
 | `HINDSIGHT_API_FILE_CONVERSION_MAX_BATCH_SIZE` | Max files per upload request | `10` |
 | `HINDSIGHT_API_FILE_CONVERSION_MAX_BATCH_SIZE_MB` | Max total upload size per request (MB) | `100` |
 | `HINDSIGHT_API_FILE_DELETE_AFTER_RETAIN` | Delete stored files after memory extraction completes | `true` |
 
-**Supported formats (via markitdown):** PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, images (JPG, PNG, GIF — OCR), audio (MP3, WAV — transcription), HTML, TXT, MD, CSV, and more.
+#### Parser: markitdown (default)
+
+Local file-to-markdown conversion using [Microsoft's markitdown](https://github.com/microsoft/markitdown). No external service required.
+
+**Supported formats:** PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, images (JPG, PNG — OCR), audio (MP3, WAV — transcription), HTML, TXT, MD, CSV.
+
+#### Parser: iris
+
+Cloud-based extraction via [Vectorize Iris](https://docs.vectorize.io/build-deploy/extract-information/understanding-iris/). Higher quality extraction for complex documents, powered by a remote AI service.
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN` | Vectorize API token | — |
+| `HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID` | Vectorize organization ID | — |
+
+**Supported formats:** PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, images (JPG, JPEG, PNG, GIF, BMP, TIFF, WEBP), HTML, TXT, MD, CSV.
+
+```bash
+# Use iris parser (requires Vectorize account)
+export HINDSIGHT_API_FILE_PARSER=iris
+export HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN=your-vectorize-token
+export HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID=your-org-id
+```
 
 ```bash
 # Increase batch limits for large file imports