Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion hindsight-api/hindsight_api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,8 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
ENV_FILE_STORAGE_AZURE_ACCOUNT_NAME = "HINDSIGHT_API_FILE_STORAGE_AZURE_ACCOUNT_NAME"
ENV_FILE_STORAGE_AZURE_ACCOUNT_KEY = "HINDSIGHT_API_FILE_STORAGE_AZURE_ACCOUNT_KEY"
ENV_FILE_PARSER = "HINDSIGHT_API_FILE_PARSER"
ENV_FILE_PARSER_IRIS_TOKEN = "HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN"
ENV_FILE_PARSER_IRIS_ORG_ID = "HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID"
ENV_FILE_CONVERSION_MAX_BATCH_SIZE_MB = "HINDSIGHT_API_FILE_CONVERSION_MAX_BATCH_SIZE_MB"
ENV_FILE_CONVERSION_MAX_BATCH_SIZE = "HINDSIGHT_API_FILE_CONVERSION_MAX_BATCH_SIZE"
ENV_ENABLE_FILE_UPLOAD_API = "HINDSIGHT_API_ENABLE_FILE_UPLOAD_API"
Expand Down Expand Up @@ -645,7 +647,9 @@ class HindsightConfig:
file_storage_azure_container: str | None # Azure container name (required for azure storage)
file_storage_azure_account_name: str | None # Azure storage account name
file_storage_azure_account_key: str | None # Azure storage account key
file_parser: str # File parser to use (e.g., "markitdown")
file_parser: str # File parser to use (e.g., "markitdown", "iris")
file_parser_iris_token: str | None # Vectorize API token for iris parser (VECTORIZE_TOKEN)
file_parser_iris_org_id: str | None # Vectorize org ID for iris parser (VECTORIZE_ORG_ID)
file_conversion_max_batch_size_mb: int # Max total batch size in MB (all files combined)
file_conversion_max_batch_size: int # Max files per request
enable_file_upload_api: bool
Expand Down Expand Up @@ -712,6 +716,8 @@ class HindsightConfig:
"file_storage_s3_secret_access_key",
"file_storage_gcs_service_account_key",
"file_storage_azure_account_key",
# File parser credentials
"file_parser_iris_token",
}

# CONFIGURABLE_FIELDS: Safe behavioral settings that can be customized per-tenant/bank
Expand Down Expand Up @@ -1030,6 +1036,8 @@ def from_env(cls) -> "HindsightConfig":
file_storage_azure_account_name=os.getenv(ENV_FILE_STORAGE_AZURE_ACCOUNT_NAME) or None,
file_storage_azure_account_key=os.getenv(ENV_FILE_STORAGE_AZURE_ACCOUNT_KEY) or None,
file_parser=os.getenv(ENV_FILE_PARSER, DEFAULT_FILE_PARSER),
file_parser_iris_token=os.getenv(ENV_FILE_PARSER_IRIS_TOKEN) or None,
file_parser_iris_org_id=os.getenv(ENV_FILE_PARSER_IRIS_ORG_ID) or None,
file_conversion_max_batch_size_mb=int(
os.getenv(ENV_FILE_CONVERSION_MAX_BATCH_SIZE_MB, str(DEFAULT_FILE_CONVERSION_MAX_BATCH_SIZE_MB))
),
Expand Down
9 changes: 8 additions & 1 deletion hindsight-api/hindsight_api/engine/memory_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1374,14 +1374,21 @@ async def verify_llm():
logger.debug(f"File storage initialized ({config.file_storage_type})")

# Initialize parser registry
from .parsers import FileParserRegistry, MarkitdownParser
from .parsers import FileParserRegistry, IrisParser, MarkitdownParser

self._parser_registry = FileParserRegistry()
try:
self._parser_registry.register(MarkitdownParser())
logger.debug("Registered markitdown parser")
except ImportError:
logger.warning("markitdown not available - file parsing disabled")
iris_token = config.file_parser_iris_token
iris_org_id = config.file_parser_iris_org_id
if iris_token and iris_org_id:
self._parser_registry.register(IrisParser(token=iris_token, org_id=iris_org_id))
logger.debug("Registered iris parser")
else:
logger.debug("Iris parser not registered (VECTORIZE_TOKEN or VECTORIZE_ORG_ID not set)")

# Set executor for task backend and initialize
self._task_backend.set_executor(self.execute_task)
Expand Down
8 changes: 5 additions & 3 deletions hindsight-api/hindsight_api/engine/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""File parser implementations."""

from .base import FileParser
from .base import FileParser, UnsupportedFileTypeError
from .iris import IrisParser
from .markitdown import MarkitdownParser

__all__ = ["FileParser", "MarkitdownParser", "FileParserRegistry"]
__all__ = ["FileParser", "UnsupportedFileTypeError", "IrisParser", "MarkitdownParser", "FileParserRegistry"]


class FileParserRegistry:
Expand Down Expand Up @@ -43,7 +44,8 @@ def get_parser(
ValueError: If no suitable parser found
"""
if name:
# Explicit parser requested
# Explicit parser requested — return it directly, let the parser
# raise UnsupportedFileTypeError from convert() if needed
if name not in self._parsers:
raise ValueError(f"Parser '{name}' not found. Available: {list(self._parsers.keys())}")
return self._parsers[name]
Expand Down
19 changes: 14 additions & 5 deletions hindsight-api/hindsight_api/engine/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
from abc import ABC, abstractmethod


class UnsupportedFileTypeError(Exception):
"""Raised by a parser when it does not support the given file type."""

pass


class FileParser(ABC):
"""Abstract base for file to markdown parsers."""

Expand All @@ -19,24 +25,27 @@ async def convert(self, file_data: bytes, filename: str) -> str:
Markdown content as string

Raises:
ValueError: If file format is not supported
RuntimeError: If parsing fails
UnsupportedFileTypeError: If the file type is not supported by this parser
RuntimeError: If parsing fails for another reason
"""
pass

@abstractmethod
def supports(self, filename: str, content_type: str | None = None) -> bool:
"""
Check if parser supports this file type.

Override this for local/static extension-based filtering.
Parsers that delegate to a remote service should leave this as True
and raise UnsupportedFileTypeError from convert() instead.

Args:
filename: File name (used for extension check)
content_type: MIME type (optional)

Returns:
True if this parser can handle the file
True if this parser can handle the file (default: True)
"""
pass
return True

@abstractmethod
def name(self) -> str:
Expand Down
137 changes: 137 additions & 0 deletions hindsight-api/hindsight_api/engine/parsers/iris.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""Iris parser implementation using the Vectorize Iris HTTP API."""

import asyncio
import logging
import mimetypes
import time

import httpx

from .base import FileParser, UnsupportedFileTypeError

logger = logging.getLogger(__name__)

_IRIS_BASE_URL = "https://api.vectorize.io/v1"
_DEFAULT_POLL_INTERVAL = 2.0 # seconds
_DEFAULT_TIMEOUT = 300.0 # seconds


class IrisParser(FileParser):
"""
Iris file parser using the Vectorize Iris cloud extraction service.

Uploads files to the Vectorize Iris API, starts an extraction job,
and polls until the text is ready. The API determines which file types
are supported — UnsupportedFileTypeError is raised if the file is rejected.

Authentication:
Requires HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN and
HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID environment variables,
or pass them explicitly via the constructor.
"""

def __init__(
self,
token: str,
org_id: str,
poll_interval: float = _DEFAULT_POLL_INTERVAL,
timeout: float = _DEFAULT_TIMEOUT,
):
"""
Initialize iris parser.

Args:
token: Vectorize API token
org_id: Vectorize organization ID
poll_interval: Seconds between status poll requests (default: 2)
timeout: Maximum seconds to wait for extraction (default: 300)
"""
self._token = token
self._org_id = org_id
self._poll_interval = poll_interval
self._timeout = timeout
self._auth_headers = {"Authorization": f"Bearer {token}"}

async def convert(self, file_data: bytes, filename: str) -> str:
"""
Parse file to text using the Vectorize Iris API.

Raises:
UnsupportedFileTypeError: If the Iris API rejects the file type (4xx)
RuntimeError: If extraction fails for another reason
"""
content_type = mimetypes.guess_type(filename)[0] or "application/octet-stream"

async with httpx.AsyncClient() as client:
# Step 1: Request a presigned upload URL
init_resp = await client.post(
f"{_IRIS_BASE_URL}/org/{self._org_id}/files",
headers=self._auth_headers,
json={"name": filename, "contentType": content_type},
)
_raise_for_status(init_resp, filename, "file upload init")
init_data = init_resp.json()
file_id: str = init_data["fileId"]
upload_url: str = init_data["uploadUrl"]

# Step 2: Upload the file bytes to the presigned URL (no auth header)
upload_resp = await client.put(
upload_url,
content=file_data,
headers={"Content-Type": content_type},
)
_raise_for_status(upload_resp, filename, "file upload")

# Step 3: Start extraction
extract_resp = await client.post(
f"{_IRIS_BASE_URL}/org/{self._org_id}/extraction",
headers=self._auth_headers,
json={"fileId": file_id},
)
_raise_for_status(extract_resp, filename, "start extraction")
extraction_id: str = extract_resp.json()["extractionId"]

# Step 4: Poll until ready or timeout
deadline = time.monotonic() + self._timeout
while True:
status_resp = await client.get(
f"{_IRIS_BASE_URL}/org/{self._org_id}/extraction/{extraction_id}",
headers=self._auth_headers,
)
_raise_for_status(status_resp, filename, "poll extraction status")
status_data = status_resp.json()

if status_data.get("ready"):
data = status_data.get("data", {})
if not data.get("success"):
error = data.get("error", "unknown error")
raise RuntimeError(f"Iris extraction failed for '{filename}': {error}")
text = data.get("text")
if not text:
raise RuntimeError(f"No content extracted from '{filename}'")
return text

if time.monotonic() >= deadline:
raise RuntimeError(f"Iris extraction timed out after {self._timeout}s for '{filename}'")

await asyncio.sleep(self._poll_interval)

def name(self) -> str:
"""Get parser name."""
return "iris"


def _raise_for_status(response: httpx.Response, filename: str, step: str) -> None:
"""
Raise an appropriate error including the response body on HTTP errors.

Raises UnsupportedFileTypeError for 4xx responses (file rejected by the API),
RuntimeError for other HTTP errors.
"""
if not response.is_error:
return
body = response.text or "<empty>"
msg = f"Iris API error during {step} for '{filename}': {response.status_code} {response.reason_phrase} — {body}"
if response.is_client_error:
raise UnsupportedFileTypeError(msg)
raise RuntimeError(msg)
2 changes: 2 additions & 0 deletions hindsight-api/hindsight_api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,8 @@ def main():
file_storage_azure_account_name=config.file_storage_azure_account_name,
file_storage_azure_account_key=config.file_storage_azure_account_key,
file_parser=config.file_parser,
file_parser_iris_token=config.file_parser_iris_token,
file_parser_iris_org_id=config.file_parser_iris_org_id,
file_conversion_max_batch_size_mb=config.file_conversion_max_batch_size_mb,
file_conversion_max_batch_size=config.file_conversion_max_batch_size,
enable_file_upload_api=config.enable_file_upload_api,
Expand Down
72 changes: 72 additions & 0 deletions hindsight-api/tests/test_iris_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
Integration tests for the Iris file parser.

Tests are skipped automatically if HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN
and HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID are not set in the environment.
"""

import os

import pytest

from hindsight_api.config import ENV_FILE_PARSER_IRIS_ORG_ID, ENV_FILE_PARSER_IRIS_TOKEN
from hindsight_api.engine.parsers.iris import IrisParser

_token = os.getenv(ENV_FILE_PARSER_IRIS_TOKEN)
_org_id = os.getenv(ENV_FILE_PARSER_IRIS_ORG_ID)

pytestmark = pytest.mark.skipif(
not (_token and _org_id),
reason="HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN and HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID not set",
)

# Minimal valid PDF with the text "Hello from Hindsight"
_SAMPLE_PDF = b"""%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]
/Contents 4 0 R /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >> >>
endobj
4 0 obj
<< /Length 44 >>
stream
BT /F1 12 Tf 100 700 Td (Hello from Hindsight) Tj ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000274 00000 n
trailer << /Size 5 /Root 1 0 R >>
startxref
369
%%EOF"""


@pytest.fixture
def iris_parser() -> IrisParser:
return IrisParser(token=_token, org_id=_org_id)


@pytest.mark.asyncio
async def test_iris_parser_converts_pdf(iris_parser: IrisParser):
"""IrisParser should extract text from a valid PDF."""
result = await iris_parser.convert(_SAMPLE_PDF, "sample.pdf")
assert isinstance(result, str)
assert len(result) > 0


@pytest.mark.asyncio
async def test_iris_parser_name(iris_parser: IrisParser):
"""IrisParser.name() should return 'iris'."""
assert iris_parser.name() == "iris"


26 changes: 24 additions & 2 deletions hindsight-docs/docs/developer/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -636,12 +636,34 @@ Configuration for the file upload and conversion pipeline (used by `POST /v1/def
| Variable | Description | Default |
|----------|-------------|---------|
| `HINDSIGHT_API_ENABLE_FILE_UPLOAD_API` | Enable the file upload API endpoint | `true` |
| `HINDSIGHT_API_FILE_PARSER` | File parser to use (`markitdown`) | `markitdown` |
| `HINDSIGHT_API_FILE_PARSER` | File parser to use (`markitdown`, `iris`) | `markitdown` |
| `HINDSIGHT_API_FILE_CONVERSION_MAX_BATCH_SIZE` | Max files per upload request | `10` |
| `HINDSIGHT_API_FILE_CONVERSION_MAX_BATCH_SIZE_MB` | Max total upload size per request (MB) | `100` |
| `HINDSIGHT_API_FILE_DELETE_AFTER_RETAIN` | Delete stored files after memory extraction completes | `true` |

**Supported formats (via markitdown):** PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, images (JPG, PNG, GIF — OCR), audio (MP3, WAV — transcription), HTML, TXT, MD, CSV, and more.
#### Parser: markitdown (default)

Local file-to-markdown conversion using [Microsoft's markitdown](https://github.com/microsoft/markitdown). No external service required.

**Supported formats:** PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, images (JPG, PNG — OCR), audio (MP3, WAV — transcription), HTML, TXT, MD, CSV.

#### Parser: iris

Cloud-based extraction via [Vectorize Iris](https://docs.vectorize.io/build-deploy/extract-information/understanding-iris/). Higher quality extraction for complex documents, powered by a remote AI service.

| Variable | Description | Default |
|----------|-------------|---------|
| `HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN` | Vectorize API token | — |
| `HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID` | Vectorize organization ID | — |

**Supported formats:** PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, images (JPG, JPEG, PNG, GIF, BMP, TIFF, WEBP), HTML, TXT, MD, CSV.

```bash
# Use iris parser (requires Vectorize account)
export HINDSIGHT_API_FILE_PARSER=iris
export HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN=your-vectorize-token
export HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID=your-org-id
```

```bash
# Increase batch limits for large file imports
Expand Down
Loading