Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions examples/bedrock_llm_extraction/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Postgres database address for cocoindex
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex

# Fallback to CPU for operations not supported by MPS on Mac.
# It's no-op for other platforms.
PYTORCH_ENABLE_MPS_FALLBACK=1

# AWS Bedrock credentials
BEDROCK_API_KEY=your_bedrock_api_key
BEDROCK_REGION=your_bedrock_region
1 change: 1 addition & 0 deletions examples/bedrock_llm_extraction/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.env
72 changes: 72 additions & 0 deletions examples/bedrock_llm_extraction/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Structured Data Extraction from PDF with AWS Bedrock and CocoIndex

In this example, we

* Converts PDFs (generated from a few Python docs) into Markdown.
* Extract structured information from the Markdown using an AWS Bedrock LLM.
* Use a custom function to further extract information from the structured output.

Please give [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us if you like our work. Thank you so much with a warm coconut hug 🥥🤗. [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)

## Prerequisite

Before running the example, you need to:

* [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
* Configure your AWS Bedrock credentials. In this example we use AWS Bedrock. You need to get it ready by following [this guide](https://docs.aws.amazon.com/bedrock/latest/userguide/api-keys.html) to create an API key. Alternatively, you can also follow the comments in source code to switch to other LLMs.

First, copy the example environment file:

```bash
cp .env.example .env
```

Then, open the `.env` file and fill in your AWS Bedrock credentials. The `.env` file is ignored by git, so your secrets will not be committed.

## Run


### Build the index

Install dependencies:

```bash
pip install -e .
```

Setup:

```bash
cocoindex setup main.py
```

Update index:

```bash
cocoindex update main.py
```

### Query the index

After index is build, you have a table with name `modules_info`. You can query it any time, e.g. start a Postgres shell:

```bash
psql postgres://cocoindex:cocoindex@localhost/cocoindex
```

And run the SQL query:

```sql
SELECT filename, module_info->'title' AS title, module_summary FROM modules_info;
```
You should see results like:

```
filename | title | module_summary
---------------------+------------------------+--------------------------
manuals/asyncio.pdf | "asyncio — Asynchronous" | {"num_classes": 0, "num_methods": 0}
manuals/json.pdf | "json — JSON encoder" | {"num_classes": 0, "num_methods": 0}
(2 rows)
```

The output may vary depending on the model you are using. The important part is that the `module_info` and `module_summary` columns are populated with the extracted data.
138 changes: 138 additions & 0 deletions examples/bedrock_llm_extraction/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import tempfile
import dataclasses

from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.config.parser import ConfigParser

import cocoindex


class PdfToMarkdown(cocoindex.op.FunctionSpec):
"""Convert a PDF to markdown."""


@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1)
class PdfToMarkdownExecutor:
"""Executor for PdfToMarkdown."""

spec: PdfToMarkdown
_converter: PdfConverter

def prepare(self):
config_parser = ConfigParser({})
self._converter = PdfConverter(
create_model_dict(), config=config_parser.generate_config_dict()
)

def __call__(self, content: bytes) -> str:
with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
temp_file.write(content)
temp_file.flush()
text, _, _ = text_from_rendered(self._converter(temp_file.name))
return text


@dataclasses.dataclass
class ArgInfo:
"""Information about an argument of a method."""

name: str
description: str


@dataclasses.dataclass
class MethodInfo:
"""Information about a method."""

name: str
args: list[ArgInfo]
description: str


@dataclasses.dataclass
class ClassInfo:
"""Information about a class."""

name: str
description: str
methods: list[MethodInfo]


@dataclasses.dataclass
class ModuleInfo:
"""Information about a Python module."""

title: str
description: str
classes: list[ClassInfo]
methods: list[MethodInfo]


@dataclasses.dataclass
class ModuleSummary:
"""Summary info about a Python module."""

num_classes: int
num_methods: int


@cocoindex.op.function()
def summarize_module(module_info: ModuleInfo) -> ModuleSummary:
"""Summarize a Python module."""
return ModuleSummary(
num_classes=len(module_info.classes),
num_methods=len(module_info.methods),
)


@cocoindex.flow_def(name="ManualExtraction")
def manual_extraction_flow(
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
):
"""
Define an example flow that extracts manual information from a Markdown.
"""
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="manuals", binary=True)
)

modules_index = data_scope.add_collector()

with data_scope["documents"].row() as doc:
doc["markdown"] = doc["content"].transform(PdfToMarkdown())
doc["module_info"] = doc["markdown"].transform(
cocoindex.functions.ExtractByLlm(
llm_spec=cocoindex.LlmSpec(
api_type=cocoindex.LlmApiType.BEDROCK,
model="anthropic.claude-3-haiku-20240307-v1:0",
),
# Replace by this spec below, to use OpenAI API model instead of ollama
# llm_spec=cocoindex.LlmSpec(
# api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
# Replace by this spec below, to use Gemini API model
# llm_spec=cocoindex.LlmSpec(
# api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"),
# Replace by this spec below, to use Anthropic API model
# llm_spec=cocoindex.LlmSpec(
# api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"),
# Replace by this spec below, to use Ollama API model
# llm_spec=cocoindex.LlmSpec(
# api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.2"),
output_type=ModuleInfo,
instruction="Please extract Python module information from the manual.",
)
)
doc["module_summary"] = doc["module_info"].transform(summarize_module)
modules_index.collect(
filename=doc["filename"],
module_info=doc["module_info"],
module_summary=doc["module_summary"],
)

modules_index.export(
"modules",
cocoindex.targets.Postgres(table_name="modules_info"),
primary_key_fields=["filename"],
)
Binary file not shown.
Binary file not shown.
Binary file added examples/bedrock_llm_extraction/manuals/copy.pdf
Binary file not shown.
Binary file added examples/bedrock_llm_extraction/manuals/glob.pdf
Binary file not shown.
9 changes: 9 additions & 0 deletions examples/bedrock_llm_extraction/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[project]
name = "manuals-llm-extraction"
version = "0.1.0"
description = "Simple example for cocoindex: extract structured information from a Markdown file using LLM."
requires-python = ">=3.11"
dependencies = ["cocoindex>=0.2.8", "marker-pdf>=1.8.5"]

[tool.setuptools]
packages = []
1 change: 1 addition & 0 deletions python/cocoindex/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class LlmApiType(Enum):
OPEN_ROUTER = "OpenRouter"
VOYAGE = "Voyage"
VLLM = "Vllm"
BEDROCK = "Bedrock"


@dataclass
Expand Down
19 changes: 19 additions & 0 deletions python/cocoindex/tests/test_engine_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,25 @@ def test_full_roundtrip_vector_numeric_types() -> None:
validate_full_roundtrip(value_u64, Vector[np.uint64, Literal[3]])


def test_llm_api_type_bedrock() -> None:
"""Test that LlmApiType.BEDROCK is available and works."""
from cocoindex.llm import LlmApiType, LlmSpec

# Test enum availability
assert hasattr(LlmApiType, "BEDROCK")
assert LlmApiType.BEDROCK.value == "Bedrock"

# Test LlmSpec creation with Bedrock
spec = LlmSpec(
api_type=LlmApiType.BEDROCK, model="us.anthropic.claude-3-5-haiku-20241022-v1:0"
)

assert spec.api_type == LlmApiType.BEDROCK
assert spec.model == "us.anthropic.claude-3-5-haiku-20241022-v1:0"
assert spec.address is None
assert spec.api_config is None


def test_full_roundtrip_vector_of_vector() -> None:
"""Test full roundtrip for vector of vector."""
value_f32 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
Expand Down
Loading