Skip to content

Commit 736b580

Browse files
committed
feat: add bedrock llm extraction example
1 parent 4b4f6f4 commit 736b580

File tree

9 files changed

+230
-0
lines changed

9 files changed

+230
-0
lines changed
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Postgres database address for cocoindex
2+
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
3+
4+
# Fallback to CPU for operations not supported by MPS on Mac.
5+
# It's no-op for other platforms.
6+
PYTORCH_ENABLE_MPS_FALLBACK=1
7+
8+
# AWS Bedrock credentials
9+
BEDROCK_API_KEY=your_bedrock_api_key
10+
BEDROCK_REGION=your_bedrock_region
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.env
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Structured Data Extraction from PDF with AWS Bedrock and CocoIndex
2+
3+
In this example, we
4+
5+
* Converts PDFs (generated from a few Python docs) into Markdown.
6+
* Extract structured information from the Markdown using an AWS Bedrock LLM.
7+
* Use a custom function to further extract information from the structured output.
8+
9+
Please give [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us if you like our work. Thank you so much with a warm coconut hug 🥥🤗. [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
10+
11+
## Prerequisite
12+
13+
Before running the example, you need to:
14+
15+
* [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
16+
* Configure your AWS Bedrock credentials. In this example we use AWS Bedrock. You need to get it ready by following [this guide](https://docs.aws.amazon.com/bedrock/latest/userguide/api-keys.html) to create an API key. Alternatively, you can also follow the comments in source code to switch to other LLMs.
17+
18+
First, copy the example environment file:
19+
20+
```bash
21+
cp .env.example .env
22+
```
23+
24+
Then, open the `.env` file and fill in your AWS Bedrock credentials. The `.env` file is ignored by git, so your secrets will not be committed.
25+
26+
## Run
27+
28+
29+
### Build the index
30+
31+
Install dependencies:
32+
33+
```bash
34+
pip install -e .
35+
```
36+
37+
Setup:
38+
39+
```bash
40+
cocoindex setup main.py
41+
```
42+
43+
Update index:
44+
45+
```bash
46+
cocoindex update main.py
47+
```
48+
49+
### Query the index
50+
51+
After index is build, you have a table with name `modules_info`. You can query it any time, e.g. start a Postgres shell:
52+
53+
```bash
54+
psql postgres://cocoindex:cocoindex@localhost/cocoindex
55+
```
56+
57+
And run the SQL query:
58+
59+
```sql
60+
SELECT filename, module_info->'title' AS title, module_summary FROM modules_info;
61+
```
62+
You should see results like:
63+
64+
```
65+
filename | title | module_summary
66+
---------------------+------------------------+--------------------------
67+
manuals/asyncio.pdf | "asyncio — Asynchronous" | {"num_classes": 0, "num_methods": 0}
68+
manuals/json.pdf | "json — JSON encoder" | {"num_classes": 0, "num_methods": 0}
69+
(2 rows)
70+
```
71+
72+
The output may vary depending on the model you are using. The important part is that the `module_info` and `module_summary` columns are populated with the extracted data.
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import tempfile
2+
import dataclasses
3+
4+
from marker.converters.pdf import PdfConverter
5+
from marker.models import create_model_dict
6+
from marker.output import text_from_rendered
7+
from marker.config.parser import ConfigParser
8+
9+
import cocoindex
10+
11+
12+
class PdfToMarkdown(cocoindex.op.FunctionSpec):
13+
"""Convert a PDF to markdown."""
14+
15+
16+
@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1)
17+
class PdfToMarkdownExecutor:
18+
"""Executor for PdfToMarkdown."""
19+
20+
spec: PdfToMarkdown
21+
_converter: PdfConverter
22+
23+
def prepare(self):
24+
config_parser = ConfigParser({})
25+
self._converter = PdfConverter(
26+
create_model_dict(), config=config_parser.generate_config_dict()
27+
)
28+
29+
def __call__(self, content: bytes) -> str:
30+
with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
31+
temp_file.write(content)
32+
temp_file.flush()
33+
text, _, _ = text_from_rendered(self._converter(temp_file.name))
34+
return text
35+
36+
37+
@dataclasses.dataclass
38+
class ArgInfo:
39+
"""Information about an argument of a method."""
40+
41+
name: str
42+
description: str
43+
44+
45+
@dataclasses.dataclass
46+
class MethodInfo:
47+
"""Information about a method."""
48+
49+
name: str
50+
args: list[ArgInfo]
51+
description: str
52+
53+
54+
@dataclasses.dataclass
55+
class ClassInfo:
56+
"""Information about a class."""
57+
58+
name: str
59+
description: str
60+
methods: list[MethodInfo]
61+
62+
63+
@dataclasses.dataclass
64+
class ModuleInfo:
65+
"""Information about a Python module."""
66+
67+
title: str
68+
description: str
69+
classes: list[ClassInfo]
70+
methods: list[MethodInfo]
71+
72+
73+
@dataclasses.dataclass
74+
class ModuleSummary:
75+
"""Summary info about a Python module."""
76+
77+
num_classes: int
78+
num_methods: int
79+
80+
81+
@cocoindex.op.function()
82+
def summarize_module(module_info: ModuleInfo) -> ModuleSummary:
83+
"""Summarize a Python module."""
84+
return ModuleSummary(
85+
num_classes=len(module_info.classes),
86+
num_methods=len(module_info.methods),
87+
)
88+
89+
90+
@cocoindex.flow_def(name="ManualExtraction")
91+
def manual_extraction_flow(
92+
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
93+
):
94+
"""
95+
Define an example flow that extracts manual information from a Markdown.
96+
"""
97+
data_scope["documents"] = flow_builder.add_source(
98+
cocoindex.sources.LocalFile(path="manuals", binary=True)
99+
)
100+
101+
modules_index = data_scope.add_collector()
102+
103+
with data_scope["documents"].row() as doc:
104+
doc["markdown"] = doc["content"].transform(PdfToMarkdown())
105+
doc["module_info"] = doc["markdown"].transform(
106+
cocoindex.functions.ExtractByLlm(
107+
llm_spec=cocoindex.LlmSpec(
108+
api_type=cocoindex.LlmApiType.BEDROCK,
109+
model="anthropic.claude-3-haiku-20240307-v1:0",
110+
),
111+
# Replace by this spec below, to use OpenAI API model instead of ollama
112+
# llm_spec=cocoindex.LlmSpec(
113+
# api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
114+
# Replace by this spec below, to use Gemini API model
115+
# llm_spec=cocoindex.LlmSpec(
116+
# api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"),
117+
# Replace by this spec below, to use Anthropic API model
118+
# llm_spec=cocoindex.LlmSpec(
119+
# api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"),
120+
# Replace by this spec below, to use Ollama API model
121+
# llm_spec=cocoindex.LlmSpec(
122+
# api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.2"),
123+
output_type=ModuleInfo,
124+
instruction="Please extract Python module information from the manual.",
125+
)
126+
)
127+
doc["module_summary"] = doc["module_info"].transform(summarize_module)
128+
modules_index.collect(
129+
filename=doc["filename"],
130+
module_info=doc["module_info"],
131+
module_summary=doc["module_summary"],
132+
)
133+
134+
modules_index.export(
135+
"modules",
136+
cocoindex.targets.Postgres(table_name="modules_info"),
137+
primary_key_fields=["filename"],
138+
)
371 KB
Binary file not shown.
424 KB
Binary file not shown.
190 KB
Binary file not shown.
320 KB
Binary file not shown.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[project]
2+
name = "manuals-llm-extraction"
3+
version = "0.1.0"
4+
description = "Simple example for cocoindex: extract structured information from a Markdown file using LLM."
5+
requires-python = ">=3.11"
6+
dependencies = ["cocoindex>=0.2.8", "marker-pdf>=1.8.5"]
7+
8+
[tool.setuptools]
9+
packages = []

0 commit comments

Comments
 (0)