cocoindex-io
diff --git a/‎examples/bedrock_llm_extraction/.env.example‎
Lines changed: 10 additions & 0 deletions b/‎examples/bedrock_llm_extraction/.env.example‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎examples/bedrock_llm_extraction/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎examples/bedrock_llm_extraction/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/bedrock_llm_extraction/README.md‎
Lines changed: 72 additions & 0 deletions b/‎examples/bedrock_llm_extraction/README.md‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎examples/bedrock_llm_extraction/main.py‎
Lines changed: 138 additions & 0 deletions b/‎examples/bedrock_llm_extraction/main.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎examples/bedrock_llm_extraction/manuals/array.pdf‎
371 KB b/‎examples/bedrock_llm_extraction/manuals/array.pdf‎
371 KB
diff --git a/‎examples/bedrock_llm_extraction/manuals/base64.pdf‎
424 KB b/‎examples/bedrock_llm_extraction/manuals/base64.pdf‎
424 KB
diff --git a/‎examples/bedrock_llm_extraction/manuals/copy.pdf‎
190 KB b/‎examples/bedrock_llm_extraction/manuals/copy.pdf‎
190 KB
diff --git a/‎examples/bedrock_llm_extraction/manuals/glob.pdf‎
320 KB b/‎examples/bedrock_llm_extraction/manuals/glob.pdf‎
320 KB
diff --git a/‎examples/bedrock_llm_extraction/pyproject.toml‎
Lines changed: 9 additions & 0 deletions b/‎examples/bedrock_llm_extraction/pyproject.toml‎
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1,10 @@
+# Postgres database address for cocoindex
+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
+
+# Fallback to CPU for operations not supported by MPS on Mac.
+# It's no-op for other platforms.
+PYTORCH_ENABLE_MPS_FALLBACK=1
+
+# AWS Bedrock credentials
+BEDROCK_API_KEY=your_bedrock_api_key
+BEDROCK_REGION=your_bedrock_region
@@ -0,0 +1 @@
+.env
@@ -0,0 +1,72 @@
+# Structured Data Extraction from PDF with AWS Bedrock and CocoIndex
+
+In this example, we
+
+*   Converts PDFs (generated from a few Python docs) into Markdown.
+*   Extract structured information from the Markdown using an AWS Bedrock LLM.
+*   Use a custom function to further extract information from the structured output.
+
+Please give [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us if you like our work. Thank you so much with a warm coconut hug 🥥🤗. [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
+
+## Prerequisite
+
+Before running the example, you need to:
+
+*   [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
+*   Configure your AWS Bedrock credentials. In this example we use AWS Bedrock. You need to get it ready by following [this guide](https://docs.aws.amazon.com/bedrock/latest/userguide/api-keys.html) to create an API key. Alternatively, you can also follow the comments in source code to switch to other LLMs.
+
+First, copy the example environment file:
+
+```bash
+cp .env.example .env
+```
+
+Then, open the `.env` file and fill in your AWS Bedrock credentials. The `.env` file is ignored by git, so your secrets will not be committed.
+
+## Run
+
+
+### Build the index
+
+Install dependencies:
+
+```bash
+pip install -e .
+```
+
+Setup:
+
+```bash
+cocoindex setup main.py
+```
+
+Update index:
+
+```bash
+cocoindex update main.py
+```
+
+### Query the index
+
+After index is build, you have a table with name `modules_info`. You can query it any time, e.g. start a Postgres shell:
+
+```bash
+psql postgres://cocoindex:cocoindex@localhost/cocoindex
+```
+
+And run the SQL query:
+
+```sql
+SELECT filename, module_info->'title' AS title, module_summary FROM modules_info;
+```
+You should see results like:
+
+```
+      filename       |         title          |      module_summary
+---------------------+------------------------+--------------------------
+ manuals/asyncio.pdf | "asyncio — Asynchronous" | {"num_classes": 0, "num_methods": 0}
+ manuals/json.pdf    | "json — JSON encoder"  | {"num_classes": 0, "num_methods": 0}
+(2 rows)
+```
+
+The output may vary depending on the model you are using. The important part is that the `module_info` and `module_summary` columns are populated with the extracted data.
@@ -0,0 +1,138 @@
+import tempfile
+import dataclasses
+
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+from marker.config.parser import ConfigParser
+
+import cocoindex
+
+
+class PdfToMarkdown(cocoindex.op.FunctionSpec):
+    """Convert a PDF to markdown."""
+
+
+@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1)
+class PdfToMarkdownExecutor:
+    """Executor for PdfToMarkdown."""
+
+    spec: PdfToMarkdown
+    _converter: PdfConverter
+
+    def prepare(self):
+        config_parser = ConfigParser({})
+        self._converter = PdfConverter(
+            create_model_dict(), config=config_parser.generate_config_dict()
+        )
+
+    def __call__(self, content: bytes) -> str:
+        with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
+            temp_file.write(content)
+            temp_file.flush()
+            text, _, _ = text_from_rendered(self._converter(temp_file.name))
+            return text
+
+
+@dataclasses.dataclass
+class ArgInfo:
+    """Information about an argument of a method."""
+
+    name: str
+    description: str
+
+
+@dataclasses.dataclass
+class MethodInfo:
+    """Information about a method."""
+
+    name: str
+    args: list[ArgInfo]
+    description: str
+
+
+@dataclasses.dataclass
+class ClassInfo:
+    """Information about a class."""
+
+    name: str
+    description: str
+    methods: list[MethodInfo]
+
+
+@dataclasses.dataclass
+class ModuleInfo:
+    """Information about a Python module."""
+
+    title: str
+    description: str
+    classes: list[ClassInfo]
+    methods: list[MethodInfo]
+
+
+@dataclasses.dataclass
+class ModuleSummary:
+    """Summary info about a Python module."""
+
+    num_classes: int
+    num_methods: int
+
+
+@cocoindex.op.function()
+def summarize_module(module_info: ModuleInfo) -> ModuleSummary:
+    """Summarize a Python module."""
+    return ModuleSummary(
+        num_classes=len(module_info.classes),
+        num_methods=len(module_info.methods),
+    )
+
+
+@cocoindex.flow_def(name="ManualExtraction")
+def manual_extraction_flow(
+    flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
+):
+    """
+    Define an example flow that extracts manual information from a Markdown.
+    """
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(path="manuals", binary=True)
+    )
+
+    modules_index = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        doc["markdown"] = doc["content"].transform(PdfToMarkdown())
+        doc["module_info"] = doc["markdown"].transform(
+            cocoindex.functions.ExtractByLlm(
+                llm_spec=cocoindex.LlmSpec(
+                    api_type=cocoindex.LlmApiType.BEDROCK,
+                    model="anthropic.claude-3-haiku-20240307-v1:0",
+                ),
+                # Replace by this spec below, to use OpenAI API model instead of ollama
+                #   llm_spec=cocoindex.LlmSpec(
+                #       api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
+                # Replace by this spec below, to use Gemini API model
+                #   llm_spec=cocoindex.LlmSpec(
+                #       api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"),
+                # Replace by this spec below, to use Anthropic API model
+                #   llm_spec=cocoindex.LlmSpec(
+                #       api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"),
+                # Replace by this spec below, to use Ollama API model
+                #   llm_spec=cocoindex.LlmSpec(
+                #       api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.2"),
+                output_type=ModuleInfo,
+                instruction="Please extract Python module information from the manual.",
+            )
+        )
+        doc["module_summary"] = doc["module_info"].transform(summarize_module)
+        modules_index.collect(
+            filename=doc["filename"],
+            module_info=doc["module_info"],
+            module_summary=doc["module_summary"],
+        )
+
+    modules_index.export(
+        "modules",
+        cocoindex.targets.Postgres(table_name="modules_info"),
+        primary_key_fields=["filename"],
+    )
@@ -0,0 +1,9 @@
+[project]
+name = "manuals-llm-extraction"
+version = "0.1.0"
+description = "Simple example for cocoindex: extract structured information from a Markdown file using LLM."
+requires-python = ">=3.11"
+dependencies = ["cocoindex>=0.2.8", "marker-pdf>=1.8.5"]
+
+[tool.setuptools]
+packages = []