Skip to content
Merged

Dev #60

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ members = ["rust", "python"]
resolver = "2"

[workspace.package]
version = "0.1.25"
version = "0.1.26"
edition = "2024"
authors = ["zTgx <beautifularea@gmail.com>"]
license = "Apache-2.0"
Expand Down
28 changes: 28 additions & 0 deletions examples/batch_indexing/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Batch Indexing Example

Demonstrates indexing multiple documents at once using:
- `from_paths` -- explicit list of file paths
- `from_dir` -- all supported files in a directory
- `from_bytes` -- raw in-memory content

Also shows cross-document querying with `with_doc_ids`.

## Setup

```bash
pip install vectorless
```

## Run

```bash
python main.py
```

## Environment Variables

| Variable | Description | Default |
|------------------------|----------------------|-----------|
| `VECTORLESS_API_KEY` | LLM API key | `sk-...` |
| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` |
| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` |
183 changes: 183 additions & 0 deletions examples/batch_indexing/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""
Batch indexing example -- demonstrates indexing multiple documents at once
using from_paths, from_dir, and from_bytes.

Usage:
pip install vectorless
python main.py
"""

import asyncio
import os

from vectorless import (
Engine,
IndexContext,
IndexOptions,
QueryContext,
VectorlessError,
)

# --- Configuration ---
API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...")
MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o")
ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None)
WORKSPACE = "./workspace"

# Sample documents for demonstration
DOCS = {
"alpha.md": """\
# Alpha Report

## Summary

Alpha is a distributed key-value store designed for low-latency reads.
It uses a log-structured merge tree for storage.

## Architecture

Write requests go through a write-ahead log, then are buffered in memory.
When the buffer is full, it is flushed to disk as an immutable SSTable.
""",
"beta.md": """\
# Beta Report

## Summary

Beta is a stream processing engine that consumes events from Kafka topics
and applies real-time transformations using a DAG-based execution model.

## Performance

Beta processes up to 2 million events per second per node on commodity hardware.
""",
"gamma.md": """\
# Gamma Report

## Summary

Gamma is a feature store that bridges the gap between offline feature
computation and online serving. Features are computed in Spark and served
via a low-latency gRPC endpoint.

## Integration

Gamma integrates with Alpha for feature metadata storage and Beta for
real-time feature updates.
""",
}


def write_sample_docs(base_dir: str) -> list[str]:
"""Write sample markdown files and return their paths."""
paths = []
for name, content in DOCS.items():
path = os.path.join(base_dir, name)
with open(path, "w") as f:
f.write(content)
paths.append(path)
return paths


async def main() -> None:
engine = Engine(
workspace=WORKSPACE,
api_key=API_KEY,
model=MODEL,
endpoint=ENDPOINT,
)

# Create a temp directory with sample documents
docs_dir = "./batch_docs"
os.makedirs(docs_dir, exist_ok=True)
paths = write_sample_docs(docs_dir)

# ---- 1. Index multiple files at once via from_paths ----
print("=" * 50)
print(" from_paths -- index a list of files")
print("=" * 50)

ctx = IndexContext.from_paths(paths)
result = await engine.index(ctx)

print(f" Indexed {len(result.items)} document(s)")
for item in result.items:
print(f" - {item.name} ({item.doc_id[:8]}...)")
if result.has_failures():
for f in result.failed:
print(f" ! Failed: {f.source} -- {f.error}")
print()

doc_ids = [item.doc_id for item in result.items]

# ---- 2. Query across all batch-indexed documents ----
print("=" * 50)
print(" Query across multiple documents")
print("=" * 50)

answer = await engine.query(
QueryContext(
"Which system processes the most events per second?"
).with_doc_ids(doc_ids)
)
for item in answer.items:
print(f" [{item.doc_id[:8]}...] score={item.score:.2f}")
print(f" {item.content[:200]}...")
print()

# ---- 3. Index a directory via from_dir ----
print("=" * 50)
print(" from_dir -- index all supported files in a directory")
print("=" * 50)

# Clear first so we see fresh results
await engine.clear()

ctx = IndexContext.from_dir(docs_dir).with_options(
IndexOptions(generate_summaries=True, generate_description=True)
)
result = await engine.index(ctx)

print(f" Indexed {len(result.items)} document(s)")
for item in result.items:
desc = item.description[:80] if item.description else "N/A"
print(f" - {item.name}: {desc}...")
print()

# ---- 4. Index from raw bytes via from_bytes ----
print("=" * 50)
print(" from_bytes -- index in-memory content")
print("=" * 50)

md_bytes = b"""# Delta Notes

## Key Points

- Delta uses CRDTs for conflict-free replication.
- Writes are locally committed then asynchronously propagated.
- Read repair ensures eventual consistency across all replicas.
"""

ctx = IndexContext.from_bytes(md_bytes, "markdown").with_name("delta")
result = await engine.index(ctx)

print(f" Indexed: {result.doc_id}")
print()

# ---- Cleanup ----
print("=" * 50)
print(" Cleanup")
print("=" * 50)

removed = await engine.clear()
print(f" Removed {removed} document(s)")

# Remove temp files
for p in paths:
os.remove(p)
os.rmdir(docs_dir)
print(f" Cleaned up {docs_dir}/")


if __name__ == "__main__":
asyncio.run(main())
28 changes: 28 additions & 0 deletions examples/document_management/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Document Management Example

Demonstrates CRUD operations on indexed documents:

- `engine.list()` -- list all documents
- `engine.exists(doc_id)` -- check if a document exists
- `engine.remove(doc_id)` -- remove a single document
- `engine.clear()` -- remove all documents

## Setup

```bash
pip install vectorless
```

## Run

```bash
python main.py
```

## Environment Variables

| Variable | Description | Default |
|------------------------|----------------------|-----------|
| `VECTORLESS_API_KEY` | LLM API key | `sk-...` |
| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` |
| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` |
135 changes: 135 additions & 0 deletions examples/document_management/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""
Document management example -- demonstrates CRUD operations on indexed documents:
list, exists, remove, and clear.

Usage:
pip install vectorless
python main.py
"""

import asyncio
import os

from vectorless import (
Engine,
IndexContext,
QueryContext,
VectorlessError,
)

# --- Configuration ---
API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...")
MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o")
ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None)
WORKSPACE = "./workspace"

# Sample documents
SAMPLE_A = """\
# Project Alpha

## Overview

Project Alpha is a next-generation database engine written in Rust.
It supports ACID transactions and serializable isolation.

## Features

- MVCC concurrency control
- B-tree and LSM storage engines
- Query planner with cost-based optimization
"""

SAMPLE_B = """\
# Project Beta

## Overview

Project Beta is a web framework for building real-time applications.
It uses WebSocket-based communication and server-side rendering.

## Features

- Hot module reloading
- Built-in authentication middleware
- Automatic code splitting
"""


async def main() -> None:
engine = Engine(
workspace=WORKSPACE,
api_key=API_KEY,
model=MODEL,
endpoint=ENDPOINT,
)

# ---- Index two documents ----
print("Indexing two documents...")

result_a = await engine.index(
IndexContext.from_content(SAMPLE_A, "markdown").with_name("alpha")
)
doc_id_a = result_a.doc_id
print(f" A: {doc_id_a}")

result_b = await engine.index(
IndexContext.from_content(SAMPLE_B, "markdown").with_name("beta")
)
doc_id_b = result_b.doc_id
print(f" B: {doc_id_b}")
print()

# ---- list() -- show all indexed documents ----
print("--- list() ---")
docs = await engine.list()
for doc in docs:
pages = f", pages={doc.page_count}" if doc.page_count else ""
lines = f", lines={doc.line_count}" if doc.line_count else ""
print(f" {doc.name} id={doc.id[:8]}... format={doc.format}{pages}{lines}")
print(f" Total: {len(docs)} document(s)\n")

# ---- exists() -- check if a document is indexed ----
print("--- exists() ---")
for did, label in [(doc_id_a, "A"), (doc_id_b, "B"), ("nonexistent-id", "?")]:
found = await engine.exists(did)
print(f" {label}: exists={found}")
print()

# ---- Query a specific document ----
print("--- query(doc_id_a) ---")
answer = await engine.query(
QueryContext("What storage engines does Alpha support?").with_doc_id(doc_id_a)
)
item = answer.single()
if item:
print(f" Score: {item.score:.2f}")
print(f" Answer: {item.content[:200]}...\n")

# ---- remove() -- delete a single document ----
print("--- remove(doc_id_a) ---")
removed = await engine.remove(doc_id_a)
print(f" Removed A: {removed}")

# Verify it's gone
exists_a = await engine.exists(doc_id_a)
print(f" exists(A) after removal: {exists_a}")
print()

# ---- list() again -- only B should remain ----
print("--- list() after removal ---")
docs = await engine.list()
for doc in docs:
print(f" {doc.name} id={doc.id[:8]}...")
print(f" Total: {len(docs)} document(s)\n")

# ---- clear() -- remove all remaining documents ----
print("--- clear() ---")
cleared = await engine.clear()
print(f" Cleared {cleared} document(s)")

docs = await engine.list()
print(f" Remaining: {len(docs)} document(s)")


if __name__ == "__main__":
asyncio.run(main())
Loading