Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions protos/table.proto
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,20 @@ message IndexMetadata {
// The base path index of the data file. Used when the file is imported or referred from another dataset.
// Lance use it as key of the base_paths field in Manifest to determine the actual base path of the data file.
optional uint32 base_id = 9;

// List of files and their sizes for this index segment.
// This enables skipping HEAD calls when opening indices and allows reporting
// of index sizes without extra IO.
// If this is empty, the index files sizes are unknown.
repeated IndexFile files = 10;
}

// Metadata about a single file within an index segment.
message IndexFile {
// Path relative to the index directory (e.g., "index.idx", "auxiliary.idx")
string path = 1;
// Size of the file in bytes
uint64 size_bytes = 2;
}

// Index Section, containing a list of index metadata for one dataset version.
Expand Down
2 changes: 2 additions & 0 deletions python/python/lance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
DataStatistics,
FieldStatistics,
Index,
IndexFile,
LanceDataset,
LanceOperation,
LanceScanner,
Expand Down Expand Up @@ -58,6 +59,7 @@
"FieldStatistics",
"FragmentMetadata",
"Index",
"IndexFile",
"LanceDataset",
"LanceFragment",
"LanceOperation",
Expand Down
9 changes: 9 additions & 0 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3946,6 +3946,14 @@ class ExecuteResult(TypedDict):
num_deleted_rows: int


@dataclass
class IndexFile:
"""Metadata about a file in an index segment."""

path: str
size_bytes: int


@dataclass
class Index:
"""Represents an index in the dataset."""
Expand All @@ -3958,6 +3966,7 @@ class Index:
index_version: int
created_at: Optional[datetime] = None
base_id: Optional[int] = None
files: Optional[List["IndexFile"]] = None


class AutoCleanupConfig(TypedDict):
Expand Down
2 changes: 2 additions & 0 deletions python/python/lance/lance/indices/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class IndexSegmentDescription:
fragment_ids: set[int]
index_version: int
created_at: Optional[datetime]
size_bytes: Optional[int]

def __repr__(self) -> str: ...

Expand All @@ -69,5 +70,6 @@ class IndexDescription:
field_names: list[str]
segments: list[IndexSegmentDescription]
details: dict
total_size_bytes: Optional[int]

def __repr__(self) -> str: ...
79 changes: 79 additions & 0 deletions python/python/tests/test_commit_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,82 @@ def test_commit_index(dataset_with_index, test_table, tmp_path):
)
plan = scanner.explain_plan()
assert "ScalarIndexQuery: query=[meta = hello]@meta_idx" in plan


def test_commit_index_with_files(dataset_with_index, test_table, tmp_path):
"""Test that the files field on Index round-trips through commit."""
from lance.dataset import Index, IndexFile

# Get info about the existing index created by the fixture
existing_indices = dataset_with_index.list_indices()
assert len(existing_indices) == 1
index_id = existing_indices[0]["uuid"]

# Get the original index description which should have file sizes
original_desc = dataset_with_index.describe_indices()[0]
original_size = original_desc.total_size_bytes
assert original_size is not None and original_size > 0

# Create a new dataset without index
dataset_without_index = lance.write_dataset(
test_table, tmp_path / "dataset_without_index"
)

# Copy the index files from dataset_with_index to dataset_without_index
src_index_dir = Path(dataset_with_index.uri) / "_indices" / index_id
dest_index_dir = Path(dataset_without_index.uri) / "_indices" / index_id
shutil.copytree(src_index_dir, dest_index_dir)

# Get the field id
field_id = _get_field_id_by_name(dataset_without_index.lance_schema, "meta")

# Create IndexFile objects with custom sizes to verify they round-trip
index_files = [
IndexFile(path="index.idx", size_bytes=1024),
IndexFile(path="auxiliary.bin", size_bytes=2048),
]

# Create an Index object with the files field
index = Index(
uuid=index_id,
name="meta_idx",
fields=[field_id],
dataset_version=dataset_without_index.version,
fragment_ids=set(
[f.fragment_id for f in dataset_without_index.get_fragments()]
),
index_version=0,
files=index_files,
)

create_index_op = lance.LanceOperation.CreateIndex(
new_indices=[index],
removed_indices=[],
)
dataset_without_index = lance.LanceDataset.commit(
dataset_without_index.uri,
create_index_op,
read_version=dataset_without_index.version,
)

# Verify that the index was created
assert len(dataset_without_index.list_indices()) == 1

# Read back the transaction to verify the files were stored
transactions = dataset_without_index.get_transactions(1)
assert len(transactions) == 1
transaction = transactions[0]
assert transaction is not None
assert transaction.operation is not None

# The operation should be a CreateIndex with our index that has files
op = transaction.operation
assert len(op.new_indices) == 1
committed_index = op.new_indices[0]
assert committed_index.files is not None
assert len(committed_index.files) == 2

# Verify the file sizes match what we set
files_by_path = {f.path: f.size_bytes for f in committed_index.files}
assert files_by_path["index.idx"] == 1024
assert files_by_path["auxiliary.bin"] == 2048
13 changes: 11 additions & 2 deletions python/src/indices.rs
Original file line number Diff line number Diff line change
Expand Up @@ -481,11 +481,14 @@ pub struct PyIndexSegmentDescription {
pub index_version: i32,
/// The timestamp when the index segment was created
pub created_at: Option<DateTime<Utc>>,
/// The total size in bytes of all files in this segment
/// (None for backward compatibility with indices created before file tracking)
pub size_bytes: Option<u64>,
}

impl PyIndexSegmentDescription {
pub fn __repr__(&self) -> String {
format!("IndexSegmentDescription(uuid={}, dataset_version_at_last_update={}, fragment_ids={:?}, index_version={}, created_at={:?})", self.uuid, self.dataset_version_at_last_update, self.fragment_ids, self.index_version, self.created_at)
format!("IndexSegmentDescription(uuid={}, dataset_version_at_last_update={}, fragment_ids={:?}, index_version={}, created_at={:?}, size_bytes={:?})", self.uuid, self.dataset_version_at_last_update, self.fragment_ids, self.index_version, self.created_at, self.size_bytes)
}
}

Expand All @@ -507,6 +510,9 @@ pub struct PyIndexDescription {
pub details: PyJson,
/// The segments of the index
pub segments: Vec<PyIndexSegmentDescription>,
/// The total size in bytes of all files across all segments
/// (None for backward compatibility with indices created before file tracking)
pub total_size_bytes: Option<u64>,
}

impl PyIndexDescription {
Expand All @@ -532,12 +538,14 @@ impl PyIndexDescription {
.as_ref()
.map(|bitmap| bitmap.iter().collect::<HashSet<_>>())
.unwrap_or_default();
let size_bytes = segment.total_size_bytes();
PyIndexSegmentDescription {
uuid: segment.uuid.to_string(),
dataset_version_at_last_update: segment.dataset_version,
fragment_ids,
index_version: segment.index_version,
created_at: segment.created_at,
size_bytes,
}
})
.collect();
Expand All @@ -553,14 +561,15 @@ impl PyIndexDescription {
type_url: index.type_url().to_string(),
num_rows_indexed: index.rows_indexed(),
details: PyJson(details),
total_size_bytes: index.total_size_bytes(),
}
}
}

#[pymethods]
impl PyIndexDescription {
pub fn __repr__(&self) -> String {
format!("IndexDescription(name={}, type_url={}, num_rows_indexed={}, fields={:?}, field_names={:?}, num_segments={})", self.name, self.type_url, self.num_rows_indexed, self.fields, self.field_names, self.segments.len())
format!("IndexDescription(name={}, type_url={}, num_rows_indexed={}, fields={:?}, field_names={:?}, num_segments={}, total_size_bytes={:?})", self.name, self.type_url, self.num_rows_indexed, self.fields, self.field_names, self.segments.len(), self.total_size_bytes)
}
}

Expand Down
53 changes: 51 additions & 2 deletions python/src/transaction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use lance::dataset::transaction::{
UpdateMapEntry, UpdateMode,
};
use lance::datatypes::Schema;
use lance_table::format::{BasePath, DataFile, Fragment, IndexMetadata};
use lance_table::format::{BasePath, DataFile, Fragment, IndexFile, IndexMetadata};
use pyo3::exceptions::PyValueError;
use pyo3::types::PySet;
use pyo3::{intern, prelude::*};
Expand All @@ -21,7 +21,43 @@ use std::collections::HashMap;
use std::sync::Arc;
use uuid::Uuid;

// Add Index bindings
// IndexFile bindings
impl FromPyObject<'_> for PyLance<IndexFile> {
fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> {
let path = ob.getattr("path")?.extract()?;
let size_bytes = ob.getattr("size_bytes")?.extract()?;
Ok(Self(IndexFile { path, size_bytes }))
}
}

impl<'py> IntoPyObject<'py> for PyLance<&IndexFile> {
type Target = PyAny;
type Output = Bound<'py, Self::Target>;
type Error = PyErr;

fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
let namespace = py
.import(intern!(py, "lance"))
.expect("Failed to import lance module");

let cls = namespace
.getattr("IndexFile")
.expect("Failed to get IndexFile class");
cls.call1((self.0.path.clone(), self.0.size_bytes))
}
}

impl<'py> IntoPyObject<'py> for PyLance<IndexFile> {
type Target = PyAny;
type Output = Bound<'py, Self::Target>;
type Error = PyErr;

fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
PyLance(&self.0).into_pyobject(py)
}
}

// IndexMetadata bindings
impl FromPyObject<'_> for PyLance<IndexMetadata> {
fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> {
let uuid = ob.getattr("uuid")?.to_string();
Expand All @@ -44,6 +80,11 @@ impl FromPyObject<'_> for PyLance<IndexMetadata> {
.extract::<Option<i64>>()?
.map(|id| id as u32);

let files: Option<Vec<IndexFile>> = ob
.getattr("files")?
.extract::<Option<Vec<PyLance<IndexFile>>>>()?
.map(|v| v.into_iter().map(|f| f.0).collect());

Ok(Self(IndexMetadata {
uuid: Uuid::parse_str(&uuid).map_err(|e| PyValueError::new_err(e.to_string()))?,
name,
Expand All @@ -54,6 +95,7 @@ impl FromPyObject<'_> for PyLance<IndexMetadata> {
index_version,
created_at,
base_id,
files,
}))
}
}
Expand Down Expand Up @@ -85,6 +127,12 @@ impl<'py> IntoPyObject<'py> for PyLance<&IndexMetadata> {
);
let created_at = self.0.created_at;
let base_id = self.0.base_id.map(|id| id as i64);
let files = self
.0
.files
.as_ref()
.map(|f| export_vec(py, f.as_slice()))
.transpose()?;

let cls = namespace
.getattr("Index")
Expand All @@ -98,6 +146,7 @@ impl<'py> IntoPyObject<'py> for PyLance<&IndexMetadata> {
index_version,
created_at,
base_id,
files,
))
}
}
Expand Down
12 changes: 12 additions & 0 deletions rust/lance-index/src/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use snafu::location;
use crate::metrics::MetricsCollector;
use crate::scalar::registry::TrainingCriteria;
use crate::{Index, IndexParams, IndexType};
pub use lance_table::format::IndexFile;

pub mod bitmap;
pub mod bloomfilter;
Expand Down Expand Up @@ -227,6 +228,12 @@ pub trait IndexStore: std::fmt::Debug + Send + Sync + DeepSizeOf {

/// Delete an index file (used in the tmp spill store to keep tmp size down)
async fn delete_index_file(&self, name: &str) -> Result<()>;

/// List all files in the index directory with their sizes.
///
/// Returns a list of (relative_path, size_bytes) tuples.
/// Used to capture file metadata after index creation/modification.
async fn list_files_with_sizes(&self) -> Result<Vec<IndexFile>>;
}

/// Different scalar indices may support different kinds of queries
Expand Down Expand Up @@ -741,6 +748,11 @@ pub struct CreatedIndex {
///
/// This can be used to determine if a reader is able to load the index.
pub index_version: u32,
/// List of files and their sizes for this index
///
/// This enables skipping HEAD calls when opening indices and provides
/// visibility into index storage size via describe_indices().
pub files: Option<Vec<IndexFile>>,
}

/// The criteria that specifies how to update an index
Expand Down
3 changes: 3 additions & 0 deletions rust/lance-index/src/scalar/bitmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,7 @@ impl ScalarIndex for BitmapIndex {
index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default())
.unwrap(),
index_version: BITMAP_INDEX_VERSION,
files: Some(dest_store.list_files_with_sizes().await?),
})
}

Expand Down Expand Up @@ -609,6 +610,7 @@ impl ScalarIndex for BitmapIndex {
index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default())
.unwrap(),
index_version: BITMAP_INDEX_VERSION,
files: Some(dest_store.list_files_with_sizes().await?),
})
}

Expand Down Expand Up @@ -803,6 +805,7 @@ impl ScalarIndexPlugin for BitmapIndexPlugin {
index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default())
.unwrap(),
index_version: BITMAP_INDEX_VERSION,
files: Some(index_store.list_files_with_sizes().await?),
})
}

Expand Down
2 changes: 2 additions & 0 deletions rust/lance-index/src/scalar/bloomfilter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,7 @@ impl ScalarIndex for BloomFilterIndex {
index_details: prost_types::Any::from_msg(&pb::BloomFilterIndexDetails::default())
.unwrap(),
index_version: BLOOMFILTER_INDEX_VERSION,
files: Some(dest_store.list_files_with_sizes().await?),
})
}

Expand Down Expand Up @@ -1129,6 +1130,7 @@ impl ScalarIndexPlugin for BloomFilterIndexPlugin {
index_details: prost_types::Any::from_msg(&pb::BloomFilterIndexDetails::default())
.unwrap(),
index_version: BLOOMFILTER_INDEX_VERSION,
files: Some(index_store.list_files_with_sizes().await?),
})
}

Expand Down
Loading
Loading