Skip to content

Commit 0488686

Browse files
authored
Use manifest_files_v2 field in the flatbuffers (#1909)
1 parent c471c2c commit 0488686

File tree

211 files changed

+119
-50
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

211 files changed

+119
-50
lines changed

icechunk-format/flatbuffers/snapshot.fbs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ table Snapshot {
123123
id: ObjectId12 (required);
124124

125125
// the id of the parent snapshot, can be null for a root snapshot
126+
// This is left empty on V2 spec snapshots
126127
parent_id: ObjectId12;
127128

128129
// sorted by ascending order of NodeSnapshot.path
@@ -141,14 +142,15 @@ table Snapshot {
141142

142143
// the list of all manifest files this snapshot points to
143144
// sorted in ascending order of ManifestFileInfo.id
145+
// This is left empty on V2 spec snapshots
144146
manifest_files: [ManifestFileInfo] (required);
145147

146-
// Reserved for future use. Opaque byte vector for extensibility.
147148
// Introduced in spec version 2
148-
extra: [uint8];
149+
manifest_files_v2: [ManifestFileInfoV2];
149150

151+
// Reserved for future use. Opaque byte vector for extensibility.
150152
// Introduced in spec version 2
151-
manifest_files_v2: [ManifestFileInfoV2];
153+
extra: [uint8];
152154
}
153155

154156
root_type Snapshot;

icechunk-format/src/snapshot.rs

Lines changed: 92 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
//! Repository state at a point in time (arrays, groups, and manifest references).
22
3-
use std::{collections::BTreeMap, ops::Range, sync::Arc};
3+
use std::{borrow::Cow, collections::BTreeMap, ops::Range, sync::Arc};
44

55
use bytes::Bytes;
66
use chrono::{DateTime, Utc};
77
use flatbuffers::{
88
FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, VerifierOptions,
99
WIPOffset,
1010
};
11-
use itertools::Itertools as _;
11+
use itertools::{Either, Itertools as _};
1212
use quick_cache::sync::{Cache, GuardResult};
1313
use serde::{Deserialize, Serialize};
1414
use serde_json::Value;
@@ -286,6 +286,26 @@ impl From<&generated::ManifestFileInfo> for ManifestFileInfo {
286286
}
287287
}
288288

289+
impl TryFrom<&generated::ManifestFileInfoV2<'_>> for ManifestFileInfo {
290+
type Error = IcechunkFormatError;
291+
292+
fn try_from(value: &generated::ManifestFileInfoV2<'_>) -> Result<Self, Self::Error> {
293+
let id = value.id().map(|id| id.into()).ok_or_else(|| {
294+
IcechunkFormatError::capture(IcechunkFormatErrorKind::InvalidFlatBuffer(
295+
flatbuffers::InvalidFlatbuffer::MissingRequiredField {
296+
required: Cow::Borrowed("id"),
297+
error_trace: Default::default(),
298+
},
299+
))
300+
})?;
301+
Ok(Self {
302+
id,
303+
size_bytes: value.size_bytes(),
304+
num_chunk_refs: value.num_chunk_refs(),
305+
})
306+
}
307+
}
308+
289309
pub type SnapshotProperties = BTreeMap<String, Value>;
290310

291311
/// Insert a key-value pair into the `__icechunk` namespace object within snapshot properties.
@@ -349,7 +369,10 @@ impl std::fmt::Debug for Snapshot {
349369
.field("parent_id", &self.parent_id())
350370
.field("flushed_at", &self.flushed_at())
351371
.field("nodes", &nodes)
352-
.field("manifests", &self.manifest_files().collect::<Vec<_>>())
372+
.field(
373+
"manifests",
374+
&self.manifest_files().collect::<IcechunkResult<Vec<_>>>(),
375+
)
353376
.field("message", &self.message())
354377
.field("metadata", &self.metadata())
355378
.finish_non_exhaustive()
@@ -445,14 +468,43 @@ impl Snapshot {
445468
let mut builder = FlatBufferBuilder::with_capacity(4_096);
446469

447470
manifest_files.sort_by(|a, b| a.id.cmp(&b.id));
448-
let manifest_files = manifest_files
449-
.iter()
450-
.map(|mfi| {
451-
let id = generated::ObjectId12::new(&mfi.id.0);
452-
generated::ManifestFileInfo::new(&id, mfi.size_bytes, mfi.num_chunk_refs)
453-
})
454-
.collect::<Vec<_>>();
455-
let manifest_files = builder.create_vector(&manifest_files);
471+
let (manifest_files_v1, manifest_files_v2) = match spec_version {
472+
SpecVersionBin::V1 => {
473+
let ms1 = manifest_files
474+
.iter()
475+
.map(|mfi| {
476+
let id = generated::ObjectId12::new(&mfi.id.0);
477+
generated::ManifestFileInfo::new(
478+
&id,
479+
mfi.size_bytes,
480+
mfi.num_chunk_refs,
481+
)
482+
})
483+
.collect::<Vec<_>>();
484+
let ms1 = builder.create_vector(&ms1);
485+
let ms2 = None;
486+
(ms1, ms2)
487+
}
488+
SpecVersionBin::V2 => {
489+
let ms2 = manifest_files
490+
.iter()
491+
.map(|mfi| {
492+
let id = generated::ObjectId12::new(&mfi.id.0);
493+
494+
let args = generated::ManifestFileInfoV2Args {
495+
id: Some(&id),
496+
size_bytes: mfi.size_bytes,
497+
num_chunk_refs: mfi.num_chunk_refs,
498+
extra: None,
499+
};
500+
generated::ManifestFileInfoV2::create(&mut builder, &args)
501+
})
502+
.collect::<Vec<_>>();
503+
let ms2 = builder.create_vector(&ms2);
504+
let ms1 = builder.create_vector::<generated::ManifestFileInfo>(&[]);
505+
(ms1, Some(ms2))
506+
}
507+
};
456508

457509
let metadata_items: Vec<_> = properties
458510
.unwrap_or_default()
@@ -495,7 +547,8 @@ impl Snapshot {
495547
flushed_at,
496548
message: Some(message),
497549
metadata: Some(metadata_items),
498-
manifest_files: Some(manifest_files),
550+
manifest_files: Some(manifest_files_v1),
551+
manifest_files_v2,
499552
..Default::default()
500553
},
501554
);
@@ -583,18 +636,15 @@ impl Snapshot {
583636
self.root().message().to_string()
584637
}
585638

586-
pub fn get_manifest_file(&self, id: &ManifestId) -> Option<ManifestFileInfo> {
587-
self.root().manifest_files().iter().find(|mf| mf.id().0 == id.0.as_slice()).map(
588-
|mf| ManifestFileInfo {
589-
id: ManifestId::new(mf.id().0),
590-
size_bytes: mf.size_bytes(),
591-
num_chunk_refs: mf.num_chunk_refs(),
592-
},
593-
)
594-
}
595-
596-
pub fn manifest_files(&self) -> impl Iterator<Item = ManifestFileInfo> + '_ {
597-
self.root().manifest_files().iter().map(|mf| mf.into())
639+
pub fn manifest_files(
640+
&self,
641+
) -> impl Iterator<Item = IcechunkResult<ManifestFileInfo>> + '_ {
642+
let root = self.root();
643+
if let Some(mf2) = root.manifest_files_v2() {
644+
Either::Left(mf2.iter().map(|mf| (&mf).try_into()))
645+
} else {
646+
Either::Right(root.manifest_files().iter().map(|mf| Ok(mf.into())))
647+
}
598648
}
599649

600650
/// Cretase a new `Snapshot` with all the same data as `new_child` but `self` as parent
@@ -612,7 +662,7 @@ impl Snapshot {
612662
SpecVersionBin::V1, // 2.0 doesn't use adopt
613663
&new_child.message(),
614664
Some(new_child.metadata()?.clone()),
615-
new_child.manifest_files().collect(),
665+
new_child.manifest_files().try_collect()?,
616666
Some(new_child.flushed_at()?),
617667
new_child.iter(),
618668
)
@@ -673,12 +723,23 @@ impl Snapshot {
673723
self.len() == 0
674724
}
675725

676-
pub fn manifest_info(&self, id: &ManifestId) -> Option<ManifestFileInfo> {
677-
self.root()
678-
.manifest_files()
679-
.iter()
680-
.find(|mi| mi.id().0 == id.0)
681-
.map(|man| man.into())
726+
pub fn manifest_info(
727+
&self,
728+
id: &ManifestId,
729+
) -> IcechunkResult<Option<ManifestFileInfo>> {
730+
let root = self.root();
731+
if let Some(mf2) = root.manifest_files_v2() {
732+
mf2.iter()
733+
.find(|mf| mf.id().is_some_and(|mid| mid.0 == id.0))
734+
.map(|mf| (&mf).try_into())
735+
.transpose()
736+
} else {
737+
Ok(root
738+
.manifest_files()
739+
.iter()
740+
.find(|mi| mi.id().0 == id.0)
741+
.map(|man| man.into()))
742+
}
682743
}
683744
}
684745

0 Bytes
Binary file not shown.

icechunk-python/tests/data/split-repo-v2/chunks/16QBJ188VFA289DRZS60 renamed to icechunk-python/tests/data/split-repo-v2/chunks/1330XWDZH3JFNZ5NRWRG

File renamed without changes.

icechunk-python/tests/data/split-repo-v2/chunks/20W80PTE6M0J21M3FN8G renamed to icechunk-python/tests/data/split-repo-v2/chunks/1TRFX0DVC5Z7HQVDA8E0

File renamed without changes.

icechunk-python/tests/data/split-repo-v2/chunks/AZ91V75J10182ZS2S3DG renamed to icechunk-python/tests/data/split-repo-v2/chunks/1WH6BK0GYK1DE8GQMQA0

File renamed without changes.

icechunk-python/tests/data/split-repo-v2/chunks/2CHZ77CJCKYQAJS52EVG renamed to icechunk-python/tests/data/split-repo-v2/chunks/1ZB4JNCQA9WT4Q2QAJJ0

File renamed without changes.

icechunk-python/tests/data/split-repo-v2/chunks/3HPHVS6B3TPW8YGQT1H0 renamed to icechunk-python/tests/data/split-repo-v2/chunks/27ZN85YWQ4PJZFBWBM70

File renamed without changes.

icechunk-python/tests/data/split-repo-v2/chunks/2JRQ6HN31ZST2WGGYXY0 renamed to icechunk-python/tests/data/split-repo-v2/chunks/2983MBZARQHQTJ2KG410

File renamed without changes.

icechunk-python/tests/data/split-repo-v2/chunks/EXCRAX9CPA9WD2RY1RTG renamed to icechunk-python/tests/data/split-repo-v2/chunks/2X6Q41707C8F81F6KP70

File renamed without changes.

0 commit comments

Comments
 (0)