Skip to content

Commit 228263c

Browse files
committed
PR comments
1 parent 3efd28e commit 228263c

File tree

2 files changed

+44
-46
lines changed

2 files changed

+44
-46
lines changed

pyiceberg/table/puffin.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -152,27 +152,34 @@ def to_vector(self) -> dict[str, "pa.ChunkedArray"]:
152152
class PuffinWriter:
153153
_blobs: list[PuffinBlobMetadata]
154154
_blob_payloads: list[bytes]
155+
_created_by: str | None
155156

156-
def __init__(self) -> None:
157+
def __init__(self, created_by: str | None = None) -> None:
157158
self._blobs = []
158159
self._blob_payloads = []
160+
self._created_by = created_by
159161

160-
def add(
162+
def set_blob(
161163
self,
162164
positions: Iterable[int],
163165
referenced_data_file: str,
164166
) -> None:
167+
# We only support one blob at the moment
168+
self._blobs = []
169+
self._blob_payloads = []
170+
165171
# 1. Create bitmaps from positions
166172
bitmaps: dict[int, BitMap] = {}
167-
cardinality = 0
168173
for pos in positions:
169-
cardinality += 1
170174
key = pos >> 32
171175
low_bits = pos & 0xFFFFFFFF
172176
if key not in bitmaps:
173177
bitmaps[key] = BitMap()
174178
bitmaps[key].add(low_bits)
175179

180+
# Calculate the cardinality from the bitmaps
181+
cardinality = sum(len(bm) for bm in bitmaps.values())
182+
176183
# 2. Serialize bitmaps for the vector payload
177184
vector_payload = _serialize_bitmaps(bitmaps)
178185

@@ -204,13 +211,13 @@ def add(
204211
self._blobs.append(
205212
PuffinBlobMetadata(
206213
type="deletion-vector-v1",
207-
fields=[],
214+
fields=[2147483645], # Java INT_MAX - 2, reserved field id for deletion vectors
208215
snapshot_id=-1,
209216
sequence_number=-1,
210-
offset=0, # Will be set later
211-
length=0, # Will be set later
217+
offset=0, # TODO: Use DeleteFileIndex data
218+
length=0, # TODO: Use DeleteFileIndex data
212219
properties=properties,
213-
compression_codec=None, # Explicitly None
220+
compression_codec=None,
214221
)
215222
)
216223

@@ -229,12 +236,15 @@ def finish(self) -> bytes:
229236
updated_blobs_metadata.append(PuffinBlobMetadata(**original_metadata_dict))
230237
current_offset += len(blob_payload)
231238

232-
footer = Footer(blobs=updated_blobs_metadata)
239+
footer = Footer(
240+
blobs=updated_blobs_metadata, properties={"created-by": self._created_by} if self._created_by else {}
241+
)
233242
footer_payload_bytes = footer.model_dump_json(by_alias=True, exclude_none=True).encode("utf-8")
234243

235244
# Final assembly
236245
out.write(MAGIC_BYTES)
237246
out.write(payload_buffer.getvalue())
247+
out.write(MAGIC_BYTES)
238248
out.write(footer_payload_bytes)
239249
out.write(len(footer_payload_bytes).to_bytes(4, "little"))
240250
out.write((0).to_bytes(4, "little")) # flags

tests/table/test_puffin.py

Lines changed: 25 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -74,68 +74,55 @@ def test_map_high_vals() -> None:
7474

7575

7676
def test_puffin_round_trip() -> None:
77-
# Define some deletion positions for multiple files
78-
deletions1 = [10, 20, 30]
79-
deletions2 = [5, (1 << 32) + 1] # Test with a high-bit position
77+
# Define some deletion positions for a file
78+
deletions = [5, (1 << 32) + 1, 5] # Test with a high-bit position and duplicate
8079

81-
file1_path = "path/to/data1.parquet"
82-
file2_path = "path/to/data2.parquet"
80+
file_path = "path/to/data.parquet"
8381

8482
# Write the Puffin file
85-
writer = PuffinWriter()
86-
writer.add(positions=deletions1, referenced_data_file=file1_path)
87-
writer.add(positions=deletions2, referenced_data_file=file2_path)
83+
writer = PuffinWriter(created_by="my-test-app")
84+
writer.set_blob(positions=deletions, referenced_data_file=file_path)
8885
puffin_bytes = writer.finish()
8986

9087
# Read the Puffin file back
9188
reader = PuffinFile(puffin_bytes)
9289

9390
# Assert footer metadata
94-
assert len(reader.footer.blobs) == 2
95-
96-
blob1_meta = reader.footer.blobs[0]
97-
assert blob1_meta.properties[PROPERTY_REFERENCED_DATA_FILE] == file1_path
98-
assert blob1_meta.properties["cardinality"] == str(len(deletions1))
91+
assert reader.footer.properties["created-by"] == "my-test-app"
92+
assert len(reader.footer.blobs) == 1
9993

100-
blob2_meta = reader.footer.blobs[1]
101-
assert blob2_meta.properties[PROPERTY_REFERENCED_DATA_FILE] == file2_path
102-
assert blob2_meta.properties["cardinality"] == str(len(deletions2))
94+
blob_meta = reader.footer.blobs[0]
95+
assert blob_meta.properties[PROPERTY_REFERENCED_DATA_FILE] == file_path
96+
assert blob_meta.properties["cardinality"] == str(len(set(deletions)))
10397

10498
# Assert the content of deletion vectors
10599
read_vectors = reader.to_vector()
106100

107-
assert file1_path in read_vectors
108-
assert file2_path in read_vectors
109-
110-
assert read_vectors[file1_path].to_pylist() == sorted(deletions1)
111-
assert read_vectors[file2_path].to_pylist() == sorted(deletions2)
101+
assert file_path in read_vectors
102+
assert read_vectors[file_path].to_pylist() == sorted(list(set(deletions)))
112103

113104

114105
def test_write_and_read_puffin_file() -> None:
115106
writer = PuffinWriter()
116-
writer.add(positions=[1, 2, 3], referenced_data_file="file1.parquet")
117-
writer.add(positions=[4, 5, 6], referenced_data_file="file2.parquet")
107+
writer.set_blob(positions=[1, 2, 3], referenced_data_file="file1.parquet")
108+
writer.set_blob(positions=[4, 5, 6], referenced_data_file="file2.parquet")
118109
puffin_bytes = writer.finish()
119110

120111
reader = PuffinFile(puffin_bytes)
121112

122-
assert len(reader.footer.blobs) == 2
123-
blob1 = reader.footer.blobs[0]
124-
blob2 = reader.footer.blobs[1]
125-
126-
assert blob1.properties["referenced-data-file"] == "file1.parquet"
127-
assert blob1.properties["cardinality"] == "3"
128-
assert blob1.type == "deletion-vector-v1"
129-
assert blob1.snapshot_id == -1
130-
assert blob1.sequence_number == -1
131-
assert blob1.compression_codec is None
113+
assert len(reader.footer.blobs) == 1
114+
blob = reader.footer.blobs[0]
132115

133-
assert blob2.properties["referenced-data-file"] == "file2.parquet"
134-
assert blob2.properties["cardinality"] == "3"
116+
assert blob.properties["referenced-data-file"] == "file2.parquet"
117+
assert blob.properties["cardinality"] == "3"
118+
assert blob.type == "deletion-vector-v1"
119+
assert blob.snapshot_id == -1
120+
assert blob.sequence_number == -1
121+
assert blob.compression_codec is None
135122

136123
vectors = reader.to_vector()
137-
assert len(vectors) == 2
138-
assert vectors["file1.parquet"].to_pylist() == [1, 2, 3]
124+
assert len(vectors) == 1
125+
assert "file1.parquet" not in vectors
139126
assert vectors["file2.parquet"].to_pylist() == [4, 5, 6]
140127

141128

@@ -146,3 +133,4 @@ def test_puffin_file_with_no_blobs() -> None:
146133
reader = PuffinFile(puffin_bytes)
147134
assert len(reader.footer.blobs) == 0
148135
assert len(reader.to_vector()) == 0
136+
assert "created-by" not in reader.footer.properties

0 commit comments

Comments
 (0)