@@ -74,68 +74,55 @@ def test_map_high_vals() -> None:
7474
7575
7676def test_puffin_round_trip () -> None :
77- # Define some deletion positions for multiple files
78- deletions1 = [10 , 20 , 30 ]
79- deletions2 = [5 , (1 << 32 ) + 1 ] # Test with a high-bit position
77+ # Define some deletion positions for a file
78+ deletions = [5 , (1 << 32 ) + 1 , 5 ] # Test with a high-bit position and duplicate
8079
81- file1_path = "path/to/data1.parquet"
82- file2_path = "path/to/data2.parquet"
80+ file_path = "path/to/data.parquet"
8381
8482 # Write the Puffin file
85- writer = PuffinWriter ()
86- writer .add (positions = deletions1 , referenced_data_file = file1_path )
87- writer .add (positions = deletions2 , referenced_data_file = file2_path )
83+ writer = PuffinWriter (created_by = "my-test-app" )
84+ writer .set_blob (positions = deletions , referenced_data_file = file_path )
8885 puffin_bytes = writer .finish ()
8986
9087 # Read the Puffin file back
9188 reader = PuffinFile (puffin_bytes )
9289
9390 # Assert footer metadata
94- assert len (reader .footer .blobs ) == 2
95-
96- blob1_meta = reader .footer .blobs [0 ]
97- assert blob1_meta .properties [PROPERTY_REFERENCED_DATA_FILE ] == file1_path
98- assert blob1_meta .properties ["cardinality" ] == str (len (deletions1 ))
91+ assert reader .footer .properties ["created-by" ] == "my-test-app"
92+ assert len (reader .footer .blobs ) == 1
9993
100- blob2_meta = reader .footer .blobs [1 ]
101- assert blob2_meta .properties [PROPERTY_REFERENCED_DATA_FILE ] == file2_path
102- assert blob2_meta .properties ["cardinality" ] == str (len (deletions2 ))
94+ blob_meta = reader .footer .blobs [0 ]
95+ assert blob_meta .properties [PROPERTY_REFERENCED_DATA_FILE ] == file_path
96+ assert blob_meta .properties ["cardinality" ] == str (len (set ( deletions ) ))
10397
10498 # Assert the content of deletion vectors
10599 read_vectors = reader .to_vector ()
106100
107- assert file1_path in read_vectors
108- assert file2_path in read_vectors
109-
110- assert read_vectors [file1_path ].to_pylist () == sorted (deletions1 )
111- assert read_vectors [file2_path ].to_pylist () == sorted (deletions2 )
101+ assert file_path in read_vectors
102+ assert read_vectors [file_path ].to_pylist () == sorted (list (set (deletions )))
112103
113104
114105def test_write_and_read_puffin_file () -> None :
115106 writer = PuffinWriter ()
116- writer .add (positions = [1 , 2 , 3 ], referenced_data_file = "file1.parquet" )
117- writer .add (positions = [4 , 5 , 6 ], referenced_data_file = "file2.parquet" )
107+ writer .set_blob (positions = [1 , 2 , 3 ], referenced_data_file = "file1.parquet" )
108+ writer .set_blob (positions = [4 , 5 , 6 ], referenced_data_file = "file2.parquet" )
118109 puffin_bytes = writer .finish ()
119110
120111 reader = PuffinFile (puffin_bytes )
121112
122- assert len (reader .footer .blobs ) == 2
123- blob1 = reader .footer .blobs [0 ]
124- blob2 = reader .footer .blobs [1 ]
125-
126- assert blob1 .properties ["referenced-data-file" ] == "file1.parquet"
127- assert blob1 .properties ["cardinality" ] == "3"
128- assert blob1 .type == "deletion-vector-v1"
129- assert blob1 .snapshot_id == - 1
130- assert blob1 .sequence_number == - 1
131- assert blob1 .compression_codec is None
113+ assert len (reader .footer .blobs ) == 1
114+ blob = reader .footer .blobs [0 ]
132115
133- assert blob2 .properties ["referenced-data-file" ] == "file2.parquet"
134- assert blob2 .properties ["cardinality" ] == "3"
116+ assert blob .properties ["referenced-data-file" ] == "file2.parquet"
117+ assert blob .properties ["cardinality" ] == "3"
118+ assert blob .type == "deletion-vector-v1"
119+ assert blob .snapshot_id == - 1
120+ assert blob .sequence_number == - 1
121+ assert blob .compression_codec is None
135122
136123 vectors = reader .to_vector ()
137- assert len (vectors ) == 2
138- assert vectors [ "file1.parquet" ]. to_pylist () == [ 1 , 2 , 3 ]
124+ assert len (vectors ) == 1
125+ assert "file1.parquet" not in vectors
139126 assert vectors ["file2.parquet" ].to_pylist () == [4 , 5 , 6 ]
140127
141128
@@ -146,3 +133,4 @@ def test_puffin_file_with_no_blobs() -> None:
146133 reader = PuffinFile (puffin_bytes )
147134 assert len (reader .footer .blobs ) == 0
148135 assert len (reader .to_vector ()) == 0
136+ assert "created-by" not in reader .footer .properties
0 commit comments