@@ -732,3 +732,98 @@ def test_add_files_subset_of_schema(spark: SparkSession, session_catalog: Catalo
732732 for column in written_arrow_table .column_names :
733733 for left , right in zip (lhs [column ].to_list (), rhs [column ].to_list ()):
734734 assert left == right
735+
736+
737+ @pytest .mark .integration
738+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
739+ def test_add_files_with_duplicate_files_in_file_paths (spark : SparkSession , session_catalog : Catalog , format_version : int ) -> None :
740+ identifier = f"default.test_table_duplicate_add_files_v{ format_version } "
741+ tbl = _create_table (session_catalog , identifier , format_version )
742+ file_path = "s3://warehouse/default/unpartitioned/v{format_version}/test-1.parquet"
743+ file_paths = [file_path , file_path ]
744+
745+ # add the parquet files as data files
746+ with pytest .raises (ValueError ) as exc_info :
747+ tbl .add_files (file_paths = file_paths )
748+ assert "File paths must be unique" in str (exc_info .value )
749+
750+
751+ @pytest .mark .integration
752+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
753+ def test_add_files_that_referenced_by_current_snapshot (
754+ spark : SparkSession , session_catalog : Catalog , format_version : int
755+ ) -> None :
756+ identifier = f"default.test_table_add_referenced_file_v{ format_version } "
757+ tbl = _create_table (session_catalog , identifier , format_version )
758+
759+ file_paths = [f"s3://warehouse/default/unpartitioned/v{ format_version } /test-{ i } .parquet" for i in range (5 )]
760+
761+ # write parquet files
762+ for file_path in file_paths :
763+ fo = tbl .io .new_output (file_path )
764+ with fo .create (overwrite = True ) as fos :
765+ with pq .ParquetWriter (fos , schema = ARROW_SCHEMA ) as writer :
766+ writer .write_table (ARROW_TABLE )
767+
768+ # add the parquet files as data files
769+ tbl .add_files (file_paths = file_paths )
770+ existing_files_in_table = tbl .inspect .files ().to_pylist ().pop ()["file_path" ]
771+
772+ with pytest .raises (ValueError ) as exc_info :
773+ tbl .add_files (file_paths = [existing_files_in_table ])
774+ assert f"Cannot add files that are already referenced by table, files: { existing_files_in_table } " in str (exc_info .value )
775+
776+
777+ @pytest .mark .integration
778+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
779+ def test_add_files_that_referenced_by_current_snapshot_with_check_duplicate_files_false (
780+ spark : SparkSession , session_catalog : Catalog , format_version : int
781+ ) -> None :
782+ identifier = f"default.test_table_add_referenced_file_v{ format_version } "
783+ tbl = _create_table (session_catalog , identifier , format_version )
784+
785+ file_paths = [f"s3://warehouse/default/unpartitioned/v{ format_version } /test-{ i } .parquet" for i in range (5 )]
786+ # write parquet files
787+ for file_path in file_paths :
788+ fo = tbl .io .new_output (file_path )
789+ with fo .create (overwrite = True ) as fos :
790+ with pq .ParquetWriter (fos , schema = ARROW_SCHEMA ) as writer :
791+ writer .write_table (ARROW_TABLE )
792+
793+ # add the parquet files as data files
794+ tbl .add_files (file_paths = file_paths )
795+ existing_files_in_table = tbl .inspect .files ().to_pylist ().pop ()["file_path" ]
796+ tbl .add_files (file_paths = [existing_files_in_table ], check_duplicate_files = False )
797+ rows = spark .sql (
798+ f"""
799+ SELECT added_data_files_count, existing_data_files_count, deleted_data_files_count
800+ FROM { identifier } .all_manifests
801+ """
802+ ).collect ()
803+ assert [row .added_data_files_count for row in rows ] == [5 , 1 , 5 ]
804+ assert [row .existing_data_files_count for row in rows ] == [0 , 0 , 0 ]
805+ assert [row .deleted_data_files_count for row in rows ] == [0 , 0 , 0 ]
806+
807+
808+ @pytest .mark .integration
809+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
810+ def test_add_files_that_referenced_by_current_snapshot_with_check_duplicate_files_true (
811+ spark : SparkSession , session_catalog : Catalog , format_version : int
812+ ) -> None :
813+ identifier = f"default.test_table_add_referenced_file_v{ format_version } "
814+ tbl = _create_table (session_catalog , identifier , format_version )
815+
816+ file_paths = [f"s3://warehouse/default/unpartitioned/v{ format_version } /test-{ i } .parquet" for i in range (5 )]
817+ # write parquet files
818+ for file_path in file_paths :
819+ fo = tbl .io .new_output (file_path )
820+ with fo .create (overwrite = True ) as fos :
821+ with pq .ParquetWriter (fos , schema = ARROW_SCHEMA ) as writer :
822+ writer .write_table (ARROW_TABLE )
823+
824+ # add the parquet files as data files
825+ tbl .add_files (file_paths = file_paths )
826+ existing_files_in_table = tbl .inspect .files ().to_pylist ().pop ()["file_path" ]
827+ with pytest .raises (ValueError ) as exc_info :
828+ tbl .add_files (file_paths = [existing_files_in_table ], check_duplicate_files = True )
829+ assert f"Cannot add files that are already referenced by table, files: { existing_files_in_table } " in str (exc_info .value )
0 commit comments