@@ -3971,6 +3971,75 @@ def test_csv_pandas_df_with_row_id_and_version_etag_in_index(
39713971 # THEN assert the dataframe is equal to the expected dataframe
39723972 pd .testing .assert_frame_equal (df , expected_df )
39733973
3974+ @pytest .mark .parametrize (
3975+ "list_column_types" ,
3976+ [
3977+ {"empty_list" : "INTEGER_LIST" },
3978+ {"empty_list" : "BOOLEAN_LIST" },
3979+ {"empty_list" : "STRING_LIST" },
3980+ {"empty_list" : "USERID_LIST" },
3981+ {"empty_list" : "ENTITYID_LIST" },
3982+ None ,
3983+ ],
3984+ ids = [
3985+ "INTEGER_LIST" ,
3986+ "BOOLEAN_LIST" ,
3987+ "STRING_LIST" ,
3988+ "USERID_LIST" ,
3989+ "ENTITYID_LIST" ,
3990+ "no_types" ,
3991+ ],
3992+ )
3993+ def test_csv_to_pandas_df_all_na_list_column (self , list_column_types ):
3994+ """Reproducer for the bug where querying a table with a list column whose
3995+ values are all NA in the result set raised
3996+ TypeError: Invalid value '[]' for dtype 'Int64'.
3997+
3998+ pandas' read_csv().convert_dtypes() infers an all-empty column as the
3999+ nullable Int64 dtype; the previous fillna({col: '[]'}) implementation
4000+ could not store a string into that column."""
4001+ # GIVEN a CSV where every row has an empty value for the list column
4002+ csv_content = "name,empty_list\n " "Alice,\n " "Bob,\n " "Charlie,"
4003+ csv_file = BytesIO (csv_content .encode ("utf-8" ))
4004+
4005+ # WHEN csv_to_pandas_df is called for that list column
4006+ df = csv_to_pandas_df (
4007+ filepath = csv_file ,
4008+ list_columns = ["empty_list" ],
4009+ list_column_types = list_column_types ,
4010+ )
4011+
4012+ # THEN the all-NA column should become a column of empty lists, and the
4013+ # other columns should still parse normally
4014+ assert list (df ["name" ]) == ["Alice" , "Bob" , "Charlie" ]
4015+ assert list (df ["empty_list" ]) == [[], [], []]
4016+
4017+ def test_csv_to_pandas_df_mixed_all_na_and_populated_list_columns (self ):
4018+ """When two list columns are present and only one is all-NA, the
4019+ populated one must still parse correctly."""
4020+ # GIVEN a CSV with one populated list column and one all-NA list column
4021+ csv_content = (
4022+ "name,populated_list,empty_list\n "
4023+ 'Alice,"[1, 2, 3]",\n '
4024+ 'Bob,"[4, 5]",\n '
4025+ 'Charlie,"[6]",'
4026+ )
4027+ csv_file = BytesIO (csv_content .encode ("utf-8" ))
4028+
4029+ # WHEN csv_to_pandas_df is called
4030+ df = csv_to_pandas_df (
4031+ filepath = csv_file ,
4032+ list_columns = ["populated_list" , "empty_list" ],
4033+ list_column_types = {
4034+ "populated_list" : "INTEGER_LIST" ,
4035+ "empty_list" : "INTEGER_LIST" ,
4036+ },
4037+ )
4038+
4039+ # THEN both columns should have the correct contents
4040+ assert list (df ["populated_list" ]) == [[1 , 2 , 3 ], [4 , 5 ], [6 ]]
4041+ assert list (df ["empty_list" ]) == [[], [], []]
4042+
39744043
39754044class TestConvertDtypesToJsonSerializable :
39764045 """Tests for convert_dtypes_to_json_serializable function"""
0 commit comments