Skip to content

Commit 586983f

Browse files
authored
Merge branch 'develop' into SYNPY-1760
2 parents 2db50df + 97efeed commit 586983f

2 files changed

Lines changed: 77 additions & 4 deletions

File tree

synapseclient/models/mixins/table_components.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4581,10 +4581,14 @@ def csv_to_pandas_df(
45814581
# Turn list columns into lists and convert items to their proper types
45824582
if list_columns:
45834583
for col in list_columns:
4584-
# Fill NA values with empty lists, it must be a string for json.loads to work
4585-
# json.loads will convert null values in boolean list, string list to None.
4586-
df.fillna({col: "[]"}, inplace=True)
4587-
df[col] = df[col].apply(json.loads)
4584+
# A CSV cell for a list column is either a JSON string like "[1, 2]"
4585+
# or NA. When every value is NA, convert_dtypes() infers a typed
4586+
# dtype (e.g. Int64) into which the string "[]" cannot be written,
4587+
# so fillna({col: "[]"}) raises. Parse strings and substitute []
4588+
# for NA in a single pass.
4589+
df[col] = df[col].apply(
4590+
lambda x: json.loads(x) if isinstance(x, str) else []
4591+
)
45884592
# Convert list items to their proper types based on column type
45894593
if list_column_types and col in list_column_types:
45904594
column_type = list_column_types[col]

tests/unit/synapseclient/mixins/unit_test_table_components.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3971,6 +3971,75 @@ def test_csv_pandas_df_with_row_id_and_version_etag_in_index(
39713971
# THEN assert the dataframe is equal to the expected dataframe
39723972
pd.testing.assert_frame_equal(df, expected_df)
39733973

3974+
@pytest.mark.parametrize(
3975+
"list_column_types",
3976+
[
3977+
{"empty_list": "INTEGER_LIST"},
3978+
{"empty_list": "BOOLEAN_LIST"},
3979+
{"empty_list": "STRING_LIST"},
3980+
{"empty_list": "USERID_LIST"},
3981+
{"empty_list": "ENTITYID_LIST"},
3982+
None,
3983+
],
3984+
ids=[
3985+
"INTEGER_LIST",
3986+
"BOOLEAN_LIST",
3987+
"STRING_LIST",
3988+
"USERID_LIST",
3989+
"ENTITYID_LIST",
3990+
"no_types",
3991+
],
3992+
)
3993+
def test_csv_to_pandas_df_all_na_list_column(self, list_column_types):
3994+
"""Reproducer for the bug where querying a table with a list column whose
3995+
values are all NA in the result set raised
3996+
TypeError: Invalid value '[]' for dtype 'Int64'.
3997+
3998+
pandas' read_csv().convert_dtypes() infers an all-empty column as the
3999+
nullable Int64 dtype; the previous fillna({col: '[]'}) implementation
4000+
could not store a string into that column."""
4001+
# GIVEN a CSV where every row has an empty value for the list column
4002+
csv_content = "name,empty_list\n" "Alice,\n" "Bob,\n" "Charlie,"
4003+
csv_file = BytesIO(csv_content.encode("utf-8"))
4004+
4005+
# WHEN csv_to_pandas_df is called for that list column
4006+
df = csv_to_pandas_df(
4007+
filepath=csv_file,
4008+
list_columns=["empty_list"],
4009+
list_column_types=list_column_types,
4010+
)
4011+
4012+
# THEN the all-NA column should become a column of empty lists, and the
4013+
# other columns should still parse normally
4014+
assert list(df["name"]) == ["Alice", "Bob", "Charlie"]
4015+
assert list(df["empty_list"]) == [[], [], []]
4016+
4017+
def test_csv_to_pandas_df_mixed_all_na_and_populated_list_columns(self):
4018+
"""When two list columns are present and only one is all-NA, the
4019+
populated one must still parse correctly."""
4020+
# GIVEN a CSV with one populated list column and one all-NA list column
4021+
csv_content = (
4022+
"name,populated_list,empty_list\n"
4023+
'Alice,"[1, 2, 3]",\n'
4024+
'Bob,"[4, 5]",\n'
4025+
'Charlie,"[6]",'
4026+
)
4027+
csv_file = BytesIO(csv_content.encode("utf-8"))
4028+
4029+
# WHEN csv_to_pandas_df is called
4030+
df = csv_to_pandas_df(
4031+
filepath=csv_file,
4032+
list_columns=["populated_list", "empty_list"],
4033+
list_column_types={
4034+
"populated_list": "INTEGER_LIST",
4035+
"empty_list": "INTEGER_LIST",
4036+
},
4037+
)
4038+
4039+
# THEN both columns should have the correct contents
4040+
assert list(df["populated_list"]) == [[1, 2, 3], [4, 5], [6]]
4041+
assert list(df["empty_list"]) == [[], [], []]
4042+
39744043

39754044
class TestConvertDtypesToJsonSerializable:
39764045
"""Tests for convert_dtypes_to_json_serializable function"""

0 commit comments

Comments
 (0)