Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion superset/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,22 @@ def _convert_big_integers(val: Any) -> Any:
return str(val) if isinstance(val, int) and abs(val) > JS_MAX_INTEGER else val


def _is_na(val: Any) -> bool:
"""
Check if a value is NA/NaN for scalar values only.

pd.isna() raises ValueError for arrays/lists, so we catch that case.

:param val: the value to check
:returns: True if the value is NA/NaN, False otherwise
"""
try:
return bool(pd.isna(val))
except ValueError:
# pd.isna raises ValueError for arrays (e.g., lists, dicts from JSON)
return False


def df_to_records(dframe: pd.DataFrame) -> list[dict[str, Any]]:
"""
Convert a DataFrame to a set of records.
Expand All @@ -56,7 +72,7 @@ def df_to_records(dframe: pd.DataFrame) -> list[dict[str, Any]]:
for record in records:
for key in record:
record[key] = (
None if pd.isna(record[key]) else _convert_big_integers(record[key])
None if _is_na(record[key]) else _convert_big_integers(record[key])
)

return records
29 changes: 25 additions & 4 deletions superset/result_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ def __init__( # pylint: disable=too-many-locals # noqa: C901
deduped_cursor_desc: list[tuple[Any, ...]] = []
numpy_dtype: list[tuple[str, ...]] = []
stringified_arr: NDArray[Any]
# Track columns with nested/JSON data to preserve them as objects
self._nested_columns: dict[str, list[Any]] = {}

if cursor_description:
# get deduped list of column names
Expand Down Expand Up @@ -147,16 +149,29 @@ def __init__( # pylint: disable=too-many-locals # noqa: C901
TypeError, # this is super hackey,
# https://issues.apache.org/jira/browse/ARROW-7855
):
# Check if original data has nested types (lists/dicts)
# before stringifying, since stringification removes
# the nested structure that the second loop relies on
# to detect via pa.types.is_nested().
original_values = array[column].tolist()
if any(
isinstance(v, (list, dict))
for v in original_values
if v is not None
):
self._nested_columns[column] = original_values
# attempt serialization of values as strings
stringified_arr = stringify_values(array[column])
pa_data.append(pa.array(stringified_arr.tolist()))

if pa_data: # pylint: disable=too-many-nested-blocks
for i, column in enumerate(column_names):
if pa.types.is_nested(pa_data[i].type):
# TODO: revisit nested column serialization once nested types
# are added as a natively supported column type in Superset
# (superset.utils.core.GenericDataType).
# Preserve nested/JSON data as Python objects for use in
# templates like Handlebars. Store original values before
# stringifying for PyArrow compatibility.
# See: https://github.com/apache/superset/issues/25125
self._nested_columns[column] = array[column].tolist()
stringified_arr = stringify_values(array[column])
pa_data[i] = pa.array(stringified_arr.tolist())

Expand Down Expand Up @@ -247,7 +262,13 @@ def data_type(self, col_name: str, pa_dtype: pa.DataType) -> Optional[str]:
return None

def to_pandas_df(self) -> pd.DataFrame:
return self.convert_table_to_df(self.table)
df = self.convert_table_to_df(self.table)
# Restore nested/JSON columns as Python objects instead of strings
# This allows JSON data to be used directly in templates like Handlebars
for column, values in self._nested_columns.items():
if column in df.columns:
df[column] = values
return df

@property
def pa_table(self) -> pa.Table:
Expand Down
34 changes: 26 additions & 8 deletions tests/integration_tests/result_set_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,18 +226,19 @@ def test_nested_types(self):
assert results.columns[3]["type"] == "STRING"
assert results.columns[3]["type_generic"] == GenericDataType.STRING
df = results.to_pandas_df()
# JSON/JSONB data is preserved as objects instead of being stringified
assert df_to_records(df) == [
{
"id": 4,
"dict_arr": '[{"table_name": "unicode_test", "database_id": 1}]',
"num_arr": "[1, 2, 3]",
"map_col": "{'chart_name': 'scatter'}",
"dict_arr": [{"table_name": "unicode_test", "database_id": 1}],
"num_arr": [1, 2, 3],
"map_col": {"chart_name": "scatter"},
},
{
"id": 3,
"dict_arr": '[{"table_name": "birth_names", "database_id": 1}]',
"num_arr": "[4, 5, 6]",
"map_col": "{'chart_name': 'plot'}",
"dict_arr": [{"table_name": "birth_names", "database_id": 1}],
"num_arr": [4, 5, 6],
"map_col": {"chart_name": "plot"},
},
]

Expand Down Expand Up @@ -267,9 +268,25 @@ def test_single_column_multidim_nested_types(self):
assert results.columns[0]["type"] == "STRING"
assert results.columns[0]["type_generic"] == GenericDataType.STRING
df = results.to_pandas_df()
# JSON/JSONB data is preserved as objects instead of being stringified
assert df_to_records(df) == [
{
"metadata": '["test", [["foo", 123456, [[["test"], 3432546, 7657658766], [["fake"], 656756765, 324324324324]]]], ["test2", 43, 765765765], null, null]' # noqa: E501
"metadata": [
"test",
[
[
"foo",
123456,
[
[["test"], 3432546, 7657658766],
[["fake"], 656756765, 324324324324],
],
]
],
["test2", 43, 765765765],
None,
None,
]
}
]

Expand All @@ -280,7 +297,8 @@ def test_nested_list_types(self):
assert results.columns[0]["type"] == "STRING"
assert results.columns[0]["type_generic"] == GenericDataType.STRING
df = results.to_pandas_df()
assert df_to_records(df) == [{"metadata": '[{"TestKey": [123456, "foo"]}]'}]
# JSON/JSONB data is preserved as objects instead of being stringified
assert df_to_records(df) == [{"metadata": [{"TestKey": [123456, "foo"]}]}]

def test_empty_datetime(self):
data = [(None,)]
Expand Down
88 changes: 88 additions & 0 deletions tests/unit_tests/result_set_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,91 @@ def test_get_column_description_from_empty_data_using_cursor_description(
)
assert any(col.get("column_name") == "__time" for col in result_set.columns)
logger.exception.assert_not_called()


def test_json_data_type_preserved_as_objects() -> None:
"""
Test that JSON/JSONB data is preserved as Python objects (dicts/lists)
instead of being converted to strings.

This is important for Handlebars templates and other features that need
to access JSON data as objects rather than strings.

See: https://github.com/apache/superset/issues/25125
"""
# Simulate data from PostgreSQL JSONB column - psycopg2 returns dicts
data = [
(1, {"key": "value1", "nested": {"a": 1}}, "text1"),
(2, {"key": "value2", "items": [1, 2, 3]}, "text2"),
(3, None, "text3"),
(4, {"mixed": "string"}, "text4"),
]
description = [
("id", 23, None, None, None, None, None), # INT
("json_col", 3802, None, None, None, None, None), # JSONB
("text_col", 1043, None, None, None, None, None), # VARCHAR
]
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
df = result_set.to_pandas_df()

# JSON column should be preserved as Python objects, not strings
assert df["json_col"].iloc[0] == {"key": "value1", "nested": {"a": 1}}
assert isinstance(df["json_col"].iloc[0], dict)
assert df["json_col"].iloc[1] == {"key": "value2", "items": [1, 2, 3]}
assert df["json_col"].iloc[2] is None
assert df["json_col"].iloc[3] == {"mixed": "string"}

# Verify the data can be serialized to JSON (as it would be for API response)
from superset.utils import json as superset_json

records = df.to_dict(orient="records")
json_output = superset_json.dumps(records)
parsed = superset_json.loads(json_output)
assert parsed[0]["json_col"]["key"] == "value1"
assert parsed[0]["json_col"]["nested"]["a"] == 1
assert parsed[1]["json_col"]["items"] == [1, 2, 3]


def test_json_data_with_homogeneous_structure() -> None:
"""
Test that JSON data with consistent structure is also preserved as objects.
"""
# All rows have the same JSON structure
data = [
(1, {"name": "Alice", "age": 30}),
(2, {"name": "Bob", "age": 25}),
(3, {"name": "Charlie", "age": 35}),
]
description = [
("id", 23, None, None, None, None, None),
("data", 3802, None, None, None, None, None),
]
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
df = result_set.to_pandas_df()

# Should be preserved as dicts
assert isinstance(df["data"].iloc[0], dict)
assert df["data"].iloc[0]["name"] == "Alice"
assert df["data"].iloc[1]["age"] == 25


def test_array_data_type_preserved() -> None:
"""
Test that array data is also preserved as Python lists.
"""
data = [
(1, [1, 2, 3]),
(2, [4, 5, 6]),
(3, None),
]
description = [
("id", 23, None, None, None, None, None),
("arr", 1007, None, None, None, None, None), # INT ARRAY
]
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
df = result_set.to_pandas_df()

# Arrays should be preserved as lists
assert df["arr"].iloc[0] == [1, 2, 3]
assert isinstance(df["arr"].iloc[0], list)
assert df["arr"].iloc[2] is None
Loading