Skip to content

Commit ff96a56

Browse files
authored
Merge c322d3d into 7e6cd2a
2 parents 7e6cd2a + c322d3d commit ff96a56

File tree

2 files changed

+341
-0
lines changed

2 files changed

+341
-0
lines changed

tests/integration/test_trino.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,3 +225,134 @@ def test_execute_sql_with_autodetection(self, trino_credentials):
225225
assert len(result) == 1
226226
assert "detected" in result.columns
227227
assert result["detected"].iloc[0] == test_value
228+
229+
def test_execute_sql_with_struct_types(self, trino_toolkit_connection):
230+
"""
231+
Test Trino STRUCT/ROW types don't break rendering (BLU-5140 regression).
232+
233+
Verifies both analyze_columns() for stats and to_records() for cell values.
234+
"""
235+
from deepnote_toolkit.ocelots import DataFrame
236+
from deepnote_toolkit.ocelots.pandas.analyze import analyze_columns
237+
238+
query = """
239+
SELECT id, simple_struct FROM (
240+
SELECT
241+
t.id,
242+
CAST(
243+
ROW(
244+
'item_' || CAST(t.id AS VARCHAR),
245+
'value_' || CAST(t.id * 10 AS VARCHAR)
246+
)
247+
AS ROW(a VARCHAR, b VARCHAR)
248+
) AS simple_struct
249+
FROM
250+
UNNEST(SEQUENCE(1, 100)) AS t (id)
251+
)
252+
"""
253+
254+
result = execute_sql(
255+
template=query,
256+
sql_alchemy_json_env_var=trino_toolkit_connection,
257+
)
258+
259+
assert isinstance(result, pd.DataFrame)
260+
assert len(result) == 100
261+
assert "id" in result.columns
262+
assert "simple_struct" in result.columns
263+
264+
# Verify NamedRowTuple structure
265+
first_struct = result["simple_struct"].iloc[0]
266+
assert isinstance(first_struct, tuple)
267+
assert len(first_struct) == 2
268+
assert first_struct[0] == "item_1"
269+
assert first_struct[1] == "value_10"
270+
assert first_struct.a == "item_1"
271+
assert first_struct.b == "value_10"
272+
273+
# Verify analyze_columns() works without crashing
274+
analysis_result = analyze_columns(result)
275+
assert len(analysis_result) == 2
276+
277+
struct_col = next(col for col in analysis_result if col.name == "simple_struct")
278+
assert struct_col.stats is not None
279+
assert struct_col.stats.categories is not None
280+
assert len(struct_col.stats.categories) > 0
281+
282+
# Verify to_records() produces stringified values
283+
oc_df = DataFrame.from_native(result)
284+
records = oc_df.to_records(mode="json")
285+
286+
assert len(records) == 100
287+
cell_value = records[0]["simple_struct"]
288+
assert isinstance(cell_value, str)
289+
assert "item_1" in cell_value
290+
assert "value_10" in cell_value
291+
292+
def test_execute_sql_with_array_types(self, trino_toolkit_connection):
293+
"""
294+
Test Trino ARRAY types don't break rendering (BLU-5140 regression).
295+
296+
Verifies both analyze_columns() for stats and to_records() for cell values.
297+
"""
298+
from deepnote_toolkit.ocelots import DataFrame
299+
from deepnote_toolkit.ocelots.pandas.analyze import analyze_columns
300+
301+
query = """
302+
SELECT
303+
id,
304+
tags,
305+
nested_array
306+
FROM (
307+
SELECT
308+
t.id,
309+
ARRAY['tag_' || CAST(t.id AS VARCHAR), 'item', 'test'] AS tags,
310+
ARRAY[ARRAY[t.id, t.id * 2], ARRAY[t.id * 3, t.id * 4]] AS nested_array
311+
FROM
312+
UNNEST(SEQUENCE(1, 50)) AS t (id)
313+
)
314+
"""
315+
316+
result = execute_sql(
317+
template=query,
318+
sql_alchemy_json_env_var=trino_toolkit_connection,
319+
)
320+
321+
assert isinstance(result, pd.DataFrame)
322+
assert len(result) == 50
323+
assert "id" in result.columns
324+
assert "tags" in result.columns
325+
assert "nested_array" in result.columns
326+
327+
# Verify array data
328+
first_tags = result["tags"].iloc[0]
329+
assert isinstance(first_tags, list)
330+
assert len(first_tags) == 3
331+
assert first_tags == ["tag_1", "item", "test"]
332+
333+
first_nested = result["nested_array"].iloc[0]
334+
assert isinstance(first_nested, list)
335+
assert len(first_nested) == 2
336+
assert first_nested == [[1, 2], [3, 4]]
337+
338+
# Verify analyze_columns() works without crashing
339+
analysis_result = analyze_columns(result)
340+
assert len(analysis_result) == 3
341+
342+
for col_name in ["tags", "nested_array"]:
343+
col = next(c for c in analysis_result if c.name == col_name)
344+
assert col.stats is not None
345+
assert col.stats.categories is not None
346+
347+
# Verify to_records() produces stringified values
348+
oc_df = DataFrame.from_native(result)
349+
records = oc_df.to_records(mode="json")
350+
351+
assert len(records) == 50
352+
tags_value = records[0]["tags"]
353+
nested_value = records[0]["nested_array"]
354+
355+
assert isinstance(tags_value, str)
356+
assert isinstance(nested_value, str)
357+
assert "tag_1" in tags_value
358+
assert "item" in tags_value
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
"""
2+
Unit tests for DataFrame rendering with structured types.
3+
4+
These tests simulate the complete rendering flow that happens when the frontend
5+
displays a DataFrame, ensuring both column analysis and data serialization work correctly.
6+
7+
This is a regression test suite for BLU-5140 where Trino STRUCT/ROW types caused
8+
analyze_columns() to crash, resulting in fallback to plain DataFrame view instead of
9+
the Deepnote native DataFrame view.
10+
"""
11+
12+
import numpy as np
13+
import pandas as pd
14+
from trino.types import NamedRowTuple
15+
16+
from deepnote_toolkit.ocelots import DataFrame
17+
from deepnote_toolkit.ocelots.pandas.analyze import analyze_columns
18+
19+
20+
def _test_rendering_flow(df, expected_columns):
21+
"""
22+
Simulate the complete rendering flow:
23+
1. analyze_columns() - for native view with stats
24+
2. to_records(mode="json") - for cell values
25+
26+
Both paths must work for the Deepnote native DataFrame view to display correctly.
27+
"""
28+
# 1. column stats (native view)
29+
analysis_result = analyze_columns(df)
30+
31+
assert len(analysis_result) == len(expected_columns)
32+
33+
for col_name in expected_columns:
34+
col = next(c for c in analysis_result if c.name == col_name)
35+
assert col.stats is not None, f"analyze_columns() failed for {col_name}"
36+
# Object columns should have categories for display
37+
if df[col_name].dtype == object:
38+
assert (
39+
col.stats.categories is not None
40+
), f"No categories for object column {col_name}"
41+
42+
# 2. cell values
43+
oc_df = DataFrame.from_native(df)
44+
records = oc_df.to_records(mode="json")
45+
46+
assert len(records) == len(df)
47+
# all values are JSON-serializable (strings, numbers, None)
48+
for record in records:
49+
for col_name in expected_columns:
50+
value = record[col_name]
51+
assert isinstance(
52+
value, (str, int, float, type(None))
53+
), f"Value for {col_name} is not JSON-serializable: {type(value)}"
54+
55+
56+
def test_rendering_with_dict_objects():
57+
"""Test rendering DataFrame with dict objects (simulates database ROW types)."""
58+
df = pd.DataFrame(
59+
{
60+
"id": [1, 2, 3],
61+
"struct_col": [
62+
{"a": "item_1", "b": "value_10"},
63+
{"a": "item_2", "b": "value_20"},
64+
{"a": "item_3", "b": "value_30"},
65+
],
66+
}
67+
)
68+
69+
_test_rendering_flow(df, ["id", "struct_col"])
70+
71+
72+
def test_rendering_with_list_objects():
73+
"""Test rendering DataFrame with list objects (simulates database ARRAY types)."""
74+
df = pd.DataFrame(
75+
{
76+
"id": [1, 2, 3],
77+
"array_col": [
78+
["tag_1", "item", "test"],
79+
["tag_2", "item", "test"],
80+
["tag_3", "item", "test"],
81+
],
82+
}
83+
)
84+
85+
_test_rendering_flow(df, ["id", "array_col"])
86+
87+
88+
def test_rendering_with_tuple_objects():
89+
"""Test rendering DataFrame with tuple objects."""
90+
df = pd.DataFrame(
91+
{
92+
"id": [1, 2, 3],
93+
"tuple_col": [
94+
("item_1", "value_10"),
95+
("item_2", "value_20"),
96+
("item_3", "value_30"),
97+
],
98+
}
99+
)
100+
101+
_test_rendering_flow(df, ["id", "tuple_col"])
102+
103+
104+
def test_rendering_with_trino_namedrowtuple():
105+
"""
106+
Test rendering DataFrame with Trino NamedRowTuple objects.
107+
108+
This is the exact scenario from BLU-5140 that caused the crash.
109+
Before the fix, pd.Series(np_array.tolist()) would fail because
110+
NamedRowTuple has a broken __array_struct__ attribute.
111+
"""
112+
# Create NamedRowTuple array using np.empty + assignment pattern.
113+
# This avoids pandas conversion issues during DataFrame creation.
114+
# Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug.
115+
np_array = np.empty(3, dtype=object)
116+
np_array[0] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None])
117+
np_array[1] = NamedRowTuple(["item_2", "value_20"], ["a", "b"], [None, None])
118+
np_array[2] = NamedRowTuple(["item_3", "value_30"], ["a", "b"], [None, None])
119+
120+
df = pd.DataFrame({"id": [1, 2, 3], "struct_col": np_array})
121+
122+
_test_rendering_flow(df, ["id", "struct_col"])
123+
124+
# stringified values should preserve structure
125+
oc_df = DataFrame.from_native(df)
126+
records = oc_df.to_records(mode="json")
127+
128+
struct_value = records[0]["struct_col"]
129+
assert isinstance(struct_value, str)
130+
assert "item_1" in struct_value
131+
assert "value_10" in struct_value
132+
133+
134+
def test_rendering_with_nested_structures():
135+
"""Test rendering DataFrame with nested dicts/lists."""
136+
df = pd.DataFrame(
137+
{
138+
"id": [1, 2, 3],
139+
"nested_col": [
140+
{"outer": ["inner_1", "inner_2"]},
141+
{"outer": ["inner_3", "inner_4"]},
142+
{"outer": ["inner_5", "inner_6"]},
143+
],
144+
}
145+
)
146+
147+
_test_rendering_flow(df, ["id", "nested_col"])
148+
149+
150+
def test_rendering_with_mixed_types():
151+
"""Test rendering DataFrame with multiple structured type columns."""
152+
df = pd.DataFrame(
153+
{
154+
"id": [1, 2, 3],
155+
"dict_col": [{"a": 1}, {"b": 2}, {"c": 3}],
156+
"list_col": [[1, 2], [3, 4], [5, 6]],
157+
"tuple_col": [(1, 2), (3, 4), (5, 6)],
158+
}
159+
)
160+
161+
_test_rendering_flow(df, ["id", "dict_col", "list_col", "tuple_col"])
162+
163+
164+
def test_rendering_with_namedrowtuple_and_missing_values():
165+
"""Test rendering with NamedRowTuple including None values."""
166+
# Create NamedRowTuple array using np.empty + assignment pattern.
167+
# Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug.
168+
np_array = np.empty(4, dtype=object)
169+
np_array[0] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None])
170+
np_array[1] = None
171+
np_array[2] = NamedRowTuple(["item_2", "value_20"], ["a", "b"], [None, None])
172+
np_array[3] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None])
173+
174+
df = pd.DataFrame({"id": [1, 2, 3, 4], "struct_col": np_array})
175+
176+
# Should not crash with None values
177+
analysis_result = analyze_columns(df)
178+
179+
struct_col = next(col for col in analysis_result if col.name == "struct_col")
180+
assert struct_col.stats is not None
181+
assert struct_col.stats.categories is not None
182+
183+
# Should have "Missing" category
184+
category_names = [cat["name"] for cat in struct_col.stats.categories]
185+
assert "Missing" in category_names
186+
187+
188+
def test_rendering_preserves_field_names_in_str_representation():
189+
"""
190+
Test that NamedRowTuple field names are preserved in stringification.
191+
"""
192+
# Create NamedRowTuple array using np.empty + assignment pattern.
193+
# Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug.
194+
np_array = np.empty(1, dtype=object)
195+
np_array[0] = NamedRowTuple(
196+
["value_a", "value_b"], ["field_a", "field_b"], [None, None]
197+
)
198+
199+
df = pd.DataFrame({"struct_col": np_array})
200+
201+
# Get the stringified representation
202+
oc_df = DataFrame.from_native(df)
203+
records = oc_df.to_records(mode="json")
204+
205+
stringified = records[0]["struct_col"]
206+
207+
# str(NamedRowTuple) produces something like: (field_a: 'value_a', field_b: 'value_b')
208+
# This preserves field name information for better display
209+
assert "field_a: 'value_a'" in stringified
210+
assert "field_b: 'value_b'" in stringified

0 commit comments

Comments
 (0)