|
| 1 | +""" |
| 2 | +Unit tests for DataFrame rendering with structured types. |
| 3 | +
|
| 4 | +These tests simulate the complete rendering flow that happens when the frontend |
| 5 | +displays a DataFrame, ensuring both column analysis and data serialization work correctly. |
| 6 | +
|
| 7 | +This is a regression test suite for BLU-5140 where Trino STRUCT/ROW types caused |
| 8 | +analyze_columns() to crash, resulting in fallback to plain DataFrame view instead of |
| 9 | +the Deepnote native DataFrame view. |
| 10 | +""" |
| 11 | + |
| 12 | +import numpy as np |
| 13 | +import pandas as pd |
| 14 | +from trino.types import NamedRowTuple |
| 15 | + |
| 16 | +from deepnote_toolkit.ocelots import DataFrame |
| 17 | +from deepnote_toolkit.ocelots.pandas.analyze import analyze_columns |
| 18 | + |
| 19 | + |
| 20 | +def _test_rendering_flow(df, expected_columns): |
| 21 | + """ |
| 22 | + Simulate the complete rendering flow: |
| 23 | + 1. analyze_columns() - for native view with stats |
| 24 | + 2. to_records(mode="json") - for cell values |
| 25 | +
|
| 26 | + Both paths must work for the Deepnote native DataFrame view to display correctly. |
| 27 | + """ |
| 28 | + # 1. column stats (native view) |
| 29 | + analysis_result = analyze_columns(df) |
| 30 | + |
| 31 | + assert len(analysis_result) == len(expected_columns) |
| 32 | + |
| 33 | + for col_name in expected_columns: |
| 34 | + col = next(c for c in analysis_result if c.name == col_name) |
| 35 | + assert col.stats is not None, f"analyze_columns() failed for {col_name}" |
| 36 | + # Object columns should have categories for display |
| 37 | + if df[col_name].dtype == object: |
| 38 | + assert ( |
| 39 | + col.stats.categories is not None |
| 40 | + ), f"No categories for object column {col_name}" |
| 41 | + |
| 42 | + # 2. cell values |
| 43 | + oc_df = DataFrame.from_native(df) |
| 44 | + records = oc_df.to_records(mode="json") |
| 45 | + |
| 46 | + assert len(records) == len(df) |
| 47 | + # all values are JSON-serializable (strings, numbers, None) |
| 48 | + for record in records: |
| 49 | + for col_name in expected_columns: |
| 50 | + value = record[col_name] |
| 51 | + assert isinstance( |
| 52 | + value, (str, int, float, type(None)) |
| 53 | + ), f"Value for {col_name} is not JSON-serializable: {type(value)}" |
| 54 | + |
| 55 | + |
| 56 | +def test_rendering_with_dict_objects(): |
| 57 | + """Test rendering DataFrame with dict objects (simulates database ROW types).""" |
| 58 | + df = pd.DataFrame( |
| 59 | + { |
| 60 | + "id": [1, 2, 3], |
| 61 | + "struct_col": [ |
| 62 | + {"a": "item_1", "b": "value_10"}, |
| 63 | + {"a": "item_2", "b": "value_20"}, |
| 64 | + {"a": "item_3", "b": "value_30"}, |
| 65 | + ], |
| 66 | + } |
| 67 | + ) |
| 68 | + |
| 69 | + _test_rendering_flow(df, ["id", "struct_col"]) |
| 70 | + |
| 71 | + |
| 72 | +def test_rendering_with_list_objects(): |
| 73 | + """Test rendering DataFrame with list objects (simulates database ARRAY types).""" |
| 74 | + df = pd.DataFrame( |
| 75 | + { |
| 76 | + "id": [1, 2, 3], |
| 77 | + "array_col": [ |
| 78 | + ["tag_1", "item", "test"], |
| 79 | + ["tag_2", "item", "test"], |
| 80 | + ["tag_3", "item", "test"], |
| 81 | + ], |
| 82 | + } |
| 83 | + ) |
| 84 | + |
| 85 | + _test_rendering_flow(df, ["id", "array_col"]) |
| 86 | + |
| 87 | + |
| 88 | +def test_rendering_with_tuple_objects(): |
| 89 | + """Test rendering DataFrame with tuple objects.""" |
| 90 | + df = pd.DataFrame( |
| 91 | + { |
| 92 | + "id": [1, 2, 3], |
| 93 | + "tuple_col": [ |
| 94 | + ("item_1", "value_10"), |
| 95 | + ("item_2", "value_20"), |
| 96 | + ("item_3", "value_30"), |
| 97 | + ], |
| 98 | + } |
| 99 | + ) |
| 100 | + |
| 101 | + _test_rendering_flow(df, ["id", "tuple_col"]) |
| 102 | + |
| 103 | + |
| 104 | +def test_rendering_with_trino_namedrowtuple(): |
| 105 | + """ |
| 106 | + Test rendering DataFrame with Trino NamedRowTuple objects. |
| 107 | +
|
| 108 | + This is the exact scenario from BLU-5140 that caused the crash. |
| 109 | + Before the fix, pd.Series(np_array.tolist()) would fail because |
| 110 | + NamedRowTuple has a broken __array_struct__ attribute. |
| 111 | + """ |
| 112 | + # Create NamedRowTuple array using np.empty + assignment pattern. |
| 113 | + # This avoids pandas conversion issues during DataFrame creation. |
| 114 | + # Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug. |
| 115 | + np_array = np.empty(3, dtype=object) |
| 116 | + np_array[0] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None]) |
| 117 | + np_array[1] = NamedRowTuple(["item_2", "value_20"], ["a", "b"], [None, None]) |
| 118 | + np_array[2] = NamedRowTuple(["item_3", "value_30"], ["a", "b"], [None, None]) |
| 119 | + |
| 120 | + df = pd.DataFrame({"id": [1, 2, 3], "struct_col": np_array}) |
| 121 | + |
| 122 | + _test_rendering_flow(df, ["id", "struct_col"]) |
| 123 | + |
| 124 | + # stringified values should preserve structure |
| 125 | + oc_df = DataFrame.from_native(df) |
| 126 | + records = oc_df.to_records(mode="json") |
| 127 | + |
| 128 | + struct_value = records[0]["struct_col"] |
| 129 | + assert isinstance(struct_value, str) |
| 130 | + assert "item_1" in struct_value |
| 131 | + assert "value_10" in struct_value |
| 132 | + |
| 133 | + |
| 134 | +def test_rendering_with_nested_structures(): |
| 135 | + """Test rendering DataFrame with nested dicts/lists.""" |
| 136 | + df = pd.DataFrame( |
| 137 | + { |
| 138 | + "id": [1, 2, 3], |
| 139 | + "nested_col": [ |
| 140 | + {"outer": ["inner_1", "inner_2"]}, |
| 141 | + {"outer": ["inner_3", "inner_4"]}, |
| 142 | + {"outer": ["inner_5", "inner_6"]}, |
| 143 | + ], |
| 144 | + } |
| 145 | + ) |
| 146 | + |
| 147 | + _test_rendering_flow(df, ["id", "nested_col"]) |
| 148 | + |
| 149 | + |
| 150 | +def test_rendering_with_mixed_types(): |
| 151 | + """Test rendering DataFrame with multiple structured type columns.""" |
| 152 | + df = pd.DataFrame( |
| 153 | + { |
| 154 | + "id": [1, 2, 3], |
| 155 | + "dict_col": [{"a": 1}, {"b": 2}, {"c": 3}], |
| 156 | + "list_col": [[1, 2], [3, 4], [5, 6]], |
| 157 | + "tuple_col": [(1, 2), (3, 4), (5, 6)], |
| 158 | + } |
| 159 | + ) |
| 160 | + |
| 161 | + _test_rendering_flow(df, ["id", "dict_col", "list_col", "tuple_col"]) |
| 162 | + |
| 163 | + |
| 164 | +def test_rendering_with_namedrowtuple_and_missing_values(): |
| 165 | + """Test rendering with NamedRowTuple including None values.""" |
| 166 | + # Create NamedRowTuple array using np.empty + assignment pattern. |
| 167 | + # Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug. |
| 168 | + np_array = np.empty(4, dtype=object) |
| 169 | + np_array[0] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None]) |
| 170 | + np_array[1] = None |
| 171 | + np_array[2] = NamedRowTuple(["item_2", "value_20"], ["a", "b"], [None, None]) |
| 172 | + np_array[3] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None]) |
| 173 | + |
| 174 | + df = pd.DataFrame({"id": [1, 2, 3, 4], "struct_col": np_array}) |
| 175 | + |
| 176 | + # Should not crash with None values |
| 177 | + analysis_result = analyze_columns(df) |
| 178 | + |
| 179 | + struct_col = next(col for col in analysis_result if col.name == "struct_col") |
| 180 | + assert struct_col.stats is not None |
| 181 | + assert struct_col.stats.categories is not None |
| 182 | + |
| 183 | + # Should have "Missing" category |
| 184 | + category_names = [cat["name"] for cat in struct_col.stats.categories] |
| 185 | + assert "Missing" in category_names |
| 186 | + |
| 187 | + |
| 188 | +def test_rendering_preserves_field_names_in_str_representation(): |
| 189 | + """ |
| 190 | + Test that NamedRowTuple field names are preserved in stringification. |
| 191 | + """ |
| 192 | + # Create NamedRowTuple array using np.empty + assignment pattern. |
| 193 | + # Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug. |
| 194 | + np_array = np.empty(1, dtype=object) |
| 195 | + np_array[0] = NamedRowTuple( |
| 196 | + ["value_a", "value_b"], ["field_a", "field_b"], [None, None] |
| 197 | + ) |
| 198 | + |
| 199 | + df = pd.DataFrame({"struct_col": np_array}) |
| 200 | + |
| 201 | + # Get the stringified representation |
| 202 | + oc_df = DataFrame.from_native(df) |
| 203 | + records = oc_df.to_records(mode="json") |
| 204 | + |
| 205 | + stringified = records[0]["struct_col"] |
| 206 | + |
| 207 | + # str(NamedRowTuple) produces something like: (field_a: 'value_a', field_b: 'value_b') |
| 208 | + # This preserves field name information for better display |
| 209 | + assert "field_a: 'value_a'" in stringified |
| 210 | + assert "field_b: 'value_b'" in stringified |
0 commit comments