Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion superset/result_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,31 @@ def convert_to_string(value: Any) -> str:
return str(value)


def normalize_column_name(value: Any, index: int) -> str:
"""
Normalize a column name from the cursor description.

Some databases (e.g., MSSQL) return empty strings for unnamed columns
(e.g., SELECT COUNT(*) without an alias). This function ensures every
column has a valid, non-empty name by generating a positional fallback
name when needed.

:param value: The column name from cursor.description (can be str, bytes, None, etc.)
:param index: The 0-based column position, used to generate fallback names
:return: A non-empty string column name
"""
if value is None:
return f"_col{index}"

name = convert_to_string(value)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: Decoding column names can raise UnicodeDecodeError when convert_to_string tries to decode non-UTF-8 bytes; catch decoding errors and fallback to a safe decode (errors='replace') or str conversion so the normalization step never raises. [possible bug]

Severity Level: Major ⚠️
- ❌ SupersetResultSet fails on non-UTF8 column names.
- ⚠️ Query display can error for weird driver encodings.
- ⚠️ Downstream callers may see exceptions during result parsing.
Suggested change
name = convert_to_string(value)
try:
name = convert_to_string(value)
except UnicodeDecodeError:
# Fallback for bytes that are not valid UTF-8: replace invalid bytes
if isinstance(value, (bytes, bytearray)):
name = value.decode("utf-8", errors="replace")
else:
name = str(value)
Steps of Reproduction ✅
1. Open `superset/result_set.py` and find `convert_to_string` at
`superset/result_set.py:89-99` and `normalize_column_name` at
`superset/result_set.py:102-124`.

2. Create a cursor description entry containing non-UTF8 bytes, e.g. `cursor_description =
[(b"\xff", None, None, None, None, None, None)]`.

3. Instantiate `SupersetResultSet` in a test or REPL:

   `SupersetResultSet(data=[], cursor_description=cursor_description,
   db_engine_spec=BaseEngineSpec)` (`__init__` begins at `superset/result_set.py:128`).

4. During name normalization the code calls `convert_to_string` (invoked from
`normalize_column_name` at `superset/result_set.py:118`), which executes the bytes decode
at `superset/result_set.py:96-97` and raises `UnicodeDecodeError` for invalid UTF-8 bytes,
causing `SupersetResultSet.__init__` to error and abort result construction.
Prompt for AI Agent 🤖
This is a comment left during a code review.

**Path:** superset/result_set.py
**Line:** 118:118
**Comment:**
	*Possible Bug: Decoding column names can raise UnicodeDecodeError when `convert_to_string` tries to decode non-UTF-8 bytes; catch decoding errors and fallback to a safe decode (errors='replace') or str conversion so the normalization step never raises.

Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise.


# Handle empty or whitespace-only names
if not name or not name.strip():
return f"_col{index}"

return name


class SupersetResultSet:
def __init__( # pylint: disable=too-many-locals # noqa: C901
self,
Expand All @@ -116,8 +141,13 @@ def __init__( # pylint: disable=too-many-locals # noqa: C901

if cursor_description:
# get deduped list of column names
# Use normalize_column_name to handle None/empty names from databases
# like MSSQL that return empty strings for unnamed columns
column_names = dedup(
[convert_to_string(col[0]) for col in cursor_description]
[
normalize_column_name(col[0], idx)
for idx, col in enumerate(cursor_description)
]
)
Comment on lines 146 to 151
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: Accessing col[0] without validating that col is a sequence (or non-None) can raise TypeError/IndexError if a cursor description entry is None or not indexable; iterate and safely extract the 0th element before calling normalize_column_name. [possible bug]

Severity Level: Major ⚠️
- ❌ SupersetResultSet construction fails on malformed cursor descriptions.
- ⚠️ Query rendering flows may error when cursor.description malformed.
- ⚠️ Alerts/dimension creation pipelines using result set fail.
Suggested change
column_names = dedup(
[convert_to_string(col[0]) for col in cursor_description]
[
normalize_column_name(col[0], idx)
for idx, col in enumerate(cursor_description)
]
)
column_names_values: list[str] = []
for idx, col in enumerate(cursor_description):
try:
name_value = col[0]
except Exception:
# description entry missing or not indexable; treat as unnamed
name_value = None
column_names_values.append(normalize_column_name(name_value, idx))
column_names = dedup(column_names_values)
Steps of Reproduction ✅
1. In a Python REPL or unit test import the class: open `superset/result_set.py` and
locate `SupersetResultSet.__init__` at `superset/result_set.py:128` (constructor start).

2. Construct a malformed cursor description and invoke the constructor directly:

   - Create `cursor_description = [None]` or `cursor_description = [123]` in the test.

   - Call `SupersetResultSet(data=[], cursor_description=cursor_description,
   db_engine_spec=BaseEngineSpec)` (the call site is the `__init__` at
   `superset/result_set.py:128`).

3. When the list comprehension at `superset/result_set.py:146-150` executes, it tries to
evaluate `col[0]` (the call to `normalize_column_name(col[0], idx)` is at
`superset/result_set.py:148`), which raises:

   - `TypeError: 'NoneType' object is not subscriptable` for `None`, or

   - `TypeError: 'int' object is not subscriptable` for `123`.

4. Observe the exception bubbles out of `SupersetResultSet.__init__` and prevents
result-set construction; this reproduces the failure caused by unguarded `col[0]` access.

   - Note: This is reproducible without other parts of Superset by directly calling the
   constructor described above.
Prompt for AI Agent 🤖
This is a comment left during a code review.

**Path:** superset/result_set.py
**Line:** 146:151
**Comment:**
	*Possible Bug: Accessing `col[0]` without validating that `col` is a sequence (or non-None) can raise TypeError/IndexError if a cursor description entry is None or not indexable; iterate and safely extract the 0th element before calling `normalize_column_name`.

Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise.


# fix cursor descriptor with the deduped names
Expand Down
204 changes: 203 additions & 1 deletion tests/unit_tests/result_set_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,40 @@
from pytest_mock import MockerFixture

from superset.db_engine_specs.base import BaseEngineSpec
from superset.result_set import stringify_values, SupersetResultSet
from superset.result_set import normalize_column_name, stringify_values, SupersetResultSet
from superset.superset_typing import DbapiResult


def test_normalize_column_name_with_valid_string() -> None:
"""Test that valid string column names are preserved."""
assert normalize_column_name("user_id", 0) == "user_id"
assert normalize_column_name("COUNT(*)", 1) == "COUNT(*)"
assert normalize_column_name("my column", 2) == "my column"


def test_normalize_column_name_with_bytes() -> None:
"""Test that byte column names are decoded."""
assert normalize_column_name(b"column_name", 0) == "column_name"


def test_normalize_column_name_with_none() -> None:
"""Test that None column names get positional fallback."""
assert normalize_column_name(None, 0) == "_col0"
assert normalize_column_name(None, 5) == "_col5"


def test_normalize_column_name_with_empty_string() -> None:
"""Test that empty string column names get positional fallback."""
assert normalize_column_name("", 0) == "_col0"
assert normalize_column_name("", 3) == "_col3"


def test_normalize_column_name_with_whitespace() -> None:
"""Test that whitespace-only column names get positional fallback."""
assert normalize_column_name(" ", 0) == "_col0"
assert normalize_column_name("\t\n", 1) == "_col1"


def test_column_names_as_bytes() -> None:
"""
Test that we can handle column names as bytes.
Expand Down Expand Up @@ -185,3 +215,175 @@ def test_get_column_description_from_empty_data_using_cursor_description(
)
assert any(col.get("column_name") == "__time" for col in result_set.columns)
logger.exception.assert_not_called()


def test_unnamed_columns_get_fallback_names() -> None:
"""
Test that unnamed columns (empty string or None) get deterministic fallback names.

This addresses issue #23848 where MSSQL returns empty strings for unnamed columns
(e.g., SELECT COUNT(*) without an alias), causing failures in SQL Lab, alerts,
and dimension creation.
"""
# Simulate MSSQL behavior: empty string for unnamed column
data = [(42,), (100,)]
description = [("", "int", None, None, None, None, None)] # Empty column name

result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore

# Should have generated a fallback name
assert result_set.columns[0]["column_name"] == "_col0"
assert result_set.columns[0]["name"] == "_col0"

# Data should be accessible
df = result_set.to_pandas_df()
assert df.columns.tolist() == ["_col0"]
assert df["_col0"].tolist() == [42, 100]


def test_none_column_name_gets_fallback() -> None:
"""
Test that None column names get deterministic fallback names.
"""
data = [("value1",), ("value2",)]
description = [(None, "varchar", None, None, None, None, None)]

result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore

assert result_set.columns[0]["column_name"] == "_col0"
df = result_set.to_pandas_df()
assert df["_col0"].tolist() == ["value1", "value2"]


def test_whitespace_only_column_name_gets_fallback() -> None:
"""
Test that whitespace-only column names get deterministic fallback names.
"""
data = [(1,), (2,)]
description = [(" ", "int", None, None, None, None, None)]

result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore

assert result_set.columns[0]["column_name"] == "_col0"


def test_mixed_named_and_unnamed_columns() -> None:
"""
Test that named columns are preserved while unnamed columns get fallback names.

Simulates: SELECT id, COUNT(*), name FROM table
where COUNT(*) has no alias on MSSQL.
"""
data = [(1, 42, "Alice"), (2, 17, "Bob")]
description = [
("id", "int", None, None, None, None, None),
("", "int", None, None, None, None, None), # COUNT(*) without alias
("name", "varchar", None, None, None, None, None),
]

result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore

# Named columns should be preserved exactly
assert result_set.columns[0]["column_name"] == "id"
assert result_set.columns[2]["column_name"] == "name"

# Unnamed column should get positional fallback
assert result_set.columns[1]["column_name"] == "_col1"

# Verify data integrity
df = result_set.to_pandas_df()
assert df.columns.tolist() == ["id", "_col1", "name"]
assert df["id"].tolist() == [1, 2]
assert df["_col1"].tolist() == [42, 17]
assert df["name"].tolist() == ["Alice", "Bob"]


def test_multiple_unnamed_columns() -> None:
"""
Test that multiple unnamed columns each get unique fallback names.

Simulates: SELECT COUNT(*), SUM(x), AVG(y) FROM table
without any aliases on MSSQL.
"""
data = [(10, 100, 5.5)]
description = [
("", "int", None, None, None, None, None),
("", "int", None, None, None, None, None),
("", "float", None, None, None, None, None),
]

result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore

# Each unnamed column should get a unique positional name
# Note: dedup() will handle any collisions, but since we use position-based
# names, there shouldn't be collisions
column_names = [col["column_name"] for col in result_set.columns]
assert column_names == ["_col0", "_col1", "_col2"]

# Verify data is accessible
df = result_set.to_pandas_df()
assert df["_col0"].tolist() == [10]
assert df["_col1"].tolist() == [100]
assert df["_col2"].tolist() == [5.5]


def test_named_columns_not_modified() -> None:
"""
Test that explicitly named columns are never modified.

This ensures the fix doesn't accidentally change behavior for well-formed queries.
"""
data = [(1, "test", 3.14)]
description = [
("user_id", "int", None, None, None, None, None),
("username", "varchar", None, None, None, None, None),
("score", "float", None, None, None, None, None),
]

result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore

column_names = [col["column_name"] for col in result_set.columns]
assert column_names == ["user_id", "username", "score"]


def test_empty_result_with_unnamed_columns() -> None:
"""
Test that empty results with unnamed columns still work correctly.

This is important for SQL Lab's display of column headers even when
the query returns no rows.
"""
data: DbapiResult = []
description = [
("", "int", None, None, None, None, None),
("named_col", "varchar", None, None, None, None, None),
]

result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore

column_names = [col["column_name"] for col in result_set.columns]
assert column_names == ["_col0", "named_col"]


def test_aliased_expression_preserved() -> None:
"""
Test that explicitly aliased expressions (e.g., SELECT COUNT(*) AS total)
preserve the alias name exactly.

This verifies the fix for #23848 doesn't accidentally affect properly
aliased columns.
"""
# Simulates: SELECT COUNT(*) AS total FROM table
# The database returns "total" as the column name
data = [(42,)]
description = [("total", "int", None, None, None, None, None)]

result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore

# Alias must be preserved exactly
assert result_set.columns[0]["column_name"] == "total"
assert result_set.columns[0]["name"] == "total"

df = result_set.to_pandas_df()
assert df.columns.tolist() == ["total"]
assert df["total"].tolist() == [42]
Loading