Skip to content

Commit 5e96035

Browse files
SNOW-1989239 - prevent silent failures on nano-arrow conversion (#2227)
Co-authored-by: Adam Ling <adam.ling@snowflake.com>
1 parent 253c47a commit 5e96035

File tree

9 files changed

+75
-9
lines changed

9 files changed

+75
-9
lines changed

DESCRIPTION.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
1313
- Improved error message for client-side query cancellations due to timeouts.
1414
- Added support of GCS regional endpoints.
1515
- Added `gcs_use_virtual_endpoints` connection property that forces the usage of the virtual GCS usage. See more: https://cloud.google.com/storage/docs/request-endpoints#xml-api
16+
- Fixed a bug that caused driver to fail silently on `TO_DATE` arrow to python conversion when invalid date was followed by the correct one.
17+
- Added `check_arrow_conversion_error_on_every_column` connection property that can be set to `False` to restore previous behaviour in which driver will ignore errors until it occurs in the last column. This flag's purpose is to unblock workflows that may be impacted by the bugfix and will be removed in later releases.
1618

1719
- v3.14.0(March 03, 2025)
1820
- Bumped pyOpenSSL dependency upper boundary from <25.0.0 to <26.0.0.

src/snowflake/connector/connection.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,10 @@ def _get_private_bytes_from_file(
317317
False,
318318
bool,
319319
), # use https://{bucket}.storage.googleapis.com instead of https://storage.googleapis.com/{bucket}
320+
"check_arrow_conversion_error_on_every_column": (
321+
True,
322+
bool,
323+
), # SNOW-XXXXX: remove the check_arrow_conversion_error_on_every_column flag
320324
}
321325

322326
APPLICATION_RE = re.compile(r"[\w\d_]+")
@@ -397,6 +401,7 @@ class SnowflakeConnection:
397401
token_file_path: The file path of the token file. If both token and token_file_path are provided, the token in token_file_path will be used.
398402
unsafe_file_write: When true, files downloaded by GET will be saved with 644 permissions. Otherwise, files will be saved with safe - owner-only permissions: 600.
399403
gcs_use_virtual_endpoints: When true, the virtual endpoint url is used, see: https://cloud.google.com/storage/docs/request-endpoints#xml-api
404+
check_arrow_conversion_error_on_every_column: When true, the error check after the conversion from arrow to python types will happen for every column in the row. This is a new behaviour which fixes the bug that caused the type errors to trigger silently when occurring at any place other than last column in a row. To revert the previous (faulty) behaviour, please set this flag to false.
400405
"""
401406

402407
OCSP_ENV_LOCK = Lock()
@@ -797,6 +802,14 @@ def gcs_use_virtual_endpoints(self) -> bool:
797802
def gcs_use_virtual_endpoints(self, value: bool) -> None:
798803
self._gcs_use_virtual_endpoints = value
799804

805+
@property
806+
def check_arrow_conversion_error_on_every_column(self) -> bool:
807+
return self._check_arrow_conversion_error_on_every_column
808+
809+
@check_arrow_conversion_error_on_every_column.setter
810+
def check_arrow_conversion_error_on_every_column(self, value: bool) -> bool:
811+
self._check_arrow_conversion_error_on_every_column = value
812+
800813
def connect(self, **kwargs) -> None:
801814
"""Establishes connection to Snowflake."""
802815
logger.debug("connect")

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowChunkIterator.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ namespace sf {
2727

2828
CArrowChunkIterator::CArrowChunkIterator(PyObject* context, char* arrow_bytes,
2929
int64_t arrow_bytes_size,
30-
PyObject* use_numpy)
30+
PyObject* use_numpy,
31+
PyObject* check_error_on_every_column)
3132
: CArrowIterator(arrow_bytes, arrow_bytes_size),
3233
m_latestReturnedRow(nullptr),
3334
m_context(context) {
@@ -39,6 +40,7 @@ CArrowChunkIterator::CArrowChunkIterator(PyObject* context, char* arrow_bytes,
3940
m_rowCountInBatch = 0;
4041
m_latestReturnedRow.reset();
4142
m_useNumpy = PyObject_IsTrue(use_numpy);
43+
m_checkErrorOnEveryColumn = PyObject_IsTrue(check_error_on_every_column);
4244

4345
m_batchCount = m_ipcArrowArrayVec.size();
4446
m_columnCount = m_batchCount > 0 ? m_ipcArrowSchema->n_children : 0;
@@ -92,6 +94,9 @@ void CArrowChunkIterator::createRowPyObject() {
9294
PyTuple_SET_ITEM(
9395
m_latestReturnedRow.get(), i,
9496
m_currentBatchConverters[i]->toPyObject(m_rowIndexInBatch));
97+
if (m_checkErrorOnEveryColumn && py::checkPyError()) {
98+
return;
99+
}
95100
}
96101
return;
97102
}
@@ -505,7 +510,8 @@ DictCArrowChunkIterator::DictCArrowChunkIterator(PyObject* context,
505510
char* arrow_bytes,
506511
int64_t arrow_bytes_size,
507512
PyObject* use_numpy)
508-
: CArrowChunkIterator(context, arrow_bytes, arrow_bytes_size, use_numpy) {}
513+
: CArrowChunkIterator(context, arrow_bytes, arrow_bytes_size, use_numpy,
514+
Py_False) {}
509515

510516
void DictCArrowChunkIterator::createRowPyObject() {
511517
m_latestReturnedRow.reset(PyDict_New());

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowChunkIterator.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ class CArrowChunkIterator : public CArrowIterator {
3333
* Constructor
3434
*/
3535
CArrowChunkIterator(PyObject* context, char* arrow_bytes,
36-
int64_t arrow_bytes_size, PyObject* use_numpy);
36+
int64_t arrow_bytes_size, PyObject* use_numpy,
37+
PyObject* check_error_on_every_column);
3738

3839
/**
3940
* Destructor
@@ -78,6 +79,10 @@ class CArrowChunkIterator : public CArrowIterator {
7879
/** true if return numpy int64 float64 datetime*/
7980
bool m_useNumpy;
8081

82+
/** a flag that ensures running py::checkPyError after each column processing
83+
* in order to fail early on first python processing error */
84+
bool m_checkErrorOnEveryColumn;
85+
8186
void initColumnConverters();
8287
};
8388

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/nanoarrow_arrow_iterator.pyx

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ cdef extern from "CArrowChunkIterator.hpp" namespace "sf":
5050
char* arrow_bytes,
5151
int64_t arrow_bytes_size,
5252
PyObject* use_numpy,
53+
PyObject* check_error_on_every_column,
5354
) except +
5455

5556
cdef cppclass DictCArrowChunkIterator(CArrowChunkIterator):
@@ -100,6 +101,7 @@ cdef class PyArrowIterator(EmptyPyArrowIterator):
100101
# still be converted into native python types.
101102
# https://docs.snowflake.com/en/user-guide/sqlalchemy.html#numpy-data-type-support
102103
cdef object use_numpy
104+
cdef object check_error_on_every_column
103105
cdef object number_to_decimal
104106
cdef object pyarrow_table
105107

@@ -111,12 +113,14 @@ cdef class PyArrowIterator(EmptyPyArrowIterator):
111113
object use_dict_result,
112114
object numpy,
113115
object number_to_decimal,
116+
object check_error_on_every_column
114117
):
115118
self.context = arrow_context
116119
self.cIterator = NULL
117120
self.use_dict_result = use_dict_result
118121
self.cursor = cursor
119122
self.use_numpy = numpy
123+
self.check_error_on_every_column = check_error_on_every_column
120124
self.number_to_decimal = number_to_decimal
121125
self.pyarrow_table = None
122126
self.table_returned = False
@@ -139,8 +143,9 @@ cdef class PyArrowRowIterator(PyArrowIterator):
139143
object use_dict_result,
140144
object numpy,
141145
object number_to_decimal,
146+
object check_error_on_every_column,
142147
):
143-
super().__init__(cursor, py_inputstream, arrow_context, use_dict_result, numpy, number_to_decimal)
148+
super().__init__(cursor, py_inputstream, arrow_context, use_dict_result, numpy, number_to_decimal, check_error_on_every_column)
144149
if self.cIterator is not NULL:
145150
return
146151

@@ -155,7 +160,8 @@ cdef class PyArrowRowIterator(PyArrowIterator):
155160
<PyObject *> self.context,
156161
self.arrow_bytes,
157162
self.arrow_bytes_size,
158-
<PyObject *> self.use_numpy
163+
<PyObject *> self.use_numpy,
164+
<PyObject *> self.check_error_on_every_column
159165
)
160166
cdef ReturnVal cret = self.cIterator.checkInitializationStatus()
161167
if cret.exception:
@@ -200,8 +206,9 @@ cdef class PyArrowTableIterator(PyArrowIterator):
200206
object use_dict_result,
201207
object numpy,
202208
object number_to_decimal,
209+
object check_error_on_every_column
203210
):
204-
super().__init__(cursor, py_inputstream, arrow_context, use_dict_result, numpy, number_to_decimal)
211+
super().__init__(cursor, py_inputstream, arrow_context, use_dict_result, numpy, number_to_decimal, check_error_on_every_column)
205212
if not INSTALLED_PYARROW:
206213
raise Error.errorhandler_make_exception(
207214
ProgrammingError,

src/snowflake/connector/result_batch.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def _create_nanoarrow_iterator(
6262
numpy: bool,
6363
number_to_decimal: bool,
6464
row_unit: IterUnit,
65+
check_error_on_every_column: bool = True,
6566
):
6667
from .nanoarrow_arrow_iterator import PyArrowRowIterator, PyArrowTableIterator
6768

@@ -74,6 +75,7 @@ def _create_nanoarrow_iterator(
7475
use_dict_result,
7576
numpy,
7677
number_to_decimal,
78+
check_error_on_every_column,
7779
)
7880
if row_unit == IterUnit.ROW_UNIT
7981
else PyArrowTableIterator(
@@ -83,6 +85,7 @@ def _create_nanoarrow_iterator(
8385
use_dict_result,
8486
numpy,
8587
number_to_decimal,
88+
check_error_on_every_column,
8689
)
8790
)
8891

@@ -614,7 +617,7 @@ def _load(
614617
)
615618

616619
def _from_data(
617-
self, data: str, iter_unit: IterUnit
620+
self, data: str, iter_unit: IterUnit, check_error_on_every_column: bool = True
618621
) -> Iterator[dict | Exception] | Iterator[tuple | Exception]:
619622
"""Creates a ``PyArrowIterator`` files from a str.
620623
@@ -631,6 +634,7 @@ def _from_data(
631634
self._numpy,
632635
self._number_to_decimal,
633636
iter_unit,
637+
check_error_on_every_column,
634638
)
635639

636640
@classmethod
@@ -665,7 +669,15 @@ def _create_iter(
665669
"""Create an iterator for the ResultBatch. Used by get_arrow_iter."""
666670
if self._local:
667671
try:
668-
return self._from_data(self._data, iter_unit)
672+
return self._from_data(
673+
self._data,
674+
iter_unit,
675+
(
676+
connection.check_arrow_conversion_error_on_every_column
677+
if connection
678+
else None
679+
),
680+
)
669681
except Exception:
670682
if connection and getattr(connection, "_debug_arrow_chunk", False):
671683
logger.debug(f"arrow data can not be parsed: {self._data}")

test/helpers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ def create_nanoarrow_pyarrow_iterator(input_data, use_table_iterator):
124124
False,
125125
False,
126126
False,
127+
True,
127128
)
128129
if not use_table_iterator
129130
else NanoarrowPyArrowTableIterator(
@@ -135,6 +136,7 @@ def create_nanoarrow_pyarrow_iterator(input_data, use_table_iterator):
135136
False,
136137
False,
137138
False,
139+
False,
138140
)
139141
)
140142

test/integ/pandas/test_unit_arrow_chunk_iterator.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,9 @@ def iterate_over_test_chunk(
430430
stream.seek(0)
431431
context = ArrowConverterContext()
432432

433-
it = NanoarrowPyArrowRowIterator(None, stream.read(), context, False, False, False)
433+
it = NanoarrowPyArrowRowIterator(
434+
None, stream.read(), context, False, False, False, True
435+
)
434436

435437
count = 0
436438
while True:

test/integ/test_cursor.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1790,6 +1790,23 @@ def test_out_of_range_year(conn_cnx, result_format, cursor_type, fetch_method):
17901790
fetch_next_fn()
17911791

17921792

1793+
@pytest.mark.parametrize("result_format", ("json", "arrow"))
1794+
def test_out_of_range_year_followed_by_correct_year(conn_cnx, result_format):
1795+
"""Tests whether the year 10000 is out of range exception is raised as expected."""
1796+
with conn_cnx(
1797+
session_parameters={
1798+
PARAMETER_PYTHON_CONNECTOR_QUERY_RESULT_FORMAT: result_format
1799+
}
1800+
) as con:
1801+
with con.cursor() as cur:
1802+
cur.execute("select TO_DATE('10000-01-01'), TO_DATE('9999-01-01')")
1803+
with pytest.raises(
1804+
InterfaceError,
1805+
match="out of range",
1806+
):
1807+
cur.fetchall()
1808+
1809+
17931810
@pytest.mark.skipolddriver
17941811
def test_describe(conn_cnx):
17951812
with conn_cnx() as con:

0 commit comments

Comments
 (0)