Skip to content

Commit 5f22a62

Browse files
author
subrata-ms
committed
review comment
1 parent dcbe1a5 commit 5f22a62

File tree

2 files changed

+19
-8
lines changed

2 files changed

+19
-8
lines changed

mssql_python/pybind/ddbc_bindings.cpp

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2956,13 +2956,22 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
29562956
row.append(
29572957
FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false, charEncoding));
29582958
} else {
2959-
// Use columnSize * 4 + 1 to accommodate worst-case UTF-8 expansion.
2960-
// columnSize is in characters, but the ODBC driver may return UTF-8
2961-
// encoded bytes where each character can be up to 4 bytes. This
2962-
// applies on Linux/macOS (driver always returns UTF-8 for SQL_C_CHAR)
2963-
// and on Windows when the database uses a UTF-8 collation. Without
2964-
// this, data at exact column boundary with multi-byte chars (e.g.,
2965-
// CP1252 é in VARCHAR(10)) causes truncation and corruption.
2959+
// Allocate columnSize * 4 + 1 on ALL platforms (no #if guard).
2960+
//
2961+
// Why this differs from SQLBindColums / FetchBatchData:
2962+
// Those two functions use #if to apply *4 only on Linux/macOS,
2963+
// because on Windows with a non-UTF-8 collation (e.g. CP1252)
2964+
// each character occupies exactly 1 byte, so *1 suffices and
2965+
// saves memory across the entire batch (fetchSize × numCols
2966+
// buffers).
2967+
//
2968+
// SQLGetData_wrap allocates a single temporary buffer per
2969+
// column per row, so the over-allocation cost is negligible.
2970+
// Using *4 unconditionally here keeps the code simple and
2971+
// correct on every platform—including Windows with a UTF-8
2972+
// collation where multi-byte chars could otherwise cause
2973+
// truncation at the exact column boundary (e.g. CP1252 é in
2974+
// VARCHAR(10)).
29662975
uint64_t fetchBufferSize = columnSize * 4 + 1 /* null-termination */;
29672976
std::vector<SQLCHAR> dataBuffer(fetchBufferSize);
29682977
SQLLEN dataLen;
@@ -3697,6 +3706,7 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
36973706
columnInfosExt[col].fetchBufferSize = columnInfos[col].fetchBufferSize;
36983707
columnInfosExt[col].isLob = columnInfos[col].isLob;
36993708
columnInfosExt[col].charEncoding = effectiveCharEnc;
3709+
columnInfosExt[col].isUtf8 = (effectiveCharEnc == "utf-8");
37003710

37013711
// Map data type to processor function (switch executed once per column,
37023712
// not per cell)

mssql_python/pybind/ddbc_bindings.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,7 @@ struct ColumnInfoExt {
667667
SQLULEN processedColumnSize;
668668
uint64_t fetchBufferSize;
669669
bool isLob;
670+
bool isUtf8; // Pre-computed from charEncoding (avoids string compare per cell)
670671
std::string charEncoding; // Effective decoding encoding for SQL_C_CHAR data
671672
};
672673

@@ -824,7 +825,7 @@ inline void ProcessChar(PyObject* row, ColumnBuffers& buffers, const void* colIn
824825
// For UTF-8, use the direct C API (PyUnicode_FromStringAndSize) which
825826
// bypasses the codec registry for maximum reliability. For non-UTF-8
826827
// encodings (e.g., CP1252), use PyUnicode_Decode with the codec registry.
827-
if (colInfo->charEncoding == "utf-8") {
828+
if (colInfo->isUtf8) {
828829
pyStr = PyUnicode_FromStringAndSize(dataPtr, numCharsInData);
829830
} else {
830831
pyStr =

0 commit comments

Comments
 (0)