Skip to content

Commit fba171c

Browse files
authored
FEAT: streaming support in fetchone for nvarcharmax data type (#220)
### Work Item / Issue Reference <!-- IMPORTANT: Please follow the PR template guidelines below. For mssql-python maintainers: Insert your ADO Work Item ID below (e.g. AB#37452) For external contributors: Insert Github Issue number below (e.g. #149) Only one reference is required - either GitHub issue OR ADO Work Item. --> <!-- mssql-python maintainers: ADO Work Item --> > [AB#38110](https://sqlclientdrivers.visualstudio.com/c6d89619-62de-46a0-8b46-70b92a84d85e/_workitems/edit/38110) [AB#34162](https://sqlclientdrivers.visualstudio.com/c6d89619-62de-46a0-8b46-70b92a84d85e/_workitems/edit/34162) <!-- External contributors: GitHub Issue --> > GitHub Issue: #<ISSUE_NUMBER> ------------------------------------------------------------------- ### Summary <!-- Insert your summary of changes below. Minimum 10 characters required. --> This pull request improves NVARCHAR data handling in the SQL Server Python bindings and adds comprehensive tests for NVARCHAR(MAX) scenarios. The main changes include switching to streaming for large NVARCHAR values, optimizing direct fetch for smaller values, and adding tests for edge cases and boundaries to ensure correctness. **NVARCHAR data handling improvements:** * Updated the logic in `ddbc_bindings.cpp` to use streaming for large NVARCHAR/NCHAR columns (over 4000 characters or unknown size) and direct fetch for smaller values, optimizing performance and reliability. * Refactored data conversion for NVARCHAR fetches, using `std::wstring` for conversion and simplifying platform-specific handling for both macOS/Linux and Windows. * Improved handling of empty strings and NULLs for NVARCHAR columns, ensuring correct Python types are returned and logging is more descriptive. **Testing enhancements:** * Added new tests in `test_004_cursor.py` for NVARCHAR(MAX) covering short strings, boundary conditions (4000 chars), streaming (4100+ chars), large values (100,000 chars), empty strings, NULLs, and transaction rollback scenarios to verify correct behavior across all edge cases. **VARCHAR/CHAR fetch improvements:** * Improved direct fetch logic for small VARCHAR/CHAR columns and fixed string conversion to use the actual data length, preventing potential issues with null-termination and buffer size. [[1]](diffhunk://#diff-dde2297345718ec449a14e7dff91b7bb2342b008ecc071f562233646d71144a1R1825-R1830) [[2]](diffhunk://#diff-dde2297345718ec449a14e7dff91b7bb2342b008ecc071f562233646d71144a1L1841-L1850) <!-- ### PR Title Guide > For feature requests FEAT: (short-description) > For non-feature requests like test case updates, config updates , dependency updates etc CHORE: (short-description) > For Fix requests FIX: (short-description) > For doc update requests DOC: (short-description) > For Formatting, indentation, or styling update STYLE: (short-description) > For Refactor, without any feature changes REFACTOR: (short-description) > For release related changes, without any feature changes RELEASE: #<RELEASE_VERSION> (short-description) ### Contribution Guidelines External contributors: - Create a GitHub issue first: https://github.com/microsoft/mssql-python/issues/new - Link the GitHub issue in the "GitHub Issue" section above - Follow the PR title format and provide a meaningful summary mssql-python maintainers: - Create an ADO Work Item following internal processes - Link the ADO Work Item in the "ADO Work Item" section above - Follow the PR title format and provide a meaningful summary -->
1 parent 7f67326 commit fba171c

File tree

2 files changed

+397
-425
lines changed

2 files changed

+397
-425
lines changed

mssql_python/pybind/ddbc_bindings.cpp

Lines changed: 125 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#define ARCHITECTURE "win64" // Default to win64 if not defined during compilation
3232
#endif
3333
#define DAE_CHUNK_SIZE 8192
34+
#define SQL_MAX_LOB_SIZE 8000
3435
//-------------------------------------------------------------------------------------------------
3536
// Class definitions
3637
//-------------------------------------------------------------------------------------------------
@@ -1747,8 +1748,13 @@ static py::object FetchLobColumnData(SQLHSTMT hStmt,
17471748
&actualRead);
17481749

17491750
if (ret == SQL_ERROR || !SQL_SUCCEEDED(ret) && ret != SQL_SUCCESS_WITH_INFO) {
1750-
LOG("Loop {}: Error fetching column {} with cType={}", loopCount, colIndex, cType);
1751-
ThrowStdException("Error fetching column data");
1751+
std::ostringstream oss;
1752+
oss << "Error fetching LOB for column " << colIndex
1753+
<< ", cType=" << cType
1754+
<< ", loop=" << loopCount
1755+
<< ", SQLGetData return=" << ret;
1756+
LOG(oss.str());
1757+
ThrowStdException(oss.str());
17521758
}
17531759
if (actualRead == SQL_NULL_DATA) {
17541760
LOG("Loop {}: Column {} is NULL", loopCount, colIndex);
@@ -1862,7 +1868,7 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
18621868
case SQL_CHAR:
18631869
case SQL_VARCHAR:
18641870
case SQL_LONGVARCHAR: {
1865-
if (columnSize == SQL_NO_TOTAL || columnSize == 0 || columnSize > 8000) {
1871+
if (columnSize == SQL_NO_TOTAL || columnSize == 0 || columnSize > SQL_MAX_LOB_SIZE) {
18661872
LOG("Streaming LOB for column {}", i);
18671873
row.append(FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false));
18681874
} else {
@@ -1884,6 +1890,10 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
18841890
#else
18851891
row.append(std::string(reinterpret_cast<char*>(dataBuffer.data())));
18861892
#endif
1893+
} else {
1894+
// Buffer too small, fallback to streaming
1895+
LOG("CHAR column {} data truncated, using streaming LOB", i);
1896+
row.append(FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false));
18871897
}
18881898
} else if (dataLen == SQL_NULL_DATA) {
18891899
LOG("Column {} is NULL (CHAR)", i);
@@ -1911,62 +1921,53 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
19111921
}
19121922
case SQL_WCHAR:
19131923
case SQL_WVARCHAR:
1914-
case SQL_WLONGVARCHAR: {
1915-
// TODO: revisit
1916-
HandleZeroColumnSizeAtFetch(columnSize);
1917-
uint64_t fetchBufferSize = columnSize + 1 /* null-termination */;
1918-
std::vector<SQLWCHAR> dataBuffer(fetchBufferSize);
1919-
SQLLEN dataLen;
1920-
ret = SQLGetData_ptr(hStmt, i, SQL_C_WCHAR, dataBuffer.data(),
1921-
dataBuffer.size() * sizeof(SQLWCHAR), &dataLen);
1922-
1923-
if (SQL_SUCCEEDED(ret)) {
1924-
// TODO: Refactor these if's across other switches to avoid code duplication
1925-
if (dataLen > 0) {
1926-
uint64_t numCharsInData = dataLen / sizeof(SQLWCHAR);
1927-
if (numCharsInData < dataBuffer.size()) {
1928-
// SQLGetData will null-terminate the data
1924+
case SQL_WLONGVARCHAR: {
1925+
if (columnSize == SQL_NO_TOTAL || columnSize == 0 || columnSize > 4000) {
1926+
LOG("Streaming LOB for column {} (NVARCHAR)", i);
1927+
row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false));
1928+
} else {
1929+
uint64_t fetchBufferSize = (columnSize + 1) * sizeof(SQLWCHAR); // +1 for null terminator
1930+
std::vector<SQLWCHAR> dataBuffer(columnSize + 1);
1931+
SQLLEN dataLen;
1932+
ret = SQLGetData_ptr(hStmt, i, SQL_C_WCHAR, dataBuffer.data(), fetchBufferSize, &dataLen);
1933+
if (SQL_SUCCEEDED(ret)) {
1934+
if (dataLen > 0) {
1935+
uint64_t numCharsInData = dataLen / sizeof(SQLWCHAR);
1936+
if (numCharsInData < dataBuffer.size()) {
19291937
#if defined(__APPLE__) || defined(__linux__)
1930-
auto raw_bytes = reinterpret_cast<const char*>(dataBuffer.data());
1931-
size_t actualBufferSize = dataBuffer.size() * sizeof(SQLWCHAR);
1932-
if (dataLen < 0 || static_cast<size_t>(dataLen) > actualBufferSize) {
1933-
LOG("Error: py::bytes creation request exceeds buffer size. dataLen={} buffer={}",
1934-
dataLen, actualBufferSize);
1935-
ThrowStdException("Invalid buffer length for py::bytes");
1936-
}
1937-
py::bytes py_bytes(raw_bytes, dataLen);
1938-
py::str decoded = py_bytes.attr("decode")("utf-16-le");
1939-
row.append(decoded);
1938+
const SQLWCHAR* sqlwBuf = reinterpret_cast<const SQLWCHAR*>(dataBuffer.data());
1939+
std::wstring wstr = SQLWCHARToWString(sqlwBuf, numCharsInData);
1940+
std::string utf8str = WideToUTF8(wstr);
1941+
row.append(py::str(utf8str));
19401942
#else
1941-
row.append(std::wstring(dataBuffer.data()));
1943+
std::wstring wstr(reinterpret_cast<wchar_t*>(dataBuffer.data()));
1944+
row.append(py::cast(wstr));
19421945
#endif
1943-
} else {
1944-
// In this case, buffer size is smaller, and data to be retrieved is longer
1945-
// TODO: Revisit
1946-
std::ostringstream oss;
1947-
oss << "Buffer length for fetch (" << dataBuffer.size()-1 << ") is smaller, & data "
1948-
<< "to be retrieved is longer (" << numCharsInData << "). ColumnID - "
1949-
<< i << ", datatype - " << dataType;
1950-
ThrowStdException(oss.str());
1946+
LOG("Appended NVARCHAR string of length {} to result row", numCharsInData);
1947+
} else {
1948+
// Buffer too small, fallback to streaming
1949+
LOG("NVARCHAR column {} data truncated, using streaming LOB", i);
1950+
row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false));
1951+
}
1952+
} else if (dataLen == SQL_NULL_DATA) {
1953+
LOG("Column {} is NULL (CHAR)", i);
1954+
row.append(py::none());
1955+
} else if (dataLen == 0) {
1956+
row.append(py::str(""));
1957+
} else if (dataLen == SQL_NO_TOTAL) {
1958+
LOG("SQLGetData couldn't determine the length of the NVARCHAR data. Returning NULL. Column ID - {}", i);
1959+
row.append(py::none());
1960+
} else if (dataLen < 0) {
1961+
LOG("SQLGetData returned an unexpected negative data length. "
1962+
"Raising exception. Column ID - {}, Data Type - {}, Data Length - {}",
1963+
i, dataType, dataLen);
1964+
ThrowStdException("SQLGetData returned an unexpected negative data length");
19511965
}
1952-
} else if (dataLen == SQL_NULL_DATA) {
1953-
row.append(py::none());
1954-
} else if (dataLen == 0) {
1955-
// Handle zero-length (non-NULL) data
1956-
row.append(py::str(""));
1957-
} else if (dataLen < 0) {
1958-
// This is unexpected
1959-
LOG("SQLGetData returned an unexpected negative data length. "
1960-
"Raising exception. Column ID - {}, Data Type - {}, Data Length - {}",
1961-
i, dataType, dataLen);
1962-
ThrowStdException("SQLGetData returned an unexpected negative data length");
1966+
} else {
1967+
LOG("Error retrieving data for column {} (NVARCHAR), SQLGetData return code {}", i, ret);
1968+
row.append(py::none());
19631969
}
1964-
} else {
1965-
LOG("Error retrieving data for column - {}, data type - {}, SQLGetData return "
1966-
"code - {}. Returning NULL value instead",
1967-
i, dataType, ret);
1968-
row.append(py::none());
1969-
}
1970+
}
19701971
break;
19711972
}
19721973
case SQL_INTEGER: {
@@ -2411,7 +2412,7 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
24112412
// Fetch rows in batches
24122413
// TODO: Move to anonymous namespace, since it is not used outside this file
24132414
SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& columnNames,
2414-
py::list& rows, SQLUSMALLINT numCols, SQLULEN& numRowsFetched) {
2415+
py::list& rows, SQLUSMALLINT numCols, SQLULEN& numRowsFetched, const std::vector<SQLUSMALLINT>& lobColumns) {
24152416
LOG("Fetching data in batches");
24162417
SQLRETURN ret = SQLFetchScroll_ptr(hStmt, SQL_FETCH_NEXT, 0);
24172418
if (ret == SQL_NO_DATA) {
@@ -2471,25 +2472,19 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
24712472
case SQL_CHAR:
24722473
case SQL_VARCHAR:
24732474
case SQL_LONGVARCHAR: {
2474-
// TODO: variable length data needs special handling, this logic wont suffice
24752475
SQLULEN columnSize = columnMeta["ColumnSize"].cast<SQLULEN>();
24762476
HandleZeroColumnSizeAtFetch(columnSize);
24772477
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
24782478
uint64_t numCharsInData = dataLen / sizeof(SQLCHAR);
2479+
bool isLob = std::find(lobColumns.begin(), lobColumns.end(), col) != lobColumns.end();
24792480
// fetchBufferSize includes null-terminator, numCharsInData doesn't. Hence '<'
2480-
if (numCharsInData < fetchBufferSize) {
2481+
if (!isLob && numCharsInData < fetchBufferSize) {
24812482
// SQLFetch will nullterminate the data
24822483
row.append(std::string(
24832484
reinterpret_cast<char*>(&buffers.charBuffers[col - 1][i * fetchBufferSize]),
24842485
numCharsInData));
24852486
} else {
2486-
// In this case, buffer size is smaller, and data to be retrieved is longer
2487-
// TODO: Revisit
2488-
std::ostringstream oss;
2489-
oss << "Buffer length for fetch (" << columnSize << ") is smaller, & data "
2490-
<< "to be retrieved is longer (" << numCharsInData << "). ColumnID - "
2491-
<< col << ", datatype - " << dataType;
2492-
ThrowStdException(oss.str());
2487+
row.append(FetchLobColumnData(hStmt, col, SQL_C_CHAR, false, false));
24932488
}
24942489
break;
24952490
}
@@ -2501,8 +2496,9 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
25012496
HandleZeroColumnSizeAtFetch(columnSize);
25022497
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
25032498
uint64_t numCharsInData = dataLen / sizeof(SQLWCHAR);
2499+
bool isLob = std::find(lobColumns.begin(), lobColumns.end(), col) != lobColumns.end();
25042500
// fetchBufferSize includes null-terminator, numCharsInData doesn't. Hence '<'
2505-
if (numCharsInData < fetchBufferSize) {
2501+
if (!isLob && numCharsInData < fetchBufferSize) {
25062502
// SQLFetch will nullterminate the data
25072503
#if defined(__APPLE__) || defined(__linux__)
25082504
// Use unix-specific conversion to handle the wchar_t/SQLWCHAR size difference
@@ -2516,13 +2512,7 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
25162512
numCharsInData));
25172513
#endif
25182514
} else {
2519-
// In this case, buffer size is smaller, and data to be retrieved is longer
2520-
// TODO: Revisit
2521-
std::ostringstream oss;
2522-
oss << "Buffer length for fetch (" << columnSize << ") is smaller, & data "
2523-
<< "to be retrieved is longer (" << numCharsInData << "). ColumnID - "
2524-
<< col << ", datatype - " << dataType;
2525-
ThrowStdException(oss.str());
2515+
row.append(FetchLobColumnData(hStmt, col, SQL_C_WCHAR, true, false));
25262516
}
25272517
break;
25282518
}
@@ -2608,21 +2598,15 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
26082598
case SQL_BINARY:
26092599
case SQL_VARBINARY:
26102600
case SQL_LONGVARBINARY: {
2611-
// TODO: variable length data needs special handling, this logic wont suffice
26122601
SQLULEN columnSize = columnMeta["ColumnSize"].cast<SQLULEN>();
26132602
HandleZeroColumnSizeAtFetch(columnSize);
2614-
if (static_cast<size_t>(dataLen) <= columnSize) {
2603+
bool isLob = std::find(lobColumns.begin(), lobColumns.end(), col) != lobColumns.end();
2604+
if (!isLob && static_cast<size_t>(dataLen) <= columnSize) {
26152605
row.append(py::bytes(reinterpret_cast<const char*>(
26162606
&buffers.charBuffers[col - 1][i * columnSize]),
26172607
dataLen));
26182608
} else {
2619-
// In this case, buffer size is smaller, and data to be retrieved is longer
2620-
// TODO: Revisit
2621-
std::ostringstream oss;
2622-
oss << "Buffer length for fetch (" << columnSize << ") is smaller, & data "
2623-
<< "to be retrieved is longer (" << dataLen << "). ColumnID - "
2624-
<< col << ", datatype - " << dataType;
2625-
ThrowStdException(oss.str());
2609+
row.append(FetchLobColumnData(hStmt, col, SQL_C_BINARY, false, true));
26262610
}
26272611
break;
26282612
}
@@ -2751,6 +2735,35 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch
27512735
return ret;
27522736
}
27532737

2738+
std::vector<SQLUSMALLINT> lobColumns;
2739+
for (SQLSMALLINT i = 0; i < numCols; i++) {
2740+
auto colMeta = columnNames[i].cast<py::dict>();
2741+
SQLSMALLINT dataType = colMeta["DataType"].cast<SQLSMALLINT>();
2742+
SQLULEN columnSize = colMeta["ColumnSize"].cast<SQLULEN>();
2743+
2744+
if ((dataType == SQL_WVARCHAR || dataType == SQL_WLONGVARCHAR ||
2745+
dataType == SQL_VARCHAR || dataType == SQL_LONGVARCHAR ||
2746+
dataType == SQL_VARBINARY || dataType == SQL_LONGVARBINARY) &&
2747+
(columnSize == 0 || columnSize == SQL_NO_TOTAL || columnSize > SQL_MAX_LOB_SIZE)) {
2748+
lobColumns.push_back(i + 1); // 1-based
2749+
}
2750+
}
2751+
2752+
// If we have LOBs → fall back to row-by-row fetch + SQLGetData_wrap
2753+
if (!lobColumns.empty()) {
2754+
LOG("LOB columns detected → using per-row SQLGetData path");
2755+
while (true) {
2756+
ret = SQLFetch_ptr(hStmt);
2757+
if (ret == SQL_NO_DATA) break;
2758+
if (!SQL_SUCCEEDED(ret)) return ret;
2759+
2760+
py::list row;
2761+
SQLGetData_wrap(StatementHandle, numCols, row); // <-- streams LOBs correctly
2762+
rows.append(row);
2763+
}
2764+
return SQL_SUCCESS;
2765+
}
2766+
27542767
// Initialize column buffers
27552768
ColumnBuffers buffers(numCols, fetchSize);
27562769

@@ -2765,7 +2778,7 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch
27652778
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)(intptr_t)fetchSize, 0);
27662779
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, &numRowsFetched, 0);
27672780

2768-
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched);
2781+
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns);
27692782
if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) {
27702783
LOG("Error when fetching data");
27712784
return ret;
@@ -2844,6 +2857,35 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows) {
28442857
}
28452858
LOG("Fetching data in batch sizes of {}", fetchSize);
28462859

2860+
std::vector<SQLUSMALLINT> lobColumns;
2861+
for (SQLSMALLINT i = 0; i < numCols; i++) {
2862+
auto colMeta = columnNames[i].cast<py::dict>();
2863+
SQLSMALLINT dataType = colMeta["DataType"].cast<SQLSMALLINT>();
2864+
SQLULEN columnSize = colMeta["ColumnSize"].cast<SQLULEN>();
2865+
2866+
if ((dataType == SQL_WVARCHAR || dataType == SQL_WLONGVARCHAR ||
2867+
dataType == SQL_VARCHAR || dataType == SQL_LONGVARCHAR ||
2868+
dataType == SQL_VARBINARY || dataType == SQL_LONGVARBINARY) &&
2869+
(columnSize == 0 || columnSize == SQL_NO_TOTAL || columnSize > SQL_MAX_LOB_SIZE)) {
2870+
lobColumns.push_back(i + 1); // 1-based
2871+
}
2872+
}
2873+
2874+
// If we have LOBs → fall back to row-by-row fetch + SQLGetData_wrap
2875+
if (!lobColumns.empty()) {
2876+
LOG("LOB columns detected → using per-row SQLGetData path");
2877+
while (true) {
2878+
ret = SQLFetch_ptr(hStmt);
2879+
if (ret == SQL_NO_DATA) break;
2880+
if (!SQL_SUCCEEDED(ret)) return ret;
2881+
2882+
py::list row;
2883+
SQLGetData_wrap(StatementHandle, numCols, row); // <-- streams LOBs correctly
2884+
rows.append(row);
2885+
}
2886+
return SQL_SUCCESS;
2887+
}
2888+
28472889
ColumnBuffers buffers(numCols, fetchSize);
28482890

28492891
// Bind columns
@@ -2858,7 +2900,7 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows) {
28582900
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, &numRowsFetched, 0);
28592901

28602902
while (ret != SQL_NO_DATA) {
2861-
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched);
2903+
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns);
28622904
if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) {
28632905
LOG("Error when fetching data");
28642906
return ret;

0 commit comments

Comments
 (0)