resolved review comments

gargsaumya · gargsaumya · commit 056c810291be · 2025-08-28T19:03:20.000+05:30
diff --git a/mssql_python/cursor.py b/mssql_python/cursor.py
@@ -216,6 +216,14 @@ def _get_numeric_data(self, param):
         numeric_data.val = val
         return numeric_data
 
+    def _calculate_utf16_length(self, param: str) -> int:
+        """Return UTF-16 code unit length of a Python string."""
+        try:
+            return len(param.encode("utf-16-le")) // 2
+        except UnicodeEncodeError as e:
+            log('warning', "UTF-16 encoding failed for %r: %s. Falling back to len().", param, e)
+            return len(param)
+
     def _map_sql_type(self, param, parameters_list, i):
         """
         Map a Python data type to the corresponding SQL type, 
@@ -332,7 +340,7 @@ def _map_sql_type(self, param, parameters_list, i):
             # TODO: revisit
             if len(param) > 4000:  # Long strings
                 if is_unicode:
-                    utf16_len = len(param.encode("utf-16-le")) // 2
+                    utf16_len = self._calculate_utf16_length(param)
                     return (
                         ddbc_sql_const.SQL_WLONGVARCHAR.value,
                         ddbc_sql_const.SQL_C_WCHAR.value,
@@ -346,7 +354,7 @@ def _map_sql_type(self, param, parameters_list, i):
                     0,
                 )
             if is_unicode:  # Short Unicode strings
-                utf16_len = len(param.encode("utf-16-le")) // 2
+                utf16_len = self._calculate_utf16_length(param)
                 return (
                     ddbc_sql_const.SQL_WVARCHAR.value,
                     ddbc_sql_const.SQL_C_WCHAR.value,
diff --git a/mssql_python/pybind/ddbc_bindings.cpp b/mssql_python/pybind/ddbc_bindings.cpp
@@ -275,10 +275,16 @@ SQLRETURN BindParameters(SQLHANDLE hStmt, const py::list& params,
                     AllocateParamBuffer<std::vector<SQLWCHAR>>(paramBuffers);
 
                 // Reserve space and convert from wstring to SQLWCHAR array
-                sqlwcharBuffer->resize(strParam->size() + 1, 0); // +1 for null terminator
                 std::vector<SQLWCHAR> utf16 = WStringToSQLWCHAR(*strParam);
-                sqlwcharBuffer->assign(utf16.begin(), utf16.end());
-
+                if (utf16.size() < strParam->size()) {
+                    LOG("Warning: UTF-16 encoding shrank string? input={} output={}",
+                        strParam->size(), utf16.size());
+                }
+                if (utf16.size() > strParam->size() * 2 + 1) {
+                    LOG("Warning: UTF-16 expansion unusually large: input={} output={}",
+                        strParam->size(), utf16.size());
+                }
+                *sqlwcharBuffer = std::move(utf16);
                 // Use the SQLWCHAR buffer instead of the wstring directly
                 dataPtr = sqlwcharBuffer->data();
                 bufferLength = sqlwcharBuffer->size() * sizeof(SQLWCHAR);
@@ -1704,6 +1710,12 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
                             // SQLGetData will null-terminate the data
 #if defined(__APPLE__) || defined(__linux__)
                             auto raw_bytes = reinterpret_cast<const char*>(dataBuffer.data());
+                            size_t actualBufferSize = dataBuffer.size() * sizeof(SQLWCHAR);
+                            if (dataLen < 0 || static_cast<size_t>(dataLen) > actualBufferSize) {
+                                LOG("Error: py::bytes creation request exceeds buffer size. dataLen={} buffer={}",
+                                    dataLen, actualBufferSize);
+                                ThrowStdException("Invalid buffer length for py::bytes");
+                            }
                             py::bytes py_bytes(raw_bytes, dataLen);
                             py::str decoded = py_bytes.attr("decode")("utf-16-le");
                             row.append(decoded);
diff --git a/mssql_python/pybind/ddbc_bindings.h b/mssql_python/pybind/ddbc_bindings.h
@@ -33,49 +33,107 @@ using namespace pybind11::literals;
 #include <sqlext.h>
 
 #if defined(__APPLE__) || defined(__linux__)
-    // macOS-specific headers
-    #include <dlfcn.h>
+#include <dlfcn.h>
+
+// Unicode constants for surrogate ranges and max scalar value
+constexpr uint32_t UNICODE_SURROGATE_HIGH_START = 0xD800;
+constexpr uint32_t UNICODE_SURROGATE_HIGH_END   = 0xDBFF;
+constexpr uint32_t UNICODE_SURROGATE_LOW_START  = 0xDC00;
+constexpr uint32_t UNICODE_SURROGATE_LOW_END    = 0xDFFF;
+constexpr uint32_t UNICODE_MAX_CODEPOINT        = 0x10FFFF;
+constexpr uint32_t UNICODE_REPLACEMENT_CHAR     = 0xFFFD;
+
+// Validate whether a code point is a legal Unicode scalar value
+// (excludes surrogate halves and values beyond U+10FFFF)
+inline bool IsValidUnicodeScalar(uint32_t cp) {
+    return cp <= UNICODE_MAX_CODEPOINT &&
+           !(cp >= UNICODE_SURROGATE_HIGH_START && cp <= UNICODE_SURROGATE_LOW_END);
+}
 
-    inline std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
-        if (!sqlwStr) return std::wstring();
+inline std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
+    if (!sqlwStr) return std::wstring();
 
-        if (length == SQL_NTS) {
-            size_t i = 0;
-            while (sqlwStr[i] != 0) ++i;
-            length = i;
-        }
+    if (length == SQL_NTS) {
+        size_t i = 0;
+        while (sqlwStr[i] != 0) ++i;
+        length = i;
+    }
+    std::wstring result;
+    result.reserve(length);
 
-        std::wstring result;
-        result.reserve(length);
+    if constexpr (sizeof(SQLWCHAR) == 2) {
+        // Decode UTF-16 to UTF-32 (with surrogate pair handling)
+        for (size_t i = 0; i < length; ++i) {
+            uint16_t wc = static_cast<uint16_t>(sqlwStr[i]);
+            // Check if this is a high surrogate (U+D800–U+DBFF)
+            if (wc >= UNICODE_SURROGATE_HIGH_START && wc <= UNICODE_SURROGATE_HIGH_END && i + 1 < length) {
+                uint16_t low = static_cast<uint16_t>(sqlwStr[i + 1]);
+                // Check if the next code unit is a low surrogate (U+DC00–U+DFFF)
+                if (low >= UNICODE_SURROGATE_LOW_START && low <= UNICODE_SURROGATE_LOW_END) {
+                    // Combine surrogate pair into a single code point
+                    uint32_t cp = (((wc - UNICODE_SURROGATE_HIGH_START) << 10) | (low - UNICODE_SURROGATE_LOW_START)) + 0x10000;
+                    result.push_back(static_cast<wchar_t>(cp));
+                    ++i; // Skip the low surrogate
+                    continue;
+                }
+            }
+            // If valid scalar then append, else append replacement char (U+FFFD)
+            if (IsValidUnicodeScalar(wc)) {
+                result.push_back(static_cast<wchar_t>(wc));
+            } else {
+                result.push_back(static_cast<wchar_t>(UNICODE_REPLACEMENT_CHAR));
+            }
+        }
+    } else {
+        // SQLWCHAR is UTF-32, so just copy with validation
         for (size_t i = 0; i < length; ++i) {
-            result.push_back(static_cast<wchar_t>(sqlwStr[i]));
+            uint32_t cp = static_cast<uint32_t>(sqlwStr[i]);
+            if (IsValidUnicodeScalar(cp)) {
+                result.push_back(static_cast<wchar_t>(cp));
+            } else {
+                result.push_back(static_cast<wchar_t>(UNICODE_REPLACEMENT_CHAR));
+            }
         }
-        return result;
     }
+    return result;
+}
 
-    inline std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
+inline std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
     std::vector<SQLWCHAR> result;
-
-    for (wchar_t wc : str) {
-        uint32_t codePoint = static_cast<uint32_t>(wc);
-        if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
-            // Skip invalid lone surrogates (shouldn't occur in well-formed wchar_t strings)
-            continue;
-        } else if (codePoint <= 0xFFFF) {
-            result.push_back(static_cast<SQLWCHAR>(codePoint));
-        } else if (codePoint <= 0x10FFFF) {
-            // Encode as surrogate pair
-            codePoint -= 0x10000;
-            SQLWCHAR highSurrogate = static_cast<SQLWCHAR>((codePoint >> 10) + 0xD800);
-            SQLWCHAR lowSurrogate  = static_cast<SQLWCHAR>((codePoint & 0x3FF) + 0xDC00);
-            result.push_back(highSurrogate);
-            result.push_back(lowSurrogate);
+    result.reserve(str.size() + 2);
+    if constexpr (sizeof(SQLWCHAR) == 2) {
+        // Encode UTF-32 to UTF-16
+        for (wchar_t wc : str) {
+            uint32_t cp = static_cast<uint32_t>(wc);
+            if (!IsValidUnicodeScalar(cp)) {
+                cp = UNICODE_REPLACEMENT_CHAR;
+            }
+            if (cp <= 0xFFFF) {
+                // Fits in a single UTF-16 code unit
+                result.push_back(static_cast<SQLWCHAR>(cp));
+            } else {
+                // Encode as surrogate pair
+                cp -= 0x10000;
+                SQLWCHAR high = static_cast<SQLWCHAR>((cp >> 10) + UNICODE_SURROGATE_HIGH_START);
+                SQLWCHAR low  = static_cast<SQLWCHAR>((cp & 0x3FF) + UNICODE_SURROGATE_LOW_START);
+                result.push_back(high);
+                result.push_back(low);
+            }
+        }
+    } else {
+        // Encode UTF-32 directly
+        for (wchar_t wc : str) {
+            uint32_t cp = static_cast<uint32_t>(wc);
+            if (IsValidUnicodeScalar(cp)) {
+                result.push_back(static_cast<SQLWCHAR>(cp));
+            } else {
+                result.push_back(static_cast<SQLWCHAR>(UNICODE_REPLACEMENT_CHAR));
+            }
         }
     }
-    result.push_back(0); // Null terminator
+    result.push_back(0); // null terminator
     return result;
 }
-
 #endif
 
 #if defined(__APPLE__) || defined(__linux__)
diff --git a/tests/test_004_cursor.py b/tests/test_004_cursor.py
@@ -1315,22 +1315,24 @@ def test_row_column_mapping(cursor, db_connection):
         cursor.execute("DROP TABLE #pytest_row_test")
         db_connection.commit()
 
-test_inputs = [
-    "Hello 😄",
-    "Flags 🇮🇳🇺🇸",
-    "Family 👨‍👩‍👧‍👦",
-    "Skin tone 👍🏽",
-    "Brain 🧠",
-    "Ice 🧊",
-    "Melting face 🫠",
-    "Accented éüñç",
-    "Chinese: 中文",
-    "Japanese: 日本語",
-]
-
 def test_emoji_round_trip(cursor, db_connection):
     """Test round-trip of emoji and special characters"""
-    
+    test_inputs = [
+        "Hello 😄",
+        "Flags 🇮🇳🇺🇸",
+        "Family 👨‍👩‍👧‍👦",
+        "Skin tone 👍🏽",
+        "Brain 🧠",
+        "Ice 🧊",
+        "Melting face 🫠",
+        "Accented éüñç",
+        "Chinese: 中文",
+        "Japanese: 日本語",
+        "Hello 🚀 World",
+        "admin🔒user",
+        "1🚀' OR '1'='1",
+    ]
+
     cursor.execute("""
         CREATE TABLE #pytest_emoji_test (
             id INT IDENTITY PRIMARY KEY,