@@ -33,49 +33,107 @@ using namespace pybind11::literals;
3333#include < sqlext.h>
3434
3535#if defined(__APPLE__) || defined(__linux__)
36- // macOS-specific headers
37- #include < dlfcn.h>
36+ #include < dlfcn.h>
37+
38+ // Unicode constants for surrogate ranges and max scalar value
39+ constexpr uint32_t UNICODE_SURROGATE_HIGH_START = 0xD800 ;
40+ constexpr uint32_t UNICODE_SURROGATE_HIGH_END = 0xDBFF ;
41+ constexpr uint32_t UNICODE_SURROGATE_LOW_START = 0xDC00 ;
42+ constexpr uint32_t UNICODE_SURROGATE_LOW_END = 0xDFFF ;
43+ constexpr uint32_t UNICODE_MAX_CODEPOINT = 0x10FFFF ;
44+ constexpr uint32_t UNICODE_REPLACEMENT_CHAR = 0xFFFD ;
45+
46+ // Validate whether a code point is a legal Unicode scalar value
47+ // (excludes surrogate halves and values beyond U+10FFFF)
48+ inline bool IsValidUnicodeScalar (uint32_t cp) {
49+ return cp <= UNICODE_MAX_CODEPOINT &&
50+ !(cp >= UNICODE_SURROGATE_HIGH_START && cp <= UNICODE_SURROGATE_LOW_END);
51+ }
3852
39- inline std::wstring SQLWCHARToWString (const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
40- if (!sqlwStr) return std::wstring ();
53+ inline std::wstring SQLWCHARToWString (const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
54+ if (!sqlwStr) return std::wstring ();
4155
42- if (length == SQL_NTS) {
43- size_t i = 0 ;
44- while (sqlwStr[i] != 0 ) ++i;
45- length = i;
46- }
56+ if (length == SQL_NTS) {
57+ size_t i = 0 ;
58+ while (sqlwStr[i] != 0 ) ++i;
59+ length = i;
60+ }
61+ std::wstring result;
62+ result.reserve (length);
4763
48- std::wstring result;
49- result.reserve (length);
64+ if constexpr (sizeof (SQLWCHAR) == 2 ) {
65+ // Decode UTF-16 to UTF-32 (with surrogate pair handling)
66+ for (size_t i = 0 ; i < length; ++i) {
67+ uint16_t wc = static_cast <uint16_t >(sqlwStr[i]);
68+ // Check if this is a high surrogate (U+D800โU+DBFF)
69+ if (wc >= UNICODE_SURROGATE_HIGH_START && wc <= UNICODE_SURROGATE_HIGH_END && i + 1 < length) {
70+ uint16_t low = static_cast <uint16_t >(sqlwStr[i + 1 ]);
71+ // Check if the next code unit is a low surrogate (U+DC00โU+DFFF)
72+ if (low >= UNICODE_SURROGATE_LOW_START && low <= UNICODE_SURROGATE_LOW_END) {
73+ // Combine surrogate pair into a single code point
74+ uint32_t cp = (((wc - UNICODE_SURROGATE_HIGH_START) << 10 ) | (low - UNICODE_SURROGATE_LOW_START)) + 0x10000 ;
75+ result.push_back (static_cast <wchar_t >(cp));
76+ ++i; // Skip the low surrogate
77+ continue ;
78+ }
79+ }
80+ // If valid scalar then append, else append replacement char (U+FFFD)
81+ if (IsValidUnicodeScalar (wc)) {
82+ result.push_back (static_cast <wchar_t >(wc));
83+ } else {
84+ result.push_back (static_cast <wchar_t >(UNICODE_REPLACEMENT_CHAR));
85+ }
86+ }
87+ } else {
88+ // SQLWCHAR is UTF-32, so just copy with validation
5089 for (size_t i = 0 ; i < length; ++i) {
51- result.push_back (static_cast <wchar_t >(sqlwStr[i]));
90+ uint32_t cp = static_cast <uint32_t >(sqlwStr[i]);
91+ if (IsValidUnicodeScalar (cp)) {
92+ result.push_back (static_cast <wchar_t >(cp));
93+ } else {
94+ result.push_back (static_cast <wchar_t >(UNICODE_REPLACEMENT_CHAR));
95+ }
5296 }
53- return result;
5497 }
98+ return result;
99+ }
55100
56- inline std::vector<SQLWCHAR> WStringToSQLWCHAR (const std::wstring& str) {
101+ inline std::vector<SQLWCHAR> WStringToSQLWCHAR (const std::wstring& str) {
57102 std::vector<SQLWCHAR> result;
58-
59- for (wchar_t wc : str) {
60- uint32_t codePoint = static_cast <uint32_t >(wc);
61- if (codePoint >= 0xD800 && codePoint <= 0xDFFF ) {
62- // Skip invalid lone surrogates (shouldn't occur in well-formed wchar_t strings)
63- continue ;
64- } else if (codePoint <= 0xFFFF ) {
65- result.push_back (static_cast <SQLWCHAR>(codePoint));
66- } else if (codePoint <= 0x10FFFF ) {
67- // Encode as surrogate pair
68- codePoint -= 0x10000 ;
69- SQLWCHAR highSurrogate = static_cast <SQLWCHAR>((codePoint >> 10 ) + 0xD800 );
70- SQLWCHAR lowSurrogate = static_cast <SQLWCHAR>((codePoint & 0x3FF ) + 0xDC00 );
71- result.push_back (highSurrogate);
72- result.push_back (lowSurrogate);
103+ result.reserve (str.size () + 2 );
104+ if constexpr (sizeof (SQLWCHAR) == 2 ) {
105+ // Encode UTF-32 to UTF-16
106+ for (wchar_t wc : str) {
107+ uint32_t cp = static_cast <uint32_t >(wc);
108+ if (!IsValidUnicodeScalar (cp)) {
109+ cp = UNICODE_REPLACEMENT_CHAR;
110+ }
111+ if (cp <= 0xFFFF ) {
112+ // Fits in a single UTF-16 code unit
113+ result.push_back (static_cast <SQLWCHAR>(cp));
114+ } else {
115+ // Encode as surrogate pair
116+ cp -= 0x10000 ;
117+ SQLWCHAR high = static_cast <SQLWCHAR>((cp >> 10 ) + UNICODE_SURROGATE_HIGH_START);
118+ SQLWCHAR low = static_cast <SQLWCHAR>((cp & 0x3FF ) + UNICODE_SURROGATE_LOW_START);
119+ result.push_back (high);
120+ result.push_back (low);
121+ }
122+ }
123+ } else {
124+ // Encode UTF-32 directly
125+ for (wchar_t wc : str) {
126+ uint32_t cp = static_cast <uint32_t >(wc);
127+ if (IsValidUnicodeScalar (cp)) {
128+ result.push_back (static_cast <SQLWCHAR>(cp));
129+ } else {
130+ result.push_back (static_cast <SQLWCHAR>(UNICODE_REPLACEMENT_CHAR));
131+ }
73132 }
74133 }
75- result.push_back (0 ); // Null terminator
134+ result.push_back (0 ); // null terminator
76135 return result;
77136}
78-
79137#endif
80138
81139#if defined(__APPLE__) || defined(__linux__)
0 commit comments