-
Notifications
You must be signed in to change notification settings - Fork 46
FIX: Inconsistent retrieval of CP1252 encoded data in VARCHAR columns - Windows vs. Linux #468 #478
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
12ad019
d7e894b
b06ac2c
39d9aaf
a1675fb
0ac41f9
f5eb0e6
9b4a5f2
54e57cc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2914,6 +2914,10 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p | |
| // Note: wcharEncoding parameter is reserved for future use | ||
| // Currently WCHAR data always uses UTF-16LE for Windows compatibility | ||
| (void)wcharEncoding; // Suppress unused parameter warning | ||
| #if !defined(__APPLE__) && !defined(__linux__) | ||
| // On Windows, VARCHAR is fetched as SQL_C_WCHAR, so charEncoding is unused. | ||
| (void)charEncoding; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I consider this as a big behavioral (or potentially breaking) change, why? Because: Consider a user with a server using a rare or custom collation (say a legacy Thai or Japanese single-byte encoding). Before this PR, they could call What I get from this change is that the The user's explicit decoding configuration is silently discarded. The test
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am still thinking about what could be the possible solution for this to avoid this "potentially" big change.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's actually a cleaner fix that avoids the breaking change entirely.
This means default users get
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @gargsaumya for the details. |
||
| #endif | ||
|
|
||
| LOG("SQLGetData: Getting data from %d columns for statement_handle=%p", colCount, | ||
| (void*)StatementHandle->get()); | ||
|
|
@@ -2949,6 +2953,8 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p | |
| case SQL_CHAR: | ||
| case SQL_VARCHAR: | ||
| case SQL_LONGVARCHAR: { | ||
| #if defined(__APPLE__) || defined(__linux__) | ||
| // On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR. | ||
| if (columnSize == SQL_NO_TOTAL || columnSize == 0 || | ||
| columnSize > SQL_MAX_LOB_SIZE) { | ||
| LOG("SQLGetData: Streaming LOB for column %d (SQL_C_CHAR) " | ||
|
|
@@ -2957,34 +2963,16 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p | |
| row.append( | ||
| FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false, charEncoding)); | ||
| } else { | ||
| // Allocate columnSize * 4 + 1 on ALL platforms (no #if guard). | ||
| // | ||
| // Why this differs from SQLBindColums / FetchBatchData: | ||
| // Those two functions use #if to apply *4 only on Linux/macOS, | ||
| // because on Windows with a non-UTF-8 collation (e.g. CP1252) | ||
| // each character occupies exactly 1 byte, so *1 suffices and | ||
| // saves memory across the entire batch (fetchSize × numCols | ||
| // buffers). | ||
| // | ||
| // SQLGetData_wrap allocates a single temporary buffer per | ||
| // column per row, so the over-allocation cost is negligible. | ||
| // Using *4 unconditionally here keeps the code simple and | ||
| // correct on every platform—including Windows with a UTF-8 | ||
| // collation where multi-byte chars could otherwise cause | ||
| // truncation at the exact column boundary (e.g. CP1252 é in | ||
| // VARCHAR(10)). | ||
| // Allocate columnSize * 4 + 1 to accommodate UTF-8 expansion. | ||
| uint64_t fetchBufferSize = columnSize * 4 + 1 /* null-termination */; | ||
| std::vector<SQLCHAR> dataBuffer(fetchBufferSize); | ||
| SQLLEN dataLen; | ||
| ret = SQLGetData_ptr(hStmt, i, SQL_C_CHAR, dataBuffer.data(), dataBuffer.size(), | ||
| &dataLen); | ||
| if (SQL_SUCCEEDED(ret)) { | ||
| // columnSize is in chars, dataLen is in bytes | ||
| if (dataLen > 0) { | ||
| uint64_t numCharsInData = dataLen / sizeof(SQLCHAR); | ||
| if (numCharsInData < dataBuffer.size()) { | ||
| // SQLGetData will null-terminate the data | ||
| // Use Python's codec system to decode bytes. | ||
| const std::string decodeEncoding = | ||
| GetEffectiveCharDecoding(charEncoding); | ||
| py::bytes raw_bytes(reinterpret_cast<char*>(dataBuffer.data()), | ||
|
|
@@ -3001,11 +2989,9 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p | |
| LOG_ERROR( | ||
| "SQLGetData: Failed to decode CHAR column %d with '%s': %s", | ||
| i, decodeEncoding.c_str(), e.what()); | ||
| // Return raw bytes as fallback | ||
| row.append(raw_bytes); | ||
| } | ||
| } else { | ||
| // Buffer too small, fallback to streaming | ||
| LOG("SQLGetData: CHAR column %d data truncated " | ||
| "(buffer_size=%zu), using streaming LOB", | ||
| i, dataBuffer.size()); | ||
|
|
@@ -3037,6 +3023,66 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p | |
| row.append(py::none()); | ||
| } | ||
| } | ||
| #else | ||
| // On Windows, request SQL_C_WCHAR so the ODBC driver converts | ||
| // from the server's native encoding (e.g. CP1252) to UTF-16. | ||
| // This avoids the need to guess the server's code page and | ||
| // eliminates the bytes-vs-str inconsistency. | ||
| if (columnSize == SQL_NO_TOTAL || columnSize == 0 || | ||
| columnSize > SQL_MAX_LOB_SIZE) { | ||
| LOG("SQLGetData: Streaming LOB for column %d (VARCHAR as SQL_C_WCHAR) " | ||
| "- columnSize=%lu", | ||
| i, (unsigned long)columnSize); | ||
| row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false, "utf-16le")); | ||
| } else { | ||
| uint64_t fetchBufferSize = | ||
| (columnSize + 1) * sizeof(SQLWCHAR); // +1 for null terminator | ||
| std::vector<SQLWCHAR> dataBuffer(columnSize + 1); | ||
| SQLLEN dataLen; | ||
| ret = SQLGetData_ptr(hStmt, i, SQL_C_WCHAR, dataBuffer.data(), fetchBufferSize, | ||
| &dataLen); | ||
| if (SQL_SUCCEEDED(ret)) { | ||
| if (dataLen > 0) { | ||
| uint64_t numCharsInData = dataLen / sizeof(SQLWCHAR); | ||
| if (numCharsInData < dataBuffer.size()) { | ||
| std::wstring wstr(reinterpret_cast<wchar_t*>(dataBuffer.data())); | ||
|
subrata-ms marked this conversation as resolved.
|
||
| row.append(py::cast(wstr)); | ||
| LOG("SQLGetData: VARCHAR column %d decoded via SQL_C_WCHAR, " | ||
| "length=%lu", | ||
| i, (unsigned long)numCharsInData); | ||
| } else { | ||
| LOG("SQLGetData: VARCHAR column %d data truncated " | ||
| "(as WCHAR), using streaming LOB", | ||
| i); | ||
| row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false, | ||
| "utf-16le")); | ||
| } | ||
| } else if (dataLen == SQL_NULL_DATA) { | ||
| LOG("SQLGetData: Column %d is NULL (VARCHAR via WCHAR)", i); | ||
| row.append(py::none()); | ||
| } else if (dataLen == 0) { | ||
| row.append(py::str("")); | ||
| } else if (dataLen == SQL_NO_TOTAL) { | ||
| LOG("SQLGetData: Cannot determine data length " | ||
| "(SQL_NO_TOTAL) for column %d (VARCHAR via WCHAR), " | ||
| "returning NULL", | ||
| i); | ||
| row.append(py::none()); | ||
|
subrata-ms marked this conversation as resolved.
Outdated
|
||
| } else if (dataLen < 0) { | ||
| LOG("SQLGetData: Unexpected negative data length " | ||
| "for column %d (VARCHAR via WCHAR) - dataLen=%ld", | ||
| i, (long)dataLen); | ||
| ThrowStdException("SQLGetData returned an unexpected negative " | ||
| "data length"); | ||
| } | ||
| } else { | ||
| LOG("SQLGetData: Error retrieving data for column %d " | ||
| "(VARCHAR via WCHAR) - SQLRETURN=%d, returning NULL", | ||
| i, ret); | ||
| row.append(py::none()); | ||
| } | ||
| } | ||
| #endif | ||
| break; | ||
| } | ||
| case SQL_SS_XML: { | ||
|
|
@@ -3487,29 +3533,26 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column | |
| // TODO: handle variable length data correctly. This logic wont | ||
| // suffice | ||
| HandleZeroColumnSizeAtFetch(columnSize); | ||
| // Use columnSize * 4 + 1 on Linux/macOS to accommodate UTF-8 | ||
| // expansion. The ODBC driver returns UTF-8 for SQL_C_CHAR where | ||
| // each character can be up to 4 bytes. | ||
| #if defined(__APPLE__) || defined(__linux__) | ||
| // On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR | ||
| // where each character can be up to 4 bytes. | ||
| uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/; | ||
| #else | ||
| uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/; | ||
| #endif | ||
| // TODO: For LONGVARCHAR/BINARY types, columnSize is returned as | ||
| // 2GB-1 by SQLDescribeCol. So fetchBufferSize = 2GB. | ||
| // fetchSize=1 if columnSize>1GB. So we'll allocate a vector of | ||
| // size 2GB. If a query fetches multiple (say N) LONG... | ||
| // columns, we will have allocated multiple (N) 2GB sized | ||
| // vectors. This will make driver very slow. And if the N is | ||
| // high enough, we could hit the OS limit for heap memory that | ||
| // we can allocate, & hence get a std::bad_alloc. The process | ||
| // could also be killed by OS for consuming too much memory. | ||
| // Hence this will be revisited in beta to not allocate 2GB+ | ||
| // memory, & use streaming instead | ||
| buffers.charBuffers[col - 1].resize(fetchSize * fetchBufferSize); | ||
| ret = SQLBindCol_ptr(hStmt, col, SQL_C_CHAR, buffers.charBuffers[col - 1].data(), | ||
| fetchBufferSize * sizeof(SQLCHAR), | ||
| buffers.indicators[col - 1].data()); | ||
| #else | ||
| // On Windows, the ODBC driver returns bytes in the server's | ||
| // native encoding (e.g., CP1252). Rather than guessing the | ||
| // code page, we request SQL_C_WCHAR so the driver performs | ||
| // the conversion to UTF-16 — exactly matching how NVARCHAR | ||
| // columns are already handled. | ||
| uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/; | ||
| buffers.wcharBuffers[col - 1].resize(fetchSize * fetchBufferSize); | ||
| ret = SQLBindCol_ptr(hStmt, col, SQL_C_WCHAR, buffers.wcharBuffers[col - 1].data(), | ||
| fetchBufferSize * sizeof(SQLWCHAR), | ||
| buffers.indicators[col - 1].data()); | ||
| #endif | ||
| break; | ||
| } | ||
| case SQL_WCHAR: | ||
|
|
@@ -3675,9 +3718,9 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum | |
| HandleZeroColumnSizeAtFetch(columnInfos[col].processedColumnSize); | ||
| // On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR where | ||
| // each character can be up to 4 bytes. Must match SQLBindColums buffer. | ||
| #if defined(__APPLE__) || defined(__linux__) | ||
| SQLSMALLINT dt = columnInfos[col].dataType; | ||
| bool isCharType = (dt == SQL_CHAR || dt == SQL_VARCHAR || dt == SQL_LONGVARCHAR); | ||
| #if defined(__APPLE__) || defined(__linux__) | ||
| if (isCharType) { | ||
| columnInfos[col].fetchBufferSize = columnInfos[col].processedColumnSize * 4 + | ||
| 1; // *4 for UTF-8, +1 for null terminator | ||
|
|
@@ -3686,6 +3729,10 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum | |
| columnInfos[col].processedColumnSize + 1; // +1 for null terminator | ||
| } | ||
| #else | ||
| // On Windows, VARCHAR columns are fetched as SQL_C_WCHAR (see | ||
| // SQLBindColums). The fetchBufferSize is in SQLWCHAR elements, | ||
| // matching the wcharBuffers layout. | ||
| (void)isCharType; // same formula for all types on Windows | ||
| columnInfos[col].fetchBufferSize = | ||
| columnInfos[col].processedColumnSize + 1; // +1 for null terminator | ||
| #endif | ||
|
|
@@ -3740,7 +3787,14 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum | |
| case SQL_CHAR: | ||
| case SQL_VARCHAR: | ||
| case SQL_LONGVARCHAR: | ||
| #if defined(__APPLE__) || defined(__linux__) | ||
| columnProcessors[col] = ColumnProcessors::ProcessChar; | ||
| #else | ||
| // On Windows, VARCHAR columns are fetched as SQL_C_WCHAR | ||
| // (the driver converts from the server's native encoding to | ||
| // UTF-16), so we reuse the NVARCHAR processor. | ||
| columnProcessors[col] = ColumnProcessors::ProcessWChar; | ||
| #endif | ||
| break; | ||
| case SQL_WCHAR: | ||
| case SQL_WVARCHAR: | ||
|
|
@@ -4048,7 +4102,8 @@ size_t calculateRowSize(py::list& columnNames, SQLUSMALLINT numCols) { | |
| break; | ||
| case SQL_SS_UDT: | ||
| rowSize += (static_cast<SQLLEN>(columnSize) == SQL_NO_TOTAL || columnSize == 0) | ||
| ? SQL_MAX_LOB_SIZE : columnSize; | ||
| ? SQL_MAX_LOB_SIZE | ||
| : columnSize; | ||
|
subrata-ms marked this conversation as resolved.
|
||
| break; | ||
| case SQL_BINARY: | ||
| case SQL_VARBINARY: | ||
|
|
@@ -4112,8 +4167,7 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch | |
|
|
||
| if ((dataType == SQL_WVARCHAR || dataType == SQL_WLONGVARCHAR || dataType == SQL_VARCHAR || | ||
| dataType == SQL_LONGVARCHAR || dataType == SQL_VARBINARY || | ||
| dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML || | ||
| dataType == SQL_SS_UDT) && | ||
| dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML || dataType == SQL_SS_UDT) && | ||
| (columnSize == 0 || columnSize == SQL_NO_TOTAL || columnSize > SQL_MAX_LOB_SIZE)) { | ||
| lobColumns.push_back(i + 1); // 1-based | ||
| } | ||
|
|
@@ -4252,8 +4306,7 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows, | |
|
|
||
| if ((dataType == SQL_WVARCHAR || dataType == SQL_WLONGVARCHAR || dataType == SQL_VARCHAR || | ||
| dataType == SQL_LONGVARCHAR || dataType == SQL_VARBINARY || | ||
| dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML || | ||
| dataType == SQL_SS_UDT) && | ||
| dataType == SQL_LONGVARBINARY || dataType == SQL_SS_XML || dataType == SQL_SS_UDT) && | ||
| (columnSize == 0 || columnSize == SQL_NO_TOTAL || columnSize > SQL_MAX_LOB_SIZE)) { | ||
| lobColumns.push_back(i + 1); // 1-based | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.