Fix decoding of Cursor.description column names.

Fixes #190 - The issue was a column alias with an i with acute which caused an error. pyodbc now (1) uses the Unicode version of SQLDescribeColumn and (2) uses the configured SQL_C_WCHAR encoding. I've added SQL Server test decode_meta for this. As part of this, I've moved much of the Unicode handling to textenc.h and textenc.cpp. I fully expect to replace SQLWChar in the future and move all handling to that file.
mkleehammer · Feb 11, 2017 · 5728869 · 5728869
1 parent 23db3d0
commit 5728869
Show file tree

Hide file tree

Showing 12 changed files with 328 additions and 225 deletions.
diff --git a/src/cnxninfo.cpp b/src/cnxninfo.cpp
@@ -6,6 +6,7 @@
 
 #include "pyodbc.h"
 #include "wrapper.h"
+#include "textenc.h"
 #include "cnxninfo.h"
 #include "connection.h"
 

diff --git a/src/connection.cpp b/src/connection.cpp
@@ -10,6 +10,7 @@
 
 #include "pyodbc.h"
 #include "wrapper.h"
+#include "textenc.h"
 #include "connection.h"
 #include "cursor.h"
 #include "pyodbcmodule.h"
@@ -211,6 +212,10 @@ PyObject* Connection_New(PyObject* pConnectString, bool fAutoCommit, bool fAnsi,
     cnxn->conv_types   = 0;
     cnxn->conv_funcs   = 0;
 
+    // This is an inefficient default, but should work all the time.  When we are offered
+    // single-byte text we don't actually know what the encoding is.  For example, with SQL
+    // Server the encoding is based on the database's collation.  We ask the driver / DB to
+    // convert to SQL_C_WCHAR and use the ODBC default of UTF-16LE.
     cnxn->sqlchar_enc.optenc = OPTENC_UTF16LE;
     cnxn->sqlchar_enc.name   = _strdup("utf-16le");
     cnxn->sqlchar_enc.ctype  = SQL_C_WCHAR;
@@ -219,6 +224,10 @@ PyObject* Connection_New(PyObject* pConnectString, bool fAutoCommit, bool fAnsi,
     cnxn->sqlwchar_enc.name   = _strdup("utf-16le");
     cnxn->sqlwchar_enc.ctype  = SQL_C_WCHAR;
 
+    cnxn->metadata_enc.optenc = OPTENC_UTF16LE;
+    cnxn->metadata_enc.name   = _strdup("utf-16le");
+    cnxn->metadata_enc.ctype  = SQL_C_WCHAR;
+
     // Note: I attempted to use UTF-8 here too since it can hold any type, but SQL Server fails
     // with a data truncation error if we send something encoded in 2 bytes to a column with 1
     // character.  I don't know if this is a bug in SQL Server's driver or if I'm missing
@@ -234,9 +243,10 @@ PyObject* Connection_New(PyObject* pConnectString, bool fAutoCommit, bool fAnsi,
 
     cnxn->sqlchar_enc.to  = TO_UNICODE;
     cnxn->sqlwchar_enc.to = TO_UNICODE;
+    cnxn->metadata_enc.to = TO_UNICODE;
 #endif
 
-    if (!cnxn->sqlchar_enc.name || !cnxn->sqlwchar_enc.name || !cnxn->unicode_enc.name
+    if (!cnxn->sqlchar_enc.name || !cnxn->sqlwchar_enc.name || !cnxn->metadata_enc.name || !cnxn->unicode_enc.name
 #if PY_MAJOR_VERSION < 3
         || !cnxn->str_enc.name
 #endif
@@ -396,6 +406,8 @@ static int Connection_clear(PyObject* self)
     cnxn->sqlchar_enc.name = 0;
     free((void*)cnxn->sqlwchar_enc.name);
     cnxn->sqlwchar_enc.name = 0;
+    free((void*)cnxn->metadata_enc.name);
+    cnxn->metadata_enc.name = 0;
     free((void*)cnxn->unicode_enc.name);
     cnxn->unicode_enc.name = 0;
 #if PY_MAJOR_VERSION < 3
@@ -1245,10 +1257,11 @@ static PyObject* Connection_setdecoding(PyObject* self, PyObject* args, PyObject
     allow_raw = (sqltype == SQL_CHAR && to != TO_UNICODE);
 #endif
 
-    if (sqltype != SQL_WCHAR && sqltype != SQL_CHAR)
-        return PyErr_Format(PyExc_ValueError, "Invalid sqltype %d.  Must be SQL_CHAR or SQL_WCHAR", sqltype);
+    if (sqltype != SQL_WCHAR && sqltype != SQL_CHAR && sqltype != SQL_WMETADATA)
+        return PyErr_Format(PyExc_ValueError, "Invalid sqltype %d.  Must be SQL_CHAR or SQL_WCHAR or SQL_WMETADATA", sqltype);
 
-    TextEnc& enc = (sqltype == SQL_CHAR) ? cnxn->sqlchar_enc : cnxn->sqlwchar_enc;
+    TextEnc& enc = (sqltype == SQL_CHAR) ? cnxn->sqlchar_enc :
+        ((sqltype == SQL_WMETADATA) ? cnxn->metadata_enc : cnxn->sqlwchar_enc);
 
     if (!SetTextEncCommon(enc, encoding, ctype, allow_raw))
         return 0;

diff --git a/src/connection.h b/src/connection.h
@@ -16,55 +16,7 @@ struct Cursor;
 
 extern PyTypeObject ConnectionType;
 
-enum {
-    BYTEORDER_LE = -1,
-    BYTEORDER_NATIVE = 0,
-    BYTEORDER_BE = 1,
-
-    OPTENC_NONE    = 0,         // No optimized encoding - use the named encoding
-    OPTENC_RAW     = 1,         // In Python 2, pass bytes directly to string - no decoder
-    OPTENC_UTF8    = 2,
-    OPTENC_UTF16   = 3,         // "Native", so check for BOM and default to BE
-    OPTENC_UTF16BE = 4,
-    OPTENC_UTF16LE = 5,
-    OPTENC_LATIN1  = 6,
-
-#if PY_MAJOR_VERSION < 3
-    TO_UNICODE = 1,
-    TO_STR     = 2
-#endif
-};
-
-
-struct TextEnc
-{
-    // Holds encoding information for reading or writing text.  Since some drivers / databases
-    // are not easy to configure efficiently, a separate instance of this structure is
-    // configured for:
-    //
-    // * reading SQL_CHAR
-    // * reading SQL_WCHAR
-    // * writing unicode strings
-    // * writing non-unicode strings (Python 2.7 only)
-
-#if PY_MAJOR_VERSION < 3
-    int to;
-    // The type of object to return if reading from the database: str or unicode.
-#endif
-
-    int optenc;
-    // Set to one of the OPTENC constants to indicate whether an optimized encoding is to be
-    // used or a custom one.  If OPTENC_NONE, no optimized encoding is set and `name` should be
-    // used.
-
-    const char* name;
-    // The name of the encoding.  This must be freed using `free`.
-
-    SQLSMALLINT ctype;
-    // The C type to use, SQL_C_CHAR or SQL_C_WCHAR.  Normally this matches the SQL type of the
-    // column (SQL_C_CHAR is used for SQL_CHAR, etc.).  At least one database reports it has
-    // SQL_WCHAR data even when configured for UTF-8 which is better suited for SQL_C_CHAR.
-};
+struct TextEnc;
 
 struct Connection
 {
@@ -100,6 +52,13 @@ struct Connection
     TextEnc str_enc;            // encoding used when writing non-unicode strings
 #endif
 
+    TextEnc metadata_enc;
+    // Used when reading column names for Cursor.description.  I originally thought I could use
+    // the TextEncs above based on whether I called SQLDescribeCol vs SQLDescribeColW.
+    // Unfortunately it looks like PostgreSQL and MySQL (and probably others) ignore the ODBC
+    // specification regarding encoding everywhere *except* in these functions - SQLDescribeCol
+    // seems to always return UTF-16LE by them regardless of the connection settings.
+
     long maxwrite;
     // Used to override varchar_maxlength, etc.  Those are initialized from
     // SQLGetTypeInfo but some drivers (e.g. psqlodbc) return almost arbitrary

diff --git a/src/cursor.cpp b/src/cursor.cpp
@@ -14,6 +14,7 @@
 
 #include "pyodbc.h"
 #include "wrapper.h"
+#include "textenc.h"
 #include "cursor.h"
 #include "pyodbcmodule.h"
 #include "connection.h"
@@ -153,14 +154,15 @@ static bool create_name_map(Cursor* cur, SQLSMALLINT field_count, bool lower)
 
     for (int i = 0; i < field_count; i++)
     {
-        SQLCHAR name[300];
+        ODBCCHAR szName[300];
+        SQLSMALLINT cchName;
         SQLSMALLINT nDataType;
         SQLULEN nColSize;           // precision
         SQLSMALLINT cDecimalDigits; // scale
         SQLSMALLINT nullable;
 
         Py_BEGIN_ALLOW_THREADS
-        ret = SQLDescribeCol(cur->hstmt, (SQLUSMALLINT)(i + 1), name, _countof(name), 0, &nDataType, &nColSize, &cDecimalDigits, &nullable);
+        ret = SQLDescribeColW(cur->hstmt, (SQLUSMALLINT)(i + 1), (SQLWCHAR*)szName, _countof(szName), &cchName, &nDataType, &nColSize, &cDecimalDigits, &nullable);
         Py_END_ALLOW_THREADS
 
         if (cur->cnxn->hdbc == SQL_NULL_HANDLE)
@@ -178,10 +180,21 @@ static bool create_name_map(Cursor* cur, SQLSMALLINT field_count, bool lower)
 
         TRACE("Col %d: type=%s (%d) colsize=%d\n", (i+1), SqlTypeName(nDataType), (int)nDataType, (int)nColSize);
 
+        const TextEnc& enc = cur->cnxn->metadata_enc;
+        Object name(TextBufferToObject(enc, szName, (Py_ssize_t)(cchName * sizeof(ODBCCHAR))));
+
+        if (!name)
+            goto done;
+
         if (lower)
-            _strlwr((char*)name);
+        {
+            PyObject* l = PyObject_CallMethod(name, "lower", 0);
+            if (!l)
+                goto done;
+            name.Attach(l);
+        }
 
-        type = PythonTypeFromSqlType(cur, name, nDataType);
+        type = PythonTypeFromSqlType(cur, nDataType);
         if (!type)
             goto done;
 
@@ -220,8 +233,8 @@ static bool create_name_map(Cursor* cur, SQLSMALLINT field_count, bool lower)
             }
         }
 
-        colinfo = Py_BuildValue("(sOOiiiO)",
-                                (char*)name,
+        colinfo = Py_BuildValue("(OOOiiiO)",
+                                name.Get(),
                                 type,                // type_code
                                 Py_None,             // display size
                                 (int)nColSize,       // internal_size
@@ -231,14 +244,13 @@ static bool create_name_map(Cursor* cur, SQLSMALLINT field_count, bool lower)
         if (!colinfo)
             goto done;
 
-
         nullable_obj = 0;
 
         index = PyInt_FromLong(i);
         if (!index)
             goto done;
 
-        PyDict_SetItemString(colmap, (const char*)name, index);
+        PyDict_SetItem(colmap, name.Get(), index);
         Py_DECREF(index);       // SetItemString increments
         index = 0;
 
@@ -583,34 +595,33 @@ static PyObject* execute(Cursor* cur, PyObject* pSql, PyObject* params, bool ski
         cur->pPreparedSQL = 0;
 
         szLastFunction = "SQLExecDirect";
+
+        const TextEnc* penc = 0;
+
 #if PY_MAJOR_VERSION < 3
         if (PyString_Check(pSql))
         {
-            const TextEnc& enc = cur->cnxn->str_enc;
-            SQLWChar query(pSql, enc.ctype, enc.name);
-            if (!query)
-                return 0;
-            Py_BEGIN_ALLOW_THREADS
-            if (enc.ctype == SQL_C_WCHAR)
-                ret = SQLExecDirectW(cur->hstmt, (SQLWCHAR*)query.value(), (SQLINTEGER)query.charlen());
-            else
-                ret = SQLExecDirect(cur->hstmt, (SQLCHAR*)query.value(), (SQLINTEGER)query.charlen());
-            Py_END_ALLOW_THREADS
+            penc = &cur->cnxn->str_enc;
         }
         else
 #endif
         {
-            const TextEnc& enc = cur->cnxn->unicode_enc;
-            SQLWChar query(pSql, enc.ctype, enc.name);
-            if (!query)
-                return 0;
-            Py_BEGIN_ALLOW_THREADS
-            if (enc.ctype == SQL_C_WCHAR)
-                ret = SQLExecDirectW(cur->hstmt, (SQLWCHAR*)query.value(), (SQLINTEGER)query.charlen());
-            else
-                ret = SQLExecDirect(cur->hstmt, (SQLCHAR*)query.value(), (SQLINTEGER)query.charlen());
-            Py_END_ALLOW_THREADS
+            penc = &cur->cnxn->unicode_enc;
         }
+
+        Object query(penc->Encode(pSql));
+        if (!query)
+            return 0;
+
+        const char* pch = PyBytes_AS_STRING(query.Get());
+        SQLINTEGER  cch = (SQLINTEGER)PyBytes_GET_SIZE(query.Get());
+
+        Py_BEGIN_ALLOW_THREADS
+        if (penc->ctype == SQL_C_WCHAR)
+            ret = SQLExecDirectW(cur->hstmt, (SQLWCHAR*)pch, cch);
+        else
+            ret = SQLExecDirect(cur->hstmt, (SQLCHAR*)pch, cch);
+        Py_END_ALLOW_THREADS
     }
 
     if (cur->cnxn->hdbc == SQL_NULL_HANDLE)