Convert UTF-16 to UTF-8
I am current using VC++ 2008 MFC. Due to PostgreSQL doesn't support UTF-16 (Encoding used by Windows for Unicode), I need to convert string from UTF-16 to UTF-8, before store it.
Here is my code snippet.
// demo.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include "demo.h"
#include "Utils.h"
#include <iostream>
#ifdef _DEBUG
#define new DEBUG_NEW
#endif
// The one and only application object
CWinApp theApp;
using namespace std;
int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
int nRetCode = 0;
// initialize MFC and print and error on failure
if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: change error code to suit your needs
_tprintf(_T("Fatal Error: MFC initialization failed\n"));
nRetCode = 1;
}
else
{
// TODO: code your application's behavior here.
}
CString utf16 = _T("Hello");
std::cout << utf16.GetLength() << std::endl;
CStringA utf8 = UTF8Util::ConvertUTF16ToUTF8(utf16);
std::cout << utf8.GetLength() << std::endl;
getchar();
return nRetCode;
}
and the conversion functions.
namespace UTF8Util
{
//----------------------------------------------------------------------------
// FUNCTION: ConvertUTF8ToUTF16
// DESC: Converts Unicode UTF-8 text to Unicode UTF-16 (Windows default).
//----------------------------------------------------------------------------
CStringW ConvertUTF8ToUTF16( __in const CHAR * pszTextUTF8 )
{
//
// Special case of NULL or empty input string
//
if ( (pszTextUTF8 == NULL) || (*pszTextUTF8 == '\0') )
{
// Return empty string
return L"";
}
//
// Consider CHAR's count corresponding to total input string length,
// including end-of-string (\0) character
//
const size_t cchUTF8Max = INT_MAX - 1;
size_t cchUTF8;
HRESULT hr = ::StringCchLengthA( pszTextUTF8, cchUTF8Max, &cchUTF8 );
if ( FAILED( hr ) )
{
AtlThrow( hr );
}
// Consider also terminating \0
++cchUTF8;
// Convert to 'int' for use with MultiByteToWideChar API
int cbUTF8 = static_cast<int>( cchUTF8 );
//
// Get size of destination UTF-16 buffer, in WCHAR's
//
int cchUTF16 = ::MultiByteToWideChar(
CP_UTF8, // convert from UTF-8
MB_ERR_INVALID_CHARS, // error on invalid chars
pszTextUTF8, // source UTF-8 string
cbUTF8, // total length of source UTF-8 string,
// in CHAR's (= bytes), including end-of-string \0
NULL, // unused - no conversion done in this step
0 // request size of destination buffer, in WCHAR's
);
ATLASSERT( cchUTF16 != 0 );
if ( cchUTF16 == 0 )
{
AtlThrowLastWin32();
}
//
// Allocate destination buffer to store UTF-16 string
//
CStringW strUTF16;
WCHAR * pszUTF16 = strUTF16.GetBuffer( cchUTF16 );
//
// Do the conversion from UTF-8 to UTF-16
//
int result = ::MultiByteToWideChar(
CP_UTF8, // convert from UTF-8
MB_ERR_INVALID_CHARS, // error on invalid chars
pszTextUTF8, // source UTF-8 string
cbUTF8, // total length of source UTF-8 string,
// in CHAR's (= bytes), including end-of-string \0
pszUTF16, // destination buffer
cchUTF16 // size of destination buffer, in WCHAR's
);
ATLASSERT( result != 0 );
if ( result == 0 )
{
AtlThrowLastWin32();
}
// Release internal CString buffer
strUTF16.ReleaseBuffer();
// Return resulting UTF16 string
return strUTF16;
}
//----------------------------------------------------------------------------
// FUNCTION: ConvertUTF16ToUTF8
// DESC: Converts Unicode UTF-16 (Win开发者_如何学JAVAdows default) text to Unicode UTF-8.
//----------------------------------------------------------------------------
CStringA ConvertUTF16ToUTF8( __in const WCHAR * pszTextUTF16 )
{
//
// Special case of NULL or empty input string
//
if ( (pszTextUTF16 == NULL) || (*pszTextUTF16 == L'\0') )
{
// Return empty string
return "";
}
//
// Consider WCHAR's count corresponding to total input string length,
// including end-of-string (L'\0') character.
//
const size_t cchUTF16Max = INT_MAX - 1;
size_t cchUTF16;
HRESULT hr = ::StringCchLengthW( pszTextUTF16, cchUTF16Max, &cchUTF16 );
if ( FAILED( hr ) )
{
AtlThrow( hr );
}
// Consider also terminating \0
++cchUTF16;
//
// WC_ERR_INVALID_CHARS flag is set to fail if invalid input character
// is encountered.
// This flag is supported on Windows Vista and later.
// Don't use it on Windows XP and previous.
//
#if (WINVER >= 0x0600)
DWORD dwConversionFlags = WC_ERR_INVALID_CHARS;
#else
DWORD dwConversionFlags = 0;
#endif
//
// Get size of destination UTF-8 buffer, in CHAR's (= bytes)
//
int cbUTF8 = ::WideCharToMultiByte(
CP_UTF8, // convert to UTF-8
dwConversionFlags, // specify conversion behavior
pszTextUTF16, // source UTF-16 string
static_cast<int>( cchUTF16 ), // total source string length, in WCHAR's,
// including end-of-string \0
NULL, // unused - no conversion required in this step
0, // request buffer size
NULL, NULL // unused
);
ATLASSERT( cbUTF8 != 0 );
if ( cbUTF8 == 0 )
{
AtlThrowLastWin32();
}
//
// Allocate destination buffer for UTF-8 string
//
CStringA strUTF8;
int cchUTF8 = cbUTF8; // sizeof(CHAR) = 1 byte
CHAR * pszUTF8 = strUTF8.GetBuffer( cchUTF8 );
//
// Do the conversion from UTF-16 to UTF-8
//
int result = ::WideCharToMultiByte(
CP_UTF8, // convert to UTF-8
dwConversionFlags, // specify conversion behavior
pszTextUTF16, // source UTF-16 string
static_cast<int>( cchUTF16 ), // total source string length, in WCHAR's,
// including end-of-string \0
pszUTF8, // destination buffer
cbUTF8, // destination buffer size, in bytes
NULL, NULL // unused
);
ATLASSERT( result != 0 );
if ( result == 0 )
{
AtlThrowLastWin32();
}
// Release internal CString buffer
strUTF8.ReleaseBuffer();
// Return resulting UTF-8 string
return strUTF8;
}
} // namespace UTF8Util
However, during runtime, I get the exception at
ATLASSERT( cbUTF8 != 0 );
while trying to get size of destination UTF-8 buffer
- What thing I had missed out?
- If I am testing using a Chinese characters, How can I verify the resultant UTF-8 string is correct?
You can also use the ATL String Conversion Macros - to convert from UTF-16 to UTF-8 use CW2A
and pass CP_UTF8
as the code page, e.g.:
CW2A utf8(buffer, CP_UTF8);
const char* data = utf8.m_psz;
The problem is you specified the WC_ERR_INVALID_CHARS
flag:
Windows Vista and later: Fail if an invalid input character is encountered. If this flag is not set, the function silently drops illegal code points. A call to GetLastError returns ERROR_NO_UNICODE_TRANSLATION. Note that this flag only applies when CodePage is specified as CP_UTF8 or 54936 (for Windows Vista and later). It cannot be used with other code page values.
Your conversion function seems quite long. How does this one work for you?
//----------------------------------------------------------------------------
// FUNCTION: ConvertUTF16ToUTF8
// DESC: Converts Unicode UTF-16 (Windows default) text to Unicode UTF-8.
//----------------------------------------------------------------------------
CStringA ConvertUTF16ToUTF8( __in LPCWSTR pszTextUTF16 ) {
if (pszTextUTF16 == NULL) return "";
int utf16len = wcslen(pszTextUTF16);
int utf8len = WideCharToMultiByte(CP_UTF8, 0, pszTextUTF16, utf16len,
NULL, 0, NULL, NULL );
CArray<CHAR> buffer;
buffer.SetSize(utf8len+1);
buffer.SetAt(utf8len, '\0');
WideCharToMultiByte(CP_UTF8, 0, pszTextUTF16, utf16len,
buffer.GetData(), utf8len, 0, 0 );
return buffer.GetData();
}
I see you use a function called StringCchLengthW
to get the required length of the output buffer. Most of the places I look recommend using the WideCharToMultiByte
function itself to tell you how many CHARs it wants.
Edit:
As Rob pointed out, you can use CW2A with the CP_UTF8 code page:
CStringA str = CW2A(wStr, CP_UTF8);
While I'm editing, I can answer your second question:
How can I verify the resultant UTF-8 string is correct?
Write it to a text file, then open it in Mozilla Firefox or an equivillant program. In the View menu, you can go to Character Encoding and switch manually to UTF-8 (assuming Firefox didn't guess it correctly to begin with). Compare it with a UTF-16 document with the same text and see if there are any differences.
精彩评论