C++ equivalent of mbsrtowcs and wcsrtombs using locales and streams
Is there a C++ equivalent of mbsrtowcs and wcsrtombs type functions using std::locale and C++ streams functionality?
I'm trying to figure out the best way to convert back and forth between std::string and std::wstring using the standard library. It seems std::locale can almost do this, but I'm a little iffy on some details, or on what limitations it might have.
Some specifics: I'm on Linux, which uses utf-8 as the native encoding. I'd like to go from utf-8 std::string to std::wstring and back without losing information.
I think there may be some limitations with locales on Windows, but I'm not particularly concerned开发者_如何学JAVA about them. As long as the answer works on Linux and has no dependencies beyond libstdc++ i.e. no boost dependency, I'm happy.
Links to background information appreciated.
NOTE: There seems to be some confusion. More than one char can represent a single character in UTF-8, so functions that do not account for this when converting from wchar_t to char will not work.
locale
is overkill for this task - UTF-8 and UTF-16 can be converted back and forth with simple binary conversions. Here's some code, based on my answer to an earlier question.
std::string UTF16to8(const wchar_t * in)
{
std::string out;
if (in == NULL)
return out;
unsigned int codepoint = 0;
for (in; *in != 0; ++in)
{
if (*in >= 0xd800 && *in <= 0xdbff)
codepoint = ((*in - 0xd800) << 10) + 0x10000;
else
{
if (*in >= 0xdc00 && *in <= 0xdfff)
codepoint |= *in - 0xdc00;
else
codepoint = *in;
if (codepoint <= 0x7f)
out.append(1, static_cast<char>(codepoint));
else if (codepoint <= 0x7ff)
{
out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
else if (codepoint <= 0xffff)
{
out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
else
{
out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
codepoint = 0;
}
}
return out;
}
std::wstring UTF8to16(const char * in)
{
std::wstring out;
if (in == NULL)
return out;
unsigned int codepoint = 0;
int following = 0;
for (in; *in != 0; ++in)
{
unsigned char ch = *in;
if (ch <= 0x7f)
{
codepoint = ch;
following = 0;
}
else if (ch <= 0xbf)
{
if (following > 0)
{
codepoint = (codepoint << 6) | (ch & 0x3f);
--following;
}
}
else if (ch <= 0xdf)
{
codepoint = ch & 0x1f;
following = 1;
}
else if (ch <= 0xef)
{
codepoint = ch & 0x0f;
following = 2;
}
else
{
codepoint = ch & 0x07;
following = 3;
}
if (following == 0)
{
if (codepoint > 0xffff)
{
out.append(1, static_cast<wchar_t>(0xd800 + (codepoint >> 10)));
out.append(1, static_cast<wchar_t>(0xdc00 + (codepoint & 0x03ff)));
}
else
out.append(1, static_cast<wchar_t>(codepoint));
codepoint = 0;
}
}
return out;
}
Here's a version (untested) to use if your wchar_t is 32 bits rather than 16 bits.
std::string UTF32to8(const wchar_t * in)
{
assert(sizeof(wchar_t) >= 4);
std::string out;
if (in == NULL)
return out;
for (in; *in != 0; ++in)
{
unsigned int codepoint = *in;
if (codepoint <= 0x7f)
out.append(1, static_cast<char>(codepoint));
else if (codepoint <= 0x7ff)
{
out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
else if (codepoint <= 0xffff)
{
out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
else
{
out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
}
return out;
}
std::wstring UTF8to32(const char * in)
{
assert(sizeof(wchar_t) >= 4);
std::wstring out;
if (in == NULL)
return out;
wchar_t codepoint = 0;
int following = 0;
for (in; *in != 0; ++in)
{
unsigned char ch = *in;
if (ch <= 0x7f)
{
codepoint = ch;
following = 0;
}
else if (ch <= 0xbf)
{
if (following > 0)
{
codepoint = (codepoint << 6) | (ch & 0x3f);
--following;
}
}
else if (ch <= 0xdf)
{
codepoint = ch & 0x1f;
following = 1;
}
else if (ch <= 0xef)
{
codepoint = ch & 0x0f;
following = 2;
}
else
{
codepoint = ch & 0x07;
following = 3;
}
if (following == 0)
{
out.append(1, codepoint);
codepoint = 0;
}
}
return out;
}
Had you tried to create some simple function?
std::wstring StringToWString(const std::string& src)
{
std::wstring str(src.length(),L' ');
std::copy(src.begin(), src.end(), str.begin());
return str;
}
std::string WStringToString(const std::wstring& src)
{
std::string str(src.length(), ' ');
std::copy(src.begin(), src.end(), str.begin());
return str;
}
void main()
{
string s1 = "Hello World!";
wstring s2 = StringToWString(s1);
s1 = WStringToString(s2);
}
精彩评论