开发者

Convert UTF-16 to UTF-8 under Windows and Linux, in C

I was wondering if there is a recommended 'cross' Windows and Lin开发者_高级运维ux method for the purpose of converting strings from UTF-16LE to UTF-8? or one should use different methods for each environment?

I've managed to google few references to 'iconv' , but for somreason I can't find samples of basic conversions, such as - converting a wchar_t UTF-16 to UTF-8.

Anybody can recommend a method that would be 'cross', and if you know of references or a guide with samples, would very appreciate it.

Thanks, Doori Bar


Change encoding to UTF-8 with PowerShell:

Get-Content PATH\temp.txt -Encoding Unicode | Set-Content -Encoding UTF8 PATH2\temp.txt


The open source ICU library is very commonly used.


If you don't want to use ICU,

  1. Windows: WideCharToMultiByte
  2. Linux: iconv (Glibc)


If you have MSYS2 installed then the iconv package (which is installed by default) lets you use:

 iconv -f utf-16le -t utf-8 <input.txt >output.txt


#include <iconv.h>

wchar_t *src = ...; // or char16_t* on non-Windows platforms
int srclen = ...;
char *dst = ...;
int dstlen = ...;
iconv_t conv = iconv_open("UTF-8", "UTF-16");
iconv(conv, (char*)&src, &srclen, &dst, &dstlen);
iconv_close(conv);


I have run into this problem too, I solve it by using boost locale library

try
{           
    std::string utf8 = boost::locale::conv::utf_to_utf<char, short>(
                        (short*)wcontent.c_str(), 
                        (short*)(wcontent.c_str() + wcontent.length()));
    content = boost::locale::conv::from_utf(utf8, "ISO-8859-1");
}
catch (boost::locale::conv::conversion_error e)
{
    std::cout << "Fail to convert from UTF-8 to " << toEncoding << "!" << std::endl;
    break;
}

The boost::locale::conv::utf_to_utf function try to convert from a buffer that encoded by UTF-16LE to UTF-8, The boost::locale::conv::from_utf function try to convert from a buffer that encoded by UTF-8 to ANSI, make sure the encoding is right(Here I use encoding for Latin-1, ISO-8859-1).

Another reminder is, in Linux std::wstring is 4 bytes long, but in Windows std::wstring is 2 bytes long, so you would better not use std::wstring to contain UTF-16LE buffer.


There's also utfcpp, which is a header-only library.


Another portable C possibility to convert string between UTF-8, UTF-16, UTF-32, wchar - is mdz_unicode library.


Thanks guys, this is how I managed to solve the 'cross' windows and linux requirement:

  1. Downloaded and installed: MinGW , and MSYS
  2. Downloaded the libiconv source package
  3. Compiled libiconv via MSYS.

That's about it.


You can also roll your own, which has several benefits:

  1. Not subject to the somewhat restrictive license of iconv
  2. No restriction on statically linking to avoid ICU version hell or other dynamic-link headaches
  3. Avoid the need to link a very large library (such as icu or boost) (which, if statically linked, can add tens of MB of size to what might otherwise be a very small binary)

Note: the below assumes you have installed utf8proc, which is very compact. however, if you prefer, you can simply use its header file and copy the one utf8proc_encode_char() function that this code uses.

utf16le_to_utf8.h:

#ifndef UTF16LE_TO_UTF8_H
#define UTF16LE_TO_UTF8_H

enum utf816le_status {
  utf816le_status_ok = 0,
  utf816le_status_malformed_utf6le_input,
  utf816le_status_memory,
  utf816le_status_unencodable,
  utf816le_status_buffsize
};

/*
 * @return converted string, or NULL on error
 * @param str      input string
 * @param len      length (in bytes) of input string
 * @param buff     optional user-provided output buffer. if not provided, the returned
 *                 converted string must be freed by caller using free()
 * @param buffsize length of user-provided buffer, or 0 if no buffer provider
 * @param out_len  pointer to length of converted output, in bytes
 * @param status   pointer to status, set to non-zero in case of error
 */
unsigned char *utf16le_to_utf8(const unsigned char *str, size_t len,
                               unsigned char *buff, size_t buffsize,
                               size_t *out_len,
                               enum utf816le_status *status);

#endif

utf16le_to_utf8.c:

#include <stdlib.h>
#include <string.h>
#include <utf8proc.h>
#include "utf16le_to_utf8.h"

#if defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
#  include <sys/endian.h>
#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
#  include <machine/endian.h>
#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
#  if !defined( __MINGW32__ ) && !defined( _AIX )
#    include <endian.h>
#    if !defined( __BEOS__ )
#      include <byteswap.h>
#    endif
#  endif
#endif

static inline uint16_t little_endian_16(uint16_t v) {
#if __BYTE_ORDER == __LITTLE_ENDIAN
  return v;
#else
  return (v << 8) | (v >> 8);
#endif
}

static utf8proc_int32_t utf16le_next_codepoint(uint16_t *text, unsigned int max_bytes,
                                               unsigned int *bytes_read) {
  uint16_t c1 = little_endian_16(text[0]);
  if (c1 >= 0xd800 && c1 < 0xdc00) {
    if(max_bytes < 4) {
      *bytes_read = 0;
      return 0;
    }
    *bytes_read = 4;
    uint16_t c2 = text[1];
    return ((c1 & 0x3ff) << 10) + (c2 & 0x3ff) + 0x10000;
  }

  if(max_bytes < 2) {
    *bytes_read = 0;
    return 0;
  }
  *bytes_read = 2;
  return c1;
}

unsigned char *utf16le_to_utf8(const unsigned char *str, size_t len,
                               unsigned char *buff, size_t buffsize,
                               size_t *out_len,
                               enum utf816le_status *status) {
  if(!buffsize)
    buff = NULL;
  if(!buff)
    buffsize = 0;
  unsigned char *dst = buff;
  size_t sizeof_dst = buffsize;
  size_t written = 0;

  *status = utf816le_status_ok;
  unsigned char_len;
  for(size_t i = 0; i < len; i+= char_len) {
    utf8proc_int32_t codepoint = utf16le_next_codepoint((uint16_t *)(str + i), len - i, &char_len);
    if(!char_len) { // error! bad utf
      *status = utf816le_status_malformed_utf6le_input;
      break;
    }
    // we need at least 4 bytes to encode to utf8. add 1 for terminal null and 1 for good measure
    if(sizeof_dst < written + 6) {
      if(buffsize > 0) { // user-provided buffer is too small
        *status = utf816le_status_buffsize;
        break;
      }
      size_t new_size = sizeof_dst == 0 ? 64 : sizeof_dst * 2;
      unsigned char *new_dst = realloc(dst, new_size);
      if(!new_dst) { // out of memory!
        *status = utf816le_status_memory;
        break;
      }
      dst = new_dst;
      sizeof_dst = new_size;
    }
    utf8proc_ssize_t want = utf8proc_encode_char(codepoint, dst + written);
    if(!want) { // error
      *status = utf816le_status_unencodable;
      break;
    }
    written += want;
  }
  if(*status == utf816le_status_ok) {
    *out_len = written;
    dst[written] = '\0';
    return dst;
  }
  *out_len = 0;
  if(dst != buff)
    free(dst);
  return NULL;
}

which you can use like so:

    ...
    unsigned char *converted = utf16le_to_utf8(utf16buff, utf16byte_count, NULL, 0, &output_len, &status);
    if(!converted || !output_len)
      fprintf(stderr, "Error! %i\n", status);
    else
      fprintf(stdout, "Converted to utf8 with length %zu: %s\n", output_len, converted);
    free(converted);
  }
}
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜