--- util.cpp.orig 2014-08-12 18:08:28.000000000 +0200 +++ util.cpp 2014-08-12 18:09:20.000000000 +0200 @@ -67,3 +67,162 @@ return pid + tm + (ts.tv_sec ^ ts.tv_nsec); } + +/* Given a UTF-8 encoded string pointed to by utf8 of length length in +bytes, returns the corresponding UTF-16 encoded string in the +buffer pointed to by utf16. The maximum number of UTF-16 encoding +units (i.e., Unit16s) allowed in the buffer is specified in +utf16_max_length. The return value is the number of UTF-16 +encoding units placed in the output buffer pointed to by utf16. + +In case of an error, -1 is returned, leaving some unusable partial +results in the output buffer. + +The caller must estimate the size of utf16 buffer by itself before +calling this function. Insufficient output buffer is considered as +an error, and once an error occured, this function doesn't give any +clue how large the result will be. + +The error cases include following: + +- Invalid byte sequences were in the input UTF-8 bytes. The caller + has no way to know what point in the input buffer was the + errornous byte. + +- The input contained a character (a valid UTF-8 byte sequence) + whose scalar value exceeded the range that UTF-16 can represent + (i.e., characters whose Unicode scalar value above 0x110000). + +- The output buffer has no enough space to hold entire utf16 data. + +Please note: + +- '\0'-termination is not assumed both on the input UTF-8 string + and on the output UTF-16 string; any legal zero byte in the input + UTF-8 string will be converted to a 16-bit zero in output. As a + side effect, the last UTF-16 encoding unit stored in the output + buffer will have a non-zero value if the input UTF-8 was not + '\0'-terminated. + +- UTF-8 aliases are *not* considered as an error. They are + converted to UTF-16. For example, 0xC0 0xA0, 0xE0 0x80 0xA0, + and 0xF0 0x80 0x80 0xA0 are all mapped to a single UTF-16 + encoding unit 0x0020. + +- Three byte UTF-8 sequences whose value corresponds to a surrogate + code or other reserved scalar value are not considered as an + error either. They may cause an invalid UTF-16 data (e.g., those + containing unpaired surrogates). + +*/ +int Util::utf8ToUtf16(const char *buf, const int utf8_length, uint16_t *utf16, const int utf16_max_length) { + + /* p moves over the output buffer. max_ptr points to the next to the last slot of the buffer. */ + uint16_t *p = utf16; + const uint16_t *max_ptr = utf16 + utf16_max_length; + const unsigned char *utf8 = (const unsigned char *)buf; + + /* end_of_input points to the last byte of input as opposed to the next to the last byte. */ + unsigned char const *const end_of_input = utf8 + utf8_length - 1; + + while (utf8 <= end_of_input) { + const unsigned char c = *utf8; + if (p >= max_ptr) { + /* No more output space. */ + return -1; + } + if (c < 0x80) { + /* One byte ASCII. */ + *p++ = c; + utf8 += 1; + } else if (c < 0xC0) { + /* Follower byte without preceeding leader bytes. */ + return -1; + } else if (c < 0xE0) { + /* Two byte sequence. We need one follower byte. */ + if (end_of_input - utf8 < 1 || (((utf8[1] ^ 0x80)) & 0xC0)) { + return -1; + } + *p++ = (uint16_t)(0xCF80 + (c << 6) + utf8[1]); + utf8 += 2; + } else if (c < 0xF0) { + /* Three byte sequence. We need two follower byte. */ + if (end_of_input - utf8 < 2 || (((utf8[1] ^ 0x80) | (utf8[2] ^ 0x80)) & 0xC0)) { + return -1; + } + *p++ = (uint16_t)(0xDF80 + (c << 12) + (utf8[1] << 6) + utf8[2]); + utf8 += 3; + } else if (c < 0xF8) { + int plane; + /* Four byte sequence. We need three follower bytes. */ + if (end_of_input - utf8 < 3 || (((utf8[1] ^ 0x80) | (utf8[2] ^0x80) | (utf8[3] ^ 0x80)) & 0xC0)) { + return -1; + } + plane = (-0xC8 + (c << 2) + (utf8[1] >> 4)); + if (plane == 0) { + /* This four byte sequence is an alias that + corresponds to a Unicode scalar value in BMP. + It fits in an UTF-16 encoding unit. */ + *p++ = (uint16_t)(0xDF80 + (utf8[1] << 12) + (utf8[2] << 6) + utf8[3]); + } else if (plane <= 16) { + /* This is a legal four byte sequence that corresponds to a surrogate pair. */ + if (p + 1 >= max_ptr) { + /* No enough space on the output buffer for the pair. */ + return -1; + } + *p++ = (uint16_t)(0xE5B8 + (c << 8) + (utf8[1] << 2) + (utf8[2] >> 4)); + *p++ = (uint16_t)(0xDB80 + ((utf8[2] & 0x0F) << 6) + utf8[3]); + } else { + /* This four byte sequence is out of UTF-16 code space. */ + return -1; + } + utf8 += 4; + } else { + /* Longer sequence or unused byte. */ + return -1; + } + } + return p - utf16; +} + +/* Compare an ASCII string with an UTF-16 string */ +bool Util::utf16EqualToAscii(const char *ascii, uint16_t *utf16, int utf16Len) { + + while(*ascii != 0 && utf16Len > 0) { + if(*utf16++ != (uint16_t)*ascii++) { + return false; + } + utf16Len--; + } + return *ascii == 0 && utf16Len == 0; +} + +std::string Util::utf16BufToUtf8String(const uint16_t *utf16Buf, int utf16Len) { + + std::string outStr; + outStr.reserve(utf16Len * 2); + + while(*utf16Buf != 0 && utf16Len > 0) { + + const uint16_t c16 = *utf16Buf++; + if (c16 <= 0x007F) { + outStr.push_back((char)c16); + } + else if (c16 <= 0x07FF) { + unsigned char c = 0xC0 | ((unsigned char)(c16 >> 6)); + outStr.push_back(c); + c = 0x80 | ((unsigned char)(c16 & 0x003F)); + outStr.push_back(c); + } + else { + unsigned char c = 0xE0 | ((unsigned char)(c16 >> 12)); + outStr.push_back(c); + c = 0x80 | ((unsigned char)((c16 >> 6) & 0x003F)); + outStr.push_back(c); + c = 0x80 | ((unsigned char)(c16 & 0x003F)); + outStr.push_back(c); + } + utf16Len--; + } + return outStr; +}