FreeBSD Bugzilla – Attachment 145997 Details for
Bug 192783
x11/slim does not support UTF-8 characters input
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Help
|
New Account
|
Log In
Remember
[x]
|
Forgot Password
Login:
[x]
[patch]
util.cpp patch
patch-util.cpp (text/plain), 5.59 KB, created by
DaLynX
on 2014-08-18 18:48:42 UTC
(
hide
)
Description:
util.cpp patch
Filename:
MIME Type:
Creator:
DaLynX
Created:
2014-08-18 18:48:42 UTC
Size:
5.59 KB
patch
obsolete
>--- util.cpp.orig 2014-08-12 18:08:28.000000000 +0200 >+++ util.cpp 2014-08-12 18:09:20.000000000 +0200 >@@ -67,3 +67,162 @@ > > return pid + tm + (ts.tv_sec ^ ts.tv_nsec); > } >+ >+/* Given a UTF-8 encoded string pointed to by utf8 of length length in >+bytes, returns the corresponding UTF-16 encoded string in the >+buffer pointed to by utf16. The maximum number of UTF-16 encoding >+units (i.e., Unit16s) allowed in the buffer is specified in >+utf16_max_length. The return value is the number of UTF-16 >+encoding units placed in the output buffer pointed to by utf16. >+ >+In case of an error, -1 is returned, leaving some unusable partial >+results in the output buffer. >+ >+The caller must estimate the size of utf16 buffer by itself before >+calling this function. Insufficient output buffer is considered as >+an error, and once an error occured, this function doesn't give any >+clue how large the result will be. >+ >+The error cases include following: >+ >+- Invalid byte sequences were in the input UTF-8 bytes. The caller >+ has no way to know what point in the input buffer was the >+ errornous byte. >+ >+- The input contained a character (a valid UTF-8 byte sequence) >+ whose scalar value exceeded the range that UTF-16 can represent >+ (i.e., characters whose Unicode scalar value above 0x110000). >+ >+- The output buffer has no enough space to hold entire utf16 data. >+ >+Please note: >+ >+- '\0'-termination is not assumed both on the input UTF-8 string >+ and on the output UTF-16 string; any legal zero byte in the input >+ UTF-8 string will be converted to a 16-bit zero in output. As a >+ side effect, the last UTF-16 encoding unit stored in the output >+ buffer will have a non-zero value if the input UTF-8 was not >+ '\0'-terminated. >+ >+- UTF-8 aliases are *not* considered as an error. They are >+ converted to UTF-16. For example, 0xC0 0xA0, 0xE0 0x80 0xA0, >+ and 0xF0 0x80 0x80 0xA0 are all mapped to a single UTF-16 >+ encoding unit 0x0020. >+ >+- Three byte UTF-8 sequences whose value corresponds to a surrogate >+ code or other reserved scalar value are not considered as an >+ error either. They may cause an invalid UTF-16 data (e.g., those >+ containing unpaired surrogates). >+ >+*/ >+int Util::utf8ToUtf16(const char *buf, const int utf8_length, uint16_t *utf16, const int utf16_max_length) { >+ >+ /* p moves over the output buffer. max_ptr points to the next to the last slot of the buffer. */ >+ uint16_t *p = utf16; >+ const uint16_t *max_ptr = utf16 + utf16_max_length; >+ const unsigned char *utf8 = (const unsigned char *)buf; >+ >+ /* end_of_input points to the last byte of input as opposed to the next to the last byte. */ >+ unsigned char const *const end_of_input = utf8 + utf8_length - 1; >+ >+ while (utf8 <= end_of_input) { >+ const unsigned char c = *utf8; >+ if (p >= max_ptr) { >+ /* No more output space. */ >+ return -1; >+ } >+ if (c < 0x80) { >+ /* One byte ASCII. */ >+ *p++ = c; >+ utf8 += 1; >+ } else if (c < 0xC0) { >+ /* Follower byte without preceeding leader bytes. */ >+ return -1; >+ } else if (c < 0xE0) { >+ /* Two byte sequence. We need one follower byte. */ >+ if (end_of_input - utf8 < 1 || (((utf8[1] ^ 0x80)) & 0xC0)) { >+ return -1; >+ } >+ *p++ = (uint16_t)(0xCF80 + (c << 6) + utf8[1]); >+ utf8 += 2; >+ } else if (c < 0xF0) { >+ /* Three byte sequence. We need two follower byte. */ >+ if (end_of_input - utf8 < 2 || (((utf8[1] ^ 0x80) | (utf8[2] ^ 0x80)) & 0xC0)) { >+ return -1; >+ } >+ *p++ = (uint16_t)(0xDF80 + (c << 12) + (utf8[1] << 6) + utf8[2]); >+ utf8 += 3; >+ } else if (c < 0xF8) { >+ int plane; >+ /* Four byte sequence. We need three follower bytes. */ >+ if (end_of_input - utf8 < 3 || (((utf8[1] ^ 0x80) | (utf8[2] ^0x80) | (utf8[3] ^ 0x80)) & 0xC0)) { >+ return -1; >+ } >+ plane = (-0xC8 + (c << 2) + (utf8[1] >> 4)); >+ if (plane == 0) { >+ /* This four byte sequence is an alias that >+ corresponds to a Unicode scalar value in BMP. >+ It fits in an UTF-16 encoding unit. */ >+ *p++ = (uint16_t)(0xDF80 + (utf8[1] << 12) + (utf8[2] << 6) + utf8[3]); >+ } else if (plane <= 16) { >+ /* This is a legal four byte sequence that corresponds to a surrogate pair. */ >+ if (p + 1 >= max_ptr) { >+ /* No enough space on the output buffer for the pair. */ >+ return -1; >+ } >+ *p++ = (uint16_t)(0xE5B8 + (c << 8) + (utf8[1] << 2) + (utf8[2] >> 4)); >+ *p++ = (uint16_t)(0xDB80 + ((utf8[2] & 0x0F) << 6) + utf8[3]); >+ } else { >+ /* This four byte sequence is out of UTF-16 code space. */ >+ return -1; >+ } >+ utf8 += 4; >+ } else { >+ /* Longer sequence or unused byte. */ >+ return -1; >+ } >+ } >+ return p - utf16; >+} >+ >+/* Compare an ASCII string with an UTF-16 string */ >+bool Util::utf16EqualToAscii(const char *ascii, uint16_t *utf16, int utf16Len) { >+ >+ while(*ascii != 0 && utf16Len > 0) { >+ if(*utf16++ != (uint16_t)*ascii++) { >+ return false; >+ } >+ utf16Len--; >+ } >+ return *ascii == 0 && utf16Len == 0; >+} >+ >+std::string Util::utf16BufToUtf8String(const uint16_t *utf16Buf, int utf16Len) { >+ >+ std::string outStr; >+ outStr.reserve(utf16Len * 2); >+ >+ while(*utf16Buf != 0 && utf16Len > 0) { >+ >+ const uint16_t c16 = *utf16Buf++; >+ if (c16 <= 0x007F) { >+ outStr.push_back((char)c16); >+ } >+ else if (c16 <= 0x07FF) { >+ unsigned char c = 0xC0 | ((unsigned char)(c16 >> 6)); >+ outStr.push_back(c); >+ c = 0x80 | ((unsigned char)(c16 & 0x003F)); >+ outStr.push_back(c); >+ } >+ else { >+ unsigned char c = 0xE0 | ((unsigned char)(c16 >> 12)); >+ outStr.push_back(c); >+ c = 0x80 | ((unsigned char)((c16 >> 6) & 0x003F)); >+ outStr.push_back(c); >+ c = 0x80 | ((unsigned char)(c16 & 0x003F)); >+ outStr.push_back(c); >+ } >+ utf16Len--; >+ } >+ return outStr; >+}
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 192783
:
145959
|
145990
|
145992
|
145993
|
145994
|
145995
|
145996
|
145997
|
145998
|
146000
|
146003
|
146476