Lines 67-69
Link Here
|
67 |
|
67 |
|
68 |
return pid + tm + (ts.tv_sec ^ ts.tv_nsec); |
68 |
return pid + tm + (ts.tv_sec ^ ts.tv_nsec); |
69 |
} |
69 |
} |
|
|
70 |
|
71 |
/* Given a UTF-8 encoded string pointed to by utf8 of length length in |
72 |
bytes, returns the corresponding UTF-16 encoded string in the |
73 |
buffer pointed to by utf16. The maximum number of UTF-16 encoding |
74 |
units (i.e., Unit16s) allowed in the buffer is specified in |
75 |
utf16_max_length. The return value is the number of UTF-16 |
76 |
encoding units placed in the output buffer pointed to by utf16. |
77 |
|
78 |
In case of an error, -1 is returned, leaving some unusable partial |
79 |
results in the output buffer. |
80 |
|
81 |
The caller must estimate the size of utf16 buffer by itself before |
82 |
calling this function. Insufficient output buffer is considered as |
83 |
an error, and once an error occured, this function doesn't give any |
84 |
clue how large the result will be. |
85 |
|
86 |
The error cases include following: |
87 |
|
88 |
- Invalid byte sequences were in the input UTF-8 bytes. The caller |
89 |
has no way to know what point in the input buffer was the |
90 |
errornous byte. |
91 |
|
92 |
- The input contained a character (a valid UTF-8 byte sequence) |
93 |
whose scalar value exceeded the range that UTF-16 can represent |
94 |
(i.e., characters whose Unicode scalar value above 0x110000). |
95 |
|
96 |
- The output buffer has no enough space to hold entire utf16 data. |
97 |
|
98 |
Please note: |
99 |
|
100 |
- '\0'-termination is not assumed both on the input UTF-8 string |
101 |
and on the output UTF-16 string; any legal zero byte in the input |
102 |
UTF-8 string will be converted to a 16-bit zero in output. As a |
103 |
side effect, the last UTF-16 encoding unit stored in the output |
104 |
buffer will have a non-zero value if the input UTF-8 was not |
105 |
'\0'-terminated. |
106 |
|
107 |
- UTF-8 aliases are *not* considered as an error. They are |
108 |
converted to UTF-16. For example, 0xC0 0xA0, 0xE0 0x80 0xA0, |
109 |
and 0xF0 0x80 0x80 0xA0 are all mapped to a single UTF-16 |
110 |
encoding unit 0x0020. |
111 |
|
112 |
- Three byte UTF-8 sequences whose value corresponds to a surrogate |
113 |
code or other reserved scalar value are not considered as an |
114 |
error either. They may cause an invalid UTF-16 data (e.g., those |
115 |
containing unpaired surrogates). |
116 |
|
117 |
*/ |
118 |
int Util::utf8ToUtf16(const char *buf, const int utf8_length, uint16_t *utf16, const int utf16_max_length) { |
119 |
|
120 |
/* p moves over the output buffer. max_ptr points to the next to the last slot of the buffer. */ |
121 |
uint16_t *p = utf16; |
122 |
const uint16_t *max_ptr = utf16 + utf16_max_length; |
123 |
const unsigned char *utf8 = (const unsigned char *)buf; |
124 |
|
125 |
/* end_of_input points to the last byte of input as opposed to the next to the last byte. */ |
126 |
unsigned char const *const end_of_input = utf8 + utf8_length - 1; |
127 |
|
128 |
while (utf8 <= end_of_input) { |
129 |
const unsigned char c = *utf8; |
130 |
if (p >= max_ptr) { |
131 |
/* No more output space. */ |
132 |
return -1; |
133 |
} |
134 |
if (c < 0x80) { |
135 |
/* One byte ASCII. */ |
136 |
*p++ = c; |
137 |
utf8 += 1; |
138 |
} else if (c < 0xC0) { |
139 |
/* Follower byte without preceeding leader bytes. */ |
140 |
return -1; |
141 |
} else if (c < 0xE0) { |
142 |
/* Two byte sequence. We need one follower byte. */ |
143 |
if (end_of_input - utf8 < 1 || (((utf8[1] ^ 0x80)) & 0xC0)) { |
144 |
return -1; |
145 |
} |
146 |
*p++ = (uint16_t)(0xCF80 + (c << 6) + utf8[1]); |
147 |
utf8 += 2; |
148 |
} else if (c < 0xF0) { |
149 |
/* Three byte sequence. We need two follower byte. */ |
150 |
if (end_of_input - utf8 < 2 || (((utf8[1] ^ 0x80) | (utf8[2] ^ 0x80)) & 0xC0)) { |
151 |
return -1; |
152 |
} |
153 |
*p++ = (uint16_t)(0xDF80 + (c << 12) + (utf8[1] << 6) + utf8[2]); |
154 |
utf8 += 3; |
155 |
} else if (c < 0xF8) { |
156 |
int plane; |
157 |
/* Four byte sequence. We need three follower bytes. */ |
158 |
if (end_of_input - utf8 < 3 || (((utf8[1] ^ 0x80) | (utf8[2] ^0x80) | (utf8[3] ^ 0x80)) & 0xC0)) { |
159 |
return -1; |
160 |
} |
161 |
plane = (-0xC8 + (c << 2) + (utf8[1] >> 4)); |
162 |
if (plane == 0) { |
163 |
/* This four byte sequence is an alias that |
164 |
corresponds to a Unicode scalar value in BMP. |
165 |
It fits in an UTF-16 encoding unit. */ |
166 |
*p++ = (uint16_t)(0xDF80 + (utf8[1] << 12) + (utf8[2] << 6) + utf8[3]); |
167 |
} else if (plane <= 16) { |
168 |
/* This is a legal four byte sequence that corresponds to a surrogate pair. */ |
169 |
if (p + 1 >= max_ptr) { |
170 |
/* No enough space on the output buffer for the pair. */ |
171 |
return -1; |
172 |
} |
173 |
*p++ = (uint16_t)(0xE5B8 + (c << 8) + (utf8[1] << 2) + (utf8[2] >> 4)); |
174 |
*p++ = (uint16_t)(0xDB80 + ((utf8[2] & 0x0F) << 6) + utf8[3]); |
175 |
} else { |
176 |
/* This four byte sequence is out of UTF-16 code space. */ |
177 |
return -1; |
178 |
} |
179 |
utf8 += 4; |
180 |
} else { |
181 |
/* Longer sequence or unused byte. */ |
182 |
return -1; |
183 |
} |
184 |
} |
185 |
return p - utf16; |
186 |
} |
187 |
|
188 |
/* Compare an ASCII string with an UTF-16 string */ |
189 |
bool Util::utf16EqualToAscii(const char *ascii, uint16_t *utf16, int utf16Len) { |
190 |
|
191 |
while(*ascii != 0 && utf16Len > 0) { |
192 |
if(*utf16++ != (uint16_t)*ascii++) { |
193 |
return false; |
194 |
} |
195 |
utf16Len--; |
196 |
} |
197 |
return *ascii == 0 && utf16Len == 0; |
198 |
} |
199 |
|
200 |
std::string Util::utf16BufToUtf8String(const uint16_t *utf16Buf, int utf16Len) { |
201 |
|
202 |
std::string outStr; |
203 |
outStr.reserve(utf16Len * 2); |
204 |
|
205 |
while(*utf16Buf != 0 && utf16Len > 0) { |
206 |
|
207 |
const uint16_t c16 = *utf16Buf++; |
208 |
if (c16 <= 0x007F) { |
209 |
outStr.push_back((char)c16); |
210 |
} |
211 |
else if (c16 <= 0x07FF) { |
212 |
unsigned char c = 0xC0 | ((unsigned char)(c16 >> 6)); |
213 |
outStr.push_back(c); |
214 |
c = 0x80 | ((unsigned char)(c16 & 0x003F)); |
215 |
outStr.push_back(c); |
216 |
} |
217 |
else { |
218 |
unsigned char c = 0xE0 | ((unsigned char)(c16 >> 12)); |
219 |
outStr.push_back(c); |
220 |
c = 0x80 | ((unsigned char)((c16 >> 6) & 0x003F)); |
221 |
outStr.push_back(c); |
222 |
c = 0x80 | ((unsigned char)(c16 & 0x003F)); |
223 |
outStr.push_back(c); |
224 |
} |
225 |
utf16Len--; |
226 |
} |
227 |
return outStr; |
228 |
} |