View | Details | Raw Unified | Return to bug 192783 | Differences between
and this patch

Collapse All | Expand All

(-)util.cpp (+159 lines)
Lines 67-69 Link Here
67
67
68
	return pid + tm + (ts.tv_sec ^ ts.tv_nsec);
68
	return pid + tm + (ts.tv_sec ^ ts.tv_nsec);
69
}
69
}
70
71
/* Given a UTF-8 encoded string pointed to by utf8 of length length in
72
bytes, returns the corresponding UTF-16 encoded string in the
73
buffer pointed to by utf16.  The maximum number of UTF-16 encoding
74
units (i.e., Unit16s) allowed in the buffer is specified in
75
utf16_max_length.  The return value is the number of UTF-16
76
encoding units placed in the output buffer pointed to by utf16.
77
78
In case of an error, -1 is returned, leaving some unusable partial
79
results in the output buffer.
80
81
The caller must estimate the size of utf16 buffer by itself before
82
calling this function.  Insufficient output buffer is considered as
83
an error, and once an error occured, this function doesn't give any
84
clue how large the result will be.
85
86
The error cases include following:
87
88
- Invalid byte sequences were in the input UTF-8 bytes.  The caller
89
	has no way to know what point in the input buffer was the
90
	errornous byte.
91
92
- The input contained a character (a valid UTF-8 byte sequence)
93
	whose scalar value exceeded the range that UTF-16 can represent
94
	(i.e., characters whose Unicode scalar value above 0x110000).
95
96
- The output buffer has no enough space to hold entire utf16 data.
97
98
Please note:
99
100
- '\0'-termination is not assumed both on the input UTF-8 string
101
	and on the output UTF-16 string; any legal zero byte in the input
102
	UTF-8 string will be converted to a 16-bit zero in output.  As a
103
	side effect, the last UTF-16 encoding unit stored in the output
104
	buffer will have a non-zero value if the input UTF-8 was not
105
	'\0'-terminated.
106
107
- UTF-8 aliases are *not* considered as an error.  They are
108
	converted to UTF-16.  For example, 0xC0 0xA0, 0xE0 0x80 0xA0,
109
	and 0xF0 0x80 0x80 0xA0 are all mapped to a single UTF-16
110
	encoding unit 0x0020.
111
112
- Three byte UTF-8 sequences whose value corresponds to a surrogate
113
	code or other reserved scalar value are not considered as an
114
	error either.  They may cause an invalid UTF-16 data (e.g., those
115
	containing unpaired surrogates).
116
117
*/
118
int Util::utf8ToUtf16(const char *buf, const int utf8_length, uint16_t *utf16, const int utf16_max_length) {
119
120
	/* p moves over the output buffer.  max_ptr points to the next to the last slot of the buffer.  */
121
	uint16_t *p = utf16;
122
	const uint16_t *max_ptr = utf16 + utf16_max_length;
123
	const unsigned char *utf8 = (const unsigned char *)buf;
124
125
	/* end_of_input points to the last byte of input as opposed to the next to the last byte.  */
126
	unsigned char const *const end_of_input = utf8 + utf8_length - 1;
127
128
	while (utf8 <= end_of_input) {
129
		const unsigned char c = *utf8;
130
		if (p >= max_ptr) {
131
			/* No more output space.  */
132
			return -1;
133
		}
134
		if (c < 0x80) {
135
			/* One byte ASCII.  */
136
			*p++ = c;
137
			utf8 += 1;
138
		} else if (c < 0xC0) {
139
			/* Follower byte without preceeding leader bytes.  */
140
			return -1;
141
		} else if (c < 0xE0) {
142
			/* Two byte sequence.  We need one follower byte.  */
143
			if (end_of_input - utf8 < 1 || (((utf8[1] ^ 0x80)) & 0xC0)) {
144
				return -1;
145
			}
146
			*p++ = (uint16_t)(0xCF80 + (c << 6) + utf8[1]);
147
			utf8 += 2;
148
		} else if (c < 0xF0) {
149
			/* Three byte sequence.  We need two follower byte.  */
150
			if (end_of_input - utf8 < 2 || (((utf8[1] ^ 0x80) | (utf8[2] ^ 0x80)) & 0xC0)) {
151
				return -1;
152
			}
153
			*p++ = (uint16_t)(0xDF80 + (c << 12) + (utf8[1] << 6) + utf8[2]);
154
			utf8 += 3;
155
		} else if (c < 0xF8) {
156
			int plane;
157
			/* Four byte sequence.  We need three follower bytes.  */
158
			if (end_of_input - utf8 < 3 || (((utf8[1] ^ 0x80) | (utf8[2] ^0x80) | (utf8[3] ^ 0x80)) & 0xC0)) {
159
				return -1;
160
			}
161
			plane = (-0xC8 + (c << 2) + (utf8[1] >> 4));
162
			if (plane == 0) {
163
				/* This four byte sequence is an alias that
164
						corresponds to a Unicode scalar value in BMP.
165
				It fits in an UTF-16 encoding unit.  */
166
				*p++ = (uint16_t)(0xDF80 + (utf8[1] << 12) + (utf8[2] << 6) + utf8[3]);
167
			} else if (plane <= 16) {
168
				/* This is a legal four byte sequence that corresponds to a surrogate pair.  */
169
				if (p + 1 >= max_ptr) {
170
					/* No enough space on the output buffer for the pair.  */
171
					return -1;
172
				}
173
				*p++ = (uint16_t)(0xE5B8 + (c << 8) + (utf8[1] << 2) + (utf8[2] >> 4));
174
				*p++ = (uint16_t)(0xDB80 + ((utf8[2] & 0x0F) << 6) + utf8[3]);
175
			} else {
176
				/* This four byte sequence is out of UTF-16 code space.  */
177
				return -1;
178
			}
179
			utf8 += 4;
180
		} else {
181
			/* Longer sequence or unused byte.  */
182
			return -1;
183
		}
184
	}
185
	return p - utf16;
186
}
187
188
/* Compare an ASCII string with an UTF-16 string */
189
bool Util::utf16EqualToAscii(const char *ascii, uint16_t *utf16, int utf16Len) {
190
191
	while(*ascii != 0 && utf16Len > 0) {
192
		if(*utf16++ != (uint16_t)*ascii++) {
193
			return false;
194
		}
195
		utf16Len--;
196
	}
197
	return *ascii == 0 && utf16Len == 0;
198
}
199
200
std::string Util::utf16BufToUtf8String(const uint16_t *utf16Buf, int utf16Len) {
201
202
	std::string outStr;
203
	outStr.reserve(utf16Len * 2);
204
205
	while(*utf16Buf != 0 && utf16Len > 0) {
206
207
		const uint16_t c16 = *utf16Buf++;
208
		if (c16 <= 0x007F) {
209
			outStr.push_back((char)c16);
210
		}
211
		else if (c16 <= 0x07FF) {
212
			unsigned char c = 0xC0 | ((unsigned char)(c16 >> 6));
213
			outStr.push_back(c);
214
			c = 0x80 | ((unsigned char)(c16 & 0x003F));
215
			outStr.push_back(c);
216
		}
217
		else {
218
			unsigned char c = 0xE0 | ((unsigned char)(c16 >> 12));
219
			outStr.push_back(c);
220
			c = 0x80 | ((unsigned char)((c16 >> 6) & 0x003F));
221
			outStr.push_back(c);
222
			c = 0x80 | ((unsigned char)(c16 & 0x003F));
223
			outStr.push_back(c);
224
		}
225
		utf16Len--;
226
	}
227
	return outStr;
228
}

Return to bug 192783