1 |
|
|
/* $OpenBSD: citrus_utf8.c,v 1.18 2016/09/07 17:15:06 schwarze Exp $ */ |
2 |
|
|
|
3 |
|
|
/*- |
4 |
|
|
* Copyright (c) 2002-2004 Tim J. Robbins |
5 |
|
|
* All rights reserved. |
6 |
|
|
* |
7 |
|
|
* Redistribution and use in source and binary forms, with or without |
8 |
|
|
* modification, are permitted provided that the following conditions |
9 |
|
|
* are met: |
10 |
|
|
* 1. Redistributions of source code must retain the above copyright |
11 |
|
|
* notice, this list of conditions and the following disclaimer. |
12 |
|
|
* 2. Redistributions in binary form must reproduce the above copyright |
13 |
|
|
* notice, this list of conditions and the following disclaimer in the |
14 |
|
|
* documentation and/or other materials provided with the distribution. |
15 |
|
|
* |
16 |
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
17 |
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 |
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 |
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
20 |
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
21 |
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
22 |
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
23 |
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
24 |
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
25 |
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
26 |
|
|
* SUCH DAMAGE. |
27 |
|
|
*/ |
28 |
|
|
|
29 |
|
|
#include <sys/types.h> |
30 |
|
|
|
31 |
|
|
#include <errno.h> |
32 |
|
|
#include <string.h> |
33 |
|
|
#include <wchar.h> |
34 |
|
|
|
35 |
|
|
#include "citrus_ctype.h" |
36 |
|
|
|
37 |
|
|
struct _utf8_state { |
38 |
|
|
wchar_t ch; |
39 |
|
|
int want; |
40 |
|
|
wchar_t lbound; |
41 |
|
|
}; |
42 |
|
|
|
43 |
|
|
size_t |
44 |
|
|
_citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc, |
45 |
|
|
const char * __restrict s, size_t n, mbstate_t * __restrict ps) |
46 |
|
|
{ |
47 |
|
|
struct _utf8_state *us; |
48 |
|
|
int ch, i, mask, want; |
49 |
|
|
wchar_t lbound, wch; |
50 |
|
|
|
51 |
|
|
us = (struct _utf8_state *)ps; |
52 |
|
|
|
53 |
|
|
if (us->want < 0 || us->want > _CITRUS_UTF8_MB_CUR_MAX) { |
54 |
|
|
errno = EINVAL; |
55 |
|
|
return -1; |
56 |
|
|
} |
57 |
|
|
|
58 |
|
|
if (s == NULL) { |
59 |
|
|
s = ""; |
60 |
|
|
n = 1; |
61 |
|
|
pwc = NULL; |
62 |
|
|
} |
63 |
|
|
|
64 |
|
|
if (n == 0) |
65 |
|
|
return -2; |
66 |
|
|
|
67 |
|
|
if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { |
68 |
|
|
/* Fast path for plain ASCII characters. */ |
69 |
|
|
if (pwc != NULL) |
70 |
|
|
*pwc = ch; |
71 |
|
|
return ch != '\0' ? 1 : 0; |
72 |
|
|
} |
73 |
|
|
|
74 |
|
|
if (us->want == 0) { |
75 |
|
|
/* |
76 |
|
|
* Determine the number of bytes that make up this character |
77 |
|
|
* from the first byte, and a mask that extracts the |
78 |
|
|
* interesting bits of the first byte. We already know |
79 |
|
|
* the character is at least two bytes long. |
80 |
|
|
* |
81 |
|
|
* We also specify a lower bound for the character code to |
82 |
|
|
* detect redundant, non-"shortest form" encodings. For |
83 |
|
|
* example, the sequence C0 80 is _not_ a legal representation |
84 |
|
|
* of the null character. This enforces a 1-to-1 mapping |
85 |
|
|
* between character codes and their multibyte representations. |
86 |
|
|
*/ |
87 |
|
|
ch = (unsigned char)*s; |
88 |
|
|
if ((ch & 0x80) == 0) { |
89 |
|
|
mask = 0x7f; |
90 |
|
|
want = 1; |
91 |
|
|
lbound = 0; |
92 |
|
|
} else if ((ch & 0xe0) == 0xc0) { |
93 |
|
|
mask = 0x1f; |
94 |
|
|
want = 2; |
95 |
|
|
lbound = 0x80; |
96 |
|
|
} else if ((ch & 0xf0) == 0xe0) { |
97 |
|
|
mask = 0x0f; |
98 |
|
|
want = 3; |
99 |
|
|
lbound = 0x800; |
100 |
|
|
} else if ((ch & 0xf8) == 0xf0) { |
101 |
|
|
mask = 0x07; |
102 |
|
|
want = 4; |
103 |
|
|
lbound = 0x10000; |
104 |
|
|
} else { |
105 |
|
|
/* |
106 |
|
|
* Malformed input; input is not UTF-8. |
107 |
|
|
* See RFC 3629. |
108 |
|
|
*/ |
109 |
|
|
errno = EILSEQ; |
110 |
|
|
return -1; |
111 |
|
|
} |
112 |
|
|
} else { |
113 |
|
|
want = us->want; |
114 |
|
|
lbound = us->lbound; |
115 |
|
|
} |
116 |
|
|
|
117 |
|
|
/* |
118 |
|
|
* Decode the byte sequence representing the character in chunks |
119 |
|
|
* of 6 bits, most significant first. |
120 |
|
|
*/ |
121 |
|
|
if (us->want == 0) |
122 |
|
|
wch = (unsigned char)*s++ & mask; |
123 |
|
|
else |
124 |
|
|
wch = us->ch; |
125 |
|
|
for (i = (us->want == 0) ? 1 : 0; i < want && (size_t)i < n; i++) { |
126 |
|
|
if ((*s & 0xc0) != 0x80) { |
127 |
|
|
/* |
128 |
|
|
* Malformed input; bad byte in the middle |
129 |
|
|
* of a character. |
130 |
|
|
*/ |
131 |
|
|
errno = EILSEQ; |
132 |
|
|
return -1; |
133 |
|
|
} |
134 |
|
|
wch <<= 6; |
135 |
|
|
wch |= *s++ & 0x3f; |
136 |
|
|
} |
137 |
|
|
if (i < want) { |
138 |
|
|
/* Incomplete multibyte sequence. */ |
139 |
|
|
us->want = want - i; |
140 |
|
|
us->lbound = lbound; |
141 |
|
|
us->ch = wch; |
142 |
|
|
return -2; |
143 |
|
|
} |
144 |
|
|
if (wch < lbound) { |
145 |
|
|
/* |
146 |
|
|
* Malformed input; redundant encoding. |
147 |
|
|
*/ |
148 |
|
|
errno = EILSEQ; |
149 |
|
|
return -1; |
150 |
|
|
} |
151 |
|
|
if (wch >= 0xd800 && wch <= 0xdfff) { |
152 |
|
|
/* |
153 |
|
|
* Malformed input; invalid code points. |
154 |
|
|
*/ |
155 |
|
|
errno = EILSEQ; |
156 |
|
|
return -1; |
157 |
|
|
} |
158 |
|
|
if (wch > 0x10ffff) { |
159 |
|
|
/* |
160 |
|
|
* Malformed input; invalid code points. |
161 |
|
|
*/ |
162 |
|
|
errno = EILSEQ; |
163 |
|
|
return -1; |
164 |
|
|
} |
165 |
|
|
if (pwc != NULL) |
166 |
|
|
*pwc = wch; |
167 |
|
|
us->want = 0; |
168 |
|
|
return wch == L'\0' ? 0 : want; |
169 |
|
|
} |
170 |
|
|
|
171 |
|
|
int |
172 |
|
|
_citrus_utf8_ctype_mbsinit(const mbstate_t * __restrict ps) |
173 |
|
|
{ |
174 |
|
|
return ((const struct _utf8_state *)ps)->want == 0; |
175 |
|
|
} |
176 |
|
|
|
177 |
|
|
size_t |
178 |
|
|
_citrus_utf8_ctype_mbsnrtowcs(wchar_t * __restrict dst, |
179 |
|
|
const char ** __restrict src, size_t nmc, size_t len, |
180 |
|
|
mbstate_t * __restrict ps) |
181 |
|
|
{ |
182 |
|
|
struct _utf8_state *us; |
183 |
|
|
size_t i, o, r; |
184 |
|
|
|
185 |
|
|
us = (struct _utf8_state *)ps; |
186 |
|
|
|
187 |
|
|
if (dst == NULL) { |
188 |
|
|
/* |
189 |
|
|
* The fast path in the loop below is not safe if an ASCII |
190 |
|
|
* character appears as anything but the first byte of a |
191 |
|
|
* multibyte sequence. Check now to avoid doing it in the loop. |
192 |
|
|
*/ |
193 |
|
|
if (nmc > 0 && us->want > 0 && (unsigned char)(*src)[0] < 0x80) { |
194 |
|
|
errno = EILSEQ; |
195 |
|
|
return -1; |
196 |
|
|
} |
197 |
|
|
for (i = o = 0; i < nmc; i += r, o++) { |
198 |
|
|
if ((unsigned char)(*src)[i] < 0x80) { |
199 |
|
|
/* Fast path for plain ASCII characters. */ |
200 |
|
|
if ((*src)[i] == '\0') |
201 |
|
|
return o; |
202 |
|
|
r = 1; |
203 |
|
|
} else { |
204 |
|
|
r = _citrus_utf8_ctype_mbrtowc(NULL, *src + i, |
205 |
|
|
nmc - i, ps); |
206 |
|
|
if (r == (size_t)-1) |
207 |
|
|
return r; |
208 |
|
|
if (r == (size_t)-2) |
209 |
|
|
return o; |
210 |
|
|
if (r == 0) |
211 |
|
|
return o; |
212 |
|
|
} |
213 |
|
|
} |
214 |
|
|
return o; |
215 |
|
|
} |
216 |
|
|
|
217 |
|
|
/* |
218 |
|
|
* The fast path in the loop below is not safe if an ASCII |
219 |
|
|
* character appears as anything but the first byte of a |
220 |
|
|
* multibyte sequence. Check now to avoid doing it in the loop. |
221 |
|
|
*/ |
222 |
|
|
if (len > 0 && nmc > 0 && us->want > 0 && |
223 |
|
|
(unsigned char)(*src)[0] < 0x80) { |
224 |
|
|
errno = EILSEQ; |
225 |
|
|
return -1; |
226 |
|
|
} |
227 |
|
|
for (i = o = 0; i < nmc && o < len; i += r, o++) { |
228 |
|
|
if ((unsigned char)(*src)[i] < 0x80) { |
229 |
|
|
/* Fast path for plain ASCII characters. */ |
230 |
|
|
dst[o] = (wchar_t)(unsigned char)(*src)[i]; |
231 |
|
|
if ((*src)[i] == '\0') { |
232 |
|
|
*src = NULL; |
233 |
|
|
return o; |
234 |
|
|
} |
235 |
|
|
r = 1; |
236 |
|
|
} else { |
237 |
|
|
r = _citrus_utf8_ctype_mbrtowc(dst + o, *src + i, |
238 |
|
|
nmc - i, ps); |
239 |
|
|
if (r == (size_t)-1) { |
240 |
|
|
*src += i; |
241 |
|
|
return r; |
242 |
|
|
} |
243 |
|
|
if (r == (size_t)-2) { |
244 |
|
|
*src += nmc; |
245 |
|
|
return o; |
246 |
|
|
} |
247 |
|
|
if (r == 0) { |
248 |
|
|
*src = NULL; |
249 |
|
|
return o; |
250 |
|
|
} |
251 |
|
|
} |
252 |
|
|
} |
253 |
|
|
*src += i; |
254 |
|
|
return o; |
255 |
|
|
} |
256 |
|
|
|
257 |
|
|
size_t |
258 |
|
|
_citrus_utf8_ctype_wcrtomb(char * __restrict s, wchar_t wc, |
259 |
|
|
mbstate_t * __restrict ps) |
260 |
|
|
{ |
261 |
|
|
struct _utf8_state *us; |
262 |
|
|
unsigned char lead; |
263 |
|
|
int i, len; |
264 |
|
|
|
265 |
|
|
us = (struct _utf8_state *)ps; |
266 |
|
|
|
267 |
|
|
if (us->want != 0) { |
268 |
|
|
errno = EINVAL; |
269 |
|
|
return -1; |
270 |
|
|
} |
271 |
|
|
|
272 |
|
|
if (s == NULL) |
273 |
|
|
return 1; |
274 |
|
|
|
275 |
|
|
if (wc < 0 || (wc > 0xd7ff && wc < 0xe000) || wc > 0x10ffff) { |
276 |
|
|
errno = EILSEQ; |
277 |
|
|
return -1; |
278 |
|
|
} |
279 |
|
|
|
280 |
|
|
/* |
281 |
|
|
* Determine the number of bytes needed to represent this character. |
282 |
|
|
* We always output the shortest sequence possible. Also specify the |
283 |
|
|
* first few bits of the first byte, which contains the information |
284 |
|
|
* about the sequence length. |
285 |
|
|
*/ |
286 |
|
|
if (wc <= 0x7f) { |
287 |
|
|
/* Fast path for plain ASCII characters. */ |
288 |
|
|
*s = (char)wc; |
289 |
|
|
return 1; |
290 |
|
|
} else if (wc <= 0x7ff) { |
291 |
|
|
lead = 0xc0; |
292 |
|
|
len = 2; |
293 |
|
|
} else if (wc <= 0xffff) { |
294 |
|
|
lead = 0xe0; |
295 |
|
|
len = 3; |
296 |
|
|
} else { |
297 |
|
|
lead = 0xf0; |
298 |
|
|
len = 4; |
299 |
|
|
} |
300 |
|
|
|
301 |
|
|
/* |
302 |
|
|
* Output the bytes representing the character in chunks |
303 |
|
|
* of 6 bits, least significant last. The first byte is |
304 |
|
|
* a special case because it contains the sequence length |
305 |
|
|
* information. |
306 |
|
|
*/ |
307 |
|
|
for (i = len - 1; i > 0; i--) { |
308 |
|
|
s[i] = (wc & 0x3f) | 0x80; |
309 |
|
|
wc >>= 6; |
310 |
|
|
} |
311 |
|
|
*s = (wc & 0xff) | lead; |
312 |
|
|
|
313 |
|
|
return len; |
314 |
|
|
} |
315 |
|
|
|
316 |
|
|
size_t |
317 |
|
|
_citrus_utf8_ctype_wcsnrtombs(char * __restrict dst, |
318 |
|
|
const wchar_t ** __restrict src, size_t nwc, size_t len, |
319 |
|
|
mbstate_t * __restrict ps) |
320 |
|
|
{ |
321 |
|
|
struct _utf8_state *us; |
322 |
|
|
char buf[_CITRUS_UTF8_MB_CUR_MAX]; |
323 |
|
|
size_t i, o, r; |
324 |
|
|
|
325 |
|
|
us = (struct _utf8_state *)ps; |
326 |
|
|
|
327 |
|
|
if (us->want != 0) { |
328 |
|
|
errno = EINVAL; |
329 |
|
|
return -1; |
330 |
|
|
} |
331 |
|
|
|
332 |
|
|
if (dst == NULL) { |
333 |
|
|
for (i = o = 0; i < nwc; i++, o += r) { |
334 |
|
|
wchar_t wc = (*src)[i]; |
335 |
|
|
if (wc >= 0 && wc < 0x80) { |
336 |
|
|
/* Fast path for plain ASCII characters. */ |
337 |
|
|
if (wc == 0) |
338 |
|
|
return o; |
339 |
|
|
r = 1; |
340 |
|
|
} else { |
341 |
|
|
r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps); |
342 |
|
|
if (r == (size_t)-1) |
343 |
|
|
return r; |
344 |
|
|
} |
345 |
|
|
} |
346 |
|
|
return o; |
347 |
|
|
} |
348 |
|
|
|
349 |
|
|
for (i = o = 0; i < nwc && o < len; i++, o += r) { |
350 |
|
|
wchar_t wc = (*src)[i]; |
351 |
|
|
if (wc >= 0 && wc < 0x80) { |
352 |
|
|
/* Fast path for plain ASCII characters. */ |
353 |
|
|
dst[o] = (wchar_t)wc; |
354 |
|
|
if (wc == 0) { |
355 |
|
|
*src = NULL; |
356 |
|
|
return o; |
357 |
|
|
} |
358 |
|
|
r = 1; |
359 |
|
|
} else if (len - o >= _CITRUS_UTF8_MB_CUR_MAX) { |
360 |
|
|
/* Enough space to translate in-place. */ |
361 |
|
|
r = _citrus_utf8_ctype_wcrtomb(dst + o, wc, ps); |
362 |
|
|
if (r == (size_t)-1) { |
363 |
|
|
*src += i; |
364 |
|
|
return r; |
365 |
|
|
} |
366 |
|
|
} else { |
367 |
|
|
/* May not be enough space; use temp buffer. */ |
368 |
|
|
r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps); |
369 |
|
|
if (r == (size_t)-1) { |
370 |
|
|
*src += i; |
371 |
|
|
return r; |
372 |
|
|
} |
373 |
|
|
if (r > len - o) |
374 |
|
|
break; |
375 |
|
|
memcpy(dst + o, buf, r); |
376 |
|
|
} |
377 |
|
|
} |
378 |
|
|
*src += i; |
379 |
|
|
return o; |
380 |
|
|
} |