1 |
|
|
/* $OpenBSD: utf8.c,v 1.39 2017/06/04 09:02:57 nicm Exp $ */ |
2 |
|
|
|
3 |
|
|
/* |
4 |
|
|
* Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> |
5 |
|
|
* |
6 |
|
|
* Permission to use, copy, modify, and distribute this software for any |
7 |
|
|
* purpose with or without fee is hereby granted, provided that the above |
8 |
|
|
* copyright notice and this permission notice appear in all copies. |
9 |
|
|
* |
10 |
|
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
11 |
|
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
12 |
|
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
13 |
|
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
14 |
|
|
* WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER |
15 |
|
|
* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
16 |
|
|
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
17 |
|
|
*/ |
18 |
|
|
|
19 |
|
|
#include <sys/types.h> |
20 |
|
|
|
21 |
|
|
#include <errno.h> |
22 |
|
|
#include <stdlib.h> |
23 |
|
|
#include <string.h> |
24 |
|
|
#include <vis.h> |
25 |
|
|
#include <wchar.h> |
26 |
|
|
|
27 |
|
|
#include "tmux.h" |
28 |
|
|
|
29 |
|
|
static int utf8_width(wchar_t); |
30 |
|
|
|
31 |
|
|
/* Set a single character. */ |
32 |
|
|
void |
33 |
|
|
utf8_set(struct utf8_data *ud, u_char ch) |
34 |
|
|
{ |
35 |
|
|
static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; |
36 |
|
|
|
37 |
|
|
memcpy(ud, &empty, sizeof *ud); |
38 |
|
|
*ud->data = ch; |
39 |
|
|
} |
40 |
|
|
|
41 |
|
|
/* Copy UTF-8 character. */ |
42 |
|
|
void |
43 |
|
|
utf8_copy(struct utf8_data *to, const struct utf8_data *from) |
44 |
|
|
{ |
45 |
|
|
u_int i; |
46 |
|
|
|
47 |
|
|
memcpy(to, from, sizeof *to); |
48 |
|
|
|
49 |
|
|
for (i = to->size; i < sizeof to->data; i++) |
50 |
|
|
to->data[i] = '\0'; |
51 |
|
|
} |
52 |
|
|
|
53 |
|
|
/* |
54 |
|
|
* Open UTF-8 sequence. |
55 |
|
|
* |
56 |
|
|
* 11000010-11011111 C2-DF start of 2-byte sequence |
57 |
|
|
* 11100000-11101111 E0-EF start of 3-byte sequence |
58 |
|
|
* 11110000-11110100 F0-F4 start of 4-byte sequence |
59 |
|
|
*/ |
60 |
|
|
enum utf8_state |
61 |
|
|
utf8_open(struct utf8_data *ud, u_char ch) |
62 |
|
|
{ |
63 |
|
|
memset(ud, 0, sizeof *ud); |
64 |
|
|
if (ch >= 0xc2 && ch <= 0xdf) |
65 |
|
|
ud->size = 2; |
66 |
|
|
else if (ch >= 0xe0 && ch <= 0xef) |
67 |
|
|
ud->size = 3; |
68 |
|
|
else if (ch >= 0xf0 && ch <= 0xf4) |
69 |
|
|
ud->size = 4; |
70 |
|
|
else |
71 |
|
|
return (UTF8_ERROR); |
72 |
|
|
utf8_append(ud, ch); |
73 |
|
|
return (UTF8_MORE); |
74 |
|
|
} |
75 |
|
|
|
76 |
|
|
/* Append character to UTF-8, closing if finished. */ |
77 |
|
|
enum utf8_state |
78 |
|
|
utf8_append(struct utf8_data *ud, u_char ch) |
79 |
|
|
{ |
80 |
|
|
wchar_t wc; |
81 |
|
|
int width; |
82 |
|
|
|
83 |
|
|
if (ud->have >= ud->size) |
84 |
|
|
fatalx("UTF-8 character overflow"); |
85 |
|
|
if (ud->size > sizeof ud->data) |
86 |
|
|
fatalx("UTF-8 character size too large"); |
87 |
|
|
|
88 |
|
|
if (ud->have != 0 && (ch & 0xc0) != 0x80) |
89 |
|
|
ud->width = 0xff; |
90 |
|
|
|
91 |
|
|
ud->data[ud->have++] = ch; |
92 |
|
|
if (ud->have != ud->size) |
93 |
|
|
return (UTF8_MORE); |
94 |
|
|
|
95 |
|
|
if (ud->width == 0xff) |
96 |
|
|
return (UTF8_ERROR); |
97 |
|
|
|
98 |
|
|
if (utf8_combine(ud, &wc) != UTF8_DONE) |
99 |
|
|
return (UTF8_ERROR); |
100 |
|
|
if ((width = utf8_width(wc)) < 0) |
101 |
|
|
return (UTF8_ERROR); |
102 |
|
|
ud->width = width; |
103 |
|
|
|
104 |
|
|
return (UTF8_DONE); |
105 |
|
|
} |
106 |
|
|
|
107 |
|
|
/* Get width of Unicode character. */ |
108 |
|
|
static int |
109 |
|
|
utf8_width(wchar_t wc) |
110 |
|
|
{ |
111 |
|
|
int width; |
112 |
|
|
|
113 |
|
|
width = wcwidth(wc); |
114 |
|
|
if (width < 0 || width > 0xff) { |
115 |
|
|
log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width); |
116 |
|
|
return (-1); |
117 |
|
|
} |
118 |
|
|
return (width); |
119 |
|
|
} |
120 |
|
|
|
121 |
|
|
/* Combine UTF-8 into Unicode. */ |
122 |
|
|
enum utf8_state |
123 |
|
|
utf8_combine(const struct utf8_data *ud, wchar_t *wc) |
124 |
|
|
{ |
125 |
|
|
switch (mbtowc(wc, ud->data, ud->size)) { |
126 |
|
|
case -1: |
127 |
|
|
log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, |
128 |
|
|
errno); |
129 |
|
|
mbtowc(NULL, NULL, MB_CUR_MAX); |
130 |
|
|
return (UTF8_ERROR); |
131 |
|
|
case 0: |
132 |
|
|
return (UTF8_ERROR); |
133 |
|
|
default: |
134 |
|
|
return (UTF8_DONE); |
135 |
|
|
} |
136 |
|
|
} |
137 |
|
|
|
138 |
|
|
/* Split Unicode into UTF-8. */ |
139 |
|
|
enum utf8_state |
140 |
|
|
utf8_split(wchar_t wc, struct utf8_data *ud) |
141 |
|
|
{ |
142 |
|
|
char s[MB_LEN_MAX]; |
143 |
|
|
int slen; |
144 |
|
|
|
145 |
|
|
slen = wctomb(s, wc); |
146 |
|
|
if (slen <= 0 || slen > (int)sizeof ud->data) |
147 |
|
|
return (UTF8_ERROR); |
148 |
|
|
|
149 |
|
|
memcpy(ud->data, s, slen); |
150 |
|
|
ud->size = slen; |
151 |
|
|
|
152 |
|
|
ud->width = utf8_width(wc); |
153 |
|
|
return (UTF8_DONE); |
154 |
|
|
} |
155 |
|
|
|
156 |
|
|
/* |
157 |
|
|
* Encode len characters from src into dst, which is guaranteed to have four |
158 |
|
|
* bytes available for each character from src (for \abc or UTF-8) plus space |
159 |
|
|
* for \0. |
160 |
|
|
*/ |
161 |
|
|
int |
162 |
|
|
utf8_strvis(char *dst, const char *src, size_t len, int flag) |
163 |
|
|
{ |
164 |
|
|
struct utf8_data ud; |
165 |
|
|
const char *start, *end; |
166 |
|
|
enum utf8_state more; |
167 |
|
|
size_t i; |
168 |
|
|
|
169 |
|
|
start = dst; |
170 |
|
|
end = src + len; |
171 |
|
|
|
172 |
|
|
while (src < end) { |
173 |
|
|
if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { |
174 |
|
|
while (++src < end && more == UTF8_MORE) |
175 |
|
|
more = utf8_append(&ud, *src); |
176 |
|
|
if (more == UTF8_DONE) { |
177 |
|
|
/* UTF-8 character finished. */ |
178 |
|
|
for (i = 0; i < ud.size; i++) |
179 |
|
|
*dst++ = ud.data[i]; |
180 |
|
|
continue; |
181 |
|
|
} |
182 |
|
|
/* Not a complete, valid UTF-8 character. */ |
183 |
|
|
src -= ud.have; |
184 |
|
|
} |
185 |
|
|
if (src < end - 1) |
186 |
|
|
dst = vis(dst, src[0], flag, src[1]); |
187 |
|
|
else if (src < end) |
188 |
|
|
dst = vis(dst, src[0], flag, '\0'); |
189 |
|
|
src++; |
190 |
|
|
} |
191 |
|
|
|
192 |
|
|
*dst = '\0'; |
193 |
|
|
return (dst - start); |
194 |
|
|
} |
195 |
|
|
|
196 |
|
|
/* Same as utf8_strvis but allocate the buffer. */ |
197 |
|
|
int |
198 |
|
|
utf8_stravis(char **dst, const char *src, int flag) |
199 |
|
|
{ |
200 |
|
|
char *buf; |
201 |
|
|
int len; |
202 |
|
|
|
203 |
|
|
buf = xreallocarray(NULL, 4, strlen(src) + 1); |
204 |
|
|
len = utf8_strvis(buf, src, strlen(src), flag); |
205 |
|
|
|
206 |
|
|
*dst = xrealloc(buf, len + 1); |
207 |
|
|
return (len); |
208 |
|
|
} |
209 |
|
|
|
210 |
|
|
/* Does this string contain anything that isn't valid UTF-8? */ |
211 |
|
|
int |
212 |
|
|
utf8_isvalid(const char *s) |
213 |
|
|
{ |
214 |
|
|
struct utf8_data ud; |
215 |
|
|
const char *end; |
216 |
|
|
enum utf8_state more; |
217 |
|
|
|
218 |
|
|
end = s + strlen(s); |
219 |
|
|
while (s < end) { |
220 |
|
|
if ((more = utf8_open(&ud, *s)) == UTF8_MORE) { |
221 |
|
|
while (++s < end && more == UTF8_MORE) |
222 |
|
|
more = utf8_append(&ud, *s); |
223 |
|
|
if (more == UTF8_DONE) |
224 |
|
|
continue; |
225 |
|
|
return (0); |
226 |
|
|
} |
227 |
|
|
if (*s < 0x20 || *s > 0x7e) |
228 |
|
|
return (0); |
229 |
|
|
s++; |
230 |
|
|
} |
231 |
|
|
return (1); |
232 |
|
|
} |
233 |
|
|
|
234 |
|
|
/* |
235 |
|
|
* Sanitize a string, changing any UTF-8 characters to '_'. Caller should free |
236 |
|
|
* the returned string. Anything not valid printable ASCII or UTF-8 is |
237 |
|
|
* stripped. |
238 |
|
|
*/ |
239 |
|
|
char * |
240 |
|
|
utf8_sanitize(const char *src) |
241 |
|
|
{ |
242 |
|
|
char *dst; |
243 |
|
|
size_t n; |
244 |
|
|
enum utf8_state more; |
245 |
|
|
struct utf8_data ud; |
246 |
|
|
u_int i; |
247 |
|
|
|
248 |
|
|
dst = NULL; |
249 |
|
|
|
250 |
|
|
n = 0; |
251 |
|
|
while (*src != '\0') { |
252 |
|
|
dst = xreallocarray(dst, n + 1, sizeof *dst); |
253 |
|
|
if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { |
254 |
|
|
while (*++src != '\0' && more == UTF8_MORE) |
255 |
|
|
more = utf8_append(&ud, *src); |
256 |
|
|
if (more == UTF8_DONE) { |
257 |
|
|
dst = xreallocarray(dst, n + ud.width, |
258 |
|
|
sizeof *dst); |
259 |
|
|
for (i = 0; i < ud.width; i++) |
260 |
|
|
dst[n++] = '_'; |
261 |
|
|
continue; |
262 |
|
|
} |
263 |
|
|
src -= ud.have; |
264 |
|
|
} |
265 |
|
|
if (*src > 0x1f && *src < 0x7f) |
266 |
|
|
dst[n++] = *src; |
267 |
|
|
else |
268 |
|
|
dst[n++] = '_'; |
269 |
|
|
src++; |
270 |
|
|
} |
271 |
|
|
|
272 |
|
|
dst = xreallocarray(dst, n + 1, sizeof *dst); |
273 |
|
|
dst[n] = '\0'; |
274 |
|
|
return (dst); |
275 |
|
|
} |
276 |
|
|
|
277 |
|
|
/* Get UTF-8 buffer length. */ |
278 |
|
|
size_t |
279 |
|
|
utf8_strlen(const struct utf8_data *s) |
280 |
|
|
{ |
281 |
|
|
size_t i; |
282 |
|
|
|
283 |
|
|
for (i = 0; s[i].size != 0; i++) |
284 |
|
|
/* nothing */; |
285 |
|
|
return (i); |
286 |
|
|
} |
287 |
|
|
|
288 |
|
|
/* Get UTF-8 string width. */ |
289 |
|
|
u_int |
290 |
|
|
utf8_strwidth(const struct utf8_data *s, ssize_t n) |
291 |
|
|
{ |
292 |
|
|
ssize_t i; |
293 |
|
|
u_int width; |
294 |
|
|
|
295 |
|
|
width = 0; |
296 |
|
|
for (i = 0; s[i].size != 0; i++) { |
297 |
|
|
if (n != -1 && n == i) |
298 |
|
|
break; |
299 |
|
|
width += s[i].width; |
300 |
|
|
} |
301 |
|
|
return (width); |
302 |
|
|
} |
303 |
|
|
|
304 |
|
|
/* |
305 |
|
|
* Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. |
306 |
|
|
* Caller frees. |
307 |
|
|
*/ |
308 |
|
|
struct utf8_data * |
309 |
|
|
utf8_fromcstr(const char *src) |
310 |
|
|
{ |
311 |
|
|
struct utf8_data *dst; |
312 |
|
|
size_t n; |
313 |
|
|
enum utf8_state more; |
314 |
|
|
|
315 |
|
|
dst = NULL; |
316 |
|
|
|
317 |
|
|
n = 0; |
318 |
|
|
while (*src != '\0') { |
319 |
|
|
dst = xreallocarray(dst, n + 1, sizeof *dst); |
320 |
|
|
if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { |
321 |
|
|
while (*++src != '\0' && more == UTF8_MORE) |
322 |
|
|
more = utf8_append(&dst[n], *src); |
323 |
|
|
if (more == UTF8_DONE) { |
324 |
|
|
n++; |
325 |
|
|
continue; |
326 |
|
|
} |
327 |
|
|
src -= dst[n].have; |
328 |
|
|
} |
329 |
|
|
utf8_set(&dst[n], *src); |
330 |
|
|
n++; |
331 |
|
|
src++; |
332 |
|
|
} |
333 |
|
|
|
334 |
|
|
dst = xreallocarray(dst, n + 1, sizeof *dst); |
335 |
|
|
dst[n].size = 0; |
336 |
|
|
return (dst); |
337 |
|
|
} |
338 |
|
|
|
339 |
|
|
/* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ |
340 |
|
|
char * |
341 |
|
|
utf8_tocstr(struct utf8_data *src) |
342 |
|
|
{ |
343 |
|
|
char *dst; |
344 |
|
|
size_t n; |
345 |
|
|
|
346 |
|
|
dst = NULL; |
347 |
|
|
|
348 |
|
|
n = 0; |
349 |
|
|
for(; src->size != 0; src++) { |
350 |
|
|
dst = xreallocarray(dst, n + src->size, 1); |
351 |
|
|
memcpy(dst + n, src->data, src->size); |
352 |
|
|
n += src->size; |
353 |
|
|
} |
354 |
|
|
|
355 |
|
|
dst = xreallocarray(dst, n + 1, 1); |
356 |
|
|
dst[n] = '\0'; |
357 |
|
|
return (dst); |
358 |
|
|
} |
359 |
|
|
|
360 |
|
|
/* Get width of UTF-8 string. */ |
361 |
|
|
u_int |
362 |
|
|
utf8_cstrwidth(const char *s) |
363 |
|
|
{ |
364 |
|
|
struct utf8_data tmp; |
365 |
|
|
u_int width; |
366 |
|
|
enum utf8_state more; |
367 |
|
|
|
368 |
|
|
width = 0; |
369 |
|
|
while (*s != '\0') { |
370 |
|
|
if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { |
371 |
|
|
while (*++s != '\0' && more == UTF8_MORE) |
372 |
|
|
more = utf8_append(&tmp, *s); |
373 |
|
|
if (more == UTF8_DONE) { |
374 |
|
|
width += tmp.width; |
375 |
|
|
continue; |
376 |
|
|
} |
377 |
|
|
s -= tmp.have; |
378 |
|
|
} |
379 |
|
|
if (*s > 0x1f && *s != 0x7f) |
380 |
|
|
width++; |
381 |
|
|
s++; |
382 |
|
|
} |
383 |
|
|
return (width); |
384 |
|
|
} |
385 |
|
|
|
386 |
|
|
/* Trim UTF-8 string to width. Caller frees. */ |
387 |
|
|
char * |
388 |
|
|
utf8_trimcstr(const char *s, u_int width) |
389 |
|
|
{ |
390 |
|
|
struct utf8_data *tmp, *next; |
391 |
|
|
char *out; |
392 |
|
|
u_int at; |
393 |
|
|
|
394 |
|
|
tmp = utf8_fromcstr(s); |
395 |
|
|
|
396 |
|
|
at = 0; |
397 |
|
|
for (next = tmp; next->size != 0; next++) { |
398 |
|
|
if (at + next->width > width) { |
399 |
|
|
next->size = 0; |
400 |
|
|
break; |
401 |
|
|
} |
402 |
|
|
at += next->width; |
403 |
|
|
} |
404 |
|
|
|
405 |
|
|
out = utf8_tocstr(tmp); |
406 |
|
|
free(tmp); |
407 |
|
|
return (out); |
408 |
|
|
} |
409 |
|
|
|
410 |
|
|
/* Trim UTF-8 string to width. Caller frees. */ |
411 |
|
|
char * |
412 |
|
|
utf8_rtrimcstr(const char *s, u_int width) |
413 |
|
|
{ |
414 |
|
|
struct utf8_data *tmp, *next, *end; |
415 |
|
|
char *out; |
416 |
|
|
u_int at; |
417 |
|
|
|
418 |
|
|
tmp = utf8_fromcstr(s); |
419 |
|
|
|
420 |
|
|
for (end = tmp; end->size != 0; end++) |
421 |
|
|
/* nothing */; |
422 |
|
|
if (end == tmp) { |
423 |
|
|
free(tmp); |
424 |
|
|
return (xstrdup("")); |
425 |
|
|
} |
426 |
|
|
next = end - 1; |
427 |
|
|
|
428 |
|
|
at = 0; |
429 |
|
|
for (;;) { |
430 |
|
|
if (at + next->width > width) { |
431 |
|
|
next++; |
432 |
|
|
break; |
433 |
|
|
} |
434 |
|
|
at += next->width; |
435 |
|
|
|
436 |
|
|
if (next == tmp) |
437 |
|
|
break; |
438 |
|
|
next--; |
439 |
|
|
} |
440 |
|
|
|
441 |
|
|
out = utf8_tocstr(next); |
442 |
|
|
free(tmp); |
443 |
|
|
return (out); |
444 |
|
|
} |
445 |
|
|
|
446 |
|
|
/* Pad UTF-8 string to width. Caller frees. */ |
447 |
|
|
char * |
448 |
|
|
utf8_padcstr(const char *s, u_int width) |
449 |
|
|
{ |
450 |
|
|
size_t slen; |
451 |
|
|
char *out; |
452 |
|
|
u_int n, i; |
453 |
|
|
|
454 |
|
|
n = utf8_cstrwidth(s); |
455 |
|
|
if (n >= width) |
456 |
|
|
return (xstrdup(s)); |
457 |
|
|
|
458 |
|
|
slen = strlen(s); |
459 |
|
|
out = xmalloc(slen + 1 + (width - n)); |
460 |
|
|
memcpy(out, s, slen); |
461 |
|
|
for (i = n; i < width; i++) |
462 |
|
|
out[slen++] = ' '; |
463 |
|
|
out[slen] = '\0'; |
464 |
|
|
return (out); |
465 |
|
|
} |