1 |
|
|
/* $OpenBSD: utf8.c,v 1.33 2016/05/27 22:57:27 nicm Exp $ */ |
2 |
|
|
|
3 |
|
|
/* |
4 |
|
|
* Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> |
5 |
|
|
* |
6 |
|
|
* Permission to use, copy, modify, and distribute this software for any |
7 |
|
|
* purpose with or without fee is hereby granted, provided that the above |
8 |
|
|
* copyright notice and this permission notice appear in all copies. |
9 |
|
|
* |
10 |
|
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
11 |
|
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
12 |
|
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
13 |
|
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
14 |
|
|
* WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER |
15 |
|
|
* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
16 |
|
|
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
17 |
|
|
*/ |
18 |
|
|
|
19 |
|
|
#include <sys/types.h> |
20 |
|
|
|
21 |
|
|
#include <errno.h> |
22 |
|
|
#include <stdlib.h> |
23 |
|
|
#include <string.h> |
24 |
|
|
#include <vis.h> |
25 |
|
|
#include <wchar.h> |
26 |
|
|
|
27 |
|
|
#include "tmux.h" |
28 |
|
|
|
29 |
|
|
static int utf8_width(wchar_t); |
30 |
|
|
|
31 |
|
|
/* Set a single character. */ |
32 |
|
|
void |
33 |
|
|
utf8_set(struct utf8_data *ud, u_char ch) |
34 |
|
|
{ |
35 |
|
|
static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; |
36 |
|
|
|
37 |
|
|
memcpy(ud, &empty, sizeof *ud); |
38 |
|
|
*ud->data = ch; |
39 |
|
|
} |
40 |
|
|
|
41 |
|
|
/* Copy UTF-8 character. */ |
42 |
|
|
void |
43 |
|
|
utf8_copy(struct utf8_data *to, const struct utf8_data *from) |
44 |
|
|
{ |
45 |
|
|
u_int i; |
46 |
|
|
|
47 |
|
|
memcpy(to, from, sizeof *to); |
48 |
|
|
|
49 |
|
|
for (i = to->size; i < sizeof to->data; i++) |
50 |
|
|
to->data[i] = '\0'; |
51 |
|
|
} |
52 |
|
|
|
53 |
|
|
/* |
54 |
|
|
* Open UTF-8 sequence. |
55 |
|
|
* |
56 |
|
|
* 11000010-11011111 C2-DF start of 2-byte sequence |
57 |
|
|
* 11100000-11101111 E0-EF start of 3-byte sequence |
58 |
|
|
* 11110000-11110100 F0-F4 start of 4-byte sequence |
59 |
|
|
*/ |
60 |
|
|
enum utf8_state |
61 |
|
|
utf8_open(struct utf8_data *ud, u_char ch) |
62 |
|
|
{ |
63 |
|
|
memset(ud, 0, sizeof *ud); |
64 |
|
|
if (ch >= 0xc2 && ch <= 0xdf) |
65 |
|
|
ud->size = 2; |
66 |
|
|
else if (ch >= 0xe0 && ch <= 0xef) |
67 |
|
|
ud->size = 3; |
68 |
|
|
else if (ch >= 0xf0 && ch <= 0xf4) |
69 |
|
|
ud->size = 4; |
70 |
|
|
else |
71 |
|
|
return (UTF8_ERROR); |
72 |
|
|
utf8_append(ud, ch); |
73 |
|
|
return (UTF8_MORE); |
74 |
|
|
} |
75 |
|
|
|
76 |
|
|
/* Append character to UTF-8, closing if finished. */ |
77 |
|
|
enum utf8_state |
78 |
|
|
utf8_append(struct utf8_data *ud, u_char ch) |
79 |
|
|
{ |
80 |
|
|
wchar_t wc; |
81 |
|
|
int width; |
82 |
|
|
|
83 |
|
|
if (ud->have >= ud->size) |
84 |
|
|
fatalx("UTF-8 character overflow"); |
85 |
|
|
if (ud->size > sizeof ud->data) |
86 |
|
|
fatalx("UTF-8 character size too large"); |
87 |
|
|
|
88 |
|
|
if (ud->have != 0 && (ch & 0xc0) != 0x80) |
89 |
|
|
ud->width = 0xff; |
90 |
|
|
|
91 |
|
|
ud->data[ud->have++] = ch; |
92 |
|
|
if (ud->have != ud->size) |
93 |
|
|
return (UTF8_MORE); |
94 |
|
|
|
95 |
|
|
if (ud->width == 0xff) |
96 |
|
|
return (UTF8_ERROR); |
97 |
|
|
|
98 |
|
|
if (utf8_combine(ud, &wc) != UTF8_DONE) |
99 |
|
|
return (UTF8_ERROR); |
100 |
|
|
if ((width = utf8_width(wc)) < 0) |
101 |
|
|
return (UTF8_ERROR); |
102 |
|
|
ud->width = width; |
103 |
|
|
|
104 |
|
|
return (UTF8_DONE); |
105 |
|
|
} |
106 |
|
|
|
107 |
|
|
/* Get width of Unicode character. */ |
108 |
|
|
static int |
109 |
|
|
utf8_width(wchar_t wc) |
110 |
|
|
{ |
111 |
|
|
int width; |
112 |
|
|
|
113 |
|
|
width = wcwidth(wc); |
114 |
|
|
if (width < 0 || width > 0xff) { |
115 |
|
|
log_debug("Unicode %04x, wcwidth() %d", wc, width); |
116 |
|
|
return (-1); |
117 |
|
|
} |
118 |
|
|
return (width); |
119 |
|
|
} |
120 |
|
|
|
121 |
|
|
/* Combine UTF-8 into Unicode. */ |
122 |
|
|
enum utf8_state |
123 |
|
|
utf8_combine(const struct utf8_data *ud, wchar_t *wc) |
124 |
|
|
{ |
125 |
|
|
switch (mbtowc(wc, ud->data, ud->size)) { |
126 |
|
|
case -1: |
127 |
|
|
log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, |
128 |
|
|
errno); |
129 |
|
|
mbtowc(NULL, NULL, MB_CUR_MAX); |
130 |
|
|
return (UTF8_ERROR); |
131 |
|
|
case 0: |
132 |
|
|
return (UTF8_ERROR); |
133 |
|
|
default: |
134 |
|
|
return (UTF8_DONE); |
135 |
|
|
} |
136 |
|
|
} |
137 |
|
|
|
138 |
|
|
/* Split Unicode into UTF-8. */ |
139 |
|
|
enum utf8_state |
140 |
|
|
utf8_split(wchar_t wc, struct utf8_data *ud) |
141 |
|
|
{ |
142 |
|
|
char s[MB_LEN_MAX]; |
143 |
|
|
int slen; |
144 |
|
|
|
145 |
|
|
slen = wctomb(s, wc); |
146 |
|
|
if (slen <= 0 || slen > (int)sizeof ud->data) |
147 |
|
|
return (UTF8_ERROR); |
148 |
|
|
|
149 |
|
|
memcpy(ud->data, s, slen); |
150 |
|
|
ud->size = slen; |
151 |
|
|
|
152 |
|
|
ud->width = utf8_width(wc); |
153 |
|
|
return (UTF8_DONE); |
154 |
|
|
} |
155 |
|
|
|
156 |
|
|
/* |
157 |
|
|
* Encode len characters from src into dst, which is guaranteed to have four |
158 |
|
|
* bytes available for each character from src (for \abc or UTF-8) plus space |
159 |
|
|
* for \0. |
160 |
|
|
*/ |
161 |
|
|
int |
162 |
|
|
utf8_strvis(char *dst, const char *src, size_t len, int flag) |
163 |
|
|
{ |
164 |
|
|
struct utf8_data ud; |
165 |
|
|
const char *start, *end; |
166 |
|
|
enum utf8_state more; |
167 |
|
|
size_t i; |
168 |
|
|
|
169 |
|
|
start = dst; |
170 |
|
|
end = src + len; |
171 |
|
|
|
172 |
|
|
while (src < end) { |
173 |
|
|
if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { |
174 |
|
|
while (++src < end && more == UTF8_MORE) |
175 |
|
|
more = utf8_append(&ud, *src); |
176 |
|
|
if (more == UTF8_DONE) { |
177 |
|
|
/* UTF-8 character finished. */ |
178 |
|
|
for (i = 0; i < ud.size; i++) |
179 |
|
|
*dst++ = ud.data[i]; |
180 |
|
|
continue; |
181 |
|
|
} |
182 |
|
|
/* Not a complete, valid UTF-8 character. */ |
183 |
|
|
src -= ud.have; |
184 |
|
|
} |
185 |
|
|
if (src < end - 1) |
186 |
|
|
dst = vis(dst, src[0], flag, src[1]); |
187 |
|
|
else if (src < end) |
188 |
|
|
dst = vis(dst, src[0], flag, '\0'); |
189 |
|
|
src++; |
190 |
|
|
} |
191 |
|
|
|
192 |
|
|
*dst = '\0'; |
193 |
|
|
return (dst - start); |
194 |
|
|
} |
195 |
|
|
|
196 |
|
|
/* |
197 |
|
|
* Sanitize a string, changing any UTF-8 characters to '_'. Caller should free |
198 |
|
|
* the returned string. Anything not valid printable ASCII or UTF-8 is |
199 |
|
|
* stripped. |
200 |
|
|
*/ |
201 |
|
|
char * |
202 |
|
|
utf8_sanitize(const char *src) |
203 |
|
|
{ |
204 |
|
|
char *dst; |
205 |
|
|
size_t n; |
206 |
|
|
enum utf8_state more; |
207 |
|
|
struct utf8_data ud; |
208 |
|
|
u_int i; |
209 |
|
|
|
210 |
|
|
dst = NULL; |
211 |
|
|
|
212 |
|
|
n = 0; |
213 |
|
|
while (*src != '\0') { |
214 |
|
|
dst = xreallocarray(dst, n + 1, sizeof *dst); |
215 |
|
|
if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { |
216 |
|
|
while (*++src != '\0' && more == UTF8_MORE) |
217 |
|
|
more = utf8_append(&ud, *src); |
218 |
|
|
if (more == UTF8_DONE) { |
219 |
|
|
dst = xreallocarray(dst, n + ud.width, |
220 |
|
|
sizeof *dst); |
221 |
|
|
for (i = 0; i < ud.width; i++) |
222 |
|
|
dst[n++] = '_'; |
223 |
|
|
continue; |
224 |
|
|
} |
225 |
|
|
src -= ud.have; |
226 |
|
|
} |
227 |
|
|
if (*src > 0x1f && *src < 0x7f) |
228 |
|
|
dst[n++] = *src; |
229 |
|
|
else |
230 |
|
|
dst[n++] = '_'; |
231 |
|
|
src++; |
232 |
|
|
} |
233 |
|
|
|
234 |
|
|
dst = xreallocarray(dst, n + 1, sizeof *dst); |
235 |
|
|
dst[n] = '\0'; |
236 |
|
|
return (dst); |
237 |
|
|
} |
238 |
|
|
|
239 |
|
|
/* |
240 |
|
|
* Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. |
241 |
|
|
* Caller frees. |
242 |
|
|
*/ |
243 |
|
|
struct utf8_data * |
244 |
|
|
utf8_fromcstr(const char *src) |
245 |
|
|
{ |
246 |
|
|
struct utf8_data *dst; |
247 |
|
|
size_t n; |
248 |
|
|
enum utf8_state more; |
249 |
|
|
|
250 |
|
|
dst = NULL; |
251 |
|
|
|
252 |
|
|
n = 0; |
253 |
|
|
while (*src != '\0') { |
254 |
|
|
dst = xreallocarray(dst, n + 1, sizeof *dst); |
255 |
|
|
if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { |
256 |
|
|
while (*++src != '\0' && more == UTF8_MORE) |
257 |
|
|
more = utf8_append(&dst[n], *src); |
258 |
|
|
if (more == UTF8_DONE) { |
259 |
|
|
n++; |
260 |
|
|
continue; |
261 |
|
|
} |
262 |
|
|
src -= dst[n].have; |
263 |
|
|
} |
264 |
|
|
utf8_set(&dst[n], *src); |
265 |
|
|
n++; |
266 |
|
|
src++; |
267 |
|
|
} |
268 |
|
|
|
269 |
|
|
dst = xreallocarray(dst, n + 1, sizeof *dst); |
270 |
|
|
dst[n].size = 0; |
271 |
|
|
return (dst); |
272 |
|
|
} |
273 |
|
|
|
274 |
|
|
/* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ |
275 |
|
|
char * |
276 |
|
|
utf8_tocstr(struct utf8_data *src) |
277 |
|
|
{ |
278 |
|
|
char *dst; |
279 |
|
|
size_t n; |
280 |
|
|
|
281 |
|
|
dst = NULL; |
282 |
|
|
|
283 |
|
|
n = 0; |
284 |
|
|
for(; src->size != 0; src++) { |
285 |
|
|
dst = xreallocarray(dst, n + src->size, 1); |
286 |
|
|
memcpy(dst + n, src->data, src->size); |
287 |
|
|
n += src->size; |
288 |
|
|
} |
289 |
|
|
|
290 |
|
|
dst = xreallocarray(dst, n + 1, 1); |
291 |
|
|
dst[n] = '\0'; |
292 |
|
|
return (dst); |
293 |
|
|
} |
294 |
|
|
|
295 |
|
|
/* Get width of UTF-8 string. */ |
296 |
|
|
u_int |
297 |
|
|
utf8_cstrwidth(const char *s) |
298 |
|
|
{ |
299 |
|
|
struct utf8_data tmp; |
300 |
|
|
u_int width; |
301 |
|
|
enum utf8_state more; |
302 |
|
|
|
303 |
|
|
width = 0; |
304 |
|
|
while (*s != '\0') { |
305 |
|
|
if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { |
306 |
|
|
while (*++s != '\0' && more == UTF8_MORE) |
307 |
|
|
more = utf8_append(&tmp, *s); |
308 |
|
|
if (more == UTF8_DONE) { |
309 |
|
|
width += tmp.width; |
310 |
|
|
continue; |
311 |
|
|
} |
312 |
|
|
s -= tmp.have; |
313 |
|
|
} |
314 |
|
|
if (*s > 0x1f && *s != 0x7f) |
315 |
|
|
width++; |
316 |
|
|
s++; |
317 |
|
|
} |
318 |
|
|
return (width); |
319 |
|
|
} |
320 |
|
|
|
321 |
|
|
/* Trim UTF-8 string to width. Caller frees. */ |
322 |
|
|
char * |
323 |
|
|
utf8_trimcstr(const char *s, u_int width) |
324 |
|
|
{ |
325 |
|
|
struct utf8_data *tmp, *next; |
326 |
|
|
char *out; |
327 |
|
|
u_int at; |
328 |
|
|
|
329 |
|
|
tmp = utf8_fromcstr(s); |
330 |
|
|
|
331 |
|
|
at = 0; |
332 |
|
|
for (next = tmp; next->size != 0; next++) { |
333 |
|
|
if (at + next->width > width) { |
334 |
|
|
next->size = 0; |
335 |
|
|
break; |
336 |
|
|
} |
337 |
|
|
at += next->width; |
338 |
|
|
} |
339 |
|
|
|
340 |
|
|
out = utf8_tocstr(tmp); |
341 |
|
|
free(tmp); |
342 |
|
|
return (out); |
343 |
|
|
} |
344 |
|
|
|
345 |
|
|
/* Trim UTF-8 string to width. Caller frees. */ |
346 |
|
|
char * |
347 |
|
|
utf8_rtrimcstr(const char *s, u_int width) |
348 |
|
|
{ |
349 |
|
|
struct utf8_data *tmp, *next, *end; |
350 |
|
|
char *out; |
351 |
|
|
u_int at; |
352 |
|
|
|
353 |
|
|
tmp = utf8_fromcstr(s); |
354 |
|
|
|
355 |
|
|
for (end = tmp; end->size != 0; end++) |
356 |
|
|
/* nothing */; |
357 |
|
|
if (end == tmp) { |
358 |
|
|
free(tmp); |
359 |
|
|
return (xstrdup("")); |
360 |
|
|
} |
361 |
|
|
next = end - 1; |
362 |
|
|
|
363 |
|
|
at = 0; |
364 |
|
|
for (;;) |
365 |
|
|
{ |
366 |
|
|
if (at + next->width > width) { |
367 |
|
|
next++; |
368 |
|
|
break; |
369 |
|
|
} |
370 |
|
|
at += next->width; |
371 |
|
|
|
372 |
|
|
if (next == tmp) |
373 |
|
|
break; |
374 |
|
|
next--; |
375 |
|
|
} |
376 |
|
|
|
377 |
|
|
out = utf8_tocstr(next); |
378 |
|
|
free(tmp); |
379 |
|
|
return (out); |
380 |
|
|
} |
381 |
|
|
|
382 |
|
|
/* Pad UTF-8 string to width. Caller frees. */ |
383 |
|
|
char * |
384 |
|
|
utf8_padcstr(const char *s, u_int width) |
385 |
|
|
{ |
386 |
|
|
size_t slen; |
387 |
|
|
char *out; |
388 |
|
|
u_int n, i; |
389 |
|
|
|
390 |
|
|
n = utf8_cstrwidth(s); |
391 |
|
|
if (n >= width) |
392 |
|
|
return (xstrdup(s)); |
393 |
|
|
|
394 |
|
|
slen = strlen(s); |
395 |
|
|
out = xmalloc(slen + 1 + (width - n)); |
396 |
|
|
memcpy(out, s, slen); |
397 |
|
|
for (i = n; i < width; i++) |
398 |
|
|
out[slen++] = ' '; |
399 |
|
|
out[slen] = '\0'; |
400 |
|
|
return (out); |
401 |
|
|
} |