GCC Code Coverage Report
Directory: ./ Exec Total Coverage
File: usr.bin/tmux/utf8.c Lines: 0 173 0.0 %
Date: 2017-11-07 Branches: 0 123 0.0 %

Line Branch Exec Source
1
/* $OpenBSD: utf8.c,v 1.39 2017/06/04 09:02:57 nicm Exp $ */
2
3
/*
4
 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5
 *
6
 * Permission to use, copy, modify, and distribute this software for any
7
 * purpose with or without fee is hereby granted, provided that the above
8
 * copyright notice and this permission notice appear in all copies.
9
 *
10
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14
 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15
 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17
 */
18
19
#include <sys/types.h>
20
21
#include <errno.h>
22
#include <stdlib.h>
23
#include <string.h>
24
#include <vis.h>
25
#include <wchar.h>
26
27
#include "tmux.h"
28
29
static int	utf8_width(wchar_t);
30
31
/* Set a single character. */
32
void
33
utf8_set(struct utf8_data *ud, u_char ch)
34
{
35
	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
36
37
	memcpy(ud, &empty, sizeof *ud);
38
	*ud->data = ch;
39
}
40
41
/* Copy UTF-8 character. */
42
void
43
utf8_copy(struct utf8_data *to, const struct utf8_data *from)
44
{
45
	u_int	i;
46
47
	memcpy(to, from, sizeof *to);
48
49
	for (i = to->size; i < sizeof to->data; i++)
50
		to->data[i] = '\0';
51
}
52
53
/*
54
 * Open UTF-8 sequence.
55
 *
56
 * 11000010-11011111 C2-DF start of 2-byte sequence
57
 * 11100000-11101111 E0-EF start of 3-byte sequence
58
 * 11110000-11110100 F0-F4 start of 4-byte sequence
59
 */
60
enum utf8_state
61
utf8_open(struct utf8_data *ud, u_char ch)
62
{
63
	memset(ud, 0, sizeof *ud);
64
	if (ch >= 0xc2 && ch <= 0xdf)
65
		ud->size = 2;
66
	else if (ch >= 0xe0 && ch <= 0xef)
67
		ud->size = 3;
68
	else if (ch >= 0xf0 && ch <= 0xf4)
69
		ud->size = 4;
70
	else
71
		return (UTF8_ERROR);
72
	utf8_append(ud, ch);
73
	return (UTF8_MORE);
74
}
75
76
/* Append character to UTF-8, closing if finished. */
77
enum utf8_state
78
utf8_append(struct utf8_data *ud, u_char ch)
79
{
80
	wchar_t	wc;
81
	int	width;
82
83
	if (ud->have >= ud->size)
84
		fatalx("UTF-8 character overflow");
85
	if (ud->size > sizeof ud->data)
86
		fatalx("UTF-8 character size too large");
87
88
	if (ud->have != 0 && (ch & 0xc0) != 0x80)
89
		ud->width = 0xff;
90
91
	ud->data[ud->have++] = ch;
92
	if (ud->have != ud->size)
93
		return (UTF8_MORE);
94
95
	if (ud->width == 0xff)
96
		return (UTF8_ERROR);
97
98
	if (utf8_combine(ud, &wc) != UTF8_DONE)
99
		return (UTF8_ERROR);
100
	if ((width = utf8_width(wc)) < 0)
101
		return (UTF8_ERROR);
102
	ud->width = width;
103
104
	return (UTF8_DONE);
105
}
106
107
/* Get width of Unicode character. */
108
static int
109
utf8_width(wchar_t wc)
110
{
111
	int	width;
112
113
	width = wcwidth(wc);
114
	if (width < 0 || width > 0xff) {
115
		log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
116
		return (-1);
117
	}
118
	return (width);
119
}
120
121
/* Combine UTF-8 into Unicode. */
122
enum utf8_state
123
utf8_combine(const struct utf8_data *ud, wchar_t *wc)
124
{
125
	switch (mbtowc(wc, ud->data, ud->size)) {
126
	case -1:
127
		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
128
		    errno);
129
		mbtowc(NULL, NULL, MB_CUR_MAX);
130
		return (UTF8_ERROR);
131
	case 0:
132
		return (UTF8_ERROR);
133
	default:
134
		return (UTF8_DONE);
135
	}
136
}
137
138
/* Split Unicode into UTF-8. */
139
enum utf8_state
140
utf8_split(wchar_t wc, struct utf8_data *ud)
141
{
142
	char	s[MB_LEN_MAX];
143
	int	slen;
144
145
	slen = wctomb(s, wc);
146
	if (slen <= 0 || slen > (int)sizeof ud->data)
147
		return (UTF8_ERROR);
148
149
	memcpy(ud->data, s, slen);
150
	ud->size = slen;
151
152
	ud->width = utf8_width(wc);
153
	return (UTF8_DONE);
154
}
155
156
/*
157
 * Encode len characters from src into dst, which is guaranteed to have four
158
 * bytes available for each character from src (for \abc or UTF-8) plus space
159
 * for \0.
160
 */
161
int
162
utf8_strvis(char *dst, const char *src, size_t len, int flag)
163
{
164
	struct utf8_data	 ud;
165
	const char		*start, *end;
166
	enum utf8_state		 more;
167
	size_t			 i;
168
169
	start = dst;
170
	end = src + len;
171
172
	while (src < end) {
173
		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
174
			while (++src < end && more == UTF8_MORE)
175
				more = utf8_append(&ud, *src);
176
			if (more == UTF8_DONE) {
177
				/* UTF-8 character finished. */
178
				for (i = 0; i < ud.size; i++)
179
					*dst++ = ud.data[i];
180
				continue;
181
			}
182
			/* Not a complete, valid UTF-8 character. */
183
			src -= ud.have;
184
		}
185
		if (src < end - 1)
186
			dst = vis(dst, src[0], flag, src[1]);
187
		else if (src < end)
188
			dst = vis(dst, src[0], flag, '\0');
189
		src++;
190
	}
191
192
	*dst = '\0';
193
	return (dst - start);
194
}
195
196
/* Same as utf8_strvis but allocate the buffer. */
197
int
198
utf8_stravis(char **dst, const char *src, int flag)
199
{
200
	char	*buf;
201
	int	 len;
202
203
	buf = xreallocarray(NULL, 4, strlen(src) + 1);
204
	len = utf8_strvis(buf, src, strlen(src), flag);
205
206
	*dst = xrealloc(buf, len + 1);
207
	return (len);
208
}
209
210
/* Does this string contain anything that isn't valid UTF-8? */
211
int
212
utf8_isvalid(const char *s)
213
{
214
	struct utf8_data	 ud;
215
	const char		*end;
216
	enum utf8_state		 more;
217
218
	end = s + strlen(s);
219
	while (s < end) {
220
		if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
221
			while (++s < end && more == UTF8_MORE)
222
				more = utf8_append(&ud, *s);
223
			if (more == UTF8_DONE)
224
				continue;
225
			return (0);
226
		}
227
		if (*s < 0x20 || *s > 0x7e)
228
			return (0);
229
		s++;
230
	}
231
	return (1);
232
}
233
234
/*
235
 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
236
 * the returned string. Anything not valid printable ASCII or UTF-8 is
237
 * stripped.
238
 */
239
char *
240
utf8_sanitize(const char *src)
241
{
242
	char			*dst;
243
	size_t			 n;
244
	enum utf8_state		 more;
245
	struct utf8_data	 ud;
246
	u_int			 i;
247
248
	dst = NULL;
249
250
	n = 0;
251
	while (*src != '\0') {
252
		dst = xreallocarray(dst, n + 1, sizeof *dst);
253
		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
254
			while (*++src != '\0' && more == UTF8_MORE)
255
				more = utf8_append(&ud, *src);
256
			if (more == UTF8_DONE) {
257
				dst = xreallocarray(dst, n + ud.width,
258
				    sizeof *dst);
259
				for (i = 0; i < ud.width; i++)
260
					dst[n++] = '_';
261
				continue;
262
			}
263
			src -= ud.have;
264
		}
265
		if (*src > 0x1f && *src < 0x7f)
266
			dst[n++] = *src;
267
		else
268
			dst[n++] = '_';
269
		src++;
270
	}
271
272
	dst = xreallocarray(dst, n + 1, sizeof *dst);
273
	dst[n] = '\0';
274
	return (dst);
275
}
276
277
/* Get UTF-8 buffer length. */
278
size_t
279
utf8_strlen(const struct utf8_data *s)
280
{
281
	size_t	i;
282
283
	for (i = 0; s[i].size != 0; i++)
284
		/* nothing */;
285
	return (i);
286
}
287
288
/* Get UTF-8 string width. */
289
u_int
290
utf8_strwidth(const struct utf8_data *s, ssize_t n)
291
{
292
	ssize_t	i;
293
	u_int	width;
294
295
	width = 0;
296
	for (i = 0; s[i].size != 0; i++) {
297
		if (n != -1 && n == i)
298
			break;
299
		width += s[i].width;
300
	}
301
	return (width);
302
}
303
304
/*
305
 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
306
 * Caller frees.
307
 */
308
struct utf8_data *
309
utf8_fromcstr(const char *src)
310
{
311
	struct utf8_data	*dst;
312
	size_t			 n;
313
	enum utf8_state		 more;
314
315
	dst = NULL;
316
317
	n = 0;
318
	while (*src != '\0') {
319
		dst = xreallocarray(dst, n + 1, sizeof *dst);
320
		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
321
			while (*++src != '\0' && more == UTF8_MORE)
322
				more = utf8_append(&dst[n], *src);
323
			if (more == UTF8_DONE) {
324
				n++;
325
				continue;
326
			}
327
			src -= dst[n].have;
328
		}
329
		utf8_set(&dst[n], *src);
330
		n++;
331
		src++;
332
	}
333
334
	dst = xreallocarray(dst, n + 1, sizeof *dst);
335
	dst[n].size = 0;
336
	return (dst);
337
}
338
339
/* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
340
char *
341
utf8_tocstr(struct utf8_data *src)
342
{
343
	char	*dst;
344
	size_t	 n;
345
346
	dst = NULL;
347
348
	n = 0;
349
	for(; src->size != 0; src++) {
350
		dst = xreallocarray(dst, n + src->size, 1);
351
		memcpy(dst + n, src->data, src->size);
352
		n += src->size;
353
	}
354
355
	dst = xreallocarray(dst, n + 1, 1);
356
	dst[n] = '\0';
357
	return (dst);
358
}
359
360
/* Get width of UTF-8 string. */
361
u_int
362
utf8_cstrwidth(const char *s)
363
{
364
	struct utf8_data	tmp;
365
	u_int			width;
366
	enum utf8_state		more;
367
368
	width = 0;
369
	while (*s != '\0') {
370
		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
371
			while (*++s != '\0' && more == UTF8_MORE)
372
				more = utf8_append(&tmp, *s);
373
			if (more == UTF8_DONE) {
374
				width += tmp.width;
375
				continue;
376
			}
377
			s -= tmp.have;
378
		}
379
		if (*s > 0x1f && *s != 0x7f)
380
			width++;
381
		s++;
382
	}
383
	return (width);
384
}
385
386
/* Trim UTF-8 string to width. Caller frees. */
387
char *
388
utf8_trimcstr(const char *s, u_int width)
389
{
390
	struct utf8_data	*tmp, *next;
391
	char			*out;
392
	u_int			 at;
393
394
	tmp = utf8_fromcstr(s);
395
396
	at = 0;
397
	for (next = tmp; next->size != 0; next++) {
398
		if (at + next->width > width) {
399
			next->size = 0;
400
			break;
401
		}
402
		at += next->width;
403
	}
404
405
	out = utf8_tocstr(tmp);
406
	free(tmp);
407
	return (out);
408
}
409
410
/* Trim UTF-8 string to width. Caller frees. */
411
char *
412
utf8_rtrimcstr(const char *s, u_int width)
413
{
414
	struct utf8_data	*tmp, *next, *end;
415
	char			*out;
416
	u_int			 at;
417
418
	tmp = utf8_fromcstr(s);
419
420
	for (end = tmp; end->size != 0; end++)
421
		/* nothing */;
422
	if (end == tmp) {
423
		free(tmp);
424
		return (xstrdup(""));
425
	}
426
	next = end - 1;
427
428
	at = 0;
429
	for (;;) {
430
		if (at + next->width > width) {
431
			next++;
432
			break;
433
		}
434
		at += next->width;
435
436
		if (next == tmp)
437
			break;
438
		next--;
439
	}
440
441
	out = utf8_tocstr(next);
442
	free(tmp);
443
	return (out);
444
}
445
446
/* Pad UTF-8 string to width. Caller frees. */
447
char *
448
utf8_padcstr(const char *s, u_int width)
449
{
450
	size_t	 slen;
451
	char	*out;
452
	u_int	  n, i;
453
454
	n = utf8_cstrwidth(s);
455
	if (n >= width)
456
		return (xstrdup(s));
457
458
	slen = strlen(s);
459
	out = xmalloc(slen + 1 + (width - n));
460
	memcpy(out, s, slen);
461
	for (i = n; i < width; i++)
462
		out[slen++] = ' ';
463
	out[slen] = '\0';
464
	return (out);
465
}