GCC Code Coverage Report
Directory: ./ Exec Total Coverage
File: usr.bin/tmux/utf8.c Lines: 0 162 0.0 %
Date: 2016-12-06 Branches: 0 101 0.0 %

Line Branch Exec Source
1
/* $OpenBSD: utf8.c,v 1.33 2016/05/27 22:57:27 nicm Exp $ */
2
3
/*
4
 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5
 *
6
 * Permission to use, copy, modify, and distribute this software for any
7
 * purpose with or without fee is hereby granted, provided that the above
8
 * copyright notice and this permission notice appear in all copies.
9
 *
10
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14
 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15
 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17
 */
18
19
#include <sys/types.h>
20
21
#include <errno.h>
22
#include <stdlib.h>
23
#include <string.h>
24
#include <vis.h>
25
#include <wchar.h>
26
27
#include "tmux.h"
28
29
static int	utf8_width(wchar_t);
30
31
/* Set a single character. */
32
void
33
utf8_set(struct utf8_data *ud, u_char ch)
34
{
35
	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
36
37
	memcpy(ud, &empty, sizeof *ud);
38
	*ud->data = ch;
39
}
40
41
/* Copy UTF-8 character. */
42
void
43
utf8_copy(struct utf8_data *to, const struct utf8_data *from)
44
{
45
	u_int	i;
46
47
	memcpy(to, from, sizeof *to);
48
49
	for (i = to->size; i < sizeof to->data; i++)
50
		to->data[i] = '\0';
51
}
52
53
/*
54
 * Open UTF-8 sequence.
55
 *
56
 * 11000010-11011111 C2-DF start of 2-byte sequence
57
 * 11100000-11101111 E0-EF start of 3-byte sequence
58
 * 11110000-11110100 F0-F4 start of 4-byte sequence
59
 */
60
enum utf8_state
61
utf8_open(struct utf8_data *ud, u_char ch)
62
{
63
	memset(ud, 0, sizeof *ud);
64
	if (ch >= 0xc2 && ch <= 0xdf)
65
		ud->size = 2;
66
	else if (ch >= 0xe0 && ch <= 0xef)
67
		ud->size = 3;
68
	else if (ch >= 0xf0 && ch <= 0xf4)
69
		ud->size = 4;
70
	else
71
		return (UTF8_ERROR);
72
	utf8_append(ud, ch);
73
	return (UTF8_MORE);
74
}
75
76
/* Append character to UTF-8, closing if finished. */
77
enum utf8_state
78
utf8_append(struct utf8_data *ud, u_char ch)
79
{
80
	wchar_t	wc;
81
	int	width;
82
83
	if (ud->have >= ud->size)
84
		fatalx("UTF-8 character overflow");
85
	if (ud->size > sizeof ud->data)
86
		fatalx("UTF-8 character size too large");
87
88
	if (ud->have != 0 && (ch & 0xc0) != 0x80)
89
		ud->width = 0xff;
90
91
	ud->data[ud->have++] = ch;
92
	if (ud->have != ud->size)
93
		return (UTF8_MORE);
94
95
	if (ud->width == 0xff)
96
		return (UTF8_ERROR);
97
98
	if (utf8_combine(ud, &wc) != UTF8_DONE)
99
		return (UTF8_ERROR);
100
	if ((width = utf8_width(wc)) < 0)
101
		return (UTF8_ERROR);
102
	ud->width = width;
103
104
	return (UTF8_DONE);
105
}
106
107
/* Get width of Unicode character. */
108
static int
109
utf8_width(wchar_t wc)
110
{
111
	int	width;
112
113
	width = wcwidth(wc);
114
	if (width < 0 || width > 0xff) {
115
		log_debug("Unicode %04x, wcwidth() %d", wc, width);
116
		return (-1);
117
	}
118
	return (width);
119
}
120
121
/* Combine UTF-8 into Unicode. */
122
enum utf8_state
123
utf8_combine(const struct utf8_data *ud, wchar_t *wc)
124
{
125
	switch (mbtowc(wc, ud->data, ud->size)) {
126
	case -1:
127
		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
128
		    errno);
129
		mbtowc(NULL, NULL, MB_CUR_MAX);
130
		return (UTF8_ERROR);
131
	case 0:
132
		return (UTF8_ERROR);
133
	default:
134
		return (UTF8_DONE);
135
	}
136
}
137
138
/* Split Unicode into UTF-8. */
139
enum utf8_state
140
utf8_split(wchar_t wc, struct utf8_data *ud)
141
{
142
	char	s[MB_LEN_MAX];
143
	int	slen;
144
145
	slen = wctomb(s, wc);
146
	if (slen <= 0 || slen > (int)sizeof ud->data)
147
		return (UTF8_ERROR);
148
149
	memcpy(ud->data, s, slen);
150
	ud->size = slen;
151
152
	ud->width = utf8_width(wc);
153
	return (UTF8_DONE);
154
}
155
156
/*
157
 * Encode len characters from src into dst, which is guaranteed to have four
158
 * bytes available for each character from src (for \abc or UTF-8) plus space
159
 * for \0.
160
 */
161
int
162
utf8_strvis(char *dst, const char *src, size_t len, int flag)
163
{
164
	struct utf8_data	 ud;
165
	const char		*start, *end;
166
	enum utf8_state		 more;
167
	size_t			 i;
168
169
	start = dst;
170
	end = src + len;
171
172
	while (src < end) {
173
		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
174
			while (++src < end && more == UTF8_MORE)
175
				more = utf8_append(&ud, *src);
176
			if (more == UTF8_DONE) {
177
				/* UTF-8 character finished. */
178
				for (i = 0; i < ud.size; i++)
179
					*dst++ = ud.data[i];
180
				continue;
181
			}
182
			/* Not a complete, valid UTF-8 character. */
183
			src -= ud.have;
184
		}
185
		if (src < end - 1)
186
			dst = vis(dst, src[0], flag, src[1]);
187
		else if (src < end)
188
			dst = vis(dst, src[0], flag, '\0');
189
		src++;
190
	}
191
192
	*dst = '\0';
193
	return (dst - start);
194
}
195
196
/*
197
 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
198
 * the returned string. Anything not valid printable ASCII or UTF-8 is
199
 * stripped.
200
 */
201
char *
202
utf8_sanitize(const char *src)
203
{
204
	char			*dst;
205
	size_t			 n;
206
	enum utf8_state		 more;
207
	struct utf8_data	 ud;
208
	u_int			 i;
209
210
	dst = NULL;
211
212
	n = 0;
213
	while (*src != '\0') {
214
		dst = xreallocarray(dst, n + 1, sizeof *dst);
215
		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
216
			while (*++src != '\0' && more == UTF8_MORE)
217
				more = utf8_append(&ud, *src);
218
			if (more == UTF8_DONE) {
219
				dst = xreallocarray(dst, n + ud.width,
220
				    sizeof *dst);
221
				for (i = 0; i < ud.width; i++)
222
					dst[n++] = '_';
223
				continue;
224
			}
225
			src -= ud.have;
226
		}
227
		if (*src > 0x1f && *src < 0x7f)
228
			dst[n++] = *src;
229
		else
230
			dst[n++] = '_';
231
		src++;
232
	}
233
234
	dst = xreallocarray(dst, n + 1, sizeof *dst);
235
	dst[n] = '\0';
236
	return (dst);
237
}
238
239
/*
240
 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
241
 * Caller frees.
242
 */
243
struct utf8_data *
244
utf8_fromcstr(const char *src)
245
{
246
	struct utf8_data	*dst;
247
	size_t			 n;
248
	enum utf8_state		 more;
249
250
	dst = NULL;
251
252
	n = 0;
253
	while (*src != '\0') {
254
		dst = xreallocarray(dst, n + 1, sizeof *dst);
255
		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
256
			while (*++src != '\0' && more == UTF8_MORE)
257
				more = utf8_append(&dst[n], *src);
258
			if (more == UTF8_DONE) {
259
				n++;
260
				continue;
261
			}
262
			src -= dst[n].have;
263
		}
264
		utf8_set(&dst[n], *src);
265
		n++;
266
		src++;
267
	}
268
269
	dst = xreallocarray(dst, n + 1, sizeof *dst);
270
	dst[n].size = 0;
271
	return (dst);
272
}
273
274
/* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
275
char *
276
utf8_tocstr(struct utf8_data *src)
277
{
278
	char	*dst;
279
	size_t	 n;
280
281
	dst = NULL;
282
283
	n = 0;
284
	for(; src->size != 0; src++) {
285
		dst = xreallocarray(dst, n + src->size, 1);
286
		memcpy(dst + n, src->data, src->size);
287
		n += src->size;
288
	}
289
290
	dst = xreallocarray(dst, n + 1, 1);
291
	dst[n] = '\0';
292
	return (dst);
293
}
294
295
/* Get width of UTF-8 string. */
296
u_int
297
utf8_cstrwidth(const char *s)
298
{
299
	struct utf8_data	tmp;
300
	u_int			width;
301
	enum utf8_state		more;
302
303
	width = 0;
304
	while (*s != '\0') {
305
		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
306
			while (*++s != '\0' && more == UTF8_MORE)
307
				more = utf8_append(&tmp, *s);
308
			if (more == UTF8_DONE) {
309
				width += tmp.width;
310
				continue;
311
			}
312
			s -= tmp.have;
313
		}
314
		if (*s > 0x1f && *s != 0x7f)
315
			width++;
316
		s++;
317
	}
318
	return (width);
319
}
320
321
/* Trim UTF-8 string to width. Caller frees. */
322
char *
323
utf8_trimcstr(const char *s, u_int width)
324
{
325
	struct utf8_data	*tmp, *next;
326
	char			*out;
327
	u_int			 at;
328
329
	tmp = utf8_fromcstr(s);
330
331
	at = 0;
332
	for (next = tmp; next->size != 0; next++) {
333
		if (at + next->width > width) {
334
			next->size = 0;
335
			break;
336
		}
337
		at += next->width;
338
	}
339
340
	out = utf8_tocstr(tmp);
341
	free(tmp);
342
	return (out);
343
}
344
345
/* Trim UTF-8 string to width. Caller frees. */
346
char *
347
utf8_rtrimcstr(const char *s, u_int width)
348
{
349
	struct utf8_data	*tmp, *next, *end;
350
	char			*out;
351
	u_int			 at;
352
353
	tmp = utf8_fromcstr(s);
354
355
	for (end = tmp; end->size != 0; end++)
356
		/* nothing */;
357
	if (end == tmp) {
358
		free(tmp);
359
		return (xstrdup(""));
360
	}
361
	next = end - 1;
362
363
	at = 0;
364
	for (;;)
365
	{
366
		if (at + next->width > width) {
367
			next++;
368
			break;
369
		}
370
		at += next->width;
371
372
		if (next == tmp)
373
			break;
374
		next--;
375
	}
376
377
	out = utf8_tocstr(next);
378
	free(tmp);
379
	return (out);
380
}
381
382
/* Pad UTF-8 string to width. Caller frees. */
383
char *
384
utf8_padcstr(const char *s, u_int width)
385
{
386
	size_t	 slen;
387
	char	*out;
388
	u_int	  n, i;
389
390
	n = utf8_cstrwidth(s);
391
	if (n >= width)
392
		return (xstrdup(s));
393
394
	slen = strlen(s);
395
	out = xmalloc(slen + 1 + (width - n));
396
	memcpy(out, s, slen);
397
	for (i = n; i < width; i++)
398
		out[slen++] = ' ';
399
	out[slen] = '\0';
400
	return (out);
401
}