1 |
|
|
/* $OpenBSD: utf8.c,v 1.7 2017/05/31 09:15:42 deraadt Exp $ */ |
2 |
|
|
/* |
3 |
|
|
* Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org> |
4 |
|
|
* |
5 |
|
|
* Permission to use, copy, modify, and distribute this software for any |
6 |
|
|
* purpose with or without fee is hereby granted, provided that the above |
7 |
|
|
* copyright notice and this permission notice appear in all copies. |
8 |
|
|
* |
9 |
|
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
10 |
|
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
11 |
|
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
12 |
|
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
13 |
|
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
14 |
|
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
15 |
|
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
16 |
|
|
*/ |
17 |
|
|
|
18 |
|
|
/* |
19 |
|
|
* Utility functions for multibyte-character handling, |
20 |
|
|
* in particular to sanitize untrusted strings for terminal output. |
21 |
|
|
*/ |
22 |
|
|
|
23 |
|
|
#include <sys/types.h> |
24 |
|
|
#include <langinfo.h> |
25 |
|
|
#include <limits.h> |
26 |
|
|
#include <stdarg.h> |
27 |
|
|
#include <stdio.h> |
28 |
|
|
#include <stdlib.h> |
29 |
|
|
#include <string.h> |
30 |
|
|
#include <vis.h> |
31 |
|
|
#include <wchar.h> |
32 |
|
|
|
33 |
|
|
#include "utf8.h" |
34 |
|
|
|
35 |
|
|
static int dangerous_locale(void); |
36 |
|
|
static int grow_dst(char **, size_t *, size_t, char **, size_t); |
37 |
|
|
static int vasnmprintf(char **, size_t, int *, const char *, va_list); |
38 |
|
|
|
39 |
|
|
|
40 |
|
|
/* |
41 |
|
|
* For US-ASCII and UTF-8 encodings, we can safely recover from |
42 |
|
|
* encoding errors and from non-printable characters. For any |
43 |
|
|
* other encodings, err to the side of caution and abort parsing: |
44 |
|
|
* For state-dependent encodings, recovery is impossible. |
45 |
|
|
* For arbitrary encodings, replacement of non-printable |
46 |
|
|
* characters would be non-trivial and too fragile. |
47 |
|
|
*/ |
48 |
|
|
|
49 |
|
|
static int |
50 |
|
|
dangerous_locale(void) { |
51 |
|
|
char *loc; |
52 |
|
|
|
53 |
|
|
loc = nl_langinfo(CODESET); |
54 |
|
|
return strcmp(loc, "US-ASCII") != 0 && strcmp(loc, "UTF-8") != 0 && |
55 |
|
|
strcmp(loc, "ANSI_X3.4-1968") != 0 && strcmp(loc, "646") != 0 && |
56 |
|
|
strcmp(loc, "") != 0; |
57 |
|
|
} |
58 |
|
|
|
59 |
|
|
static int |
60 |
|
|
grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need) |
61 |
|
|
{ |
62 |
|
|
char *tp; |
63 |
|
|
size_t tsz; |
64 |
|
|
|
65 |
|
|
if (*dp + need < *dst + *sz) |
66 |
|
|
return 0; |
67 |
|
|
tsz = *sz + 128; |
68 |
|
|
if (tsz > maxsz) |
69 |
|
|
tsz = maxsz; |
70 |
|
|
if ((tp = recallocarray(*dst, *sz, tsz, 1)) == NULL) |
71 |
|
|
return -1; |
72 |
|
|
*dp = tp + (*dp - *dst); |
73 |
|
|
*dst = tp; |
74 |
|
|
*sz = tsz; |
75 |
|
|
return 0; |
76 |
|
|
} |
77 |
|
|
|
78 |
|
|
/* |
79 |
|
|
* The following two functions limit the number of bytes written, |
80 |
|
|
* including the terminating '\0', to sz. Unless wp is NULL, |
81 |
|
|
* they limit the number of display columns occupied to *wp. |
82 |
|
|
* Whichever is reached first terminates the output string. |
83 |
|
|
* To stay close to the standard interfaces, they return the number of |
84 |
|
|
* non-NUL bytes that would have been written if both were unlimited. |
85 |
|
|
* If wp is NULL, newline, carriage return, and tab are allowed; |
86 |
|
|
* otherwise, the actual number of columns occupied by what was |
87 |
|
|
* written is returned in *wp. |
88 |
|
|
*/ |
89 |
|
|
|
90 |
|
|
static int |
91 |
|
|
vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap) |
92 |
|
|
{ |
93 |
|
|
char *src; /* Source string returned from vasprintf. */ |
94 |
|
|
char *sp; /* Pointer into src. */ |
95 |
|
|
char *dst; /* Destination string to be returned. */ |
96 |
|
|
char *dp; /* Pointer into dst. */ |
97 |
|
|
char *tp; /* Temporary pointer for dst. */ |
98 |
|
|
size_t sz; /* Number of bytes allocated for dst. */ |
99 |
|
|
wchar_t wc; /* Wide character at sp. */ |
100 |
|
|
int len; /* Number of bytes in the character at sp. */ |
101 |
|
|
int ret; /* Number of bytes needed to format src. */ |
102 |
|
|
int width; /* Display width of the character wc. */ |
103 |
|
|
int total_width, max_width, print; |
104 |
|
|
|
105 |
|
|
src = NULL; |
106 |
|
|
if ((ret = vasprintf(&src, fmt, ap)) <= 0) |
107 |
|
|
goto fail; |
108 |
|
|
|
109 |
|
|
sz = strlen(src) + 1; |
110 |
|
|
if ((dst = malloc(sz)) == NULL) { |
111 |
|
|
free(src); |
112 |
|
|
ret = -1; |
113 |
|
|
goto fail; |
114 |
|
|
} |
115 |
|
|
|
116 |
|
|
if (maxsz > INT_MAX) |
117 |
|
|
maxsz = INT_MAX; |
118 |
|
|
|
119 |
|
|
sp = src; |
120 |
|
|
dp = dst; |
121 |
|
|
ret = 0; |
122 |
|
|
print = 1; |
123 |
|
|
total_width = 0; |
124 |
|
|
max_width = wp == NULL ? INT_MAX : *wp; |
125 |
|
|
while (*sp != '\0') { |
126 |
|
|
if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) { |
127 |
|
|
(void)mbtowc(NULL, NULL, MB_CUR_MAX); |
128 |
|
|
if (dangerous_locale()) { |
129 |
|
|
ret = -1; |
130 |
|
|
break; |
131 |
|
|
} |
132 |
|
|
len = 1; |
133 |
|
|
width = -1; |
134 |
|
|
} else if (wp == NULL && |
135 |
|
|
(wc == L'\n' || wc == L'\r' || wc == L'\t')) { |
136 |
|
|
/* |
137 |
|
|
* Don't use width uninitialized; the actual |
138 |
|
|
* value doesn't matter because total_width |
139 |
|
|
* is only returned for wp != NULL. |
140 |
|
|
*/ |
141 |
|
|
width = 0; |
142 |
|
|
} else if ((width = wcwidth(wc)) == -1 && |
143 |
|
|
dangerous_locale()) { |
144 |
|
|
ret = -1; |
145 |
|
|
break; |
146 |
|
|
} |
147 |
|
|
|
148 |
|
|
/* Valid, printable character. */ |
149 |
|
|
|
150 |
|
|
if (width >= 0) { |
151 |
|
|
if (print && (dp - dst >= (int)maxsz - len || |
152 |
|
|
total_width > max_width - width)) |
153 |
|
|
print = 0; |
154 |
|
|
if (print) { |
155 |
|
|
if (grow_dst(&dst, &sz, maxsz, |
156 |
|
|
&dp, len) == -1) { |
157 |
|
|
ret = -1; |
158 |
|
|
break; |
159 |
|
|
} |
160 |
|
|
total_width += width; |
161 |
|
|
memcpy(dp, sp, len); |
162 |
|
|
dp += len; |
163 |
|
|
} |
164 |
|
|
sp += len; |
165 |
|
|
if (ret >= 0) |
166 |
|
|
ret += len; |
167 |
|
|
continue; |
168 |
|
|
} |
169 |
|
|
|
170 |
|
|
/* Escaping required. */ |
171 |
|
|
|
172 |
|
|
while (len > 0) { |
173 |
|
|
if (print && (dp - dst >= (int)maxsz - 4 || |
174 |
|
|
total_width > max_width - 4)) |
175 |
|
|
print = 0; |
176 |
|
|
if (print) { |
177 |
|
|
if (grow_dst(&dst, &sz, maxsz, |
178 |
|
|
&dp, 4) == -1) { |
179 |
|
|
ret = -1; |
180 |
|
|
break; |
181 |
|
|
} |
182 |
|
|
tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0); |
183 |
|
|
width = tp - dp; |
184 |
|
|
total_width += width; |
185 |
|
|
dp = tp; |
186 |
|
|
} else |
187 |
|
|
width = 4; |
188 |
|
|
len--; |
189 |
|
|
sp++; |
190 |
|
|
if (ret >= 0) |
191 |
|
|
ret += width; |
192 |
|
|
} |
193 |
|
|
if (len > 0) |
194 |
|
|
break; |
195 |
|
|
} |
196 |
|
|
free(src); |
197 |
|
|
*dp = '\0'; |
198 |
|
|
*str = dst; |
199 |
|
|
if (wp != NULL) |
200 |
|
|
*wp = total_width; |
201 |
|
|
|
202 |
|
|
/* |
203 |
|
|
* If the string was truncated by the width limit but |
204 |
|
|
* would have fit into the size limit, the only sane way |
205 |
|
|
* to report the problem is using the return value, such |
206 |
|
|
* that the usual idiom "if (ret < 0 || ret >= sz) error" |
207 |
|
|
* works as expected. |
208 |
|
|
*/ |
209 |
|
|
|
210 |
|
|
if (ret < (int)maxsz && !print) |
211 |
|
|
ret = -1; |
212 |
|
|
return ret; |
213 |
|
|
|
214 |
|
|
fail: |
215 |
|
|
if (wp != NULL) |
216 |
|
|
*wp = 0; |
217 |
|
|
if (ret == 0) { |
218 |
|
|
*str = src; |
219 |
|
|
return 0; |
220 |
|
|
} else { |
221 |
|
|
*str = NULL; |
222 |
|
|
return -1; |
223 |
|
|
} |
224 |
|
|
} |
225 |
|
|
|
226 |
|
|
int |
227 |
|
|
snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...) |
228 |
|
|
{ |
229 |
|
|
va_list ap; |
230 |
|
|
char *cp; |
231 |
|
|
int ret; |
232 |
|
|
|
233 |
|
|
va_start(ap, fmt); |
234 |
|
|
ret = vasnmprintf(&cp, sz, wp, fmt, ap); |
235 |
|
|
va_end(ap); |
236 |
|
|
if (cp != NULL) { |
237 |
|
|
(void)strlcpy(str, cp, sz); |
238 |
|
|
free(cp); |
239 |
|
|
} else |
240 |
|
|
*str = '\0'; |
241 |
|
|
return ret; |
242 |
|
|
} |
243 |
|
|
|
244 |
|
|
/* |
245 |
|
|
* To stay close to the standard interfaces, the following functions |
246 |
|
|
* return the number of non-NUL bytes written. |
247 |
|
|
*/ |
248 |
|
|
|
249 |
|
|
int |
250 |
|
|
vfmprintf(FILE *stream, const char *fmt, va_list ap) |
251 |
|
|
{ |
252 |
|
|
char *str; |
253 |
|
|
int ret; |
254 |
|
|
|
255 |
|
|
if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) |
256 |
|
|
return -1; |
257 |
|
|
if (fputs(str, stream) == EOF) |
258 |
|
|
ret = -1; |
259 |
|
|
free(str); |
260 |
|
|
return ret; |
261 |
|
|
} |
262 |
|
|
|
263 |
|
|
int |
264 |
|
|
fmprintf(FILE *stream, const char *fmt, ...) |
265 |
|
|
{ |
266 |
|
|
va_list ap; |
267 |
|
|
int ret; |
268 |
|
|
|
269 |
|
|
va_start(ap, fmt); |
270 |
|
|
ret = vfmprintf(stream, fmt, ap); |
271 |
|
|
va_end(ap); |
272 |
|
|
return ret; |
273 |
|
|
} |
274 |
|
|
|
275 |
|
|
int |
276 |
|
|
mprintf(const char *fmt, ...) |
277 |
|
|
{ |
278 |
|
|
va_list ap; |
279 |
|
|
int ret; |
280 |
|
|
|
281 |
|
|
va_start(ap, fmt); |
282 |
|
|
ret = vfmprintf(stdout, fmt, ap); |
283 |
|
|
va_end(ap); |
284 |
|
|
return ret; |
285 |
|
|
} |