1 |
|
|
/* $OpenBSD: preconv.c,v 1.8 2017/02/18 13:43:34 schwarze Exp $ */ |
2 |
|
|
/* |
3 |
|
|
* Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> |
4 |
|
|
* Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org> |
5 |
|
|
* |
6 |
|
|
* Permission to use, copy, modify, and distribute this software for any |
7 |
|
|
* purpose with or without fee is hereby granted, provided that the above |
8 |
|
|
* copyright notice and this permission notice appear in all copies. |
9 |
|
|
* |
10 |
|
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
11 |
|
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
12 |
|
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
13 |
|
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
14 |
|
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
15 |
|
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
16 |
|
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
17 |
|
|
*/ |
18 |
|
|
#include <sys/types.h> |
19 |
|
|
|
20 |
|
|
#include <assert.h> |
21 |
|
|
#include <stdio.h> |
22 |
|
|
#include <string.h> |
23 |
|
|
#include "mandoc.h" |
24 |
|
|
#include "libmandoc.h" |
25 |
|
|
|
26 |
|
|
int |
27 |
|
|
preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi, |
28 |
|
|
int *filenc) |
29 |
|
|
{ |
30 |
|
|
const unsigned char *cu; |
31 |
|
|
int nby; |
32 |
|
|
unsigned int accum; |
33 |
|
|
|
34 |
|
1958 |
cu = (const unsigned char *)ib->buf + *ii; |
35 |
✗✓ |
979 |
assert(*cu & 0x80); |
36 |
|
|
|
37 |
✓✗ |
979 |
if ( ! (*filenc & MPARSE_UTF8)) |
38 |
|
|
goto latin; |
39 |
|
|
|
40 |
|
|
nby = 1; |
41 |
✓✓✓✓
|
7733 |
while (nby < 5 && *cu & (1 << (7 - nby))) |
42 |
|
957 |
nby++; |
43 |
|
|
|
44 |
✓✓✓✓
|
1034 |
switch (nby) { |
45 |
|
|
case 2: |
46 |
|
99 |
accum = *cu & 0x1f; |
47 |
✓✓ |
99 |
if (accum < 0x02) /* Obfuscated ASCII. */ |
48 |
|
|
goto latin; |
49 |
|
|
break; |
50 |
|
|
case 3: |
51 |
|
143 |
accum = *cu & 0x0f; |
52 |
|
143 |
break; |
53 |
|
|
case 4: |
54 |
|
176 |
accum = *cu & 0x07; |
55 |
✓✓ |
176 |
if (accum > 0x04) /* Beyond Unicode. */ |
56 |
|
|
goto latin; |
57 |
|
|
break; |
58 |
|
|
default: /* Bad sequence header. */ |
59 |
|
|
goto latin; |
60 |
|
|
} |
61 |
|
|
|
62 |
|
517 |
cu++; |
63 |
✓✓✓ |
517 |
switch (nby) { |
64 |
|
|
case 3: |
65 |
✓✓✓✓ ✓✓ |
242 |
if ((accum == 0x00 && ! (*cu & 0x20)) || /* Use 2-byte. */ |
66 |
✓✓ |
154 |
(accum == 0x0d && *cu & 0x20)) /* Surrogates. */ |
67 |
|
|
goto latin; |
68 |
|
|
break; |
69 |
|
|
case 4: |
70 |
✓✓✓✓ ✓✓ |
286 |
if ((accum == 0x00 && ! (*cu & 0x30)) || /* Use 3-byte. */ |
71 |
✓✓ |
132 |
(accum == 0x04 && *cu & 0x30)) /* Beyond Unicode. */ |
72 |
|
|
goto latin; |
73 |
|
|
break; |
74 |
|
|
default: |
75 |
|
|
break; |
76 |
|
|
} |
77 |
|
|
|
78 |
✓✓ |
1276 |
while (--nby) { |
79 |
✓✓ |
440 |
if ((*cu & 0xc0) != 0x80) /* Invalid continuation. */ |
80 |
|
|
goto latin; |
81 |
|
418 |
accum <<= 6; |
82 |
|
418 |
accum += *cu & 0x3f; |
83 |
|
418 |
cu++; |
84 |
|
|
} |
85 |
|
|
|
86 |
✗✓ |
198 |
assert(accum > 0x7f); |
87 |
✗✓ |
198 |
assert(accum < 0x110000); |
88 |
✗✓ |
198 |
assert(accum < 0xd800 || accum > 0xdfff); |
89 |
|
|
|
90 |
|
198 |
*oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum); |
91 |
|
198 |
*ii = (const char *)cu - ib->buf; |
92 |
|
198 |
*filenc &= ~MPARSE_LATIN1; |
93 |
|
198 |
return 1; |
94 |
|
|
|
95 |
|
|
latin: |
96 |
✓✗ |
781 |
if ( ! (*filenc & MPARSE_LATIN1)) |
97 |
|
781 |
return 0; |
98 |
|
|
|
99 |
|
|
*oi += snprintf(ob->buf + *oi, 11, |
100 |
|
|
"\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]); |
101 |
|
|
|
102 |
|
|
*filenc &= ~MPARSE_UTF8; |
103 |
|
|
return 1; |
104 |
|
979 |
} |
105 |
|
|
|
106 |
|
|
int |
107 |
|
|
preconv_cue(const struct buf *b, size_t offset) |
108 |
|
|
{ |
109 |
|
|
const char *ln, *eoln, *eoph; |
110 |
|
|
size_t sz, phsz; |
111 |
|
|
|
112 |
|
16784 |
ln = b->buf + offset; |
113 |
|
8392 |
sz = b->sz - offset; |
114 |
|
|
|
115 |
|
|
/* Look for the end-of-line. */ |
116 |
|
|
|
117 |
✗✓ |
8392 |
if (NULL == (eoln = memchr(ln, '\n', sz))) |
118 |
|
|
eoln = ln + sz; |
119 |
|
|
|
120 |
|
|
/* Check if we have the correct header/trailer. */ |
121 |
|
|
|
122 |
✓✓✗✗
|
8392 |
if ((sz = (size_t)(eoln - ln)) < 10 || |
123 |
✗✓ |
7737 |
memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3)) |
124 |
|
8392 |
return MPARSE_UTF8 | MPARSE_LATIN1; |
125 |
|
|
|
126 |
|
|
/* Move after the header and adjust for the trailer. */ |
127 |
|
|
|
128 |
|
|
ln += 7; |
129 |
|
|
sz -= 10; |
130 |
|
|
|
131 |
|
|
while (sz > 0) { |
132 |
|
|
while (sz > 0 && ' ' == *ln) { |
133 |
|
|
ln++; |
134 |
|
|
sz--; |
135 |
|
|
} |
136 |
|
|
if (0 == sz) |
137 |
|
|
break; |
138 |
|
|
|
139 |
|
|
/* Find the end-of-phrase marker (or eoln). */ |
140 |
|
|
|
141 |
|
|
if (NULL == (eoph = memchr(ln, ';', sz))) |
142 |
|
|
eoph = eoln - 3; |
143 |
|
|
else |
144 |
|
|
eoph++; |
145 |
|
|
|
146 |
|
|
/* Only account for the "coding" phrase. */ |
147 |
|
|
|
148 |
|
|
if ((phsz = eoph - ln) < 7 || |
149 |
|
|
strncasecmp(ln, "coding:", 7)) { |
150 |
|
|
sz -= phsz; |
151 |
|
|
ln += phsz; |
152 |
|
|
continue; |
153 |
|
|
} |
154 |
|
|
|
155 |
|
|
sz -= 7; |
156 |
|
|
ln += 7; |
157 |
|
|
|
158 |
|
|
while (sz > 0 && ' ' == *ln) { |
159 |
|
|
ln++; |
160 |
|
|
sz--; |
161 |
|
|
} |
162 |
|
|
if (0 == sz) |
163 |
|
|
return 0; |
164 |
|
|
|
165 |
|
|
/* Check us against known encodings. */ |
166 |
|
|
|
167 |
|
|
if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) |
168 |
|
|
return MPARSE_UTF8; |
169 |
|
|
if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) |
170 |
|
|
return MPARSE_LATIN1; |
171 |
|
|
return 0; |
172 |
|
|
} |
173 |
|
|
return MPARSE_UTF8 | MPARSE_LATIN1; |
174 |
|
8392 |
} |