GCC Code Coverage Report
Directory: ./ Exec Total Coverage
File: usr.bin/mandoc/mandoc.c Lines: 209 223 93.7 %
Date: 2017-11-07 Branches: 182 240 75.8 %

Line Branch Exec Source
1
/*	$OpenBSD: mandoc.c,v 1.71 2017/07/03 13:40:00 schwarze Exp $ */
2
/*
3
 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4
 * Copyright (c) 2011-2015, 2017 Ingo Schwarze <schwarze@openbsd.org>
5
 *
6
 * Permission to use, copy, modify, and distribute this software for any
7
 * purpose with or without fee is hereby granted, provided that the above
8
 * copyright notice and this permission notice appear in all copies.
9
 *
10
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17
 */
18
#include <sys/types.h>
19
20
#include <assert.h>
21
#include <ctype.h>
22
#include <errno.h>
23
#include <limits.h>
24
#include <stdlib.h>
25
#include <stdio.h>
26
#include <string.h>
27
#include <time.h>
28
29
#include "mandoc_aux.h"
30
#include "mandoc.h"
31
#include "roff.h"
32
#include "libmandoc.h"
33
34
static	int	 a2time(time_t *, const char *, const char *);
35
static	char	*time2a(time_t);
36
37
38
enum mandoc_esc
39
mandoc_escape(const char **end, const char **start, int *sz)
40
{
41
8581550
	const char	*local_start;
42
4290775
	int		 local_sz;
43
	char		 term;
44
	enum mandoc_esc	 gly;
45
46
	/*
47
	 * When the caller doesn't provide return storage,
48
	 * use local storage.
49
	 */
50
51
4290775
	if (NULL == start)
52
2435997
		start = &local_start;
53
4290775
	if (NULL == sz)
54
2435997
		sz = &local_sz;
55
56
	/*
57
	 * Beyond the backslash, at least one input character
58
	 * is part of the escape sequence.  With one exception
59
	 * (see below), that character won't be returned.
60
	 */
61
62
	gly = ESCAPE_ERROR;
63
4291063
	*start = ++*end;
64
4291063
	*sz = 0;
65
	term = '\0';
66
67








4291063
	switch ((*start)[-1]) {
68
	/*
69
	 * First the glyphs.  There are several different forms of
70
	 * these, but each eventually returns a substring of the glyph
71
	 * name.
72
	 */
73
	case '(':
74
		gly = ESCAPE_SPECIAL;
75
267396
		*sz = 2;
76
267396
		break;
77
	case '[':
78
		gly = ESCAPE_SPECIAL;
79
		term = ']';
80
36231
		break;
81
	case 'C':
82
486
		if ('\'' != **start)
83
			return ESCAPE_ERROR;
84
486
		*start = ++*end;
85
		gly = ESCAPE_SPECIAL;
86
		term = '\'';
87
486
		break;
88
89
	/*
90
	 * Escapes taking no arguments at all.
91
	 */
92
	case 'd':
93
	case 'u':
94
	case ',':
95
	case '/':
96
192
		return ESCAPE_IGNORE;
97
	case 'p':
98
108
		return ESCAPE_BREAK;
99
100
	/*
101
	 * The \z escape is supposed to output the following
102
	 * character without advancing the cursor position.
103
	 * Since we are mostly dealing with terminal mode,
104
	 * let us just skip the next character.
105
	 */
106
	case 'z':
107
303
		return ESCAPE_SKIPCHAR;
108
109
	/*
110
	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
111
	 * 'X' is the trigger.  These have opaque sub-strings.
112
	 */
113
	case 'F':
114
	case 'g':
115
	case 'k':
116
	case 'M':
117
	case 'm':
118
	case 'n':
119
	case 'V':
120
	case 'Y':
121
135
		gly = ESCAPE_IGNORE;
122
		/* FALLTHROUGH */
123
	case 'f':
124
1982001
		if (ESCAPE_ERROR == gly)
125
1981866
			gly = ESCAPE_FONT;
126
1982001
		switch (**start) {
127
		case '(':
128
578137
			*start = ++*end;
129
578137
			*sz = 2;
130
578137
			break;
131
		case '[':
132
180
			*start = ++*end;
133
			term = ']';
134
180
			break;
135
		default:
136
1403684
			*sz = 1;
137
1403684
			break;
138
		}
139
		break;
140
141
	/*
142
	 * These escapes are of the form \X'Y', where 'X' is the trigger
143
	 * and 'Y' is any string.  These have opaque sub-strings.
144
	 * The \B and \w escapes are handled in roff.c, roff_res().
145
	 */
146
	case 'A':
147
	case 'b':
148
	case 'D':
149
	case 'R':
150
	case 'X':
151
	case 'Z':
152
153
		gly = ESCAPE_IGNORE;
153
		/* FALLTHROUGH */
154
	case 'o':
155
366
		if (**start == '\0')
156
			return ESCAPE_ERROR;
157
366
		if (gly == ESCAPE_ERROR)
158
213
			gly = ESCAPE_OVERSTRIKE;
159
366
		term = **start;
160
366
		*start = ++*end;
161
366
		break;
162
163
	/*
164
	 * These escapes are of the form \X'N', where 'X' is the trigger
165
	 * and 'N' resolves to a numerical expression.
166
	 */
167
	case 'h':
168
	case 'H':
169
	case 'L':
170
	case 'l':
171
	case 'S':
172
	case 'v':
173
	case 'x':
174
47894
		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
175
90
			if ('\0' != **start)
176
90
				++*end;
177
90
			return ESCAPE_ERROR;
178
		}
179
47804
		switch ((*start)[-1]) {
180
		case 'h':
181
			gly = ESCAPE_HORIZ;
182
36425
			break;
183
		case 'l':
184
			gly = ESCAPE_HLINE;
185
315
			break;
186
		default:
187
			gly = ESCAPE_IGNORE;
188
11064
			break;
189
		}
190
47804
		term = **start;
191
47804
		*start = ++*end;
192
47804
		break;
193
194
	/*
195
	 * Special handling for the numbered character escape.
196
	 * XXX Do any other escapes need similar handling?
197
	 */
198
	case 'N':
199
3213
		if ('\0' == **start)
200
			return ESCAPE_ERROR;
201
3213
		(*end)++;
202
3213
		if (isdigit((unsigned char)**start)) {
203
27
			*sz = 1;
204
27
			return ESCAPE_IGNORE;
205
		}
206
		(*start)++;
207
13203
		while (isdigit((unsigned char)**end))
208
			(*end)++;
209
3186
		*sz = *end - *start;
210
3186
		if ('\0' != **end)
211
3186
			(*end)++;
212
3186
		return ESCAPE_NUMBERED;
213
214
	/*
215
	 * Sizes get a special category of their own.
216
	 */
217
	case 's':
218
		gly = ESCAPE_IGNORE;
219
220
		/* See +/- counts as a sign. */
221

1241745
		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
222
248745
			*start = ++*end;
223
224

496880
		switch (**end) {
225
		case '(':
226
90
			*start = ++*end;
227
90
			*sz = 2;
228
90
			break;
229
		case '[':
230
90
			*start = ++*end;
231
			term = ']';
232
90
			break;
233
		case '\'':
234
180
			*start = ++*end;
235
			term = '\'';
236
180
			break;
237
		case '3':
238
		case '2':
239
		case '1':
240
496950
			*sz = (*end)[-1] == 's' &&
241
			    isdigit((unsigned char)(*end)[1]) ? 2 : 1;
242
248475
			break;
243
		default:
244
248045
			*sz = 1;
245
248045
			break;
246
		}
247
248
		break;
249
250
	/*
251
	 * Anything else is assumed to be a glyph.
252
	 * In this case, pass back the character after the backslash.
253
	 */
254
	default:
255
		gly = ESCAPE_SPECIAL;
256
1455705
		*start = --*end;
257
1455705
		*sz = 1;
258
1455705
		break;
259
	}
260
261
4286869
	assert(ESCAPE_ERROR != gly);
262
263
	/*
264
	 * Read up to the terminating character,
265
	 * paying attention to nested escapes.
266
	 */
267
268
4286869
	if ('\0' != term) {
269
448274
		while (**end != term) {
270
362937
			switch (**end) {
271
			case '\0':
272
				return ESCAPE_ERROR;
273
			case '\\':
274
459
				(*end)++;
275
459
				if (ESCAPE_ERROR ==
276
459
				    mandoc_escape(end, NULL, NULL))
277
					return ESCAPE_ERROR;
278
				break;
279
			default:
280
362478
				(*end)++;
281
362478
				break;
282
			}
283
		}
284
85337
		*sz = (*end)++ - *start;
285
85337
	} else {
286
4201532
		assert(*sz > 0);
287
4201532
		if ((size_t)*sz > strlen(*start))
288
48
			return ESCAPE_ERROR;
289
4201484
		*end += *sz;
290
	}
291
292
	/* Run post-processors. */
293
294
6050051
	switch (gly) {
295
	case ESCAPE_FONT:
296
1981863
		if (2 == *sz) {
297
578119
			if ('C' == **start) {
298
				/*
299
				 * Treat constant-width font modes
300
				 * just like regular font modes.
301
				 */
302
574707
				(*start)++;
303
574707
				(*sz)--;
304
			} else {
305

6824
				if ('B' == (*start)[0] && 'I' == (*start)[1])
306
3412
					gly = ESCAPE_FONTBI;
307
				break;
308
			}
309
1978451
		} else if (1 != *sz)
310
			break;
311
312


3403482
		switch (**start) {
313
		case '3':
314
		case 'B':
315
			gly = ESCAPE_FONTBOLD;
316
176730
			break;
317
		case '2':
318
		case 'I':
319
			gly = ESCAPE_FONTITALIC;
320
208190
			break;
321
		case 'P':
322
			gly = ESCAPE_FONTPREV;
323
29040
			break;
324
		case '1':
325
		case 'R':
326
			gly = ESCAPE_FONTROMAN;
327
1011071
			break;
328
		}
329
		break;
330
	case ESCAPE_SPECIAL:
331

3215523
		if (1 == *sz && 'c' == **start)
332
1253
			gly = ESCAPE_NOSPACE;
333
		/*
334
		 * Unicode escapes are defined in groff as \[u0000]
335
		 * to \[u10FFFF], where the contained value must be
336
		 * a valid Unicode codepoint.  Here, however, only
337
		 * check the length and range.
338
		 */
339

1825476
		if (**start != 'u' || *sz < 5 || *sz > 7)
340
			break;
341

34488
		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
342
			break;
343

32643
		if (*sz == 6 && (*start)[1] == '0')
344
			break;
345

61191
		if (*sz == 5 && (*start)[1] == 'D' &&
346
486
		    strchr("89ABCDEF", (*start)[2]) != NULL)
347
			break;
348
62208
		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
349
31104
		    + 1 == *sz)
350
31005
			gly = ESCAPE_UNICODE;
351
		break;
352
	default:
353
		break;
354
	}
355
356
4286821
	return gly;
357
4290775
}
358
359
/*
360
 * Parse a quoted or unquoted roff-style request or macro argument.
361
 * Return a pointer to the parsed argument, which is either the original
362
 * pointer or advanced by one byte in case the argument is quoted.
363
 * NUL-terminate the argument in place.
364
 * Collapse pairs of quotes inside quoted arguments.
365
 * Advance the argument pointer to the next argument,
366
 * or to the NUL byte terminating the argument line.
367
 */
368
char *
369
mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
370
{
371
	char	 *start, *cp;
372
	int	  quoted, pairs, white;
373
374
	/* Quoting can only start with a new word. */
375
4175334
	start = *cpp;
376
	quoted = 0;
377
2087667
	if ('"' == *start) {
378
		quoted = 1;
379
300756
		start++;
380
300756
	}
381
382
	pairs = 0;
383
	white = 0;
384
31233712
	for (cp = start; '\0' != *cp; cp++) {
385
386
		/*
387
		 * Move the following text left
388
		 * after quoted quotes and after "\\" and "\t".
389
		 */
390
14756804
		if (pairs)
391
283128
			cp[-pairs] = cp[0];
392
393
14756804
		if ('\\' == cp[0]) {
394
			/*
395
			 * In copy mode, translate double to single
396
			 * backslashes and backslash-t to literal tabs.
397
			 */
398

12261312
			switch (cp[1]) {
399
			case 't':
400
36
				cp[0] = '\t';
401
				/* FALLTHROUGH */
402
			case '\\':
403
2060
				pairs++;
404
2060
				cp++;
405
2060
				break;
406
			case ' ':
407
				/* Skip escaped blanks. */
408
1132
				if (0 == quoted)
409
1021
					cp++;
410
				break;
411
			default:
412
				break;
413
			}
414
14588639
		} else if (0 == quoted) {
415
9051603
			if (' ' == cp[0]) {
416
				/* Unescaped blanks end unquoted args. */
417
				white = 1;
418
927464
				break;
419
			}
420
5537036
		} else if ('"' == cp[0]) {
421
330429
			if ('"' == cp[1]) {
422
				/* Quoted quotes collapse. */
423
30278
				pairs++;
424
30278
				cp++;
425
			} else {
426
				/* Unquoted quotes end quoted args. */
427
				quoted = 2;
428
300151
				break;
429
			}
430
30278
		}
431
	}
432
433
	/* Quoted argument without a closing quote. */
434
2087667
	if (1 == quoted)
435
605
		mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
436
437
	/* NUL-terminate this argument and move to the next one. */
438
2087667
	if (pairs)
439
14823
		cp[-pairs] = '\0';
440
2087667
	if ('\0' != *cp) {
441
1227615
		*cp++ = '\0';
442
2742518
		while (' ' == *cp)
443
143644
			cp++;
444
	}
445
2087667
	*pos += (int)(cp - start) + (quoted ? 1 : 0);
446
2087667
	*cpp = cp;
447
448

4165369
	if ('\0' == *cp && (white || ' ' == cp[-1]))
449
735
		mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
450
451
2087667
	return start;
452
}
453
454
static int
455
a2time(time_t *t, const char *fmt, const char *p)
456
{
457
67724
	struct tm	 tm;
458
	char		*pp;
459
460
33862
	memset(&tm, 0, sizeof(struct tm));
461
462
33862
	pp = strptime(p, fmt, &tm);
463

57476
	if (NULL != pp && '\0' == *pp) {
464
23614
		*t = mktime(&tm);
465
23614
		return 1;
466
	}
467
468
10248
	return 0;
469
33862
}
470
471
static char *
472
time2a(time_t t)
473
{
474
	struct tm	*tm;
475
	char		*buf, *p;
476
	size_t		 ssz;
477
	int		 isz;
478
479
21118
	tm = localtime(&t);
480
21118
	if (tm == NULL)
481
		return NULL;
482
483
	/*
484
	 * Reserve space:
485
	 * up to 9 characters for the month (September) + blank
486
	 * up to 2 characters for the day + comma + blank
487
	 * 4 characters for the year and a terminating '\0'
488
	 */
489
490
21118
	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
491
492
21118
	if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
493
		goto fail;
494
21118
	p += (int)ssz;
495
496
	/*
497
	 * The output format is just "%d" here, not "%2d" or "%02d".
498
	 * That's also the reason why we can't just format the
499
	 * date as a whole with "%B %e, %Y" or "%B %d, %Y".
500
	 * Besides, the present approach is less prone to buffer
501
	 * overflows, in case anybody should ever introduce the bug
502
	 * of looking at LC_TIME.
503
	 */
504
505
21118
	if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
506
		goto fail;
507
21118
	p += isz;
508
509
21118
	if (strftime(p, 4 + 1, "%Y", tm) == 0)
510
		goto fail;
511
21118
	return buf;
512
513
fail:
514
	free(buf);
515
	return NULL;
516
21118
}
517
518
char *
519
mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
520
{
521
	char		*cp;
522
47942
	time_t		 t;
523
524
	/* No date specified: use today's date. */
525
526

71875
	if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
527
19
		mandoc_msg(MANDOCERR_DATE_MISSING, man->parse, ln, pos, NULL);
528
19
		return time2a(time(NULL));
529
	}
530
531
	/* Valid mdoc(7) date format. */
532
533

31009
	if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
534
7057
	    a2time(&t, "%b %d, %Y", in)) {
535
21099
		cp = time2a(t);
536
21099
		if (t > time(NULL) + 86400)
537
			mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse,
538
			    ln, pos, cp);
539
21099
		return cp;
540
	}
541
542
	/* In man(7), do not warn about the legacy format. */
543
544
2853
	if (a2time(&t, "%Y-%m-%d", in) == 0)
545
338
		mandoc_msg(MANDOCERR_DATE_BAD, man->parse, ln, pos, in);
546
2515
	else if (t > time(NULL) + 86400)
547
		mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, ln, pos, in);
548
2515
	else if (man->macroset == MACROSET_MDOC)
549
36
		mandoc_vmsg(MANDOCERR_DATE_LEGACY, man->parse,
550
		    ln, pos, "Dd %s", in);
551
552
	/* Use any non-mdoc(7) date verbatim. */
553
554
2853
	return mandoc_strdup(in);
555
23971
}
556
557
int
558
mandoc_eos(const char *p, size_t sz)
559
{
560
	const char	*q;
561
	int		 enclosed, found;
562
563
2825442
	if (0 == sz)
564
359
		return 0;
565
566
	/*
567
	 * End-of-sentence recognition must include situations where
568
	 * some symbols, such as `)', allow prior EOS punctuation to
569
	 * propagate outward.
570
	 */
571
572
	enclosed = found = 0;
573
3780120
	for (q = p + (int)sz - 1; q >= p; q--) {
574


1830796
		switch (*q) {
575
		case '\"':
576
		case '\'':
577
		case ']':
578
		case ')':
579
51794
			if (0 == found)
580
29298
				enclosed = 1;
581
			break;
582
		case '.':
583
		case '!':
584
		case '?':
585
			found = 1;
586
425904
			break;
587
		default:
588
2706196
			return found &&
589
375082
			    (!enclosed || isalnum((unsigned char)*q));
590
		}
591
	}
592
593
170799
	return found && !enclosed;
594
1412721
}
595
596
/*
597
 * Convert a string to a long that may not be <0.
598
 * If the string is invalid, or is less than 0, return -1.
599
 */
600
int
601
mandoc_strntoi(const char *p, size_t sz, int base)
602
{
603
24768
	char		 buf[32];
604
12384
	char		*ep;
605
	long		 v;
606
607
12384
	if (sz > 31)
608
		return -1;
609
610
12384
	memcpy(buf, p, sz);
611
12384
	buf[(int)sz] = '\0';
612
613
12384
	errno = 0;
614
12384
	v = strtol(buf, &ep, base);
615
616

24750
	if (buf[0] == '\0' || *ep != '\0')
617
18
		return -1;
618
619
12366
	if (v > INT_MAX)
620
		v = INT_MAX;
621
12366
	if (v < INT_MIN)
622
		v = INT_MIN;
623
624
12366
	return (int)v;
625
12384
}