GCC Code Coverage Report
Directory: ./ Exec Total Coverage
File: lib/libc/regex/regcomp.c Lines: 0 626 0.0 %
Date: 2017-11-07 Branches: 0 592 0.0 %

Line Branch Exec Source
1
/*	$OpenBSD: regcomp.c,v 1.31 2016/12/22 00:09:07 krw Exp $ */
2
/*-
3
 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4
 * Copyright (c) 1992, 1993, 1994
5
 *	The Regents of the University of California.  All rights reserved.
6
 *
7
 * This code is derived from software contributed to Berkeley by
8
 * Henry Spencer.
9
 *
10
 * Redistribution and use in source and binary forms, with or without
11
 * modification, are permitted provided that the following conditions
12
 * are met:
13
 * 1. Redistributions of source code must retain the above copyright
14
 *    notice, this list of conditions and the following disclaimer.
15
 * 2. Redistributions in binary form must reproduce the above copyright
16
 *    notice, this list of conditions and the following disclaimer in the
17
 *    documentation and/or other materials provided with the distribution.
18
 * 3. Neither the name of the University nor the names of its contributors
19
 *    may be used to endorse or promote products derived from this software
20
 *    without specific prior written permission.
21
 *
22
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
 * SUCH DAMAGE.
33
 *
34
 *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
35
 */
36
37
#include <sys/types.h>
38
#include <stdio.h>
39
#include <string.h>
40
#include <ctype.h>
41
#include <limits.h>
42
#include <stdlib.h>
43
#include <regex.h>
44
45
#include "utils.h"
46
#include "regex2.h"
47
48
#include "cclass.h"
49
#include "cname.h"
50
51
/*
52
 * parse structure, passed up and down to avoid global variables and
53
 * other clumsinesses
54
 */
55
struct parse {
56
	char *next;		/* next character in RE */
57
	char *end;		/* end of string (-> NUL normally) */
58
	int error;		/* has an error been seen? */
59
	sop *strip;		/* malloced strip */
60
	sopno ssize;		/* malloced strip size (allocated) */
61
	sopno slen;		/* malloced strip length (used) */
62
	int ncsalloc;		/* number of csets allocated */
63
	struct re_guts *g;
64
#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
65
	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
66
	sopno pend[NPAREN];	/* -> ) ([0] unused) */
67
};
68
69
static void p_ere(struct parse *, int);
70
static void p_ere_exp(struct parse *);
71
static void p_str(struct parse *);
72
static void p_bre(struct parse *, int, int);
73
static int p_simp_re(struct parse *, int);
74
static int p_count(struct parse *);
75
static void p_bracket(struct parse *);
76
static void p_b_term(struct parse *, cset *);
77
static void p_b_cclass(struct parse *, cset *);
78
static void p_b_eclass(struct parse *, cset *);
79
static char p_b_symbol(struct parse *);
80
static char p_b_coll_elem(struct parse *, int);
81
static char othercase(int);
82
static void bothcases(struct parse *, int);
83
static void ordinary(struct parse *, int);
84
static void backslash(struct parse *, int);
85
static void nonnewline(struct parse *);
86
static void repeat(struct parse *, sopno, int, int);
87
static int seterr(struct parse *, int);
88
static cset *allocset(struct parse *);
89
static void freeset(struct parse *, cset *);
90
static int freezeset(struct parse *, cset *);
91
static int firstch(struct parse *, cset *);
92
static int nch(struct parse *, cset *);
93
static void mcadd(struct parse *, cset *, char *);
94
static void mcinvert(struct parse *, cset *);
95
static void mccase(struct parse *, cset *);
96
static int isinsets(struct re_guts *, int);
97
static int samesets(struct re_guts *, int, int);
98
static void categorize(struct parse *, struct re_guts *);
99
static sopno dupl(struct parse *, sopno, sopno);
100
static void doemit(struct parse *, sop, size_t);
101
static void doinsert(struct parse *, sop, size_t, sopno);
102
static void dofwd(struct parse *, sopno, sop);
103
static int enlarge(struct parse *, sopno);
104
static void stripsnug(struct parse *, struct re_guts *);
105
static void findmust(struct parse *, struct re_guts *);
106
static sopno pluscount(struct parse *, struct re_guts *);
107
108
static char nuls[10];		/* place to point scanner in event of error */
109
110
/*
111
 * macros for use with parse structure
112
 * BEWARE:  these know that the parse structure is named `p' !!!
113
 */
114
#define	PEEK()	(*p->next)
115
#define	PEEK2()	(*(p->next+1))
116
#define	MORE()	(p->next < p->end)
117
#define	MORE2()	(p->next+1 < p->end)
118
#define	SEE(c)	(MORE() && PEEK() == (c))
119
#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
120
#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
121
#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
122
#define	NEXT()	(p->next++)
123
#define	NEXT2()	(p->next += 2)
124
#define	NEXTn(n)	(p->next += (n))
125
#define	GETNEXT()	(*p->next++)
126
#define	SETERROR(e)	seterr(p, (e))
127
#define	REQUIRE(co, e)	(void) ((co) || SETERROR(e))
128
#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
129
#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
130
#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
131
#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
132
#define	HERE()		(p->slen)
133
#define	THERE()		(p->slen - 1)
134
#define	THERETHERE()	(p->slen - 2)
135
#define	DROP(n)	(p->slen -= (n))
136
137
#ifndef NDEBUG
138
static int never = 0;		/* for use in asserts; shuts lint up */
139
#else
140
#define	never	0		/* some <assert.h>s have bugs too */
141
#endif
142
143
/*
144
 - regcomp - interface for parser and compilation
145
 */
146
int				/* 0 success, otherwise REG_something */
147
regcomp(regex_t *preg, const char *pattern, int cflags)
148
{
149
	struct parse pa;
150
	struct re_guts *g;
151
	struct parse *p = &pa;
152
	int i;
153
	size_t len;
154
#ifdef REDEBUG
155
#	define	GOODFLAGS(f)	(f)
156
#else
157
#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
158
#endif
159
160
	cflags = GOODFLAGS(cflags);
161
	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
162
		return(REG_INVARG);
163
164
	if (cflags&REG_PEND) {
165
		if (preg->re_endp < pattern)
166
			return(REG_INVARG);
167
		len = preg->re_endp - pattern;
168
	} else
169
		len = strlen((char *)pattern);
170
171
	/* do the mallocs early so failure handling is easy */
172
	g = malloc(sizeof(struct re_guts));
173
	if (g == NULL)
174
		return(REG_ESPACE);
175
	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
176
	p->strip = reallocarray(NULL, p->ssize, sizeof(sop));
177
	p->slen = 0;
178
	if (p->strip == NULL) {
179
		free(g);
180
		return(REG_ESPACE);
181
	}
182
183
	/* set things up */
184
	p->g = g;
185
	p->next = (char *)pattern;	/* convenience; we do not modify it */
186
	p->end = p->next + len;
187
	p->error = 0;
188
	p->ncsalloc = 0;
189
	for (i = 0; i < NPAREN; i++) {
190
		p->pbegin[i] = 0;
191
		p->pend[i] = 0;
192
	}
193
	g->csetsize = NC;
194
	g->sets = NULL;
195
	g->setbits = NULL;
196
	g->ncsets = 0;
197
	g->cflags = cflags;
198
	g->iflags = 0;
199
	g->nbol = 0;
200
	g->neol = 0;
201
	g->must = NULL;
202
	g->mlen = 0;
203
	g->nsub = 0;
204
	g->ncategories = 1;	/* category 0 is "everything else" */
205
	g->categories = &g->catspace[-(CHAR_MIN)];
206
	memset(g->catspace, 0, sizeof(g->catspace));
207
	g->backrefs = 0;
208
209
	/* do it */
210
	EMIT(OEND, 0);
211
	g->firststate = THERE();
212
	if (cflags&REG_EXTENDED)
213
		p_ere(p, OUT);
214
	else if (cflags&REG_NOSPEC)
215
		p_str(p);
216
	else
217
		p_bre(p, OUT, OUT);
218
	EMIT(OEND, 0);
219
	g->laststate = THERE();
220
221
	/* tidy up loose ends and fill things in */
222
	categorize(p, g);
223
	stripsnug(p, g);
224
	findmust(p, g);
225
	g->nplus = pluscount(p, g);
226
	g->magic = MAGIC2;
227
	preg->re_nsub = g->nsub;
228
	preg->re_g = g;
229
	preg->re_magic = MAGIC1;
230
#ifndef REDEBUG
231
	/* not debugging, so can't rely on the assert() in regexec() */
232
	if (g->iflags&BAD)
233
		SETERROR(REG_ASSERT);
234
#endif
235
236
	/* win or lose, we're done */
237
	if (p->error != 0)	/* lose */
238
		regfree(preg);
239
	return(p->error);
240
}
241
242
/*
243
 - p_ere - ERE parser top level, concatenation and alternation
244
 */
245
static void
246
p_ere(struct parse *p, int stop)	/* character this ERE should end at */
247
{
248
	char c;
249
	sopno prevback;
250
	sopno prevfwd;
251
	sopno conc;
252
	int first = 1;		/* is this the first alternative? */
253
254
	for (;;) {
255
		/* do a bunch of concatenated expressions */
256
		conc = HERE();
257
		while (MORE() && (c = PEEK()) != '|' && c != stop)
258
			p_ere_exp(p);
259
		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
260
261
		if (!EAT('|'))
262
			break;		/* NOTE BREAK OUT */
263
264
		if (first) {
265
			INSERT(OCH_, conc);	/* offset is wrong */
266
			prevfwd = conc;
267
			prevback = conc;
268
			first = 0;
269
		}
270
		ASTERN(OOR1, prevback);
271
		prevback = THERE();
272
		AHEAD(prevfwd);			/* fix previous offset */
273
		prevfwd = HERE();
274
		EMIT(OOR2, 0);			/* offset is very wrong */
275
	}
276
277
	if (!first) {		/* tail-end fixups */
278
		AHEAD(prevfwd);
279
		ASTERN(O_CH, prevback);
280
	}
281
282
	assert(!MORE() || SEE(stop));
283
}
284
285
/*
286
 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
287
 */
288
static void
289
p_ere_exp(struct parse *p)
290
{
291
	char c;
292
	sopno pos;
293
	int count;
294
	int count2;
295
	sopno subno;
296
	int wascaret = 0;
297
298
	assert(MORE());		/* caller should have ensured this */
299
	c = GETNEXT();
300
301
	pos = HERE();
302
	switch (c) {
303
	case '(':
304
		REQUIRE(MORE(), REG_EPAREN);
305
		p->g->nsub++;
306
		subno = p->g->nsub;
307
		if (subno < NPAREN)
308
			p->pbegin[subno] = HERE();
309
		EMIT(OLPAREN, subno);
310
		if (!SEE(')'))
311
			p_ere(p, ')');
312
		if (subno < NPAREN) {
313
			p->pend[subno] = HERE();
314
			assert(p->pend[subno] != 0);
315
		}
316
		EMIT(ORPAREN, subno);
317
		REQUIRE(MORE() && GETNEXT() == ')', REG_EPAREN);
318
		break;
319
	case '^':
320
		EMIT(OBOL, 0);
321
		p->g->iflags |= USEBOL;
322
		p->g->nbol++;
323
		wascaret = 1;
324
		break;
325
	case '$':
326
		EMIT(OEOL, 0);
327
		p->g->iflags |= USEEOL;
328
		p->g->neol++;
329
		break;
330
	case '|':
331
		SETERROR(REG_EMPTY);
332
		break;
333
	case '*':
334
	case '+':
335
	case '?':
336
		SETERROR(REG_BADRPT);
337
		break;
338
	case '.':
339
		if (p->g->cflags&REG_NEWLINE)
340
			nonnewline(p);
341
		else
342
			EMIT(OANY, 0);
343
		break;
344
	case '[':
345
		p_bracket(p);
346
		break;
347
	case '\\':
348
		REQUIRE(MORE(), REG_EESCAPE);
349
		c = GETNEXT();
350
		backslash(p, c);
351
		break;
352
	case '{':		/* okay as ordinary except if digit follows */
353
		REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
354
		/* FALLTHROUGH */
355
	default:
356
		ordinary(p, c);
357
		break;
358
	}
359
360
	if (!MORE())
361
		return;
362
	c = PEEK();
363
	/* we call { a repetition if followed by a digit */
364
	if (!( c == '*' || c == '+' || c == '?' ||
365
				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
366
		return;		/* no repetition, we're done */
367
	NEXT();
368
369
	REQUIRE(!wascaret, REG_BADRPT);
370
	switch (c) {
371
	case '*':	/* implemented as +? */
372
		/* this case does not require the (y|) trick, noKLUDGE */
373
		INSERT(OPLUS_, pos);
374
		ASTERN(O_PLUS, pos);
375
		INSERT(OQUEST_, pos);
376
		ASTERN(O_QUEST, pos);
377
		break;
378
	case '+':
379
		INSERT(OPLUS_, pos);
380
		ASTERN(O_PLUS, pos);
381
		break;
382
	case '?':
383
		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
384
		INSERT(OCH_, pos);		/* offset slightly wrong */
385
		ASTERN(OOR1, pos);		/* this one's right */
386
		AHEAD(pos);			/* fix the OCH_ */
387
		EMIT(OOR2, 0);			/* offset very wrong... */
388
		AHEAD(THERE());			/* ...so fix it */
389
		ASTERN(O_CH, THERETHERE());
390
		break;
391
	case '{':
392
		count = p_count(p);
393
		if (EAT(',')) {
394
			if (isdigit((uch)PEEK())) {
395
				count2 = p_count(p);
396
				REQUIRE(count <= count2, REG_BADBR);
397
			} else		/* single number with comma */
398
				count2 = INFINITY;
399
		} else		/* just a single number */
400
			count2 = count;
401
		repeat(p, pos, count, count2);
402
		if (!EAT('}')) {	/* error heuristics */
403
			while (MORE() && PEEK() != '}')
404
				NEXT();
405
			REQUIRE(MORE(), REG_EBRACE);
406
			SETERROR(REG_BADBR);
407
		}
408
		break;
409
	}
410
411
	if (!MORE())
412
		return;
413
	c = PEEK();
414
	if (!( c == '*' || c == '+' || c == '?' ||
415
				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
416
		return;
417
	SETERROR(REG_BADRPT);
418
}
419
420
/*
421
 - p_str - string (no metacharacters) "parser"
422
 */
423
static void
424
p_str(struct parse *p)
425
{
426
	REQUIRE(MORE(), REG_EMPTY);
427
	while (MORE())
428
		ordinary(p, GETNEXT());
429
}
430
431
/*
432
 - p_bre - BRE parser top level, anchoring and concatenation
433
 * Giving end1 as OUT essentially eliminates the end1/end2 check.
434
 *
435
 * This implementation is a bit of a kludge, in that a trailing $ is first
436
 * taken as an ordinary character and then revised to be an anchor.  The
437
 * only undesirable side effect is that '$' gets included as a character
438
 * category in such cases.  This is fairly harmless; not worth fixing.
439
 * The amount of lookahead needed to avoid this kludge is excessive.
440
 */
441
static void
442
p_bre(struct parse *p,
443
    int end1,		/* first terminating character */
444
    int end2)		/* second terminating character */
445
{
446
	sopno start = HERE();
447
	int first = 1;			/* first subexpression? */
448
	int wasdollar = 0;
449
450
	if (EAT('^')) {
451
		EMIT(OBOL, 0);
452
		p->g->iflags |= USEBOL;
453
		p->g->nbol++;
454
	}
455
	while (MORE() && !SEETWO(end1, end2)) {
456
		wasdollar = p_simp_re(p, first);
457
		first = 0;
458
	}
459
	if (wasdollar) {	/* oops, that was a trailing anchor */
460
		DROP(1);
461
		EMIT(OEOL, 0);
462
		p->g->iflags |= USEEOL;
463
		p->g->neol++;
464
	}
465
466
	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
467
}
468
469
/*
470
 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
471
 */
472
static int			/* was the simple RE an unbackslashed $? */
473
p_simp_re(struct parse *p,
474
    int starordinary)		/* is a leading * an ordinary character? */
475
{
476
	int c;
477
	int count;
478
	int count2;
479
	sopno pos;
480
	int i;
481
	sopno subno;
482
#	define	BACKSL	(1<<CHAR_BIT)
483
484
	pos = HERE();		/* repetion op, if any, covers from here */
485
486
	assert(MORE());		/* caller should have ensured this */
487
	c = GETNEXT();
488
	if (c == '\\') {
489
		REQUIRE(MORE(), REG_EESCAPE);
490
		c = BACKSL | GETNEXT();
491
	}
492
	switch (c) {
493
	case '.':
494
		if (p->g->cflags&REG_NEWLINE)
495
			nonnewline(p);
496
		else
497
			EMIT(OANY, 0);
498
		break;
499
	case '[':
500
		p_bracket(p);
501
		break;
502
	case BACKSL|'<':
503
		EMIT(OBOW, 0);
504
		break;
505
	case BACKSL|'>':
506
		EMIT(OEOW, 0);
507
		break;
508
	case BACKSL|'{':
509
		SETERROR(REG_BADRPT);
510
		break;
511
	case BACKSL|'(':
512
		p->g->nsub++;
513
		subno = p->g->nsub;
514
		if (subno < NPAREN)
515
			p->pbegin[subno] = HERE();
516
		EMIT(OLPAREN, subno);
517
		/* the MORE here is an error heuristic */
518
		if (MORE() && !SEETWO('\\', ')'))
519
			p_bre(p, '\\', ')');
520
		if (subno < NPAREN) {
521
			p->pend[subno] = HERE();
522
			assert(p->pend[subno] != 0);
523
		}
524
		EMIT(ORPAREN, subno);
525
		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
526
		break;
527
	case BACKSL|')':	/* should not get here -- must be user */
528
	case BACKSL|'}':
529
		SETERROR(REG_EPAREN);
530
		break;
531
	case BACKSL|'1':
532
	case BACKSL|'2':
533
	case BACKSL|'3':
534
	case BACKSL|'4':
535
	case BACKSL|'5':
536
	case BACKSL|'6':
537
	case BACKSL|'7':
538
	case BACKSL|'8':
539
	case BACKSL|'9':
540
		i = (c&~BACKSL) - '0';
541
		assert(i < NPAREN);
542
		if (p->pend[i] != 0) {
543
			assert(i <= p->g->nsub);
544
			EMIT(OBACK_, i);
545
			assert(p->pbegin[i] != 0);
546
			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
547
			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
548
			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
549
			EMIT(O_BACK, i);
550
		} else
551
			SETERROR(REG_ESUBREG);
552
		p->g->backrefs = 1;
553
		break;
554
	case '*':
555
		REQUIRE(starordinary, REG_BADRPT);
556
		/* FALLTHROUGH */
557
	default:
558
		ordinary(p, (char)c);
559
		break;
560
	}
561
562
	if (EAT('*')) {		/* implemented as +? */
563
		/* this case does not require the (y|) trick, noKLUDGE */
564
		INSERT(OPLUS_, pos);
565
		ASTERN(O_PLUS, pos);
566
		INSERT(OQUEST_, pos);
567
		ASTERN(O_QUEST, pos);
568
	} else if (EATTWO('\\', '{')) {
569
		count = p_count(p);
570
		if (EAT(',')) {
571
			if (MORE() && isdigit((uch)PEEK())) {
572
				count2 = p_count(p);
573
				REQUIRE(count <= count2, REG_BADBR);
574
			} else		/* single number with comma */
575
				count2 = INFINITY;
576
		} else		/* just a single number */
577
			count2 = count;
578
		repeat(p, pos, count, count2);
579
		if (!EATTWO('\\', '}')) {	/* error heuristics */
580
			while (MORE() && !SEETWO('\\', '}'))
581
				NEXT();
582
			REQUIRE(MORE(), REG_EBRACE);
583
			SETERROR(REG_BADBR);
584
		}
585
	} else if (c == '$')	/* $ (but not \$) ends it */
586
		return(1);
587
588
	return(0);
589
}
590
591
/*
592
 - p_count - parse a repetition count
593
 */
594
static int			/* the value */
595
p_count(struct parse *p)
596
{
597
	int count = 0;
598
	int ndigits = 0;
599
600
	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
601
		count = count*10 + (GETNEXT() - '0');
602
		ndigits++;
603
	}
604
605
	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
606
	return(count);
607
}
608
609
/*
610
 - p_bracket - parse a bracketed character list
611
 *
612
 * Note a significant property of this code:  if the allocset() did SETERROR,
613
 * no set operations are done.
614
 */
615
static void
616
p_bracket(struct parse *p)
617
{
618
	cset *cs;
619
	int invert = 0;
620
621
	/* Dept of Truly Sickening Special-Case Kludges */
622
	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
623
		EMIT(OBOW, 0);
624
		NEXTn(6);
625
		return;
626
	}
627
	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
628
		EMIT(OEOW, 0);
629
		NEXTn(6);
630
		return;
631
	}
632
633
	if ((cs = allocset(p)) == NULL) {
634
		/* allocset did set error status in p */
635
		return;
636
	}
637
638
	if (EAT('^'))
639
		invert++;	/* make note to invert set at end */
640
	if (EAT(']'))
641
		CHadd(cs, ']');
642
	else if (EAT('-'))
643
		CHadd(cs, '-');
644
	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
645
		p_b_term(p, cs);
646
	if (EAT('-'))
647
		CHadd(cs, '-');
648
	REQUIRE(MORE() && GETNEXT() == ']', REG_EBRACK);
649
650
	if (p->error != 0) {	/* don't mess things up further */
651
		freeset(p, cs);
652
		return;
653
	}
654
655
	if (p->g->cflags&REG_ICASE) {
656
		int i;
657
		int ci;
658
659
		for (i = p->g->csetsize - 1; i >= 0; i--)
660
			if (CHIN(cs, i) && isalpha(i)) {
661
				ci = othercase(i);
662
				if (ci != i)
663
					CHadd(cs, ci);
664
			}
665
		if (cs->multis != NULL)
666
			mccase(p, cs);
667
	}
668
	if (invert) {
669
		int i;
670
671
		for (i = p->g->csetsize - 1; i >= 0; i--)
672
			if (CHIN(cs, i))
673
				CHsub(cs, i);
674
			else
675
				CHadd(cs, i);
676
		if (p->g->cflags&REG_NEWLINE)
677
			CHsub(cs, '\n');
678
		if (cs->multis != NULL)
679
			mcinvert(p, cs);
680
	}
681
682
	assert(cs->multis == NULL);		/* xxx */
683
684
	if (nch(p, cs) == 1) {		/* optimize singleton sets */
685
		ordinary(p, firstch(p, cs));
686
		freeset(p, cs);
687
	} else
688
		EMIT(OANYOF, freezeset(p, cs));
689
}
690
691
/*
692
 - p_b_term - parse one term of a bracketed character list
693
 */
694
static void
695
p_b_term(struct parse *p, cset *cs)
696
{
697
	char c;
698
	char start, finish;
699
	int i;
700
701
	/* classify what we've got */
702
	switch ((MORE()) ? PEEK() : '\0') {
703
	case '[':
704
		c = (MORE2()) ? PEEK2() : '\0';
705
		break;
706
	case '-':
707
		SETERROR(REG_ERANGE);
708
		return;			/* NOTE RETURN */
709
		break;
710
	default:
711
		c = '\0';
712
		break;
713
	}
714
715
	switch (c) {
716
	case ':':		/* character class */
717
		NEXT2();
718
		REQUIRE(MORE(), REG_EBRACK);
719
		c = PEEK();
720
		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
721
		p_b_cclass(p, cs);
722
		REQUIRE(MORE(), REG_EBRACK);
723
		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
724
		break;
725
	case '=':		/* equivalence class */
726
		NEXT2();
727
		REQUIRE(MORE(), REG_EBRACK);
728
		c = PEEK();
729
		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
730
		p_b_eclass(p, cs);
731
		REQUIRE(MORE(), REG_EBRACK);
732
		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
733
		break;
734
	default:		/* symbol, ordinary character, or range */
735
/* xxx revision needed for multichar stuff */
736
		start = p_b_symbol(p);
737
		if (SEE('-') && MORE2() && PEEK2() != ']') {
738
			/* range */
739
			NEXT();
740
			if (EAT('-'))
741
				finish = '-';
742
			else
743
				finish = p_b_symbol(p);
744
		} else
745
			finish = start;
746
/* xxx what about signed chars here... */
747
		REQUIRE(start <= finish, REG_ERANGE);
748
		for (i = start; i <= finish; i++)
749
			CHadd(cs, i);
750
		break;
751
	}
752
}
753
754
/*
755
 - p_b_cclass - parse a character-class name and deal with it
756
 */
757
static void
758
p_b_cclass(struct parse *p, cset *cs)
759
{
760
	char *sp = p->next;
761
	struct cclass *cp;
762
	size_t len;
763
	char *u;
764
	char c;
765
766
	while (MORE() && isalpha((uch)PEEK()))
767
		NEXT();
768
	len = p->next - sp;
769
	for (cp = cclasses; cp->name != NULL; cp++)
770
		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
771
			break;
772
	if (cp->name == NULL) {
773
		/* oops, didn't find it */
774
		SETERROR(REG_ECTYPE);
775
		return;
776
	}
777
778
	u = cp->chars;
779
	while ((c = *u++) != '\0')
780
		CHadd(cs, c);
781
	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
782
		MCadd(p, cs, u);
783
}
784
785
/*
786
 - p_b_eclass - parse an equivalence-class name and deal with it
787
 *
788
 * This implementation is incomplete. xxx
789
 */
790
static void
791
p_b_eclass(struct parse *p, cset *cs)
792
{
793
	char c;
794
795
	c = p_b_coll_elem(p, '=');
796
	CHadd(cs, c);
797
}
798
799
/*
800
 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
801
 */
802
static char			/* value of symbol */
803
p_b_symbol(struct parse *p)
804
{
805
	char value;
806
807
	REQUIRE(MORE(), REG_EBRACK);
808
	if (!EATTWO('[', '.'))
809
		return(GETNEXT());
810
811
	/* collating symbol */
812
	value = p_b_coll_elem(p, '.');
813
	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
814
	return(value);
815
}
816
817
/*
818
 - p_b_coll_elem - parse a collating-element name and look it up
819
 */
820
static char			/* value of collating element */
821
p_b_coll_elem(struct parse *p,
822
    int endc)			/* name ended by endc,']' */
823
{
824
	char *sp = p->next;
825
	struct cname *cp;
826
	int len;
827
828
	while (MORE() && !SEETWO(endc, ']'))
829
		NEXT();
830
	if (!MORE()) {
831
		SETERROR(REG_EBRACK);
832
		return(0);
833
	}
834
	len = p->next - sp;
835
	for (cp = cnames; cp->name != NULL; cp++)
836
		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
837
			return(cp->code);	/* known name */
838
	if (len == 1)
839
		return(*sp);	/* single character */
840
	SETERROR(REG_ECOLLATE);			/* neither */
841
	return(0);
842
}
843
844
/*
845
 - othercase - return the case counterpart of an alphabetic
846
 */
847
static char			/* if no counterpart, return ch */
848
othercase(int ch)
849
{
850
	ch = (uch)ch;
851
	assert(isalpha(ch));
852
	if (isupper(ch))
853
		return ((uch)tolower(ch));
854
	else if (islower(ch))
855
		return ((uch)toupper(ch));
856
	else			/* peculiar, but could happen */
857
		return(ch);
858
}
859
860
/*
861
 - bothcases - emit a dualcase version of a two-case character
862
 *
863
 * Boy, is this implementation ever a kludge...
864
 */
865
static void
866
bothcases(struct parse *p, int ch)
867
{
868
	char *oldnext = p->next;
869
	char *oldend = p->end;
870
	char bracket[3];
871
872
	ch = (uch)ch;
873
	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
874
	p->next = bracket;
875
	p->end = bracket+2;
876
	bracket[0] = ch;
877
	bracket[1] = ']';
878
	bracket[2] = '\0';
879
	p_bracket(p);
880
	assert(p->next == bracket+2);
881
	p->next = oldnext;
882
	p->end = oldend;
883
}
884
885
/*
886
 - ordinary - emit an ordinary character
887
 */
888
static void
889
ordinary(struct parse *p, int ch)
890
{
891
	cat_t *cap = p->g->categories;
892
893
	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
894
		bothcases(p, ch);
895
	else {
896
		EMIT(OCHAR, (uch)ch);
897
		if (cap[ch] == 0)
898
			cap[ch] = p->g->ncategories++;
899
	}
900
}
901
902
/*
903
 * do something magic with this character, but only if it's extra magic
904
 */
905
static void
906
backslash(struct parse *p, int ch)
907
{
908
	switch (ch) {
909
	case '<':
910
		EMIT(OBOW, 0);
911
		break;
912
	case '>':
913
		EMIT(OEOW, 0);
914
		break;
915
	default:
916
		ordinary(p, ch);
917
		break;
918
	}
919
}
920
921
/*
922
 - nonnewline - emit REG_NEWLINE version of OANY
923
 *
924
 * Boy, is this implementation ever a kludge...
925
 */
926
static void
927
nonnewline(struct parse *p)
928
{
929
	char *oldnext = p->next;
930
	char *oldend = p->end;
931
	char bracket[4];
932
933
	p->next = bracket;
934
	p->end = bracket+3;
935
	bracket[0] = '^';
936
	bracket[1] = '\n';
937
	bracket[2] = ']';
938
	bracket[3] = '\0';
939
	p_bracket(p);
940
	assert(p->next == bracket+3);
941
	p->next = oldnext;
942
	p->end = oldend;
943
}
944
945
/*
946
 - repeat - generate code for a bounded repetition, recursively if needed
947
 */
948
static void
949
repeat(struct parse *p,
950
    sopno start,		/* operand from here to end of strip */
951
    int from,			/* repeated from this number */
952
    int to)			/* to this number of times (maybe INFINITY) */
953
{
954
	sopno finish = HERE();
955
#	define	N	2
956
#	define	INF	3
957
#	define	REP(f, t)	((f)*8 + (t))
958
#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
959
	sopno copy;
960
961
	if (p->error != 0)	/* head off possible runaway recursion */
962
		return;
963
964
	assert(from <= to);
965
966
	switch (REP(MAP(from), MAP(to))) {
967
	case REP(0, 0):			/* must be user doing this */
968
		DROP(finish-start);	/* drop the operand */
969
		break;
970
	case REP(0, 1):			/* as x{1,1}? */
971
	case REP(0, N):			/* as x{1,n}? */
972
	case REP(0, INF):		/* as x{1,}? */
973
		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
974
		INSERT(OCH_, start);		/* offset is wrong... */
975
		repeat(p, start+1, 1, to);
976
		ASTERN(OOR1, start);
977
		AHEAD(start);			/* ... fix it */
978
		EMIT(OOR2, 0);
979
		AHEAD(THERE());
980
		ASTERN(O_CH, THERETHERE());
981
		break;
982
	case REP(1, 1):			/* trivial case */
983
		/* done */
984
		break;
985
	case REP(1, N):			/* as x?x{1,n-1} */
986
		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
987
		INSERT(OCH_, start);
988
		ASTERN(OOR1, start);
989
		AHEAD(start);
990
		EMIT(OOR2, 0);			/* offset very wrong... */
991
		AHEAD(THERE());			/* ...so fix it */
992
		ASTERN(O_CH, THERETHERE());
993
		copy = dupl(p, start+1, finish+1);
994
		assert(copy == finish+4);
995
		repeat(p, copy, 1, to-1);
996
		break;
997
	case REP(1, INF):		/* as x+ */
998
		INSERT(OPLUS_, start);
999
		ASTERN(O_PLUS, start);
1000
		break;
1001
	case REP(N, N):			/* as xx{m-1,n-1} */
1002
		copy = dupl(p, start, finish);
1003
		repeat(p, copy, from-1, to-1);
1004
		break;
1005
	case REP(N, INF):		/* as xx{n-1,INF} */
1006
		copy = dupl(p, start, finish);
1007
		repeat(p, copy, from-1, to);
1008
		break;
1009
	default:			/* "can't happen" */
1010
		SETERROR(REG_ASSERT);	/* just in case */
1011
		break;
1012
	}
1013
}
1014
1015
/*
1016
 - seterr - set an error condition
1017
 */
1018
static int			/* useless but makes type checking happy */
1019
seterr(struct parse *p, int e)
1020
{
1021
	if (p->error == 0)	/* keep earliest error condition */
1022
		p->error = e;
1023
	p->next = nuls;		/* try to bring things to a halt */
1024
	p->end = nuls;
1025
	return(0);		/* make the return value well-defined */
1026
}
1027
1028
/*
1029
 - allocset - allocate a set of characters for []
1030
 */
1031
static cset *
1032
allocset(struct parse *p)
1033
{
1034
	int no = p->g->ncsets++;
1035
	size_t nc;
1036
	size_t nbytes;
1037
	cset *cs;
1038
	size_t css = (size_t)p->g->csetsize;
1039
	int i;
1040
1041
	if (no >= p->ncsalloc) {	/* need another column of space */
1042
		void *ptr;
1043
1044
		p->ncsalloc += CHAR_BIT;
1045
		nc = p->ncsalloc;
1046
		assert(nc % CHAR_BIT == 0);
1047
1048
		ptr = reallocarray(p->g->sets, nc, sizeof(cset));
1049
		if (ptr == NULL)
1050
			goto nomem;
1051
		p->g->sets = ptr;
1052
1053
		ptr = reallocarray(p->g->setbits, nc / CHAR_BIT, css);
1054
		if (ptr == NULL)
1055
			goto nomem;
1056
		nbytes = (nc / CHAR_BIT) * css;
1057
		p->g->setbits = ptr;
1058
1059
		for (i = 0; i < no; i++)
1060
			p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1061
1062
		(void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1063
	}
1064
	/* XXX should not happen */
1065
	if (p->g->sets == NULL || p->g->setbits == NULL)
1066
		goto nomem;
1067
1068
	cs = &p->g->sets[no];
1069
	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1070
	cs->mask = 1 << ((no) % CHAR_BIT);
1071
	cs->hash = 0;
1072
	cs->smultis = 0;
1073
	cs->multis = NULL;
1074
1075
	return(cs);
1076
nomem:
1077
	free(p->g->sets);
1078
	p->g->sets = NULL;
1079
	free(p->g->setbits);
1080
	p->g->setbits = NULL;
1081
1082
	SETERROR(REG_ESPACE);
1083
	/* caller's responsibility not to do set ops */
1084
	return(NULL);
1085
}
1086
1087
/*
1088
 - freeset - free a now-unused set
1089
 */
1090
static void
1091
freeset(struct parse *p, cset *cs)
1092
{
1093
	int i;
1094
	cset *top = &p->g->sets[p->g->ncsets];
1095
	size_t css = (size_t)p->g->csetsize;
1096
1097
	for (i = 0; i < css; i++)
1098
		CHsub(cs, i);
1099
	if (cs == top-1)	/* recover only the easy case */
1100
		p->g->ncsets--;
1101
}
1102
1103
/*
1104
 - freezeset - final processing on a set of characters
1105
 *
1106
 * The main task here is merging identical sets.  This is usually a waste
1107
 * of time (although the hash code minimizes the overhead), but can win
1108
 * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1109
 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1110
 * the same value!
1111
 */
1112
static int			/* set number */
1113
freezeset(struct parse *p, cset *cs)
1114
{
1115
	uch h = cs->hash;
1116
	int i;
1117
	cset *top = &p->g->sets[p->g->ncsets];
1118
	cset *cs2;
1119
	size_t css = (size_t)p->g->csetsize;
1120
1121
	/* look for an earlier one which is the same */
1122
	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1123
		if (cs2->hash == h && cs2 != cs) {
1124
			/* maybe */
1125
			for (i = 0; i < css; i++)
1126
				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1127
					break;		/* no */
1128
			if (i == css)
1129
				break;			/* yes */
1130
		}
1131
1132
	if (cs2 < top) {	/* found one */
1133
		freeset(p, cs);
1134
		cs = cs2;
1135
	}
1136
1137
	return((int)(cs - p->g->sets));
1138
}
1139
1140
/*
1141
 - firstch - return first character in a set (which must have at least one)
1142
 */
1143
static int			/* character; there is no "none" value */
1144
firstch(struct parse *p, cset *cs)
1145
{
1146
	int i;
1147
	size_t css = (size_t)p->g->csetsize;
1148
1149
	for (i = 0; i < css; i++)
1150
		if (CHIN(cs, i))
1151
			return((char)i);
1152
	assert(never);
1153
	return(0);		/* arbitrary */
1154
}
1155
1156
/*
1157
 - nch - number of characters in a set
1158
 */
1159
static int
1160
nch(struct parse *p, cset *cs)
1161
{
1162
	int i;
1163
	size_t css = (size_t)p->g->csetsize;
1164
	int n = 0;
1165
1166
	for (i = 0; i < css; i++)
1167
		if (CHIN(cs, i))
1168
			n++;
1169
	return(n);
1170
}
1171
1172
/*
1173
 - mcadd - add a collating element to a cset
1174
 */
1175
static void
1176
mcadd( struct parse *p, cset *cs, char *cp)
1177
{
1178
	size_t oldend = cs->smultis;
1179
	void *np;
1180
1181
	cs->smultis += strlen(cp) + 1;
1182
	np = realloc(cs->multis, cs->smultis);
1183
	if (np == NULL) {
1184
		free(cs->multis);
1185
		cs->multis = NULL;
1186
		SETERROR(REG_ESPACE);
1187
		return;
1188
	}
1189
	cs->multis = np;
1190
1191
	strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1192
}
1193
1194
/*
1195
 - mcinvert - invert the list of collating elements in a cset
1196
 *
1197
 * This would have to know the set of possibilities.  Implementation
1198
 * is deferred.
1199
 */
1200
static void
1201
mcinvert(struct parse *p, cset *cs)
1202
{
1203
	assert(cs->multis == NULL);	/* xxx */
1204
}
1205
1206
/*
1207
 - mccase - add case counterparts of the list of collating elements in a cset
1208
 *
1209
 * This would have to know the set of possibilities.  Implementation
1210
 * is deferred.
1211
 */
1212
static void
1213
mccase(struct parse *p, cset *cs)
1214
{
1215
	assert(cs->multis == NULL);	/* xxx */
1216
}
1217
1218
/*
1219
 - isinsets - is this character in any sets?
1220
 */
1221
static int			/* predicate */
1222
isinsets(struct re_guts *g, int c)
1223
{
1224
	uch *col;
1225
	int i;
1226
	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1227
	unsigned uc = (uch)c;
1228
1229
	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1230
		if (col[uc] != 0)
1231
			return(1);
1232
	return(0);
1233
}
1234
1235
/*
1236
 - samesets - are these two characters in exactly the same sets?
1237
 */
1238
static int			/* predicate */
1239
samesets(struct re_guts *g, int c1, int c2)
1240
{
1241
	uch *col;
1242
	int i;
1243
	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1244
	unsigned uc1 = (uch)c1;
1245
	unsigned uc2 = (uch)c2;
1246
1247
	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1248
		if (col[uc1] != col[uc2])
1249
			return(0);
1250
	return(1);
1251
}
1252
1253
/*
1254
 - categorize - sort out character categories
1255
 */
1256
static void
1257
categorize(struct parse *p, struct re_guts *g)
1258
{
1259
	cat_t *cats = g->categories;
1260
	int c;
1261
	int c2;
1262
	cat_t cat;
1263
1264
	/* avoid making error situations worse */
1265
	if (p->error != 0)
1266
		return;
1267
1268
	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1269
		if (cats[c] == 0 && isinsets(g, c)) {
1270
			cat = g->ncategories++;
1271
			cats[c] = cat;
1272
			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1273
				if (cats[c2] == 0 && samesets(g, c, c2))
1274
					cats[c2] = cat;
1275
		}
1276
}
1277
1278
/*
1279
 - dupl - emit a duplicate of a bunch of sops
1280
 */
1281
static sopno			/* start of duplicate */
1282
dupl(struct parse *p,
1283
    sopno start,		/* from here */
1284
    sopno finish)		/* to this less one */
1285
{
1286
	sopno ret = HERE();
1287
	sopno len = finish - start;
1288
1289
	assert(finish >= start);
1290
	if (len == 0)
1291
		return(ret);
1292
	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1293
		return(ret);
1294
	(void) memcpy(p->strip + p->slen, p->strip + start, len * sizeof(sop));
1295
	p->slen += len;
1296
	return(ret);
1297
}
1298
1299
/*
1300
 - doemit - emit a strip operator
1301
 *
1302
 * It might seem better to implement this as a macro with a function as
1303
 * hard-case backup, but it's just too big and messy unless there are
1304
 * some changes to the data structures.  Maybe later.
1305
 */
1306
static void
1307
doemit(struct parse *p, sop op, size_t opnd)
1308
{
1309
	/* avoid making error situations worse */
1310
	if (p->error != 0)
1311
		return;
1312
1313
	/* deal with oversize operands ("can't happen", more or less) */
1314
	assert(opnd < 1<<OPSHIFT);
1315
1316
	/* deal with undersized strip */
1317
	if (p->slen >= p->ssize)
1318
		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1319
			return;
1320
1321
	/* finally, it's all reduced to the easy case */
1322
	p->strip[p->slen++] = SOP(op, opnd);
1323
}
1324
1325
/*
1326
 - doinsert - insert a sop into the strip
1327
 */
1328
static void
1329
doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1330
{
1331
	sopno sn;
1332
	sop s;
1333
	int i;
1334
1335
	/* avoid making error situations worse */
1336
	if (p->error != 0)
1337
		return;
1338
1339
	sn = HERE();
1340
	EMIT(op, opnd);		/* do checks, ensure space */
1341
	assert(HERE() == sn+1);
1342
	s = p->strip[sn];
1343
1344
	/* adjust paren pointers */
1345
	assert(pos > 0);
1346
	for (i = 1; i < NPAREN; i++) {
1347
		if (p->pbegin[i] >= pos) {
1348
			p->pbegin[i]++;
1349
		}
1350
		if (p->pend[i] >= pos) {
1351
			p->pend[i]++;
1352
		}
1353
	}
1354
1355
	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1356
						(HERE()-pos-1)*sizeof(sop));
1357
	p->strip[pos] = s;
1358
}
1359
1360
/*
1361
 - dofwd - complete a forward reference
1362
 */
1363
static void
1364
dofwd(struct parse *p, sopno pos, sop value)
1365
{
1366
	/* avoid making error situations worse */
1367
	if (p->error != 0)
1368
		return;
1369
1370
	assert(value < 1<<OPSHIFT);
1371
	p->strip[pos] = OP(p->strip[pos]) | value;
1372
}
1373
1374
/*
1375
 - enlarge - enlarge the strip
1376
 */
1377
static int
1378
enlarge(struct parse *p, sopno size)
1379
{
1380
	sop *sp;
1381
1382
	if (p->ssize >= size)
1383
		return 1;
1384
1385
	sp = reallocarray(p->strip, size, sizeof(sop));
1386
	if (sp == NULL) {
1387
		SETERROR(REG_ESPACE);
1388
		return 0;
1389
	}
1390
	p->strip = sp;
1391
	p->ssize = size;
1392
	return 1;
1393
}
1394
1395
/*
1396
 - stripsnug - compact the strip
1397
 */
1398
static void
1399
stripsnug(struct parse *p, struct re_guts *g)
1400
{
1401
	g->nstates = p->slen;
1402
	g->strip = reallocarray(p->strip, p->slen, sizeof(sop));
1403
	if (g->strip == NULL) {
1404
		SETERROR(REG_ESPACE);
1405
		g->strip = p->strip;
1406
	}
1407
}
1408
1409
/*
1410
 - findmust - fill in must and mlen with longest mandatory literal string
1411
 *
1412
 * This algorithm could do fancy things like analyzing the operands of |
1413
 * for common subsequences.  Someday.  This code is simple and finds most
1414
 * of the interesting cases.
1415
 *
1416
 * Note that must and mlen got initialized during setup.
1417
 */
1418
static void
1419
findmust(struct parse *p, struct re_guts *g)
1420
{
1421
	sop *scan;
1422
	sop *start;    /* start initialized in the default case, after that */
1423
	sop *newstart; /* newstart was initialized in the OCHAR case */
1424
	sopno newlen;
1425
	sop s;
1426
	char *cp;
1427
	sopno i;
1428
1429
	/* avoid making error situations worse */
1430
	if (p->error != 0)
1431
		return;
1432
1433
	/* find the longest OCHAR sequence in strip */
1434
	newlen = 0;
1435
	scan = g->strip + 1;
1436
	do {
1437
		s = *scan++;
1438
		switch (OP(s)) {
1439
		case OCHAR:		/* sequence member */
1440
			if (newlen == 0)		/* new sequence */
1441
				newstart = scan - 1;
1442
			newlen++;
1443
			break;
1444
		case OPLUS_:		/* things that don't break one */
1445
		case OLPAREN:
1446
		case ORPAREN:
1447
			break;
1448
		case OQUEST_:		/* things that must be skipped */
1449
		case OCH_:
1450
			scan--;
1451
			do {
1452
				scan += OPND(s);
1453
				s = *scan;
1454
				/* assert() interferes w debug printouts */
1455
				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1456
							OP(s) != OOR2) {
1457
					g->iflags |= BAD;
1458
					return;
1459
				}
1460
			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1461
			/* fallthrough */
1462
		default:		/* things that break a sequence */
1463
			if (newlen > g->mlen) {		/* ends one */
1464
				start = newstart;
1465
				g->mlen = newlen;
1466
			}
1467
			newlen = 0;
1468
			break;
1469
		}
1470
	} while (OP(s) != OEND);
1471
1472
	if (g->mlen == 0)		/* there isn't one */
1473
		return;
1474
1475
	/* turn it into a character string */
1476
	g->must = malloc((size_t)g->mlen + 1);
1477
	if (g->must == NULL) {		/* argh; just forget it */
1478
		g->mlen = 0;
1479
		return;
1480
	}
1481
	cp = g->must;
1482
	scan = start;
1483
	for (i = g->mlen; i > 0; i--) {
1484
		while (OP(s = *scan++) != OCHAR)
1485
			continue;
1486
		assert(cp < g->must + g->mlen);
1487
		*cp++ = (char)OPND(s);
1488
	}
1489
	assert(cp == g->must + g->mlen);
1490
	*cp++ = '\0';		/* just on general principles */
1491
}
1492
1493
/*
1494
 - pluscount - count + nesting
1495
 */
1496
static sopno			/* nesting depth */
1497
pluscount(struct parse *p, struct re_guts *g)
1498
{
1499
	sop *scan;
1500
	sop s;
1501
	sopno plusnest = 0;
1502
	sopno maxnest = 0;
1503
1504
	if (p->error != 0)
1505
		return(0);	/* there may not be an OEND */
1506
1507
	scan = g->strip + 1;
1508
	do {
1509
		s = *scan++;
1510
		switch (OP(s)) {
1511
		case OPLUS_:
1512
			plusnest++;
1513
			break;
1514
		case O_PLUS:
1515
			if (plusnest > maxnest)
1516
				maxnest = plusnest;
1517
			plusnest--;
1518
			break;
1519
		}
1520
	} while (OP(s) != OEND);
1521
	if (plusnest != 0)
1522
		g->iflags |= BAD;
1523
	return(maxnest);
1524
}