1 |
|
|
/* $OpenBSD: regcomp.c,v 1.31 2016/12/22 00:09:07 krw Exp $ */ |
2 |
|
|
/*- |
3 |
|
|
* Copyright (c) 1992, 1993, 1994 Henry Spencer. |
4 |
|
|
* Copyright (c) 1992, 1993, 1994 |
5 |
|
|
* The Regents of the University of California. All rights reserved. |
6 |
|
|
* |
7 |
|
|
* This code is derived from software contributed to Berkeley by |
8 |
|
|
* Henry Spencer. |
9 |
|
|
* |
10 |
|
|
* Redistribution and use in source and binary forms, with or without |
11 |
|
|
* modification, are permitted provided that the following conditions |
12 |
|
|
* are met: |
13 |
|
|
* 1. Redistributions of source code must retain the above copyright |
14 |
|
|
* notice, this list of conditions and the following disclaimer. |
15 |
|
|
* 2. Redistributions in binary form must reproduce the above copyright |
16 |
|
|
* notice, this list of conditions and the following disclaimer in the |
17 |
|
|
* documentation and/or other materials provided with the distribution. |
18 |
|
|
* 3. Neither the name of the University nor the names of its contributors |
19 |
|
|
* may be used to endorse or promote products derived from this software |
20 |
|
|
* without specific prior written permission. |
21 |
|
|
* |
22 |
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
23 |
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
24 |
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
25 |
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
26 |
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
27 |
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
28 |
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
29 |
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
30 |
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
31 |
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
32 |
|
|
* SUCH DAMAGE. |
33 |
|
|
* |
34 |
|
|
* @(#)regcomp.c 8.5 (Berkeley) 3/20/94 |
35 |
|
|
*/ |
36 |
|
|
|
37 |
|
|
#include <sys/types.h> |
38 |
|
|
#include <stdio.h> |
39 |
|
|
#include <string.h> |
40 |
|
|
#include <ctype.h> |
41 |
|
|
#include <limits.h> |
42 |
|
|
#include <stdlib.h> |
43 |
|
|
#include <regex.h> |
44 |
|
|
|
45 |
|
|
#include "utils.h" |
46 |
|
|
#include "regex2.h" |
47 |
|
|
|
48 |
|
|
#include "cclass.h" |
49 |
|
|
#include "cname.h" |
50 |
|
|
|
51 |
|
|
/* |
52 |
|
|
* parse structure, passed up and down to avoid global variables and |
53 |
|
|
* other clumsinesses |
54 |
|
|
*/ |
55 |
|
|
struct parse { |
56 |
|
|
char *next; /* next character in RE */ |
57 |
|
|
char *end; /* end of string (-> NUL normally) */ |
58 |
|
|
int error; /* has an error been seen? */ |
59 |
|
|
sop *strip; /* malloced strip */ |
60 |
|
|
sopno ssize; /* malloced strip size (allocated) */ |
61 |
|
|
sopno slen; /* malloced strip length (used) */ |
62 |
|
|
int ncsalloc; /* number of csets allocated */ |
63 |
|
|
struct re_guts *g; |
64 |
|
|
# define NPAREN 10 /* we need to remember () 1-9 for back refs */ |
65 |
|
|
sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ |
66 |
|
|
sopno pend[NPAREN]; /* -> ) ([0] unused) */ |
67 |
|
|
}; |
68 |
|
|
|
69 |
|
|
static void p_ere(struct parse *, int); |
70 |
|
|
static void p_ere_exp(struct parse *); |
71 |
|
|
static void p_str(struct parse *); |
72 |
|
|
static void p_bre(struct parse *, int, int); |
73 |
|
|
static int p_simp_re(struct parse *, int); |
74 |
|
|
static int p_count(struct parse *); |
75 |
|
|
static void p_bracket(struct parse *); |
76 |
|
|
static void p_b_term(struct parse *, cset *); |
77 |
|
|
static void p_b_cclass(struct parse *, cset *); |
78 |
|
|
static void p_b_eclass(struct parse *, cset *); |
79 |
|
|
static char p_b_symbol(struct parse *); |
80 |
|
|
static char p_b_coll_elem(struct parse *, int); |
81 |
|
|
static char othercase(int); |
82 |
|
|
static void bothcases(struct parse *, int); |
83 |
|
|
static void ordinary(struct parse *, int); |
84 |
|
|
static void backslash(struct parse *, int); |
85 |
|
|
static void nonnewline(struct parse *); |
86 |
|
|
static void repeat(struct parse *, sopno, int, int); |
87 |
|
|
static int seterr(struct parse *, int); |
88 |
|
|
static cset *allocset(struct parse *); |
89 |
|
|
static void freeset(struct parse *, cset *); |
90 |
|
|
static int freezeset(struct parse *, cset *); |
91 |
|
|
static int firstch(struct parse *, cset *); |
92 |
|
|
static int nch(struct parse *, cset *); |
93 |
|
|
static void mcadd(struct parse *, cset *, char *); |
94 |
|
|
static void mcinvert(struct parse *, cset *); |
95 |
|
|
static void mccase(struct parse *, cset *); |
96 |
|
|
static int isinsets(struct re_guts *, int); |
97 |
|
|
static int samesets(struct re_guts *, int, int); |
98 |
|
|
static void categorize(struct parse *, struct re_guts *); |
99 |
|
|
static sopno dupl(struct parse *, sopno, sopno); |
100 |
|
|
static void doemit(struct parse *, sop, size_t); |
101 |
|
|
static void doinsert(struct parse *, sop, size_t, sopno); |
102 |
|
|
static void dofwd(struct parse *, sopno, sop); |
103 |
|
|
static int enlarge(struct parse *, sopno); |
104 |
|
|
static void stripsnug(struct parse *, struct re_guts *); |
105 |
|
|
static void findmust(struct parse *, struct re_guts *); |
106 |
|
|
static sopno pluscount(struct parse *, struct re_guts *); |
107 |
|
|
|
108 |
|
|
static char nuls[10]; /* place to point scanner in event of error */ |
109 |
|
|
|
110 |
|
|
/* |
111 |
|
|
* macros for use with parse structure |
112 |
|
|
* BEWARE: these know that the parse structure is named `p' !!! |
113 |
|
|
*/ |
114 |
|
|
#define PEEK() (*p->next) |
115 |
|
|
#define PEEK2() (*(p->next+1)) |
116 |
|
|
#define MORE() (p->next < p->end) |
117 |
|
|
#define MORE2() (p->next+1 < p->end) |
118 |
|
|
#define SEE(c) (MORE() && PEEK() == (c)) |
119 |
|
|
#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) |
120 |
|
|
#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) |
121 |
|
|
#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) |
122 |
|
|
#define NEXT() (p->next++) |
123 |
|
|
#define NEXT2() (p->next += 2) |
124 |
|
|
#define NEXTn(n) (p->next += (n)) |
125 |
|
|
#define GETNEXT() (*p->next++) |
126 |
|
|
#define SETERROR(e) seterr(p, (e)) |
127 |
|
|
#define REQUIRE(co, e) (void) ((co) || SETERROR(e)) |
128 |
|
|
#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) |
129 |
|
|
#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) |
130 |
|
|
#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) |
131 |
|
|
#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) |
132 |
|
|
#define HERE() (p->slen) |
133 |
|
|
#define THERE() (p->slen - 1) |
134 |
|
|
#define THERETHERE() (p->slen - 2) |
135 |
|
|
#define DROP(n) (p->slen -= (n)) |
136 |
|
|
|
137 |
|
|
#ifndef NDEBUG |
138 |
|
|
static int never = 0; /* for use in asserts; shuts lint up */ |
139 |
|
|
#else |
140 |
|
|
#define never 0 /* some <assert.h>s have bugs too */ |
141 |
|
|
#endif |
142 |
|
|
|
143 |
|
|
/* |
144 |
|
|
- regcomp - interface for parser and compilation |
145 |
|
|
*/ |
146 |
|
|
int /* 0 success, otherwise REG_something */ |
147 |
|
|
regcomp(regex_t *preg, const char *pattern, int cflags) |
148 |
|
|
{ |
149 |
|
|
struct parse pa; |
150 |
|
|
struct re_guts *g; |
151 |
|
|
struct parse *p = &pa; |
152 |
|
|
int i; |
153 |
|
|
size_t len; |
154 |
|
|
#ifdef REDEBUG |
155 |
|
|
# define GOODFLAGS(f) (f) |
156 |
|
|
#else |
157 |
|
|
# define GOODFLAGS(f) ((f)&~REG_DUMP) |
158 |
|
|
#endif |
159 |
|
|
|
160 |
|
|
cflags = GOODFLAGS(cflags); |
161 |
|
|
if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) |
162 |
|
|
return(REG_INVARG); |
163 |
|
|
|
164 |
|
|
if (cflags®_PEND) { |
165 |
|
|
if (preg->re_endp < pattern) |
166 |
|
|
return(REG_INVARG); |
167 |
|
|
len = preg->re_endp - pattern; |
168 |
|
|
} else |
169 |
|
|
len = strlen((char *)pattern); |
170 |
|
|
|
171 |
|
|
/* do the mallocs early so failure handling is easy */ |
172 |
|
|
g = malloc(sizeof(struct re_guts)); |
173 |
|
|
if (g == NULL) |
174 |
|
|
return(REG_ESPACE); |
175 |
|
|
p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ |
176 |
|
|
p->strip = reallocarray(NULL, p->ssize, sizeof(sop)); |
177 |
|
|
p->slen = 0; |
178 |
|
|
if (p->strip == NULL) { |
179 |
|
|
free(g); |
180 |
|
|
return(REG_ESPACE); |
181 |
|
|
} |
182 |
|
|
|
183 |
|
|
/* set things up */ |
184 |
|
|
p->g = g; |
185 |
|
|
p->next = (char *)pattern; /* convenience; we do not modify it */ |
186 |
|
|
p->end = p->next + len; |
187 |
|
|
p->error = 0; |
188 |
|
|
p->ncsalloc = 0; |
189 |
|
|
for (i = 0; i < NPAREN; i++) { |
190 |
|
|
p->pbegin[i] = 0; |
191 |
|
|
p->pend[i] = 0; |
192 |
|
|
} |
193 |
|
|
g->csetsize = NC; |
194 |
|
|
g->sets = NULL; |
195 |
|
|
g->setbits = NULL; |
196 |
|
|
g->ncsets = 0; |
197 |
|
|
g->cflags = cflags; |
198 |
|
|
g->iflags = 0; |
199 |
|
|
g->nbol = 0; |
200 |
|
|
g->neol = 0; |
201 |
|
|
g->must = NULL; |
202 |
|
|
g->mlen = 0; |
203 |
|
|
g->nsub = 0; |
204 |
|
|
g->ncategories = 1; /* category 0 is "everything else" */ |
205 |
|
|
g->categories = &g->catspace[-(CHAR_MIN)]; |
206 |
|
|
memset(g->catspace, 0, sizeof(g->catspace)); |
207 |
|
|
g->backrefs = 0; |
208 |
|
|
|
209 |
|
|
/* do it */ |
210 |
|
|
EMIT(OEND, 0); |
211 |
|
|
g->firststate = THERE(); |
212 |
|
|
if (cflags®_EXTENDED) |
213 |
|
|
p_ere(p, OUT); |
214 |
|
|
else if (cflags®_NOSPEC) |
215 |
|
|
p_str(p); |
216 |
|
|
else |
217 |
|
|
p_bre(p, OUT, OUT); |
218 |
|
|
EMIT(OEND, 0); |
219 |
|
|
g->laststate = THERE(); |
220 |
|
|
|
221 |
|
|
/* tidy up loose ends and fill things in */ |
222 |
|
|
categorize(p, g); |
223 |
|
|
stripsnug(p, g); |
224 |
|
|
findmust(p, g); |
225 |
|
|
g->nplus = pluscount(p, g); |
226 |
|
|
g->magic = MAGIC2; |
227 |
|
|
preg->re_nsub = g->nsub; |
228 |
|
|
preg->re_g = g; |
229 |
|
|
preg->re_magic = MAGIC1; |
230 |
|
|
#ifndef REDEBUG |
231 |
|
|
/* not debugging, so can't rely on the assert() in regexec() */ |
232 |
|
|
if (g->iflags&BAD) |
233 |
|
|
SETERROR(REG_ASSERT); |
234 |
|
|
#endif |
235 |
|
|
|
236 |
|
|
/* win or lose, we're done */ |
237 |
|
|
if (p->error != 0) /* lose */ |
238 |
|
|
regfree(preg); |
239 |
|
|
return(p->error); |
240 |
|
|
} |
241 |
|
|
|
242 |
|
|
/* |
243 |
|
|
- p_ere - ERE parser top level, concatenation and alternation |
244 |
|
|
*/ |
245 |
|
|
static void |
246 |
|
|
p_ere(struct parse *p, int stop) /* character this ERE should end at */ |
247 |
|
|
{ |
248 |
|
|
char c; |
249 |
|
|
sopno prevback; |
250 |
|
|
sopno prevfwd; |
251 |
|
|
sopno conc; |
252 |
|
|
int first = 1; /* is this the first alternative? */ |
253 |
|
|
|
254 |
|
|
for (;;) { |
255 |
|
|
/* do a bunch of concatenated expressions */ |
256 |
|
|
conc = HERE(); |
257 |
|
|
while (MORE() && (c = PEEK()) != '|' && c != stop) |
258 |
|
|
p_ere_exp(p); |
259 |
|
|
REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ |
260 |
|
|
|
261 |
|
|
if (!EAT('|')) |
262 |
|
|
break; /* NOTE BREAK OUT */ |
263 |
|
|
|
264 |
|
|
if (first) { |
265 |
|
|
INSERT(OCH_, conc); /* offset is wrong */ |
266 |
|
|
prevfwd = conc; |
267 |
|
|
prevback = conc; |
268 |
|
|
first = 0; |
269 |
|
|
} |
270 |
|
|
ASTERN(OOR1, prevback); |
271 |
|
|
prevback = THERE(); |
272 |
|
|
AHEAD(prevfwd); /* fix previous offset */ |
273 |
|
|
prevfwd = HERE(); |
274 |
|
|
EMIT(OOR2, 0); /* offset is very wrong */ |
275 |
|
|
} |
276 |
|
|
|
277 |
|
|
if (!first) { /* tail-end fixups */ |
278 |
|
|
AHEAD(prevfwd); |
279 |
|
|
ASTERN(O_CH, prevback); |
280 |
|
|
} |
281 |
|
|
|
282 |
|
|
assert(!MORE() || SEE(stop)); |
283 |
|
|
} |
284 |
|
|
|
285 |
|
|
/* |
286 |
|
|
- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op |
287 |
|
|
*/ |
288 |
|
|
static void |
289 |
|
|
p_ere_exp(struct parse *p) |
290 |
|
|
{ |
291 |
|
|
char c; |
292 |
|
|
sopno pos; |
293 |
|
|
int count; |
294 |
|
|
int count2; |
295 |
|
|
sopno subno; |
296 |
|
|
int wascaret = 0; |
297 |
|
|
|
298 |
|
|
assert(MORE()); /* caller should have ensured this */ |
299 |
|
|
c = GETNEXT(); |
300 |
|
|
|
301 |
|
|
pos = HERE(); |
302 |
|
|
switch (c) { |
303 |
|
|
case '(': |
304 |
|
|
REQUIRE(MORE(), REG_EPAREN); |
305 |
|
|
p->g->nsub++; |
306 |
|
|
subno = p->g->nsub; |
307 |
|
|
if (subno < NPAREN) |
308 |
|
|
p->pbegin[subno] = HERE(); |
309 |
|
|
EMIT(OLPAREN, subno); |
310 |
|
|
if (!SEE(')')) |
311 |
|
|
p_ere(p, ')'); |
312 |
|
|
if (subno < NPAREN) { |
313 |
|
|
p->pend[subno] = HERE(); |
314 |
|
|
assert(p->pend[subno] != 0); |
315 |
|
|
} |
316 |
|
|
EMIT(ORPAREN, subno); |
317 |
|
|
REQUIRE(MORE() && GETNEXT() == ')', REG_EPAREN); |
318 |
|
|
break; |
319 |
|
|
case '^': |
320 |
|
|
EMIT(OBOL, 0); |
321 |
|
|
p->g->iflags |= USEBOL; |
322 |
|
|
p->g->nbol++; |
323 |
|
|
wascaret = 1; |
324 |
|
|
break; |
325 |
|
|
case '$': |
326 |
|
|
EMIT(OEOL, 0); |
327 |
|
|
p->g->iflags |= USEEOL; |
328 |
|
|
p->g->neol++; |
329 |
|
|
break; |
330 |
|
|
case '|': |
331 |
|
|
SETERROR(REG_EMPTY); |
332 |
|
|
break; |
333 |
|
|
case '*': |
334 |
|
|
case '+': |
335 |
|
|
case '?': |
336 |
|
|
SETERROR(REG_BADRPT); |
337 |
|
|
break; |
338 |
|
|
case '.': |
339 |
|
|
if (p->g->cflags®_NEWLINE) |
340 |
|
|
nonnewline(p); |
341 |
|
|
else |
342 |
|
|
EMIT(OANY, 0); |
343 |
|
|
break; |
344 |
|
|
case '[': |
345 |
|
|
p_bracket(p); |
346 |
|
|
break; |
347 |
|
|
case '\\': |
348 |
|
|
REQUIRE(MORE(), REG_EESCAPE); |
349 |
|
|
c = GETNEXT(); |
350 |
|
|
backslash(p, c); |
351 |
|
|
break; |
352 |
|
|
case '{': /* okay as ordinary except if digit follows */ |
353 |
|
|
REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); |
354 |
|
|
/* FALLTHROUGH */ |
355 |
|
|
default: |
356 |
|
|
ordinary(p, c); |
357 |
|
|
break; |
358 |
|
|
} |
359 |
|
|
|
360 |
|
|
if (!MORE()) |
361 |
|
|
return; |
362 |
|
|
c = PEEK(); |
363 |
|
|
/* we call { a repetition if followed by a digit */ |
364 |
|
|
if (!( c == '*' || c == '+' || c == '?' || |
365 |
|
|
(c == '{' && MORE2() && isdigit((uch)PEEK2())) )) |
366 |
|
|
return; /* no repetition, we're done */ |
367 |
|
|
NEXT(); |
368 |
|
|
|
369 |
|
|
REQUIRE(!wascaret, REG_BADRPT); |
370 |
|
|
switch (c) { |
371 |
|
|
case '*': /* implemented as +? */ |
372 |
|
|
/* this case does not require the (y|) trick, noKLUDGE */ |
373 |
|
|
INSERT(OPLUS_, pos); |
374 |
|
|
ASTERN(O_PLUS, pos); |
375 |
|
|
INSERT(OQUEST_, pos); |
376 |
|
|
ASTERN(O_QUEST, pos); |
377 |
|
|
break; |
378 |
|
|
case '+': |
379 |
|
|
INSERT(OPLUS_, pos); |
380 |
|
|
ASTERN(O_PLUS, pos); |
381 |
|
|
break; |
382 |
|
|
case '?': |
383 |
|
|
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
384 |
|
|
INSERT(OCH_, pos); /* offset slightly wrong */ |
385 |
|
|
ASTERN(OOR1, pos); /* this one's right */ |
386 |
|
|
AHEAD(pos); /* fix the OCH_ */ |
387 |
|
|
EMIT(OOR2, 0); /* offset very wrong... */ |
388 |
|
|
AHEAD(THERE()); /* ...so fix it */ |
389 |
|
|
ASTERN(O_CH, THERETHERE()); |
390 |
|
|
break; |
391 |
|
|
case '{': |
392 |
|
|
count = p_count(p); |
393 |
|
|
if (EAT(',')) { |
394 |
|
|
if (isdigit((uch)PEEK())) { |
395 |
|
|
count2 = p_count(p); |
396 |
|
|
REQUIRE(count <= count2, REG_BADBR); |
397 |
|
|
} else /* single number with comma */ |
398 |
|
|
count2 = INFINITY; |
399 |
|
|
} else /* just a single number */ |
400 |
|
|
count2 = count; |
401 |
|
|
repeat(p, pos, count, count2); |
402 |
|
|
if (!EAT('}')) { /* error heuristics */ |
403 |
|
|
while (MORE() && PEEK() != '}') |
404 |
|
|
NEXT(); |
405 |
|
|
REQUIRE(MORE(), REG_EBRACE); |
406 |
|
|
SETERROR(REG_BADBR); |
407 |
|
|
} |
408 |
|
|
break; |
409 |
|
|
} |
410 |
|
|
|
411 |
|
|
if (!MORE()) |
412 |
|
|
return; |
413 |
|
|
c = PEEK(); |
414 |
|
|
if (!( c == '*' || c == '+' || c == '?' || |
415 |
|
|
(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) |
416 |
|
|
return; |
417 |
|
|
SETERROR(REG_BADRPT); |
418 |
|
|
} |
419 |
|
|
|
420 |
|
|
/* |
421 |
|
|
- p_str - string (no metacharacters) "parser" |
422 |
|
|
*/ |
423 |
|
|
static void |
424 |
|
|
p_str(struct parse *p) |
425 |
|
|
{ |
426 |
|
|
REQUIRE(MORE(), REG_EMPTY); |
427 |
|
|
while (MORE()) |
428 |
|
|
ordinary(p, GETNEXT()); |
429 |
|
|
} |
430 |
|
|
|
431 |
|
|
/* |
432 |
|
|
- p_bre - BRE parser top level, anchoring and concatenation |
433 |
|
|
* Giving end1 as OUT essentially eliminates the end1/end2 check. |
434 |
|
|
* |
435 |
|
|
* This implementation is a bit of a kludge, in that a trailing $ is first |
436 |
|
|
* taken as an ordinary character and then revised to be an anchor. The |
437 |
|
|
* only undesirable side effect is that '$' gets included as a character |
438 |
|
|
* category in such cases. This is fairly harmless; not worth fixing. |
439 |
|
|
* The amount of lookahead needed to avoid this kludge is excessive. |
440 |
|
|
*/ |
441 |
|
|
static void |
442 |
|
|
p_bre(struct parse *p, |
443 |
|
|
int end1, /* first terminating character */ |
444 |
|
|
int end2) /* second terminating character */ |
445 |
|
|
{ |
446 |
|
|
sopno start = HERE(); |
447 |
|
|
int first = 1; /* first subexpression? */ |
448 |
|
|
int wasdollar = 0; |
449 |
|
|
|
450 |
|
|
if (EAT('^')) { |
451 |
|
|
EMIT(OBOL, 0); |
452 |
|
|
p->g->iflags |= USEBOL; |
453 |
|
|
p->g->nbol++; |
454 |
|
|
} |
455 |
|
|
while (MORE() && !SEETWO(end1, end2)) { |
456 |
|
|
wasdollar = p_simp_re(p, first); |
457 |
|
|
first = 0; |
458 |
|
|
} |
459 |
|
|
if (wasdollar) { /* oops, that was a trailing anchor */ |
460 |
|
|
DROP(1); |
461 |
|
|
EMIT(OEOL, 0); |
462 |
|
|
p->g->iflags |= USEEOL; |
463 |
|
|
p->g->neol++; |
464 |
|
|
} |
465 |
|
|
|
466 |
|
|
REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ |
467 |
|
|
} |
468 |
|
|
|
469 |
|
|
/* |
470 |
|
|
- p_simp_re - parse a simple RE, an atom possibly followed by a repetition |
471 |
|
|
*/ |
472 |
|
|
static int /* was the simple RE an unbackslashed $? */ |
473 |
|
|
p_simp_re(struct parse *p, |
474 |
|
|
int starordinary) /* is a leading * an ordinary character? */ |
475 |
|
|
{ |
476 |
|
|
int c; |
477 |
|
|
int count; |
478 |
|
|
int count2; |
479 |
|
|
sopno pos; |
480 |
|
|
int i; |
481 |
|
|
sopno subno; |
482 |
|
|
# define BACKSL (1<<CHAR_BIT) |
483 |
|
|
|
484 |
|
|
pos = HERE(); /* repetion op, if any, covers from here */ |
485 |
|
|
|
486 |
|
|
assert(MORE()); /* caller should have ensured this */ |
487 |
|
|
c = GETNEXT(); |
488 |
|
|
if (c == '\\') { |
489 |
|
|
REQUIRE(MORE(), REG_EESCAPE); |
490 |
|
|
c = BACKSL | GETNEXT(); |
491 |
|
|
} |
492 |
|
|
switch (c) { |
493 |
|
|
case '.': |
494 |
|
|
if (p->g->cflags®_NEWLINE) |
495 |
|
|
nonnewline(p); |
496 |
|
|
else |
497 |
|
|
EMIT(OANY, 0); |
498 |
|
|
break; |
499 |
|
|
case '[': |
500 |
|
|
p_bracket(p); |
501 |
|
|
break; |
502 |
|
|
case BACKSL|'<': |
503 |
|
|
EMIT(OBOW, 0); |
504 |
|
|
break; |
505 |
|
|
case BACKSL|'>': |
506 |
|
|
EMIT(OEOW, 0); |
507 |
|
|
break; |
508 |
|
|
case BACKSL|'{': |
509 |
|
|
SETERROR(REG_BADRPT); |
510 |
|
|
break; |
511 |
|
|
case BACKSL|'(': |
512 |
|
|
p->g->nsub++; |
513 |
|
|
subno = p->g->nsub; |
514 |
|
|
if (subno < NPAREN) |
515 |
|
|
p->pbegin[subno] = HERE(); |
516 |
|
|
EMIT(OLPAREN, subno); |
517 |
|
|
/* the MORE here is an error heuristic */ |
518 |
|
|
if (MORE() && !SEETWO('\\', ')')) |
519 |
|
|
p_bre(p, '\\', ')'); |
520 |
|
|
if (subno < NPAREN) { |
521 |
|
|
p->pend[subno] = HERE(); |
522 |
|
|
assert(p->pend[subno] != 0); |
523 |
|
|
} |
524 |
|
|
EMIT(ORPAREN, subno); |
525 |
|
|
REQUIRE(EATTWO('\\', ')'), REG_EPAREN); |
526 |
|
|
break; |
527 |
|
|
case BACKSL|')': /* should not get here -- must be user */ |
528 |
|
|
case BACKSL|'}': |
529 |
|
|
SETERROR(REG_EPAREN); |
530 |
|
|
break; |
531 |
|
|
case BACKSL|'1': |
532 |
|
|
case BACKSL|'2': |
533 |
|
|
case BACKSL|'3': |
534 |
|
|
case BACKSL|'4': |
535 |
|
|
case BACKSL|'5': |
536 |
|
|
case BACKSL|'6': |
537 |
|
|
case BACKSL|'7': |
538 |
|
|
case BACKSL|'8': |
539 |
|
|
case BACKSL|'9': |
540 |
|
|
i = (c&~BACKSL) - '0'; |
541 |
|
|
assert(i < NPAREN); |
542 |
|
|
if (p->pend[i] != 0) { |
543 |
|
|
assert(i <= p->g->nsub); |
544 |
|
|
EMIT(OBACK_, i); |
545 |
|
|
assert(p->pbegin[i] != 0); |
546 |
|
|
assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); |
547 |
|
|
assert(OP(p->strip[p->pend[i]]) == ORPAREN); |
548 |
|
|
(void) dupl(p, p->pbegin[i]+1, p->pend[i]); |
549 |
|
|
EMIT(O_BACK, i); |
550 |
|
|
} else |
551 |
|
|
SETERROR(REG_ESUBREG); |
552 |
|
|
p->g->backrefs = 1; |
553 |
|
|
break; |
554 |
|
|
case '*': |
555 |
|
|
REQUIRE(starordinary, REG_BADRPT); |
556 |
|
|
/* FALLTHROUGH */ |
557 |
|
|
default: |
558 |
|
|
ordinary(p, (char)c); |
559 |
|
|
break; |
560 |
|
|
} |
561 |
|
|
|
562 |
|
|
if (EAT('*')) { /* implemented as +? */ |
563 |
|
|
/* this case does not require the (y|) trick, noKLUDGE */ |
564 |
|
|
INSERT(OPLUS_, pos); |
565 |
|
|
ASTERN(O_PLUS, pos); |
566 |
|
|
INSERT(OQUEST_, pos); |
567 |
|
|
ASTERN(O_QUEST, pos); |
568 |
|
|
} else if (EATTWO('\\', '{')) { |
569 |
|
|
count = p_count(p); |
570 |
|
|
if (EAT(',')) { |
571 |
|
|
if (MORE() && isdigit((uch)PEEK())) { |
572 |
|
|
count2 = p_count(p); |
573 |
|
|
REQUIRE(count <= count2, REG_BADBR); |
574 |
|
|
} else /* single number with comma */ |
575 |
|
|
count2 = INFINITY; |
576 |
|
|
} else /* just a single number */ |
577 |
|
|
count2 = count; |
578 |
|
|
repeat(p, pos, count, count2); |
579 |
|
|
if (!EATTWO('\\', '}')) { /* error heuristics */ |
580 |
|
|
while (MORE() && !SEETWO('\\', '}')) |
581 |
|
|
NEXT(); |
582 |
|
|
REQUIRE(MORE(), REG_EBRACE); |
583 |
|
|
SETERROR(REG_BADBR); |
584 |
|
|
} |
585 |
|
|
} else if (c == '$') /* $ (but not \$) ends it */ |
586 |
|
|
return(1); |
587 |
|
|
|
588 |
|
|
return(0); |
589 |
|
|
} |
590 |
|
|
|
591 |
|
|
/* |
592 |
|
|
- p_count - parse a repetition count |
593 |
|
|
*/ |
594 |
|
|
static int /* the value */ |
595 |
|
|
p_count(struct parse *p) |
596 |
|
|
{ |
597 |
|
|
int count = 0; |
598 |
|
|
int ndigits = 0; |
599 |
|
|
|
600 |
|
|
while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { |
601 |
|
|
count = count*10 + (GETNEXT() - '0'); |
602 |
|
|
ndigits++; |
603 |
|
|
} |
604 |
|
|
|
605 |
|
|
REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); |
606 |
|
|
return(count); |
607 |
|
|
} |
608 |
|
|
|
609 |
|
|
/* |
610 |
|
|
- p_bracket - parse a bracketed character list |
611 |
|
|
* |
612 |
|
|
* Note a significant property of this code: if the allocset() did SETERROR, |
613 |
|
|
* no set operations are done. |
614 |
|
|
*/ |
615 |
|
|
static void |
616 |
|
|
p_bracket(struct parse *p) |
617 |
|
|
{ |
618 |
|
|
cset *cs; |
619 |
|
|
int invert = 0; |
620 |
|
|
|
621 |
|
|
/* Dept of Truly Sickening Special-Case Kludges */ |
622 |
|
|
if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { |
623 |
|
|
EMIT(OBOW, 0); |
624 |
|
|
NEXTn(6); |
625 |
|
|
return; |
626 |
|
|
} |
627 |
|
|
if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { |
628 |
|
|
EMIT(OEOW, 0); |
629 |
|
|
NEXTn(6); |
630 |
|
|
return; |
631 |
|
|
} |
632 |
|
|
|
633 |
|
|
if ((cs = allocset(p)) == NULL) { |
634 |
|
|
/* allocset did set error status in p */ |
635 |
|
|
return; |
636 |
|
|
} |
637 |
|
|
|
638 |
|
|
if (EAT('^')) |
639 |
|
|
invert++; /* make note to invert set at end */ |
640 |
|
|
if (EAT(']')) |
641 |
|
|
CHadd(cs, ']'); |
642 |
|
|
else if (EAT('-')) |
643 |
|
|
CHadd(cs, '-'); |
644 |
|
|
while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) |
645 |
|
|
p_b_term(p, cs); |
646 |
|
|
if (EAT('-')) |
647 |
|
|
CHadd(cs, '-'); |
648 |
|
|
REQUIRE(MORE() && GETNEXT() == ']', REG_EBRACK); |
649 |
|
|
|
650 |
|
|
if (p->error != 0) { /* don't mess things up further */ |
651 |
|
|
freeset(p, cs); |
652 |
|
|
return; |
653 |
|
|
} |
654 |
|
|
|
655 |
|
|
if (p->g->cflags®_ICASE) { |
656 |
|
|
int i; |
657 |
|
|
int ci; |
658 |
|
|
|
659 |
|
|
for (i = p->g->csetsize - 1; i >= 0; i--) |
660 |
|
|
if (CHIN(cs, i) && isalpha(i)) { |
661 |
|
|
ci = othercase(i); |
662 |
|
|
if (ci != i) |
663 |
|
|
CHadd(cs, ci); |
664 |
|
|
} |
665 |
|
|
if (cs->multis != NULL) |
666 |
|
|
mccase(p, cs); |
667 |
|
|
} |
668 |
|
|
if (invert) { |
669 |
|
|
int i; |
670 |
|
|
|
671 |
|
|
for (i = p->g->csetsize - 1; i >= 0; i--) |
672 |
|
|
if (CHIN(cs, i)) |
673 |
|
|
CHsub(cs, i); |
674 |
|
|
else |
675 |
|
|
CHadd(cs, i); |
676 |
|
|
if (p->g->cflags®_NEWLINE) |
677 |
|
|
CHsub(cs, '\n'); |
678 |
|
|
if (cs->multis != NULL) |
679 |
|
|
mcinvert(p, cs); |
680 |
|
|
} |
681 |
|
|
|
682 |
|
|
assert(cs->multis == NULL); /* xxx */ |
683 |
|
|
|
684 |
|
|
if (nch(p, cs) == 1) { /* optimize singleton sets */ |
685 |
|
|
ordinary(p, firstch(p, cs)); |
686 |
|
|
freeset(p, cs); |
687 |
|
|
} else |
688 |
|
|
EMIT(OANYOF, freezeset(p, cs)); |
689 |
|
|
} |
690 |
|
|
|
691 |
|
|
/* |
692 |
|
|
- p_b_term - parse one term of a bracketed character list |
693 |
|
|
*/ |
694 |
|
|
static void |
695 |
|
|
p_b_term(struct parse *p, cset *cs) |
696 |
|
|
{ |
697 |
|
|
char c; |
698 |
|
|
char start, finish; |
699 |
|
|
int i; |
700 |
|
|
|
701 |
|
|
/* classify what we've got */ |
702 |
|
|
switch ((MORE()) ? PEEK() : '\0') { |
703 |
|
|
case '[': |
704 |
|
|
c = (MORE2()) ? PEEK2() : '\0'; |
705 |
|
|
break; |
706 |
|
|
case '-': |
707 |
|
|
SETERROR(REG_ERANGE); |
708 |
|
|
return; /* NOTE RETURN */ |
709 |
|
|
break; |
710 |
|
|
default: |
711 |
|
|
c = '\0'; |
712 |
|
|
break; |
713 |
|
|
} |
714 |
|
|
|
715 |
|
|
switch (c) { |
716 |
|
|
case ':': /* character class */ |
717 |
|
|
NEXT2(); |
718 |
|
|
REQUIRE(MORE(), REG_EBRACK); |
719 |
|
|
c = PEEK(); |
720 |
|
|
REQUIRE(c != '-' && c != ']', REG_ECTYPE); |
721 |
|
|
p_b_cclass(p, cs); |
722 |
|
|
REQUIRE(MORE(), REG_EBRACK); |
723 |
|
|
REQUIRE(EATTWO(':', ']'), REG_ECTYPE); |
724 |
|
|
break; |
725 |
|
|
case '=': /* equivalence class */ |
726 |
|
|
NEXT2(); |
727 |
|
|
REQUIRE(MORE(), REG_EBRACK); |
728 |
|
|
c = PEEK(); |
729 |
|
|
REQUIRE(c != '-' && c != ']', REG_ECOLLATE); |
730 |
|
|
p_b_eclass(p, cs); |
731 |
|
|
REQUIRE(MORE(), REG_EBRACK); |
732 |
|
|
REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); |
733 |
|
|
break; |
734 |
|
|
default: /* symbol, ordinary character, or range */ |
735 |
|
|
/* xxx revision needed for multichar stuff */ |
736 |
|
|
start = p_b_symbol(p); |
737 |
|
|
if (SEE('-') && MORE2() && PEEK2() != ']') { |
738 |
|
|
/* range */ |
739 |
|
|
NEXT(); |
740 |
|
|
if (EAT('-')) |
741 |
|
|
finish = '-'; |
742 |
|
|
else |
743 |
|
|
finish = p_b_symbol(p); |
744 |
|
|
} else |
745 |
|
|
finish = start; |
746 |
|
|
/* xxx what about signed chars here... */ |
747 |
|
|
REQUIRE(start <= finish, REG_ERANGE); |
748 |
|
|
for (i = start; i <= finish; i++) |
749 |
|
|
CHadd(cs, i); |
750 |
|
|
break; |
751 |
|
|
} |
752 |
|
|
} |
753 |
|
|
|
754 |
|
|
/* |
755 |
|
|
- p_b_cclass - parse a character-class name and deal with it |
756 |
|
|
*/ |
757 |
|
|
static void |
758 |
|
|
p_b_cclass(struct parse *p, cset *cs) |
759 |
|
|
{ |
760 |
|
|
char *sp = p->next; |
761 |
|
|
struct cclass *cp; |
762 |
|
|
size_t len; |
763 |
|
|
char *u; |
764 |
|
|
char c; |
765 |
|
|
|
766 |
|
|
while (MORE() && isalpha((uch)PEEK())) |
767 |
|
|
NEXT(); |
768 |
|
|
len = p->next - sp; |
769 |
|
|
for (cp = cclasses; cp->name != NULL; cp++) |
770 |
|
|
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') |
771 |
|
|
break; |
772 |
|
|
if (cp->name == NULL) { |
773 |
|
|
/* oops, didn't find it */ |
774 |
|
|
SETERROR(REG_ECTYPE); |
775 |
|
|
return; |
776 |
|
|
} |
777 |
|
|
|
778 |
|
|
u = cp->chars; |
779 |
|
|
while ((c = *u++) != '\0') |
780 |
|
|
CHadd(cs, c); |
781 |
|
|
for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) |
782 |
|
|
MCadd(p, cs, u); |
783 |
|
|
} |
784 |
|
|
|
785 |
|
|
/* |
786 |
|
|
- p_b_eclass - parse an equivalence-class name and deal with it |
787 |
|
|
* |
788 |
|
|
* This implementation is incomplete. xxx |
789 |
|
|
*/ |
790 |
|
|
static void |
791 |
|
|
p_b_eclass(struct parse *p, cset *cs) |
792 |
|
|
{ |
793 |
|
|
char c; |
794 |
|
|
|
795 |
|
|
c = p_b_coll_elem(p, '='); |
796 |
|
|
CHadd(cs, c); |
797 |
|
|
} |
798 |
|
|
|
799 |
|
|
/* |
800 |
|
|
- p_b_symbol - parse a character or [..]ed multicharacter collating symbol |
801 |
|
|
*/ |
802 |
|
|
static char /* value of symbol */ |
803 |
|
|
p_b_symbol(struct parse *p) |
804 |
|
|
{ |
805 |
|
|
char value; |
806 |
|
|
|
807 |
|
|
REQUIRE(MORE(), REG_EBRACK); |
808 |
|
|
if (!EATTWO('[', '.')) |
809 |
|
|
return(GETNEXT()); |
810 |
|
|
|
811 |
|
|
/* collating symbol */ |
812 |
|
|
value = p_b_coll_elem(p, '.'); |
813 |
|
|
REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); |
814 |
|
|
return(value); |
815 |
|
|
} |
816 |
|
|
|
817 |
|
|
/* |
818 |
|
|
- p_b_coll_elem - parse a collating-element name and look it up |
819 |
|
|
*/ |
820 |
|
|
static char /* value of collating element */ |
821 |
|
|
p_b_coll_elem(struct parse *p, |
822 |
|
|
int endc) /* name ended by endc,']' */ |
823 |
|
|
{ |
824 |
|
|
char *sp = p->next; |
825 |
|
|
struct cname *cp; |
826 |
|
|
int len; |
827 |
|
|
|
828 |
|
|
while (MORE() && !SEETWO(endc, ']')) |
829 |
|
|
NEXT(); |
830 |
|
|
if (!MORE()) { |
831 |
|
|
SETERROR(REG_EBRACK); |
832 |
|
|
return(0); |
833 |
|
|
} |
834 |
|
|
len = p->next - sp; |
835 |
|
|
for (cp = cnames; cp->name != NULL; cp++) |
836 |
|
|
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') |
837 |
|
|
return(cp->code); /* known name */ |
838 |
|
|
if (len == 1) |
839 |
|
|
return(*sp); /* single character */ |
840 |
|
|
SETERROR(REG_ECOLLATE); /* neither */ |
841 |
|
|
return(0); |
842 |
|
|
} |
843 |
|
|
|
844 |
|
|
/* |
845 |
|
|
- othercase - return the case counterpart of an alphabetic |
846 |
|
|
*/ |
847 |
|
|
static char /* if no counterpart, return ch */ |
848 |
|
|
othercase(int ch) |
849 |
|
|
{ |
850 |
|
|
ch = (uch)ch; |
851 |
|
|
assert(isalpha(ch)); |
852 |
|
|
if (isupper(ch)) |
853 |
|
|
return ((uch)tolower(ch)); |
854 |
|
|
else if (islower(ch)) |
855 |
|
|
return ((uch)toupper(ch)); |
856 |
|
|
else /* peculiar, but could happen */ |
857 |
|
|
return(ch); |
858 |
|
|
} |
859 |
|
|
|
860 |
|
|
/* |
861 |
|
|
- bothcases - emit a dualcase version of a two-case character |
862 |
|
|
* |
863 |
|
|
* Boy, is this implementation ever a kludge... |
864 |
|
|
*/ |
865 |
|
|
static void |
866 |
|
|
bothcases(struct parse *p, int ch) |
867 |
|
|
{ |
868 |
|
|
char *oldnext = p->next; |
869 |
|
|
char *oldend = p->end; |
870 |
|
|
char bracket[3]; |
871 |
|
|
|
872 |
|
|
ch = (uch)ch; |
873 |
|
|
assert(othercase(ch) != ch); /* p_bracket() would recurse */ |
874 |
|
|
p->next = bracket; |
875 |
|
|
p->end = bracket+2; |
876 |
|
|
bracket[0] = ch; |
877 |
|
|
bracket[1] = ']'; |
878 |
|
|
bracket[2] = '\0'; |
879 |
|
|
p_bracket(p); |
880 |
|
|
assert(p->next == bracket+2); |
881 |
|
|
p->next = oldnext; |
882 |
|
|
p->end = oldend; |
883 |
|
|
} |
884 |
|
|
|
885 |
|
|
/* |
886 |
|
|
- ordinary - emit an ordinary character |
887 |
|
|
*/ |
888 |
|
|
static void |
889 |
|
|
ordinary(struct parse *p, int ch) |
890 |
|
|
{ |
891 |
|
|
cat_t *cap = p->g->categories; |
892 |
|
|
|
893 |
|
|
if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) |
894 |
|
|
bothcases(p, ch); |
895 |
|
|
else { |
896 |
|
|
EMIT(OCHAR, (uch)ch); |
897 |
|
|
if (cap[ch] == 0) |
898 |
|
|
cap[ch] = p->g->ncategories++; |
899 |
|
|
} |
900 |
|
|
} |
901 |
|
|
|
902 |
|
|
/* |
903 |
|
|
* do something magic with this character, but only if it's extra magic |
904 |
|
|
*/ |
905 |
|
|
static void |
906 |
|
|
backslash(struct parse *p, int ch) |
907 |
|
|
{ |
908 |
|
|
switch (ch) { |
909 |
|
|
case '<': |
910 |
|
|
EMIT(OBOW, 0); |
911 |
|
|
break; |
912 |
|
|
case '>': |
913 |
|
|
EMIT(OEOW, 0); |
914 |
|
|
break; |
915 |
|
|
default: |
916 |
|
|
ordinary(p, ch); |
917 |
|
|
break; |
918 |
|
|
} |
919 |
|
|
} |
920 |
|
|
|
921 |
|
|
/* |
922 |
|
|
- nonnewline - emit REG_NEWLINE version of OANY |
923 |
|
|
* |
924 |
|
|
* Boy, is this implementation ever a kludge... |
925 |
|
|
*/ |
926 |
|
|
static void |
927 |
|
|
nonnewline(struct parse *p) |
928 |
|
|
{ |
929 |
|
|
char *oldnext = p->next; |
930 |
|
|
char *oldend = p->end; |
931 |
|
|
char bracket[4]; |
932 |
|
|
|
933 |
|
|
p->next = bracket; |
934 |
|
|
p->end = bracket+3; |
935 |
|
|
bracket[0] = '^'; |
936 |
|
|
bracket[1] = '\n'; |
937 |
|
|
bracket[2] = ']'; |
938 |
|
|
bracket[3] = '\0'; |
939 |
|
|
p_bracket(p); |
940 |
|
|
assert(p->next == bracket+3); |
941 |
|
|
p->next = oldnext; |
942 |
|
|
p->end = oldend; |
943 |
|
|
} |
944 |
|
|
|
945 |
|
|
/* |
946 |
|
|
- repeat - generate code for a bounded repetition, recursively if needed |
947 |
|
|
*/ |
948 |
|
|
static void |
949 |
|
|
repeat(struct parse *p, |
950 |
|
|
sopno start, /* operand from here to end of strip */ |
951 |
|
|
int from, /* repeated from this number */ |
952 |
|
|
int to) /* to this number of times (maybe INFINITY) */ |
953 |
|
|
{ |
954 |
|
|
sopno finish = HERE(); |
955 |
|
|
# define N 2 |
956 |
|
|
# define INF 3 |
957 |
|
|
# define REP(f, t) ((f)*8 + (t)) |
958 |
|
|
# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) |
959 |
|
|
sopno copy; |
960 |
|
|
|
961 |
|
|
if (p->error != 0) /* head off possible runaway recursion */ |
962 |
|
|
return; |
963 |
|
|
|
964 |
|
|
assert(from <= to); |
965 |
|
|
|
966 |
|
|
switch (REP(MAP(from), MAP(to))) { |
967 |
|
|
case REP(0, 0): /* must be user doing this */ |
968 |
|
|
DROP(finish-start); /* drop the operand */ |
969 |
|
|
break; |
970 |
|
|
case REP(0, 1): /* as x{1,1}? */ |
971 |
|
|
case REP(0, N): /* as x{1,n}? */ |
972 |
|
|
case REP(0, INF): /* as x{1,}? */ |
973 |
|
|
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
974 |
|
|
INSERT(OCH_, start); /* offset is wrong... */ |
975 |
|
|
repeat(p, start+1, 1, to); |
976 |
|
|
ASTERN(OOR1, start); |
977 |
|
|
AHEAD(start); /* ... fix it */ |
978 |
|
|
EMIT(OOR2, 0); |
979 |
|
|
AHEAD(THERE()); |
980 |
|
|
ASTERN(O_CH, THERETHERE()); |
981 |
|
|
break; |
982 |
|
|
case REP(1, 1): /* trivial case */ |
983 |
|
|
/* done */ |
984 |
|
|
break; |
985 |
|
|
case REP(1, N): /* as x?x{1,n-1} */ |
986 |
|
|
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
987 |
|
|
INSERT(OCH_, start); |
988 |
|
|
ASTERN(OOR1, start); |
989 |
|
|
AHEAD(start); |
990 |
|
|
EMIT(OOR2, 0); /* offset very wrong... */ |
991 |
|
|
AHEAD(THERE()); /* ...so fix it */ |
992 |
|
|
ASTERN(O_CH, THERETHERE()); |
993 |
|
|
copy = dupl(p, start+1, finish+1); |
994 |
|
|
assert(copy == finish+4); |
995 |
|
|
repeat(p, copy, 1, to-1); |
996 |
|
|
break; |
997 |
|
|
case REP(1, INF): /* as x+ */ |
998 |
|
|
INSERT(OPLUS_, start); |
999 |
|
|
ASTERN(O_PLUS, start); |
1000 |
|
|
break; |
1001 |
|
|
case REP(N, N): /* as xx{m-1,n-1} */ |
1002 |
|
|
copy = dupl(p, start, finish); |
1003 |
|
|
repeat(p, copy, from-1, to-1); |
1004 |
|
|
break; |
1005 |
|
|
case REP(N, INF): /* as xx{n-1,INF} */ |
1006 |
|
|
copy = dupl(p, start, finish); |
1007 |
|
|
repeat(p, copy, from-1, to); |
1008 |
|
|
break; |
1009 |
|
|
default: /* "can't happen" */ |
1010 |
|
|
SETERROR(REG_ASSERT); /* just in case */ |
1011 |
|
|
break; |
1012 |
|
|
} |
1013 |
|
|
} |
1014 |
|
|
|
1015 |
|
|
/* |
1016 |
|
|
- seterr - set an error condition |
1017 |
|
|
*/ |
1018 |
|
|
static int /* useless but makes type checking happy */ |
1019 |
|
|
seterr(struct parse *p, int e) |
1020 |
|
|
{ |
1021 |
|
|
if (p->error == 0) /* keep earliest error condition */ |
1022 |
|
|
p->error = e; |
1023 |
|
|
p->next = nuls; /* try to bring things to a halt */ |
1024 |
|
|
p->end = nuls; |
1025 |
|
|
return(0); /* make the return value well-defined */ |
1026 |
|
|
} |
1027 |
|
|
|
1028 |
|
|
/* |
1029 |
|
|
- allocset - allocate a set of characters for [] |
1030 |
|
|
*/ |
1031 |
|
|
static cset * |
1032 |
|
|
allocset(struct parse *p) |
1033 |
|
|
{ |
1034 |
|
|
int no = p->g->ncsets++; |
1035 |
|
|
size_t nc; |
1036 |
|
|
size_t nbytes; |
1037 |
|
|
cset *cs; |
1038 |
|
|
size_t css = (size_t)p->g->csetsize; |
1039 |
|
|
int i; |
1040 |
|
|
|
1041 |
|
|
if (no >= p->ncsalloc) { /* need another column of space */ |
1042 |
|
|
void *ptr; |
1043 |
|
|
|
1044 |
|
|
p->ncsalloc += CHAR_BIT; |
1045 |
|
|
nc = p->ncsalloc; |
1046 |
|
|
assert(nc % CHAR_BIT == 0); |
1047 |
|
|
|
1048 |
|
|
ptr = reallocarray(p->g->sets, nc, sizeof(cset)); |
1049 |
|
|
if (ptr == NULL) |
1050 |
|
|
goto nomem; |
1051 |
|
|
p->g->sets = ptr; |
1052 |
|
|
|
1053 |
|
|
ptr = reallocarray(p->g->setbits, nc / CHAR_BIT, css); |
1054 |
|
|
if (ptr == NULL) |
1055 |
|
|
goto nomem; |
1056 |
|
|
nbytes = (nc / CHAR_BIT) * css; |
1057 |
|
|
p->g->setbits = ptr; |
1058 |
|
|
|
1059 |
|
|
for (i = 0; i < no; i++) |
1060 |
|
|
p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); |
1061 |
|
|
|
1062 |
|
|
(void) memset((char *)p->g->setbits + (nbytes - css), 0, css); |
1063 |
|
|
} |
1064 |
|
|
/* XXX should not happen */ |
1065 |
|
|
if (p->g->sets == NULL || p->g->setbits == NULL) |
1066 |
|
|
goto nomem; |
1067 |
|
|
|
1068 |
|
|
cs = &p->g->sets[no]; |
1069 |
|
|
cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); |
1070 |
|
|
cs->mask = 1 << ((no) % CHAR_BIT); |
1071 |
|
|
cs->hash = 0; |
1072 |
|
|
cs->smultis = 0; |
1073 |
|
|
cs->multis = NULL; |
1074 |
|
|
|
1075 |
|
|
return(cs); |
1076 |
|
|
nomem: |
1077 |
|
|
free(p->g->sets); |
1078 |
|
|
p->g->sets = NULL; |
1079 |
|
|
free(p->g->setbits); |
1080 |
|
|
p->g->setbits = NULL; |
1081 |
|
|
|
1082 |
|
|
SETERROR(REG_ESPACE); |
1083 |
|
|
/* caller's responsibility not to do set ops */ |
1084 |
|
|
return(NULL); |
1085 |
|
|
} |
1086 |
|
|
|
1087 |
|
|
/* |
1088 |
|
|
- freeset - free a now-unused set |
1089 |
|
|
*/ |
1090 |
|
|
static void |
1091 |
|
|
freeset(struct parse *p, cset *cs) |
1092 |
|
|
{ |
1093 |
|
|
int i; |
1094 |
|
|
cset *top = &p->g->sets[p->g->ncsets]; |
1095 |
|
|
size_t css = (size_t)p->g->csetsize; |
1096 |
|
|
|
1097 |
|
|
for (i = 0; i < css; i++) |
1098 |
|
|
CHsub(cs, i); |
1099 |
|
|
if (cs == top-1) /* recover only the easy case */ |
1100 |
|
|
p->g->ncsets--; |
1101 |
|
|
} |
1102 |
|
|
|
1103 |
|
|
/* |
1104 |
|
|
- freezeset - final processing on a set of characters |
1105 |
|
|
* |
1106 |
|
|
* The main task here is merging identical sets. This is usually a waste |
1107 |
|
|
* of time (although the hash code minimizes the overhead), but can win |
1108 |
|
|
* big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash |
1109 |
|
|
* is done using addition rather than xor -- all ASCII [aA] sets xor to |
1110 |
|
|
* the same value! |
1111 |
|
|
*/ |
1112 |
|
|
static int /* set number */ |
1113 |
|
|
freezeset(struct parse *p, cset *cs) |
1114 |
|
|
{ |
1115 |
|
|
uch h = cs->hash; |
1116 |
|
|
int i; |
1117 |
|
|
cset *top = &p->g->sets[p->g->ncsets]; |
1118 |
|
|
cset *cs2; |
1119 |
|
|
size_t css = (size_t)p->g->csetsize; |
1120 |
|
|
|
1121 |
|
|
/* look for an earlier one which is the same */ |
1122 |
|
|
for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) |
1123 |
|
|
if (cs2->hash == h && cs2 != cs) { |
1124 |
|
|
/* maybe */ |
1125 |
|
|
for (i = 0; i < css; i++) |
1126 |
|
|
if (!!CHIN(cs2, i) != !!CHIN(cs, i)) |
1127 |
|
|
break; /* no */ |
1128 |
|
|
if (i == css) |
1129 |
|
|
break; /* yes */ |
1130 |
|
|
} |
1131 |
|
|
|
1132 |
|
|
if (cs2 < top) { /* found one */ |
1133 |
|
|
freeset(p, cs); |
1134 |
|
|
cs = cs2; |
1135 |
|
|
} |
1136 |
|
|
|
1137 |
|
|
return((int)(cs - p->g->sets)); |
1138 |
|
|
} |
1139 |
|
|
|
1140 |
|
|
/* |
1141 |
|
|
- firstch - return first character in a set (which must have at least one) |
1142 |
|
|
*/ |
1143 |
|
|
static int /* character; there is no "none" value */ |
1144 |
|
|
firstch(struct parse *p, cset *cs) |
1145 |
|
|
{ |
1146 |
|
|
int i; |
1147 |
|
|
size_t css = (size_t)p->g->csetsize; |
1148 |
|
|
|
1149 |
|
|
for (i = 0; i < css; i++) |
1150 |
|
|
if (CHIN(cs, i)) |
1151 |
|
|
return((char)i); |
1152 |
|
|
assert(never); |
1153 |
|
|
return(0); /* arbitrary */ |
1154 |
|
|
} |
1155 |
|
|
|
1156 |
|
|
/* |
1157 |
|
|
- nch - number of characters in a set |
1158 |
|
|
*/ |
1159 |
|
|
static int |
1160 |
|
|
nch(struct parse *p, cset *cs) |
1161 |
|
|
{ |
1162 |
|
|
int i; |
1163 |
|
|
size_t css = (size_t)p->g->csetsize; |
1164 |
|
|
int n = 0; |
1165 |
|
|
|
1166 |
|
|
for (i = 0; i < css; i++) |
1167 |
|
|
if (CHIN(cs, i)) |
1168 |
|
|
n++; |
1169 |
|
|
return(n); |
1170 |
|
|
} |
1171 |
|
|
|
1172 |
|
|
/* |
1173 |
|
|
- mcadd - add a collating element to a cset |
1174 |
|
|
*/ |
1175 |
|
|
static void |
1176 |
|
|
mcadd( struct parse *p, cset *cs, char *cp) |
1177 |
|
|
{ |
1178 |
|
|
size_t oldend = cs->smultis; |
1179 |
|
|
void *np; |
1180 |
|
|
|
1181 |
|
|
cs->smultis += strlen(cp) + 1; |
1182 |
|
|
np = realloc(cs->multis, cs->smultis); |
1183 |
|
|
if (np == NULL) { |
1184 |
|
|
free(cs->multis); |
1185 |
|
|
cs->multis = NULL; |
1186 |
|
|
SETERROR(REG_ESPACE); |
1187 |
|
|
return; |
1188 |
|
|
} |
1189 |
|
|
cs->multis = np; |
1190 |
|
|
|
1191 |
|
|
strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); |
1192 |
|
|
} |
1193 |
|
|
|
1194 |
|
|
/* |
1195 |
|
|
- mcinvert - invert the list of collating elements in a cset |
1196 |
|
|
* |
1197 |
|
|
* This would have to know the set of possibilities. Implementation |
1198 |
|
|
* is deferred. |
1199 |
|
|
*/ |
1200 |
|
|
static void |
1201 |
|
|
mcinvert(struct parse *p, cset *cs) |
1202 |
|
|
{ |
1203 |
|
|
assert(cs->multis == NULL); /* xxx */ |
1204 |
|
|
} |
1205 |
|
|
|
1206 |
|
|
/* |
1207 |
|
|
- mccase - add case counterparts of the list of collating elements in a cset |
1208 |
|
|
* |
1209 |
|
|
* This would have to know the set of possibilities. Implementation |
1210 |
|
|
* is deferred. |
1211 |
|
|
*/ |
1212 |
|
|
static void |
1213 |
|
|
mccase(struct parse *p, cset *cs) |
1214 |
|
|
{ |
1215 |
|
|
assert(cs->multis == NULL); /* xxx */ |
1216 |
|
|
} |
1217 |
|
|
|
1218 |
|
|
/* |
1219 |
|
|
- isinsets - is this character in any sets? |
1220 |
|
|
*/ |
1221 |
|
|
static int /* predicate */ |
1222 |
|
|
isinsets(struct re_guts *g, int c) |
1223 |
|
|
{ |
1224 |
|
|
uch *col; |
1225 |
|
|
int i; |
1226 |
|
|
int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; |
1227 |
|
|
unsigned uc = (uch)c; |
1228 |
|
|
|
1229 |
|
|
for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) |
1230 |
|
|
if (col[uc] != 0) |
1231 |
|
|
return(1); |
1232 |
|
|
return(0); |
1233 |
|
|
} |
1234 |
|
|
|
1235 |
|
|
/* |
1236 |
|
|
- samesets - are these two characters in exactly the same sets? |
1237 |
|
|
*/ |
1238 |
|
|
static int /* predicate */ |
1239 |
|
|
samesets(struct re_guts *g, int c1, int c2) |
1240 |
|
|
{ |
1241 |
|
|
uch *col; |
1242 |
|
|
int i; |
1243 |
|
|
int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; |
1244 |
|
|
unsigned uc1 = (uch)c1; |
1245 |
|
|
unsigned uc2 = (uch)c2; |
1246 |
|
|
|
1247 |
|
|
for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) |
1248 |
|
|
if (col[uc1] != col[uc2]) |
1249 |
|
|
return(0); |
1250 |
|
|
return(1); |
1251 |
|
|
} |
1252 |
|
|
|
1253 |
|
|
/* |
1254 |
|
|
- categorize - sort out character categories |
1255 |
|
|
*/ |
1256 |
|
|
static void |
1257 |
|
|
categorize(struct parse *p, struct re_guts *g) |
1258 |
|
|
{ |
1259 |
|
|
cat_t *cats = g->categories; |
1260 |
|
|
int c; |
1261 |
|
|
int c2; |
1262 |
|
|
cat_t cat; |
1263 |
|
|
|
1264 |
|
|
/* avoid making error situations worse */ |
1265 |
|
|
if (p->error != 0) |
1266 |
|
|
return; |
1267 |
|
|
|
1268 |
|
|
for (c = CHAR_MIN; c <= CHAR_MAX; c++) |
1269 |
|
|
if (cats[c] == 0 && isinsets(g, c)) { |
1270 |
|
|
cat = g->ncategories++; |
1271 |
|
|
cats[c] = cat; |
1272 |
|
|
for (c2 = c+1; c2 <= CHAR_MAX; c2++) |
1273 |
|
|
if (cats[c2] == 0 && samesets(g, c, c2)) |
1274 |
|
|
cats[c2] = cat; |
1275 |
|
|
} |
1276 |
|
|
} |
1277 |
|
|
|
1278 |
|
|
/* |
1279 |
|
|
- dupl - emit a duplicate of a bunch of sops |
1280 |
|
|
*/ |
1281 |
|
|
static sopno /* start of duplicate */ |
1282 |
|
|
dupl(struct parse *p, |
1283 |
|
|
sopno start, /* from here */ |
1284 |
|
|
sopno finish) /* to this less one */ |
1285 |
|
|
{ |
1286 |
|
|
sopno ret = HERE(); |
1287 |
|
|
sopno len = finish - start; |
1288 |
|
|
|
1289 |
|
|
assert(finish >= start); |
1290 |
|
|
if (len == 0) |
1291 |
|
|
return(ret); |
1292 |
|
|
if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ |
1293 |
|
|
return(ret); |
1294 |
|
|
(void) memcpy(p->strip + p->slen, p->strip + start, len * sizeof(sop)); |
1295 |
|
|
p->slen += len; |
1296 |
|
|
return(ret); |
1297 |
|
|
} |
1298 |
|
|
|
1299 |
|
|
/* |
1300 |
|
|
- doemit - emit a strip operator |
1301 |
|
|
* |
1302 |
|
|
* It might seem better to implement this as a macro with a function as |
1303 |
|
|
* hard-case backup, but it's just too big and messy unless there are |
1304 |
|
|
* some changes to the data structures. Maybe later. |
1305 |
|
|
*/ |
1306 |
|
|
static void |
1307 |
|
|
doemit(struct parse *p, sop op, size_t opnd) |
1308 |
|
|
{ |
1309 |
|
|
/* avoid making error situations worse */ |
1310 |
|
|
if (p->error != 0) |
1311 |
|
|
return; |
1312 |
|
|
|
1313 |
|
|
/* deal with oversize operands ("can't happen", more or less) */ |
1314 |
|
|
assert(opnd < 1<<OPSHIFT); |
1315 |
|
|
|
1316 |
|
|
/* deal with undersized strip */ |
1317 |
|
|
if (p->slen >= p->ssize) |
1318 |
|
|
if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ |
1319 |
|
|
return; |
1320 |
|
|
|
1321 |
|
|
/* finally, it's all reduced to the easy case */ |
1322 |
|
|
p->strip[p->slen++] = SOP(op, opnd); |
1323 |
|
|
} |
1324 |
|
|
|
1325 |
|
|
/* |
1326 |
|
|
- doinsert - insert a sop into the strip |
1327 |
|
|
*/ |
1328 |
|
|
static void |
1329 |
|
|
doinsert(struct parse *p, sop op, size_t opnd, sopno pos) |
1330 |
|
|
{ |
1331 |
|
|
sopno sn; |
1332 |
|
|
sop s; |
1333 |
|
|
int i; |
1334 |
|
|
|
1335 |
|
|
/* avoid making error situations worse */ |
1336 |
|
|
if (p->error != 0) |
1337 |
|
|
return; |
1338 |
|
|
|
1339 |
|
|
sn = HERE(); |
1340 |
|
|
EMIT(op, opnd); /* do checks, ensure space */ |
1341 |
|
|
assert(HERE() == sn+1); |
1342 |
|
|
s = p->strip[sn]; |
1343 |
|
|
|
1344 |
|
|
/* adjust paren pointers */ |
1345 |
|
|
assert(pos > 0); |
1346 |
|
|
for (i = 1; i < NPAREN; i++) { |
1347 |
|
|
if (p->pbegin[i] >= pos) { |
1348 |
|
|
p->pbegin[i]++; |
1349 |
|
|
} |
1350 |
|
|
if (p->pend[i] >= pos) { |
1351 |
|
|
p->pend[i]++; |
1352 |
|
|
} |
1353 |
|
|
} |
1354 |
|
|
|
1355 |
|
|
memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], |
1356 |
|
|
(HERE()-pos-1)*sizeof(sop)); |
1357 |
|
|
p->strip[pos] = s; |
1358 |
|
|
} |
1359 |
|
|
|
1360 |
|
|
/* |
1361 |
|
|
- dofwd - complete a forward reference |
1362 |
|
|
*/ |
1363 |
|
|
static void |
1364 |
|
|
dofwd(struct parse *p, sopno pos, sop value) |
1365 |
|
|
{ |
1366 |
|
|
/* avoid making error situations worse */ |
1367 |
|
|
if (p->error != 0) |
1368 |
|
|
return; |
1369 |
|
|
|
1370 |
|
|
assert(value < 1<<OPSHIFT); |
1371 |
|
|
p->strip[pos] = OP(p->strip[pos]) | value; |
1372 |
|
|
} |
1373 |
|
|
|
1374 |
|
|
/* |
1375 |
|
|
- enlarge - enlarge the strip |
1376 |
|
|
*/ |
1377 |
|
|
static int |
1378 |
|
|
enlarge(struct parse *p, sopno size) |
1379 |
|
|
{ |
1380 |
|
|
sop *sp; |
1381 |
|
|
|
1382 |
|
|
if (p->ssize >= size) |
1383 |
|
|
return 1; |
1384 |
|
|
|
1385 |
|
|
sp = reallocarray(p->strip, size, sizeof(sop)); |
1386 |
|
|
if (sp == NULL) { |
1387 |
|
|
SETERROR(REG_ESPACE); |
1388 |
|
|
return 0; |
1389 |
|
|
} |
1390 |
|
|
p->strip = sp; |
1391 |
|
|
p->ssize = size; |
1392 |
|
|
return 1; |
1393 |
|
|
} |
1394 |
|
|
|
1395 |
|
|
/* |
1396 |
|
|
- stripsnug - compact the strip |
1397 |
|
|
*/ |
1398 |
|
|
static void |
1399 |
|
|
stripsnug(struct parse *p, struct re_guts *g) |
1400 |
|
|
{ |
1401 |
|
|
g->nstates = p->slen; |
1402 |
|
|
g->strip = reallocarray(p->strip, p->slen, sizeof(sop)); |
1403 |
|
|
if (g->strip == NULL) { |
1404 |
|
|
SETERROR(REG_ESPACE); |
1405 |
|
|
g->strip = p->strip; |
1406 |
|
|
} |
1407 |
|
|
} |
1408 |
|
|
|
1409 |
|
|
/* |
1410 |
|
|
- findmust - fill in must and mlen with longest mandatory literal string |
1411 |
|
|
* |
1412 |
|
|
* This algorithm could do fancy things like analyzing the operands of | |
1413 |
|
|
* for common subsequences. Someday. This code is simple and finds most |
1414 |
|
|
* of the interesting cases. |
1415 |
|
|
* |
1416 |
|
|
* Note that must and mlen got initialized during setup. |
1417 |
|
|
*/ |
1418 |
|
|
static void |
1419 |
|
|
findmust(struct parse *p, struct re_guts *g) |
1420 |
|
|
{ |
1421 |
|
|
sop *scan; |
1422 |
|
|
sop *start; /* start initialized in the default case, after that */ |
1423 |
|
|
sop *newstart; /* newstart was initialized in the OCHAR case */ |
1424 |
|
|
sopno newlen; |
1425 |
|
|
sop s; |
1426 |
|
|
char *cp; |
1427 |
|
|
sopno i; |
1428 |
|
|
|
1429 |
|
|
/* avoid making error situations worse */ |
1430 |
|
|
if (p->error != 0) |
1431 |
|
|
return; |
1432 |
|
|
|
1433 |
|
|
/* find the longest OCHAR sequence in strip */ |
1434 |
|
|
newlen = 0; |
1435 |
|
|
scan = g->strip + 1; |
1436 |
|
|
do { |
1437 |
|
|
s = *scan++; |
1438 |
|
|
switch (OP(s)) { |
1439 |
|
|
case OCHAR: /* sequence member */ |
1440 |
|
|
if (newlen == 0) /* new sequence */ |
1441 |
|
|
newstart = scan - 1; |
1442 |
|
|
newlen++; |
1443 |
|
|
break; |
1444 |
|
|
case OPLUS_: /* things that don't break one */ |
1445 |
|
|
case OLPAREN: |
1446 |
|
|
case ORPAREN: |
1447 |
|
|
break; |
1448 |
|
|
case OQUEST_: /* things that must be skipped */ |
1449 |
|
|
case OCH_: |
1450 |
|
|
scan--; |
1451 |
|
|
do { |
1452 |
|
|
scan += OPND(s); |
1453 |
|
|
s = *scan; |
1454 |
|
|
/* assert() interferes w debug printouts */ |
1455 |
|
|
if (OP(s) != O_QUEST && OP(s) != O_CH && |
1456 |
|
|
OP(s) != OOR2) { |
1457 |
|
|
g->iflags |= BAD; |
1458 |
|
|
return; |
1459 |
|
|
} |
1460 |
|
|
} while (OP(s) != O_QUEST && OP(s) != O_CH); |
1461 |
|
|
/* fallthrough */ |
1462 |
|
|
default: /* things that break a sequence */ |
1463 |
|
|
if (newlen > g->mlen) { /* ends one */ |
1464 |
|
|
start = newstart; |
1465 |
|
|
g->mlen = newlen; |
1466 |
|
|
} |
1467 |
|
|
newlen = 0; |
1468 |
|
|
break; |
1469 |
|
|
} |
1470 |
|
|
} while (OP(s) != OEND); |
1471 |
|
|
|
1472 |
|
|
if (g->mlen == 0) /* there isn't one */ |
1473 |
|
|
return; |
1474 |
|
|
|
1475 |
|
|
/* turn it into a character string */ |
1476 |
|
|
g->must = malloc((size_t)g->mlen + 1); |
1477 |
|
|
if (g->must == NULL) { /* argh; just forget it */ |
1478 |
|
|
g->mlen = 0; |
1479 |
|
|
return; |
1480 |
|
|
} |
1481 |
|
|
cp = g->must; |
1482 |
|
|
scan = start; |
1483 |
|
|
for (i = g->mlen; i > 0; i--) { |
1484 |
|
|
while (OP(s = *scan++) != OCHAR) |
1485 |
|
|
continue; |
1486 |
|
|
assert(cp < g->must + g->mlen); |
1487 |
|
|
*cp++ = (char)OPND(s); |
1488 |
|
|
} |
1489 |
|
|
assert(cp == g->must + g->mlen); |
1490 |
|
|
*cp++ = '\0'; /* just on general principles */ |
1491 |
|
|
} |
1492 |
|
|
|
1493 |
|
|
/* |
1494 |
|
|
- pluscount - count + nesting |
1495 |
|
|
*/ |
1496 |
|
|
static sopno /* nesting depth */ |
1497 |
|
|
pluscount(struct parse *p, struct re_guts *g) |
1498 |
|
|
{ |
1499 |
|
|
sop *scan; |
1500 |
|
|
sop s; |
1501 |
|
|
sopno plusnest = 0; |
1502 |
|
|
sopno maxnest = 0; |
1503 |
|
|
|
1504 |
|
|
if (p->error != 0) |
1505 |
|
|
return(0); /* there may not be an OEND */ |
1506 |
|
|
|
1507 |
|
|
scan = g->strip + 1; |
1508 |
|
|
do { |
1509 |
|
|
s = *scan++; |
1510 |
|
|
switch (OP(s)) { |
1511 |
|
|
case OPLUS_: |
1512 |
|
|
plusnest++; |
1513 |
|
|
break; |
1514 |
|
|
case O_PLUS: |
1515 |
|
|
if (plusnest > maxnest) |
1516 |
|
|
maxnest = plusnest; |
1517 |
|
|
plusnest--; |
1518 |
|
|
break; |
1519 |
|
|
} |
1520 |
|
|
} while (OP(s) != OEND); |
1521 |
|
|
if (plusnest != 0) |
1522 |
|
|
g->iflags |= BAD; |
1523 |
|
|
return(maxnest); |
1524 |
|
|
} |