1 |
|
|
/* $OpenBSD: ccl.c,v 1.8 2015/11/19 22:55:13 tedu Exp $ */ |
2 |
|
|
|
3 |
|
|
/* ccl - routines for character classes */ |
4 |
|
|
|
5 |
|
|
/* Copyright (c) 1990 The Regents of the University of California. */ |
6 |
|
|
/* All rights reserved. */ |
7 |
|
|
|
8 |
|
|
/* This code is derived from software contributed to Berkeley by */ |
9 |
|
|
/* Vern Paxson. */ |
10 |
|
|
|
11 |
|
|
/* The United States Government has rights in this work pursuant */ |
12 |
|
|
/* to contract no. DE-AC03-76SF00098 between the United States */ |
13 |
|
|
/* Department of Energy and the University of California. */ |
14 |
|
|
|
15 |
|
|
/* This file is part of flex. */ |
16 |
|
|
|
17 |
|
|
/* Redistribution and use in source and binary forms, with or without */ |
18 |
|
|
/* modification, are permitted provided that the following conditions */ |
19 |
|
|
/* are met: */ |
20 |
|
|
|
21 |
|
|
/* 1. Redistributions of source code must retain the above copyright */ |
22 |
|
|
/* notice, this list of conditions and the following disclaimer. */ |
23 |
|
|
/* 2. Redistributions in binary form must reproduce the above copyright */ |
24 |
|
|
/* notice, this list of conditions and the following disclaimer in the */ |
25 |
|
|
/* documentation and/or other materials provided with the distribution. */ |
26 |
|
|
|
27 |
|
|
/* Neither the name of the University nor the names of its contributors */ |
28 |
|
|
/* may be used to endorse or promote products derived from this software */ |
29 |
|
|
/* without specific prior written permission. */ |
30 |
|
|
|
31 |
|
|
/* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */ |
32 |
|
|
/* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */ |
33 |
|
|
/* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */ |
34 |
|
|
/* PURPOSE. */ |
35 |
|
|
|
36 |
|
|
#include "flexdef.h" |
37 |
|
|
|
38 |
|
|
/* return true if the chr is in the ccl. Takes negation into account. */ |
39 |
|
|
static bool |
40 |
|
|
ccl_contains(const int cclp, const int ch) |
41 |
|
|
{ |
42 |
|
|
int ind, len, i; |
43 |
|
|
|
44 |
|
|
len = ccllen[cclp]; |
45 |
|
|
ind = cclmap[cclp]; |
46 |
|
|
|
47 |
|
|
for (i = 0; i < len; ++i) |
48 |
|
|
if (ccltbl[ind + i] == ch) |
49 |
|
|
return !cclng[cclp]; |
50 |
|
|
|
51 |
|
|
return cclng[cclp]; |
52 |
|
|
} |
53 |
|
|
|
54 |
|
|
|
55 |
|
|
/* ccladd - add a single character to a ccl */ |
56 |
|
|
|
57 |
|
|
void |
58 |
|
|
ccladd(cclp, ch) |
59 |
|
|
int cclp; |
60 |
|
|
int ch; |
61 |
|
|
{ |
62 |
|
|
int ind, len, newpos, i; |
63 |
|
|
|
64 |
|
106024 |
check_char(ch); |
65 |
|
|
|
66 |
|
53012 |
len = ccllen[cclp]; |
67 |
|
53012 |
ind = cclmap[cclp]; |
68 |
|
|
|
69 |
|
|
/* check to see if the character is already in the ccl */ |
70 |
|
|
|
71 |
✓✓ |
1238080 |
for (i = 0; i < len; ++i) |
72 |
✓✓ |
566288 |
if (ccltbl[ind + i] == ch) |
73 |
|
260 |
return; |
74 |
|
|
|
75 |
|
|
/* mark newlines */ |
76 |
✓✓ |
52752 |
if (ch == nlch) |
77 |
|
39 |
ccl_has_nl[cclp] = true; |
78 |
|
|
|
79 |
|
52752 |
newpos = ind + len; |
80 |
|
|
|
81 |
✓✓ |
52752 |
if (newpos >= current_max_ccl_tbl_size) { |
82 |
|
206 |
current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT; |
83 |
|
|
|
84 |
|
206 |
++num_reallocs; |
85 |
|
|
|
86 |
|
206 |
ccltbl = reallocate_Character_array(ccltbl, |
87 |
|
|
current_max_ccl_tbl_size); |
88 |
|
206 |
} |
89 |
|
52752 |
ccllen[cclp] = len + 1; |
90 |
|
52752 |
ccltbl[newpos] = ch; |
91 |
|
105764 |
} |
92 |
|
|
|
93 |
|
|
/* dump_cclp - same thing as list_character_set, but for cclps. */ |
94 |
|
|
|
95 |
|
|
static void |
96 |
|
|
dump_cclp(FILE * file, int cclp) |
97 |
|
|
{ |
98 |
|
|
int i; |
99 |
|
|
|
100 |
|
|
putc('[', file); |
101 |
|
|
|
102 |
|
|
for (i = 0; i < csize; ++i) { |
103 |
|
|
if (ccl_contains(cclp, i)) { |
104 |
|
|
int start_char = i; |
105 |
|
|
|
106 |
|
|
putc(' ', file); |
107 |
|
|
|
108 |
|
|
fputs(readable_form(i), file); |
109 |
|
|
|
110 |
|
|
while (++i < csize && ccl_contains(cclp, i)); |
111 |
|
|
|
112 |
|
|
if (i - 1 > start_char) |
113 |
|
|
/* this was a run */ |
114 |
|
|
fprintf(file, "-%s", |
115 |
|
|
readable_form(i - 1)); |
116 |
|
|
|
117 |
|
|
putc(' ', file); |
118 |
|
|
} |
119 |
|
|
} |
120 |
|
|
|
121 |
|
|
putc(']', file); |
122 |
|
|
} |
123 |
|
|
|
124 |
|
|
|
125 |
|
|
|
126 |
|
|
/* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */ |
127 |
|
|
int |
128 |
|
|
ccl_set_diff(int a, int b) |
129 |
|
|
{ |
130 |
|
|
int d, ch; |
131 |
|
|
|
132 |
|
|
/* create new class */ |
133 |
|
|
d = cclinit(); |
134 |
|
|
|
135 |
|
|
/* |
136 |
|
|
* In order to handle negation, we spin through all possible chars, |
137 |
|
|
* addding each char in a that is not in b. (This could be O(n^2), |
138 |
|
|
* but n is small and bounded.) |
139 |
|
|
*/ |
140 |
|
|
for (ch = 0; ch < csize; ++ch) |
141 |
|
|
if (ccl_contains(a, ch) && !ccl_contains(b, ch)) |
142 |
|
|
ccladd(d, ch); |
143 |
|
|
|
144 |
|
|
/* debug */ |
145 |
|
|
if (0) { |
146 |
|
|
fprintf(stderr, "ccl_set_diff ("); |
147 |
|
|
fprintf(stderr, "\n "); |
148 |
|
|
dump_cclp(stderr, a); |
149 |
|
|
fprintf(stderr, "\n "); |
150 |
|
|
dump_cclp(stderr, b); |
151 |
|
|
fprintf(stderr, "\n "); |
152 |
|
|
dump_cclp(stderr, d); |
153 |
|
|
fprintf(stderr, "\n)\n"); |
154 |
|
|
} |
155 |
|
|
return d; |
156 |
|
|
} |
157 |
|
|
|
158 |
|
|
/* ccl_set_union - create a new ccl as the set union of the two given ccls. */ |
159 |
|
|
int |
160 |
|
|
ccl_set_union(int a, int b) |
161 |
|
|
{ |
162 |
|
|
int d, i; |
163 |
|
|
|
164 |
|
|
/* create new class */ |
165 |
|
|
d = cclinit(); |
166 |
|
|
|
167 |
|
|
/* Add all of a */ |
168 |
|
|
for (i = 0; i < ccllen[a]; ++i) |
169 |
|
|
ccladd(d, ccltbl[cclmap[a] + i]); |
170 |
|
|
|
171 |
|
|
/* Add all of b */ |
172 |
|
|
for (i = 0; i < ccllen[b]; ++i) |
173 |
|
|
ccladd(d, ccltbl[cclmap[b] + i]); |
174 |
|
|
|
175 |
|
|
/* debug */ |
176 |
|
|
if (0) { |
177 |
|
|
fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d); |
178 |
|
|
fprintf(stderr, "\n "); |
179 |
|
|
dump_cclp(stderr, a); |
180 |
|
|
fprintf(stderr, "\n "); |
181 |
|
|
dump_cclp(stderr, b); |
182 |
|
|
fprintf(stderr, "\n "); |
183 |
|
|
dump_cclp(stderr, d); |
184 |
|
|
fprintf(stderr, "\n)\n"); |
185 |
|
|
} |
186 |
|
|
return d; |
187 |
|
|
} |
188 |
|
|
|
189 |
|
|
|
190 |
|
|
/* cclinit - return an empty ccl */ |
191 |
|
|
|
192 |
|
|
int |
193 |
|
|
cclinit() |
194 |
|
|
{ |
195 |
✓✓ |
5088 |
if (++lastccl >= current_maxccls) { |
196 |
|
24 |
current_maxccls += MAX_CCLS_INCREMENT; |
197 |
|
|
|
198 |
|
24 |
++num_reallocs; |
199 |
|
|
|
200 |
|
24 |
cclmap = |
201 |
|
24 |
reallocate_integer_array(cclmap, current_maxccls); |
202 |
|
24 |
ccllen = |
203 |
|
24 |
reallocate_integer_array(ccllen, current_maxccls); |
204 |
|
24 |
cclng = reallocate_integer_array(cclng, current_maxccls); |
205 |
|
24 |
ccl_has_nl = |
206 |
|
24 |
reallocate_bool_array(ccl_has_nl, |
207 |
|
|
current_maxccls); |
208 |
|
24 |
} |
209 |
✓✓ |
2544 |
if (lastccl == 1) |
210 |
|
|
/* we're making the first ccl */ |
211 |
|
|
cclmap[lastccl] = 0; |
212 |
|
|
|
213 |
|
|
else |
214 |
|
|
/* |
215 |
|
|
* The new pointer is just past the end of the last ccl. |
216 |
|
|
* Since the cclmap points to the \first/ character of a ccl, |
217 |
|
|
* adding the length of the ccl to the cclmap pointer will |
218 |
|
|
* produce a cursor to the first free space. |
219 |
|
|
*/ |
220 |
|
|
cclmap[lastccl] = |
221 |
|
2534 |
cclmap[lastccl - 1] + ccllen[lastccl - 1]; |
222 |
|
|
|
223 |
|
2544 |
ccllen[lastccl] = 0; |
224 |
|
2544 |
cclng[lastccl] = 0; /* ccl's start out life un-negated */ |
225 |
|
2544 |
ccl_has_nl[lastccl] = false; |
226 |
|
|
|
227 |
|
2544 |
return lastccl; |
228 |
|
|
} |
229 |
|
|
|
230 |
|
|
|
231 |
|
|
/* cclnegate - negate the given ccl */ |
232 |
|
|
|
233 |
|
|
void |
234 |
|
|
cclnegate(cclp) |
235 |
|
|
int cclp; |
236 |
|
|
{ |
237 |
|
104 |
cclng[cclp] = 1; |
238 |
|
52 |
ccl_has_nl[cclp] = !ccl_has_nl[cclp]; |
239 |
|
52 |
} |
240 |
|
|
|
241 |
|
|
|
242 |
|
|
/* list_character_set - list the members of a set of characters in CCL form |
243 |
|
|
* |
244 |
|
|
* Writes to the given file a character-class representation of those |
245 |
|
|
* characters present in the given CCL. A character is present if it |
246 |
|
|
* has a non-zero value in the cset array. |
247 |
|
|
*/ |
248 |
|
|
|
249 |
|
|
void |
250 |
|
|
list_character_set(file, cset) |
251 |
|
|
FILE *file; |
252 |
|
|
int cset[]; |
253 |
|
|
{ |
254 |
|
|
int i; |
255 |
|
|
|
256 |
|
|
putc('[', file); |
257 |
|
|
|
258 |
|
|
for (i = 0; i < csize; ++i) { |
259 |
|
|
if (cset[i]) { |
260 |
|
|
int start_char = i; |
261 |
|
|
|
262 |
|
|
putc(' ', file); |
263 |
|
|
|
264 |
|
|
fputs(readable_form(i), file); |
265 |
|
|
|
266 |
|
|
while (++i < csize && cset[i]); |
267 |
|
|
|
268 |
|
|
if (i - 1 > start_char) |
269 |
|
|
/* this was a run */ |
270 |
|
|
fprintf(file, "-%s", |
271 |
|
|
readable_form(i - 1)); |
272 |
|
|
|
273 |
|
|
putc(' ', file); |
274 |
|
|
} |
275 |
|
|
} |
276 |
|
|
|
277 |
|
|
putc(']', file); |
278 |
|
|
} |
279 |
|
|
|
280 |
|
|
/** Determines if the range [c1-c2] is unambiguous in a case-insensitive |
281 |
|
|
* scanner. Specifically, if a lowercase or uppercase character, x, is in the |
282 |
|
|
* range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also |
283 |
|
|
* be in the range. If not, then this range is ambiguous, and the function |
284 |
|
|
* returns false. For example, [@-_] spans [a-z] but not [A-Z]. Beware that |
285 |
|
|
* [a-z] will be labeled ambiguous because it does not include [A-Z]. |
286 |
|
|
* |
287 |
|
|
* @param c1 the lower end of the range |
288 |
|
|
* @param c2 the upper end of the range |
289 |
|
|
* @return true if [c1-c2] is not ambiguous for a caseless scanner. |
290 |
|
|
*/ |
291 |
|
|
bool |
292 |
|
|
range_covers_case(int c1, int c2) |
293 |
|
|
{ |
294 |
|
|
int i, o; |
295 |
|
|
|
296 |
✓✓ |
318 |
for (i = c1; i <= c2; i++) { |
297 |
✗✓ |
138 |
if (has_case(i)) { |
298 |
|
|
o = reverse_case(i); |
299 |
|
|
if (o < c1 || c2 < o) |
300 |
|
|
return false; |
301 |
|
|
} |
302 |
|
|
} |
303 |
|
14 |
return true; |
304 |
|
14 |
} |
305 |
|
|
|
306 |
|
|
/** Reverse the case of a character, if possible. |
307 |
|
|
* @return c if case-reversal does not apply. |
308 |
|
|
*/ |
309 |
|
|
int |
310 |
|
|
reverse_case(int c) |
311 |
|
|
{ |
312 |
✓✓✓✗
|
4392 |
return isupper(c) ? tolower(c) : (islower(c) ? toupper(c) : c); |
313 |
|
|
} |
314 |
|
|
|
315 |
|
|
/** Return true if c is uppercase or lowercase. */ |
316 |
|
|
bool |
317 |
|
|
has_case(int c) |
318 |
|
|
{ |
319 |
✓✓ |
6086 |
return (isupper(c) || islower(c)) ? true : false; |
320 |
|
|
} |