1 |
|
|
/* $OpenBSD: tokenizer.c,v 1.21 2016/04/11 21:17:29 schwarze Exp $ */ |
2 |
|
|
/* $NetBSD: tokenizer.c,v 1.28 2016/04/11 18:56:31 christos Exp $ */ |
3 |
|
|
|
4 |
|
|
/*- |
5 |
|
|
* Copyright (c) 1992, 1993 |
6 |
|
|
* The Regents of the University of California. All rights reserved. |
7 |
|
|
* |
8 |
|
|
* This code is derived from software contributed to Berkeley by |
9 |
|
|
* Christos Zoulas of Cornell University. |
10 |
|
|
* |
11 |
|
|
* Redistribution and use in source and binary forms, with or without |
12 |
|
|
* modification, are permitted provided that the following conditions |
13 |
|
|
* are met: |
14 |
|
|
* 1. Redistributions of source code must retain the above copyright |
15 |
|
|
* notice, this list of conditions and the following disclaimer. |
16 |
|
|
* 2. Redistributions in binary form must reproduce the above copyright |
17 |
|
|
* notice, this list of conditions and the following disclaimer in the |
18 |
|
|
* documentation and/or other materials provided with the distribution. |
19 |
|
|
* 3. Neither the name of the University nor the names of its contributors |
20 |
|
|
* may be used to endorse or promote products derived from this software |
21 |
|
|
* without specific prior written permission. |
22 |
|
|
* |
23 |
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
24 |
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
25 |
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
26 |
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
27 |
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
28 |
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
29 |
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
30 |
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
31 |
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
32 |
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
33 |
|
|
* SUCH DAMAGE. |
34 |
|
|
*/ |
35 |
|
|
|
36 |
|
|
#include "config.h" |
37 |
|
|
|
38 |
|
|
/* We build this file twice, once as NARROW, once as WIDE. */ |
39 |
|
|
/* |
40 |
|
|
* tokenize.c: Bourne shell like tokenizer |
41 |
|
|
*/ |
42 |
|
|
#include <stdlib.h> |
43 |
|
|
#include <string.h> |
44 |
|
|
|
45 |
|
|
#include "histedit.h" |
46 |
|
|
|
47 |
|
|
typedef enum { |
48 |
|
|
Q_none, Q_single, Q_double, Q_one, Q_doubleone |
49 |
|
|
} quote_t; |
50 |
|
|
|
51 |
|
|
#define TOK_KEEP 1 |
52 |
|
|
#define TOK_EAT 2 |
53 |
|
|
|
54 |
|
|
#define WINCR 20 |
55 |
|
|
#define AINCR 10 |
56 |
|
|
|
57 |
|
|
#define IFS STR("\t \n") |
58 |
|
|
|
59 |
|
|
#ifdef NARROWCHAR |
60 |
|
|
#define Char char |
61 |
|
|
#define FUN(prefix, rest) prefix ## _ ## rest |
62 |
|
|
#define TYPE(type) type |
63 |
|
|
#define STR(x) x |
64 |
|
|
#define Strchr(s, c) strchr(s, c) |
65 |
|
|
#define tok_strdup(s) strdup(s) |
66 |
|
|
#else |
67 |
|
|
#define Char wchar_t |
68 |
|
|
#define FUN(prefix, rest) prefix ## _w ## rest |
69 |
|
|
#define TYPE(type) type ## W |
70 |
|
|
#define STR(x) L ## x |
71 |
|
|
#define Strchr(s, c) wcschr(s, c) |
72 |
|
|
#define tok_strdup(s) wcsdup(s) |
73 |
|
|
#endif |
74 |
|
|
|
75 |
|
|
struct TYPE(tokenizer) { |
76 |
|
|
Char *ifs; /* In field separator */ |
77 |
|
|
int argc, amax; /* Current and maximum number of args */ |
78 |
|
|
Char **argv; /* Argument list */ |
79 |
|
|
Char *wptr, *wmax; /* Space and limit on the word buffer */ |
80 |
|
|
Char *wstart; /* Beginning of next word */ |
81 |
|
|
Char *wspace; /* Space of word buffer */ |
82 |
|
|
quote_t quote; /* Quoting state */ |
83 |
|
|
int flags; /* flags; */ |
84 |
|
|
}; |
85 |
|
|
|
86 |
|
|
|
87 |
|
|
static void FUN(tok,finish)(TYPE(Tokenizer) *); |
88 |
|
|
|
89 |
|
|
|
90 |
|
|
/* FUN(tok,finish)(): |
91 |
|
|
* Finish a word in the tokenizer. |
92 |
|
|
*/ |
93 |
|
|
static void |
94 |
|
|
FUN(tok,finish)(TYPE(Tokenizer) *tok) |
95 |
|
|
{ |
96 |
|
|
|
97 |
|
|
*tok->wptr = '\0'; |
98 |
|
|
if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) { |
99 |
|
|
tok->argv[tok->argc++] = tok->wstart; |
100 |
|
|
tok->argv[tok->argc] = NULL; |
101 |
|
|
tok->wstart = ++tok->wptr; |
102 |
|
|
} |
103 |
|
|
tok->flags &= ~TOK_KEEP; |
104 |
|
|
} |
105 |
|
|
|
106 |
|
|
|
107 |
|
|
/* FUN(tok,init)(): |
108 |
|
|
* Initialize the tokenizer |
109 |
|
|
*/ |
110 |
|
|
TYPE(Tokenizer) * |
111 |
|
|
FUN(tok,init)(const Char *ifs) |
112 |
|
|
{ |
113 |
|
|
TYPE(Tokenizer) *tok = malloc(sizeof(TYPE(Tokenizer))); |
114 |
|
|
|
115 |
|
|
if (tok == NULL) |
116 |
|
|
return NULL; |
117 |
|
|
tok->ifs = tok_strdup(ifs ? ifs : IFS); |
118 |
|
|
if (tok->ifs == NULL) { |
119 |
|
|
free(tok); |
120 |
|
|
return NULL; |
121 |
|
|
} |
122 |
|
|
tok->argc = 0; |
123 |
|
|
tok->amax = AINCR; |
124 |
|
|
tok->argv = reallocarray(NULL, tok->amax, sizeof(*tok->argv)); |
125 |
|
|
if (tok->argv == NULL) { |
126 |
|
|
free(tok->ifs); |
127 |
|
|
free(tok); |
128 |
|
|
return NULL; |
129 |
|
|
} |
130 |
|
|
tok->argv[0] = NULL; |
131 |
|
|
tok->wspace = reallocarray(NULL, WINCR, sizeof(*tok->wspace)); |
132 |
|
|
if (tok->wspace == NULL) { |
133 |
|
|
free(tok->argv); |
134 |
|
|
free(tok->ifs); |
135 |
|
|
free(tok); |
136 |
|
|
return NULL; |
137 |
|
|
} |
138 |
|
|
tok->wmax = tok->wspace + WINCR; |
139 |
|
|
tok->wstart = tok->wspace; |
140 |
|
|
tok->wptr = tok->wspace; |
141 |
|
|
tok->flags = 0; |
142 |
|
|
tok->quote = Q_none; |
143 |
|
|
|
144 |
|
|
return tok; |
145 |
|
|
} |
146 |
|
|
|
147 |
|
|
|
148 |
|
|
/* FUN(tok,reset)(): |
149 |
|
|
* Reset the tokenizer |
150 |
|
|
*/ |
151 |
|
|
void |
152 |
|
|
FUN(tok,reset)(TYPE(Tokenizer) *tok) |
153 |
|
|
{ |
154 |
|
|
|
155 |
|
|
tok->argc = 0; |
156 |
|
|
tok->wstart = tok->wspace; |
157 |
|
|
tok->wptr = tok->wspace; |
158 |
|
|
tok->flags = 0; |
159 |
|
|
tok->quote = Q_none; |
160 |
|
|
} |
161 |
|
|
|
162 |
|
|
|
163 |
|
|
/* FUN(tok,end)(): |
164 |
|
|
* Clean up |
165 |
|
|
*/ |
166 |
|
|
void |
167 |
|
|
FUN(tok,end)(TYPE(Tokenizer) *tok) |
168 |
|
|
{ |
169 |
|
|
|
170 |
|
|
free(tok->ifs); |
171 |
|
|
free(tok->wspace); |
172 |
|
|
free(tok->argv); |
173 |
|
|
free(tok); |
174 |
|
|
} |
175 |
|
|
|
176 |
|
|
|
177 |
|
|
|
178 |
|
|
/* FUN(tok,line)(): |
179 |
|
|
* Bourne shell (sh(1)) like tokenizing |
180 |
|
|
* Arguments: |
181 |
|
|
* tok current tokenizer state (setup with FUN(tok,init)()) |
182 |
|
|
* line line to parse |
183 |
|
|
* Returns: |
184 |
|
|
* -1 Internal error |
185 |
|
|
* 3 Quoted return |
186 |
|
|
* 2 Unmatched double quote |
187 |
|
|
* 1 Unmatched single quote |
188 |
|
|
* 0 Ok |
189 |
|
|
* Modifies (if return value is 0): |
190 |
|
|
* argc number of arguments |
191 |
|
|
* argv argument array |
192 |
|
|
* cursorc if !NULL, argv element containing cursor |
193 |
|
|
* cursorv if !NULL, offset in argv[cursorc] of cursor |
194 |
|
|
*/ |
195 |
|
|
int |
196 |
|
|
FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line, |
197 |
|
|
int *argc, const Char ***argv, int *cursorc, int *cursoro) |
198 |
|
|
{ |
199 |
|
|
const Char *ptr; |
200 |
|
|
int cc, co; |
201 |
|
|
|
202 |
|
|
cc = co = -1; |
203 |
|
|
ptr = line->buffer; |
204 |
|
|
for (ptr = line->buffer; ;ptr++) { |
205 |
|
|
if (ptr >= line->lastchar) |
206 |
|
|
ptr = STR(""); |
207 |
|
|
if (ptr == line->cursor) { |
208 |
|
|
cc = tok->argc; |
209 |
|
|
co = (int)(tok->wptr - tok->wstart); |
210 |
|
|
} |
211 |
|
|
switch (*ptr) { |
212 |
|
|
case '\'': |
213 |
|
|
tok->flags |= TOK_KEEP; |
214 |
|
|
tok->flags &= ~TOK_EAT; |
215 |
|
|
switch (tok->quote) { |
216 |
|
|
case Q_none: |
217 |
|
|
tok->quote = Q_single; /* Enter single quote |
218 |
|
|
* mode */ |
219 |
|
|
break; |
220 |
|
|
|
221 |
|
|
case Q_single: /* Exit single quote mode */ |
222 |
|
|
tok->quote = Q_none; |
223 |
|
|
break; |
224 |
|
|
|
225 |
|
|
case Q_one: /* Quote this ' */ |
226 |
|
|
tok->quote = Q_none; |
227 |
|
|
*tok->wptr++ = *ptr; |
228 |
|
|
break; |
229 |
|
|
|
230 |
|
|
case Q_double: /* Stay in double quote mode */ |
231 |
|
|
*tok->wptr++ = *ptr; |
232 |
|
|
break; |
233 |
|
|
|
234 |
|
|
case Q_doubleone: /* Quote this ' */ |
235 |
|
|
tok->quote = Q_double; |
236 |
|
|
*tok->wptr++ = *ptr; |
237 |
|
|
break; |
238 |
|
|
|
239 |
|
|
default: |
240 |
|
|
return -1; |
241 |
|
|
} |
242 |
|
|
break; |
243 |
|
|
|
244 |
|
|
case '"': |
245 |
|
|
tok->flags &= ~TOK_EAT; |
246 |
|
|
tok->flags |= TOK_KEEP; |
247 |
|
|
switch (tok->quote) { |
248 |
|
|
case Q_none: /* Enter double quote mode */ |
249 |
|
|
tok->quote = Q_double; |
250 |
|
|
break; |
251 |
|
|
|
252 |
|
|
case Q_double: /* Exit double quote mode */ |
253 |
|
|
tok->quote = Q_none; |
254 |
|
|
break; |
255 |
|
|
|
256 |
|
|
case Q_one: /* Quote this " */ |
257 |
|
|
tok->quote = Q_none; |
258 |
|
|
*tok->wptr++ = *ptr; |
259 |
|
|
break; |
260 |
|
|
|
261 |
|
|
case Q_single: /* Stay in single quote mode */ |
262 |
|
|
*tok->wptr++ = *ptr; |
263 |
|
|
break; |
264 |
|
|
|
265 |
|
|
case Q_doubleone: /* Quote this " */ |
266 |
|
|
tok->quote = Q_double; |
267 |
|
|
*tok->wptr++ = *ptr; |
268 |
|
|
break; |
269 |
|
|
|
270 |
|
|
default: |
271 |
|
|
return -1; |
272 |
|
|
} |
273 |
|
|
break; |
274 |
|
|
|
275 |
|
|
case '\\': |
276 |
|
|
tok->flags |= TOK_KEEP; |
277 |
|
|
tok->flags &= ~TOK_EAT; |
278 |
|
|
switch (tok->quote) { |
279 |
|
|
case Q_none: /* Quote next character */ |
280 |
|
|
tok->quote = Q_one; |
281 |
|
|
break; |
282 |
|
|
|
283 |
|
|
case Q_double: /* Quote next character */ |
284 |
|
|
tok->quote = Q_doubleone; |
285 |
|
|
break; |
286 |
|
|
|
287 |
|
|
case Q_one: /* Quote this, restore state */ |
288 |
|
|
*tok->wptr++ = *ptr; |
289 |
|
|
tok->quote = Q_none; |
290 |
|
|
break; |
291 |
|
|
|
292 |
|
|
case Q_single: /* Stay in single quote mode */ |
293 |
|
|
*tok->wptr++ = *ptr; |
294 |
|
|
break; |
295 |
|
|
|
296 |
|
|
case Q_doubleone: /* Quote this \ */ |
297 |
|
|
tok->quote = Q_double; |
298 |
|
|
*tok->wptr++ = *ptr; |
299 |
|
|
break; |
300 |
|
|
|
301 |
|
|
default: |
302 |
|
|
return -1; |
303 |
|
|
} |
304 |
|
|
break; |
305 |
|
|
|
306 |
|
|
case '\n': |
307 |
|
|
tok->flags &= ~TOK_EAT; |
308 |
|
|
switch (tok->quote) { |
309 |
|
|
case Q_none: |
310 |
|
|
goto tok_line_outok; |
311 |
|
|
|
312 |
|
|
case Q_single: |
313 |
|
|
case Q_double: |
314 |
|
|
*tok->wptr++ = *ptr; /* Add the return */ |
315 |
|
|
break; |
316 |
|
|
|
317 |
|
|
case Q_doubleone: /* Back to double, eat the '\n' */ |
318 |
|
|
tok->flags |= TOK_EAT; |
319 |
|
|
tok->quote = Q_double; |
320 |
|
|
break; |
321 |
|
|
|
322 |
|
|
case Q_one: /* No quote, more eat the '\n' */ |
323 |
|
|
tok->flags |= TOK_EAT; |
324 |
|
|
tok->quote = Q_none; |
325 |
|
|
break; |
326 |
|
|
|
327 |
|
|
default: |
328 |
|
|
return 0; |
329 |
|
|
} |
330 |
|
|
break; |
331 |
|
|
|
332 |
|
|
case '\0': |
333 |
|
|
switch (tok->quote) { |
334 |
|
|
case Q_none: |
335 |
|
|
/* Finish word and return */ |
336 |
|
|
if (tok->flags & TOK_EAT) { |
337 |
|
|
tok->flags &= ~TOK_EAT; |
338 |
|
|
return 3; |
339 |
|
|
} |
340 |
|
|
goto tok_line_outok; |
341 |
|
|
|
342 |
|
|
case Q_single: |
343 |
|
|
return 1; |
344 |
|
|
|
345 |
|
|
case Q_double: |
346 |
|
|
return 2; |
347 |
|
|
|
348 |
|
|
case Q_doubleone: |
349 |
|
|
tok->quote = Q_double; |
350 |
|
|
*tok->wptr++ = *ptr; |
351 |
|
|
break; |
352 |
|
|
|
353 |
|
|
case Q_one: |
354 |
|
|
tok->quote = Q_none; |
355 |
|
|
*tok->wptr++ = *ptr; |
356 |
|
|
break; |
357 |
|
|
|
358 |
|
|
default: |
359 |
|
|
return -1; |
360 |
|
|
} |
361 |
|
|
break; |
362 |
|
|
|
363 |
|
|
default: |
364 |
|
|
tok->flags &= ~TOK_EAT; |
365 |
|
|
switch (tok->quote) { |
366 |
|
|
case Q_none: |
367 |
|
|
if (Strchr(tok->ifs, *ptr) != NULL) |
368 |
|
|
FUN(tok,finish)(tok); |
369 |
|
|
else |
370 |
|
|
*tok->wptr++ = *ptr; |
371 |
|
|
break; |
372 |
|
|
|
373 |
|
|
case Q_single: |
374 |
|
|
case Q_double: |
375 |
|
|
*tok->wptr++ = *ptr; |
376 |
|
|
break; |
377 |
|
|
|
378 |
|
|
|
379 |
|
|
case Q_doubleone: |
380 |
|
|
*tok->wptr++ = '\\'; |
381 |
|
|
tok->quote = Q_double; |
382 |
|
|
*tok->wptr++ = *ptr; |
383 |
|
|
break; |
384 |
|
|
|
385 |
|
|
case Q_one: |
386 |
|
|
tok->quote = Q_none; |
387 |
|
|
*tok->wptr++ = *ptr; |
388 |
|
|
break; |
389 |
|
|
|
390 |
|
|
default: |
391 |
|
|
return -1; |
392 |
|
|
|
393 |
|
|
} |
394 |
|
|
break; |
395 |
|
|
} |
396 |
|
|
|
397 |
|
|
if (tok->wptr >= tok->wmax - 4) { |
398 |
|
|
size_t size = tok->wmax - tok->wspace + WINCR; |
399 |
|
|
Char *s = reallocarray(tok->wspace, size, sizeof(*s)); |
400 |
|
|
if (s == NULL) |
401 |
|
|
return -1; |
402 |
|
|
|
403 |
|
|
if (s != tok->wspace) { |
404 |
|
|
int i; |
405 |
|
|
for (i = 0; i < tok->argc; i++) { |
406 |
|
|
tok->argv[i] = |
407 |
|
|
(tok->argv[i] - tok->wspace) + s; |
408 |
|
|
} |
409 |
|
|
tok->wptr = (tok->wptr - tok->wspace) + s; |
410 |
|
|
tok->wstart = (tok->wstart - tok->wspace) + s; |
411 |
|
|
tok->wspace = s; |
412 |
|
|
} |
413 |
|
|
tok->wmax = s + size; |
414 |
|
|
} |
415 |
|
|
if (tok->argc >= tok->amax - 4) { |
416 |
|
|
Char **p; |
417 |
|
|
tok->amax += AINCR; |
418 |
|
|
p = reallocarray(tok->argv, tok->amax, sizeof(*p)); |
419 |
|
|
if (p == NULL) { |
420 |
|
|
tok->amax -= AINCR; |
421 |
|
|
return -1; |
422 |
|
|
} |
423 |
|
|
tok->argv = p; |
424 |
|
|
} |
425 |
|
|
} |
426 |
|
|
tok_line_outok: |
427 |
|
|
if (cc == -1 && co == -1) { |
428 |
|
|
cc = tok->argc; |
429 |
|
|
co = (int)(tok->wptr - tok->wstart); |
430 |
|
|
} |
431 |
|
|
if (cursorc != NULL) |
432 |
|
|
*cursorc = cc; |
433 |
|
|
if (cursoro != NULL) |
434 |
|
|
*cursoro = co; |
435 |
|
|
FUN(tok,finish)(tok); |
436 |
|
|
*argv = (const Char **)tok->argv; |
437 |
|
|
*argc = tok->argc; |
438 |
|
|
return 0; |
439 |
|
|
} |
440 |
|
|
|
441 |
|
|
/* FUN(tok,str)(): |
442 |
|
|
* Simpler version of tok_line, taking a NUL terminated line |
443 |
|
|
* and splitting into words, ignoring cursor state. |
444 |
|
|
*/ |
445 |
|
|
int |
446 |
|
|
FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc, |
447 |
|
|
const Char ***argv) |
448 |
|
|
{ |
449 |
|
|
TYPE(LineInfo) li; |
450 |
|
|
|
451 |
|
|
memset(&li, 0, sizeof(li)); |
452 |
|
|
li.buffer = line; |
453 |
|
|
li.cursor = li.lastchar = Strchr(line, '\0'); |
454 |
|
|
return FUN(tok,line)(tok, &li, argc, argv, NULL, NULL); |
455 |
|
|
} |