1 |
|
|
/* $OpenBSD: csplit.c,v 1.9 2016/10/28 07:22:59 schwarze Exp $ */ |
2 |
|
|
/* $FreeBSD: src/usr.bin/csplit/csplit.c,v 1.9 2004/03/22 11:15:03 tjr Exp $ */ |
3 |
|
|
|
4 |
|
|
/*- |
5 |
|
|
* Copyright (c) 2002 Tim J. Robbins. |
6 |
|
|
* All rights reserved. |
7 |
|
|
* |
8 |
|
|
* Redistribution and use in source and binary forms, with or without |
9 |
|
|
* modification, are permitted provided that the following conditions |
10 |
|
|
* are met: |
11 |
|
|
* 1. Redistributions of source code must retain the above copyright |
12 |
|
|
* notice, this list of conditions and the following disclaimer. |
13 |
|
|
* 2. Redistributions in binary form must reproduce the above copyright |
14 |
|
|
* notice, this list of conditions and the following disclaimer in the |
15 |
|
|
* documentation and/or other materials provided with the distribution. |
16 |
|
|
* |
17 |
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
18 |
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
19 |
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
20 |
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
21 |
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
22 |
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
23 |
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
24 |
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
25 |
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
26 |
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
27 |
|
|
* SUCH DAMAGE. |
28 |
|
|
*/ |
29 |
|
|
|
30 |
|
|
/* |
31 |
|
|
* csplit -- split files based on context |
32 |
|
|
* |
33 |
|
|
* This utility splits its input into numbered output files by line number |
34 |
|
|
* or by a regular expression. Regular expression matches have an optional |
35 |
|
|
* offset with them, allowing the split to occur a specified number of |
36 |
|
|
* lines before or after the match. |
37 |
|
|
* |
38 |
|
|
* To handle negative offsets, we stop reading when the match occurs and |
39 |
|
|
* store the offset that the file should have been split at, then use |
40 |
|
|
* this output file as input until all the "overflowed" lines have been read. |
41 |
|
|
* The file is then closed and truncated to the correct length. |
42 |
|
|
* |
43 |
|
|
* We assume that the output files can be seeked upon (ie. they cannot be |
44 |
|
|
* symlinks to named pipes or character devices), but make no such |
45 |
|
|
* assumption about the input. |
46 |
|
|
*/ |
47 |
|
|
|
48 |
|
|
#include <sys/types.h> |
49 |
|
|
|
50 |
|
|
#include <ctype.h> |
51 |
|
|
#include <err.h> |
52 |
|
|
#include <errno.h> |
53 |
|
|
#include <limits.h> |
54 |
|
|
#include <regex.h> |
55 |
|
|
#include <signal.h> |
56 |
|
|
#include <stdint.h> |
57 |
|
|
#include <stdio.h> |
58 |
|
|
#include <stdlib.h> |
59 |
|
|
#include <string.h> |
60 |
|
|
#include <unistd.h> |
61 |
|
|
|
62 |
|
|
void cleanup(void); |
63 |
|
|
void do_lineno(const char *); |
64 |
|
|
void do_rexp(const char *); |
65 |
|
|
char *get_line(void); |
66 |
|
|
void handlesig(int); |
67 |
|
|
FILE *newfile(void); |
68 |
|
|
void toomuch(FILE *, long); |
69 |
|
|
static void __dead usage(void); |
70 |
|
|
|
71 |
|
|
/* |
72 |
|
|
* Command line options |
73 |
|
|
*/ |
74 |
|
|
const char *prefix; /* File name prefix */ |
75 |
|
|
long sufflen; /* Number of decimal digits for suffix */ |
76 |
|
|
int sflag; /* Suppress output of file names */ |
77 |
|
|
int kflag; /* Keep output if error occurs */ |
78 |
|
|
|
79 |
|
|
/* |
80 |
|
|
* Other miscellaneous globals (XXX too many) |
81 |
|
|
*/ |
82 |
|
|
long lineno; /* Current line number in input file */ |
83 |
|
|
long reps; /* Number of repetitions for this pattern */ |
84 |
|
|
long nfiles; /* Number of files output so far */ |
85 |
|
|
long maxfiles; /* Maximum number of files we can create */ |
86 |
|
|
char currfile[PATH_MAX]; /* Current output file */ |
87 |
|
|
const char *infn; /* Name of the input file */ |
88 |
|
|
FILE *infile; /* Input file handle */ |
89 |
|
|
FILE *overfile; /* Overflow file for toomuch() */ |
90 |
|
|
off_t truncofs; /* Offset this file should be truncated at */ |
91 |
|
|
int doclean; /* Should cleanup() remove output? */ |
92 |
|
|
|
93 |
|
|
int |
94 |
|
|
main(int argc, char *argv[]) |
95 |
|
|
{ |
96 |
|
|
struct sigaction sa; |
97 |
|
|
long i; |
98 |
|
|
int ch; |
99 |
|
|
const char *expr; |
100 |
|
|
char *ep, *p; |
101 |
|
|
FILE *ofp; |
102 |
|
|
|
103 |
|
|
if (pledge("stdio rpath wpath cpath flock", NULL) == -1) |
104 |
|
|
err(1, "pledge"); |
105 |
|
|
|
106 |
|
|
kflag = sflag = 0; |
107 |
|
|
prefix = "xx"; |
108 |
|
|
sufflen = 2; |
109 |
|
|
while ((ch = getopt(argc, argv, "f:kn:s")) != -1) { |
110 |
|
|
switch (ch) { |
111 |
|
|
case 'f': |
112 |
|
|
prefix = optarg; |
113 |
|
|
break; |
114 |
|
|
case 'k': |
115 |
|
|
kflag = 1; |
116 |
|
|
break; |
117 |
|
|
case 'n': |
118 |
|
|
errno = 0; |
119 |
|
|
sufflen = strtol(optarg, &ep, 10); |
120 |
|
|
if (sufflen <= 0 || *ep != '\0' || errno != 0) |
121 |
|
|
errx(1, "%s: bad suffix length", optarg); |
122 |
|
|
break; |
123 |
|
|
case 's': |
124 |
|
|
sflag = 1; |
125 |
|
|
break; |
126 |
|
|
default: |
127 |
|
|
usage(); |
128 |
|
|
} |
129 |
|
|
} |
130 |
|
|
|
131 |
|
|
if (sufflen + strlen(prefix) >= PATH_MAX) |
132 |
|
|
errx(1, "name too long"); |
133 |
|
|
|
134 |
|
|
argc -= optind; |
135 |
|
|
argv += optind; |
136 |
|
|
|
137 |
|
|
if ((infn = *argv++) == NULL) |
138 |
|
|
usage(); |
139 |
|
|
if (strcmp(infn, "-") == 0) { |
140 |
|
|
infile = stdin; |
141 |
|
|
infn = "stdin"; |
142 |
|
|
} else if ((infile = fopen(infn, "r")) == NULL) |
143 |
|
|
err(1, "%s", infn); |
144 |
|
|
|
145 |
|
|
if (!kflag) { |
146 |
|
|
doclean = 1; |
147 |
|
|
atexit(cleanup); |
148 |
|
|
sa.sa_flags = 0; |
149 |
|
|
sa.sa_handler = handlesig; |
150 |
|
|
sigemptyset(&sa.sa_mask); |
151 |
|
|
sigaddset(&sa.sa_mask, SIGHUP); |
152 |
|
|
sigaddset(&sa.sa_mask, SIGINT); |
153 |
|
|
sigaddset(&sa.sa_mask, SIGTERM); |
154 |
|
|
sigaction(SIGHUP, &sa, NULL); |
155 |
|
|
sigaction(SIGINT, &sa, NULL); |
156 |
|
|
sigaction(SIGTERM, &sa, NULL); |
157 |
|
|
} |
158 |
|
|
|
159 |
|
|
lineno = 0; |
160 |
|
|
nfiles = 0; |
161 |
|
|
truncofs = 0; |
162 |
|
|
overfile = NULL; |
163 |
|
|
|
164 |
|
|
/* Ensure 10^sufflen < LONG_MAX. */ |
165 |
|
|
for (maxfiles = 1, i = 0; i < sufflen; i++) { |
166 |
|
|
if (maxfiles > LONG_MAX / 10) |
167 |
|
|
errx(1, "%ld: suffix too long (limit %ld)", |
168 |
|
|
sufflen, i); |
169 |
|
|
maxfiles *= 10; |
170 |
|
|
} |
171 |
|
|
|
172 |
|
|
/* Create files based on supplied patterns. */ |
173 |
|
|
while (nfiles < maxfiles - 1 && (expr = *argv++) != NULL) { |
174 |
|
|
/* Look ahead & see if this pattern has any repetitions. */ |
175 |
|
|
if (*argv != NULL && **argv == '{') { |
176 |
|
|
errno = 0; |
177 |
|
|
reps = strtol(*argv + 1, &ep, 10); |
178 |
|
|
if (reps < 0 || *ep != '}' || errno != 0) |
179 |
|
|
errx(1, "%s: bad repetition count", *argv + 1); |
180 |
|
|
argv++; |
181 |
|
|
} else |
182 |
|
|
reps = 0; |
183 |
|
|
|
184 |
|
|
if (*expr == '/' || *expr == '%') { |
185 |
|
|
do { |
186 |
|
|
do_rexp(expr); |
187 |
|
|
} while (reps-- != 0 && nfiles < maxfiles - 1); |
188 |
|
|
} else if (isdigit((unsigned char)*expr)) |
189 |
|
|
do_lineno(expr); |
190 |
|
|
else |
191 |
|
|
errx(1, "%s: unrecognised pattern", expr); |
192 |
|
|
} |
193 |
|
|
|
194 |
|
|
/* Copy the rest into a new file. */ |
195 |
|
|
if (!feof(infile)) { |
196 |
|
|
ofp = newfile(); |
197 |
|
|
while ((p = get_line()) != NULL && fputs(p, ofp) == 0) |
198 |
|
|
; |
199 |
|
|
if (!sflag) |
200 |
|
|
printf("%jd\n", (intmax_t)ftello(ofp)); |
201 |
|
|
if (fclose(ofp) != 0) |
202 |
|
|
err(1, "%s", currfile); |
203 |
|
|
} |
204 |
|
|
|
205 |
|
|
toomuch(NULL, 0); |
206 |
|
|
doclean = 0; |
207 |
|
|
|
208 |
|
|
return (0); |
209 |
|
|
} |
210 |
|
|
|
211 |
|
|
static void __dead |
212 |
|
|
usage(void) |
213 |
|
|
{ |
214 |
|
|
extern char *__progname; |
215 |
|
|
|
216 |
|
|
fprintf(stderr, |
217 |
|
|
"usage: %s [-ks] [-f prefix] [-n number] file args ...\n", |
218 |
|
|
__progname); |
219 |
|
|
exit(1); |
220 |
|
|
} |
221 |
|
|
|
222 |
|
|
/* ARGSUSED */ |
223 |
|
|
void |
224 |
|
|
handlesig(int sig) |
225 |
|
|
{ |
226 |
|
|
const char msg[] = "csplit: caught signal, cleaning up\n"; |
227 |
|
|
|
228 |
|
|
write(STDERR_FILENO, msg, sizeof(msg) - 1); |
229 |
|
|
cleanup(); |
230 |
|
|
_exit(2); |
231 |
|
|
} |
232 |
|
|
|
233 |
|
|
/* Create a new output file. */ |
234 |
|
|
FILE * |
235 |
|
|
newfile(void) |
236 |
|
|
{ |
237 |
|
|
FILE *fp; |
238 |
|
|
|
239 |
|
|
if ((size_t)snprintf(currfile, sizeof(currfile), "%s%0*ld", prefix, |
240 |
|
|
(int)sufflen, nfiles) >= sizeof(currfile)) |
241 |
|
|
errc(1, ENAMETOOLONG, "%s", currfile); |
242 |
|
|
if ((fp = fopen(currfile, "w+")) == NULL) |
243 |
|
|
err(1, "%s", currfile); |
244 |
|
|
nfiles++; |
245 |
|
|
|
246 |
|
|
return (fp); |
247 |
|
|
} |
248 |
|
|
|
249 |
|
|
/* Remove partial output, called before exiting. */ |
250 |
|
|
void |
251 |
|
|
cleanup(void) |
252 |
|
|
{ |
253 |
|
|
char fnbuf[PATH_MAX]; |
254 |
|
|
long i; |
255 |
|
|
|
256 |
|
|
if (!doclean) |
257 |
|
|
return; |
258 |
|
|
|
259 |
|
|
/* |
260 |
|
|
* NOTE: One cannot portably assume to be able to call snprintf() from |
261 |
|
|
* inside a signal handler. It is, however, safe to do on OpenBSD. |
262 |
|
|
*/ |
263 |
|
|
for (i = 0; i < nfiles; i++) { |
264 |
|
|
snprintf(fnbuf, sizeof(fnbuf), "%s%0*ld", prefix, |
265 |
|
|
(int)sufflen, i); |
266 |
|
|
unlink(fnbuf); |
267 |
|
|
} |
268 |
|
|
} |
269 |
|
|
|
270 |
|
|
/* Read a line from the input into a static buffer. */ |
271 |
|
|
char * |
272 |
|
|
get_line(void) |
273 |
|
|
{ |
274 |
|
|
static char lbuf[LINE_MAX]; |
275 |
|
|
FILE *src; |
276 |
|
|
|
277 |
|
|
src = overfile != NULL ? overfile : infile; |
278 |
|
|
|
279 |
|
|
again: if (fgets(lbuf, sizeof(lbuf), src) == NULL) { |
280 |
|
|
if (src == overfile) { |
281 |
|
|
src = infile; |
282 |
|
|
goto again; |
283 |
|
|
} |
284 |
|
|
return (NULL); |
285 |
|
|
} |
286 |
|
|
if (ferror(src)) |
287 |
|
|
err(1, "%s", infn); |
288 |
|
|
lineno++; |
289 |
|
|
|
290 |
|
|
return (lbuf); |
291 |
|
|
} |
292 |
|
|
|
293 |
|
|
/* Conceptually rewind the input (as obtained by get_line()) back `n' lines. */ |
294 |
|
|
void |
295 |
|
|
toomuch(FILE *ofp, long n) |
296 |
|
|
{ |
297 |
|
|
char buf[BUFSIZ]; |
298 |
|
|
size_t i, nread; |
299 |
|
|
|
300 |
|
|
if (overfile != NULL) { |
301 |
|
|
/* |
302 |
|
|
* Truncate the previous file we overflowed into back to |
303 |
|
|
* the correct length, close it. |
304 |
|
|
*/ |
305 |
|
|
if (fflush(overfile) != 0) |
306 |
|
|
err(1, "overflow"); |
307 |
|
|
if (ftruncate(fileno(overfile), truncofs) != 0) |
308 |
|
|
err(1, "overflow"); |
309 |
|
|
if (fclose(overfile) != 0) |
310 |
|
|
err(1, "overflow"); |
311 |
|
|
overfile = NULL; |
312 |
|
|
} |
313 |
|
|
|
314 |
|
|
if (n == 0) |
315 |
|
|
/* Just tidying up */ |
316 |
|
|
return; |
317 |
|
|
|
318 |
|
|
lineno -= n; |
319 |
|
|
|
320 |
|
|
/* |
321 |
|
|
* Wind the overflow file backwards to `n' lines before the |
322 |
|
|
* current one. |
323 |
|
|
*/ |
324 |
|
|
do { |
325 |
|
|
if (ftello(ofp) < (off_t)sizeof(buf)) |
326 |
|
|
rewind(ofp); |
327 |
|
|
else |
328 |
|
|
fseeko(ofp, -(off_t)sizeof(buf), SEEK_CUR); |
329 |
|
|
if (ferror(ofp)) |
330 |
|
|
errx(1, "%s: can't seek", currfile); |
331 |
|
|
if ((nread = fread(buf, 1, sizeof(buf), ofp)) == 0) |
332 |
|
|
errx(1, "can't read overflowed output"); |
333 |
|
|
if (fseeko(ofp, -(off_t)nread, SEEK_CUR) != 0) |
334 |
|
|
err(1, "%s", currfile); |
335 |
|
|
for (i = 1; i <= nread; i++) |
336 |
|
|
if (buf[nread - i] == '\n' && n-- == 0) |
337 |
|
|
break; |
338 |
|
|
if (ftello(ofp) == 0) |
339 |
|
|
break; |
340 |
|
|
} while (n > 0); |
341 |
|
|
if (fseeko(ofp, (off_t)(nread - i + 1), SEEK_CUR) != 0) |
342 |
|
|
err(1, "%s", currfile); |
343 |
|
|
|
344 |
|
|
/* |
345 |
|
|
* get_line() will read from here. Next call will truncate to |
346 |
|
|
* truncofs in this file. |
347 |
|
|
*/ |
348 |
|
|
overfile = ofp; |
349 |
|
|
truncofs = ftello(overfile); |
350 |
|
|
} |
351 |
|
|
|
352 |
|
|
/* Handle splits for /regexp/ and %regexp% patterns. */ |
353 |
|
|
void |
354 |
|
|
do_rexp(const char *expr) |
355 |
|
|
{ |
356 |
|
|
regex_t cre; |
357 |
|
|
intmax_t nwritten; |
358 |
|
|
long ofs; |
359 |
|
|
int first; |
360 |
|
|
char *ecopy, *ep, *p, *pofs, *re; |
361 |
|
|
FILE *ofp; |
362 |
|
|
|
363 |
|
|
if ((ecopy = strdup(expr)) == NULL) |
364 |
|
|
err(1, "strdup"); |
365 |
|
|
|
366 |
|
|
re = ecopy + 1; |
367 |
|
|
if ((pofs = strrchr(ecopy, *expr)) == NULL || pofs[-1] == '\\') |
368 |
|
|
errx(1, "%s: missing trailing %c", expr, *expr); |
369 |
|
|
*pofs++ = '\0'; |
370 |
|
|
|
371 |
|
|
if (*pofs != '\0') { |
372 |
|
|
errno = 0; |
373 |
|
|
ofs = strtol(pofs, &ep, 10); |
374 |
|
|
if (*ep != '\0' || errno != 0) |
375 |
|
|
errx(1, "%s: bad offset", pofs); |
376 |
|
|
} else |
377 |
|
|
ofs = 0; |
378 |
|
|
|
379 |
|
|
if (regcomp(&cre, re, REG_BASIC|REG_NOSUB) != 0) |
380 |
|
|
errx(1, "%s: bad regular expression", re); |
381 |
|
|
|
382 |
|
|
if (*expr == '/') |
383 |
|
|
/* /regexp/: Save results to a file. */ |
384 |
|
|
ofp = newfile(); |
385 |
|
|
else { |
386 |
|
|
/* %regexp%: Make a temporary file for overflow. */ |
387 |
|
|
if ((ofp = tmpfile()) == NULL) |
388 |
|
|
err(1, "tmpfile"); |
389 |
|
|
} |
390 |
|
|
|
391 |
|
|
/* Read and output lines until we get a match. */ |
392 |
|
|
first = 1; |
393 |
|
|
while ((p = get_line()) != NULL) { |
394 |
|
|
if (fputs(p, ofp) != 0) |
395 |
|
|
break; |
396 |
|
|
if (!first && regexec(&cre, p, 0, NULL, 0) == 0) |
397 |
|
|
break; |
398 |
|
|
first = 0; |
399 |
|
|
} |
400 |
|
|
|
401 |
|
|
if (p == NULL) |
402 |
|
|
errx(1, "%s: no match", re); |
403 |
|
|
|
404 |
|
|
if (ofs <= 0) { |
405 |
|
|
/* |
406 |
|
|
* Negative (or zero) offset: throw back any lines we should |
407 |
|
|
* not have read yet. |
408 |
|
|
*/ |
409 |
|
|
if (p != NULL) { |
410 |
|
|
toomuch(ofp, -ofs + 1); |
411 |
|
|
nwritten = (intmax_t)truncofs; |
412 |
|
|
} else |
413 |
|
|
nwritten = (intmax_t)ftello(ofp); |
414 |
|
|
} else { |
415 |
|
|
/* |
416 |
|
|
* Positive offset: copy the requested number of lines |
417 |
|
|
* after the match. |
418 |
|
|
*/ |
419 |
|
|
while (--ofs > 0 && (p = get_line()) != NULL) |
420 |
|
|
fputs(p, ofp); |
421 |
|
|
toomuch(NULL, 0); |
422 |
|
|
nwritten = (intmax_t)ftello(ofp); |
423 |
|
|
if (fclose(ofp) != 0) |
424 |
|
|
err(1, "%s", currfile); |
425 |
|
|
} |
426 |
|
|
|
427 |
|
|
if (!sflag && *expr == '/') |
428 |
|
|
printf("%jd\n", nwritten); |
429 |
|
|
|
430 |
|
|
regfree(&cre); |
431 |
|
|
free(ecopy); |
432 |
|
|
} |
433 |
|
|
|
434 |
|
|
/* Handle splits based on line number. */ |
435 |
|
|
void |
436 |
|
|
do_lineno(const char *expr) |
437 |
|
|
{ |
438 |
|
|
long lastline, tgtline; |
439 |
|
|
char *ep, *p; |
440 |
|
|
FILE *ofp; |
441 |
|
|
|
442 |
|
|
errno = 0; |
443 |
|
|
tgtline = strtol(expr, &ep, 10); |
444 |
|
|
if (tgtline <= 0 || errno != 0 || *ep != '\0') |
445 |
|
|
errx(1, "%s: bad line number", expr); |
446 |
|
|
lastline = tgtline; |
447 |
|
|
if (lastline <= lineno) |
448 |
|
|
errx(1, "%s: can't go backwards", expr); |
449 |
|
|
|
450 |
|
|
while (nfiles < maxfiles - 1) { |
451 |
|
|
ofp = newfile(); |
452 |
|
|
while (lineno + 1 != lastline) { |
453 |
|
|
if ((p = get_line()) == NULL) |
454 |
|
|
errx(1, "%ld: out of range", lastline); |
455 |
|
|
if (fputs(p, ofp) != 0) |
456 |
|
|
break; |
457 |
|
|
} |
458 |
|
|
if (!sflag) |
459 |
|
|
printf("%jd\n", (intmax_t)ftello(ofp)); |
460 |
|
|
if (fclose(ofp) != 0) |
461 |
|
|
err(1, "%s", currfile); |
462 |
|
|
if (reps-- == 0) |
463 |
|
|
break; |
464 |
|
|
lastline += tgtline; |
465 |
|
|
} |
466 |
|
|
} |