1 |
|
|
/* $OpenBSD: csplit.c,v 1.8 2015/10/11 17:43:03 semarie Exp $ */ |
2 |
|
|
/* $FreeBSD: src/usr.bin/csplit/csplit.c,v 1.9 2004/03/22 11:15:03 tjr Exp $ */ |
3 |
|
|
|
4 |
|
|
/*- |
5 |
|
|
* Copyright (c) 2002 Tim J. Robbins. |
6 |
|
|
* All rights reserved. |
7 |
|
|
* |
8 |
|
|
* Redistribution and use in source and binary forms, with or without |
9 |
|
|
* modification, are permitted provided that the following conditions |
10 |
|
|
* are met: |
11 |
|
|
* 1. Redistributions of source code must retain the above copyright |
12 |
|
|
* notice, this list of conditions and the following disclaimer. |
13 |
|
|
* 2. Redistributions in binary form must reproduce the above copyright |
14 |
|
|
* notice, this list of conditions and the following disclaimer in the |
15 |
|
|
* documentation and/or other materials provided with the distribution. |
16 |
|
|
* |
17 |
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
18 |
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
19 |
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
20 |
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
21 |
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
22 |
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
23 |
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
24 |
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
25 |
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
26 |
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
27 |
|
|
* SUCH DAMAGE. |
28 |
|
|
*/ |
29 |
|
|
|
30 |
|
|
/* |
31 |
|
|
* csplit -- split files based on context |
32 |
|
|
* |
33 |
|
|
* This utility splits its input into numbered output files by line number |
34 |
|
|
* or by a regular expression. Regular expression matches have an optional |
35 |
|
|
* offset with them, allowing the split to occur a specified number of |
36 |
|
|
* lines before or after the match. |
37 |
|
|
* |
38 |
|
|
* To handle negative offsets, we stop reading when the match occurs and |
39 |
|
|
* store the offset that the file should have been split at, then use |
40 |
|
|
* this output file as input until all the "overflowed" lines have been read. |
41 |
|
|
* The file is then closed and truncated to the correct length. |
42 |
|
|
* |
43 |
|
|
* We assume that the output files can be seeked upon (ie. they cannot be |
44 |
|
|
* symlinks to named pipes or character devices), but make no such |
45 |
|
|
* assumption about the input. |
46 |
|
|
*/ |
47 |
|
|
|
48 |
|
|
#include <sys/types.h> |
49 |
|
|
|
50 |
|
|
#include <ctype.h> |
51 |
|
|
#include <err.h> |
52 |
|
|
#include <errno.h> |
53 |
|
|
#include <limits.h> |
54 |
|
|
#include <locale.h> |
55 |
|
|
#include <regex.h> |
56 |
|
|
#include <signal.h> |
57 |
|
|
#include <stdint.h> |
58 |
|
|
#include <stdio.h> |
59 |
|
|
#include <stdlib.h> |
60 |
|
|
#include <string.h> |
61 |
|
|
#include <unistd.h> |
62 |
|
|
|
63 |
|
|
void cleanup(void); |
64 |
|
|
void do_lineno(const char *); |
65 |
|
|
void do_rexp(const char *); |
66 |
|
|
char *get_line(void); |
67 |
|
|
void handlesig(int); |
68 |
|
|
FILE *newfile(void); |
69 |
|
|
void toomuch(FILE *, long); |
70 |
|
|
void usage(void); |
71 |
|
|
|
72 |
|
|
/* |
73 |
|
|
* Command line options |
74 |
|
|
*/ |
75 |
|
|
const char *prefix; /* File name prefix */ |
76 |
|
|
long sufflen; /* Number of decimal digits for suffix */ |
77 |
|
|
int sflag; /* Suppress output of file names */ |
78 |
|
|
int kflag; /* Keep output if error occurs */ |
79 |
|
|
|
80 |
|
|
/* |
81 |
|
|
* Other miscellaneous globals (XXX too many) |
82 |
|
|
*/ |
83 |
|
|
long lineno; /* Current line number in input file */ |
84 |
|
|
long reps; /* Number of repetitions for this pattern */ |
85 |
|
|
long nfiles; /* Number of files output so far */ |
86 |
|
|
long maxfiles; /* Maximum number of files we can create */ |
87 |
|
|
char currfile[PATH_MAX]; /* Current output file */ |
88 |
|
|
const char *infn; /* Name of the input file */ |
89 |
|
|
FILE *infile; /* Input file handle */ |
90 |
|
|
FILE *overfile; /* Overflow file for toomuch() */ |
91 |
|
|
off_t truncofs; /* Offset this file should be truncated at */ |
92 |
|
|
int doclean; /* Should cleanup() remove output? */ |
93 |
|
|
|
94 |
|
|
int |
95 |
|
|
main(int argc, char *argv[]) |
96 |
|
|
{ |
97 |
|
|
struct sigaction sa; |
98 |
|
|
long i; |
99 |
|
|
int ch; |
100 |
|
|
const char *expr; |
101 |
|
|
char *ep, *p; |
102 |
|
|
FILE *ofp; |
103 |
|
|
|
104 |
|
|
setlocale(LC_ALL, ""); |
105 |
|
|
|
106 |
|
|
if (pledge("stdio rpath wpath cpath", NULL) == -1) |
107 |
|
|
err(1, "pledge"); |
108 |
|
|
|
109 |
|
|
kflag = sflag = 0; |
110 |
|
|
prefix = "xx"; |
111 |
|
|
sufflen = 2; |
112 |
|
|
while ((ch = getopt(argc, argv, "f:kn:s")) != -1) { |
113 |
|
|
switch (ch) { |
114 |
|
|
case 'f': |
115 |
|
|
prefix = optarg; |
116 |
|
|
break; |
117 |
|
|
case 'k': |
118 |
|
|
kflag = 1; |
119 |
|
|
break; |
120 |
|
|
case 'n': |
121 |
|
|
errno = 0; |
122 |
|
|
sufflen = strtol(optarg, &ep, 10); |
123 |
|
|
if (sufflen <= 0 || *ep != '\0' || errno != 0) |
124 |
|
|
errx(1, "%s: bad suffix length", optarg); |
125 |
|
|
break; |
126 |
|
|
case 's': |
127 |
|
|
sflag = 1; |
128 |
|
|
break; |
129 |
|
|
default: |
130 |
|
|
usage(); |
131 |
|
|
/*NOTREACHED*/ |
132 |
|
|
} |
133 |
|
|
} |
134 |
|
|
|
135 |
|
|
if (sufflen + strlen(prefix) >= PATH_MAX) |
136 |
|
|
errx(1, "name too long"); |
137 |
|
|
|
138 |
|
|
argc -= optind; |
139 |
|
|
argv += optind; |
140 |
|
|
|
141 |
|
|
if ((infn = *argv++) == NULL) |
142 |
|
|
usage(); |
143 |
|
|
if (strcmp(infn, "-") == 0) { |
144 |
|
|
infile = stdin; |
145 |
|
|
infn = "stdin"; |
146 |
|
|
} else if ((infile = fopen(infn, "r")) == NULL) |
147 |
|
|
err(1, "%s", infn); |
148 |
|
|
|
149 |
|
|
if (!kflag) { |
150 |
|
|
doclean = 1; |
151 |
|
|
atexit(cleanup); |
152 |
|
|
sa.sa_flags = 0; |
153 |
|
|
sa.sa_handler = handlesig; |
154 |
|
|
sigemptyset(&sa.sa_mask); |
155 |
|
|
sigaddset(&sa.sa_mask, SIGHUP); |
156 |
|
|
sigaddset(&sa.sa_mask, SIGINT); |
157 |
|
|
sigaddset(&sa.sa_mask, SIGTERM); |
158 |
|
|
sigaction(SIGHUP, &sa, NULL); |
159 |
|
|
sigaction(SIGINT, &sa, NULL); |
160 |
|
|
sigaction(SIGTERM, &sa, NULL); |
161 |
|
|
} |
162 |
|
|
|
163 |
|
|
lineno = 0; |
164 |
|
|
nfiles = 0; |
165 |
|
|
truncofs = 0; |
166 |
|
|
overfile = NULL; |
167 |
|
|
|
168 |
|
|
/* Ensure 10^sufflen < LONG_MAX. */ |
169 |
|
|
for (maxfiles = 1, i = 0; i < sufflen; i++) { |
170 |
|
|
if (maxfiles > LONG_MAX / 10) |
171 |
|
|
errx(1, "%ld: suffix too long (limit %ld)", |
172 |
|
|
sufflen, i); |
173 |
|
|
maxfiles *= 10; |
174 |
|
|
} |
175 |
|
|
|
176 |
|
|
/* Create files based on supplied patterns. */ |
177 |
|
|
while (nfiles < maxfiles - 1 && (expr = *argv++) != NULL) { |
178 |
|
|
/* Look ahead & see if this pattern has any repetitions. */ |
179 |
|
|
if (*argv != NULL && **argv == '{') { |
180 |
|
|
errno = 0; |
181 |
|
|
reps = strtol(*argv + 1, &ep, 10); |
182 |
|
|
if (reps < 0 || *ep != '}' || errno != 0) |
183 |
|
|
errx(1, "%s: bad repetition count", *argv + 1); |
184 |
|
|
argv++; |
185 |
|
|
} else |
186 |
|
|
reps = 0; |
187 |
|
|
|
188 |
|
|
if (*expr == '/' || *expr == '%') { |
189 |
|
|
do { |
190 |
|
|
do_rexp(expr); |
191 |
|
|
} while (reps-- != 0 && nfiles < maxfiles - 1); |
192 |
|
|
} else if (isdigit((unsigned char)*expr)) |
193 |
|
|
do_lineno(expr); |
194 |
|
|
else |
195 |
|
|
errx(1, "%s: unrecognised pattern", expr); |
196 |
|
|
} |
197 |
|
|
|
198 |
|
|
/* Copy the rest into a new file. */ |
199 |
|
|
if (!feof(infile)) { |
200 |
|
|
ofp = newfile(); |
201 |
|
|
while ((p = get_line()) != NULL && fputs(p, ofp) == 0) |
202 |
|
|
; |
203 |
|
|
if (!sflag) |
204 |
|
|
printf("%jd\n", (intmax_t)ftello(ofp)); |
205 |
|
|
if (fclose(ofp) != 0) |
206 |
|
|
err(1, "%s", currfile); |
207 |
|
|
} |
208 |
|
|
|
209 |
|
|
toomuch(NULL, 0); |
210 |
|
|
doclean = 0; |
211 |
|
|
|
212 |
|
|
return (0); |
213 |
|
|
} |
214 |
|
|
|
215 |
|
|
void |
216 |
|
|
usage(void) |
217 |
|
|
{ |
218 |
|
|
extern char *__progname; |
219 |
|
|
|
220 |
|
|
fprintf(stderr, |
221 |
|
|
"usage: %s [-ks] [-f prefix] [-n number] file args ...\n", |
222 |
|
|
__progname); |
223 |
|
|
exit(1); |
224 |
|
|
} |
225 |
|
|
|
226 |
|
|
/* ARGSUSED */ |
227 |
|
|
void |
228 |
|
|
handlesig(int sig) |
229 |
|
|
{ |
230 |
|
|
const char msg[] = "csplit: caught signal, cleaning up\n"; |
231 |
|
|
|
232 |
|
|
write(STDERR_FILENO, msg, sizeof(msg) - 1); |
233 |
|
|
cleanup(); |
234 |
|
|
_exit(2); |
235 |
|
|
} |
236 |
|
|
|
237 |
|
|
/* Create a new output file. */ |
238 |
|
|
FILE * |
239 |
|
|
newfile(void) |
240 |
|
|
{ |
241 |
|
|
FILE *fp; |
242 |
|
|
|
243 |
|
|
if ((size_t)snprintf(currfile, sizeof(currfile), "%s%0*ld", prefix, |
244 |
|
|
(int)sufflen, nfiles) >= sizeof(currfile)) |
245 |
|
|
errc(1, ENAMETOOLONG, "%s", currfile); |
246 |
|
|
if ((fp = fopen(currfile, "w+")) == NULL) |
247 |
|
|
err(1, "%s", currfile); |
248 |
|
|
nfiles++; |
249 |
|
|
|
250 |
|
|
return (fp); |
251 |
|
|
} |
252 |
|
|
|
253 |
|
|
/* Remove partial output, called before exiting. */ |
254 |
|
|
void |
255 |
|
|
cleanup(void) |
256 |
|
|
{ |
257 |
|
|
char fnbuf[PATH_MAX]; |
258 |
|
|
long i; |
259 |
|
|
|
260 |
|
|
if (!doclean) |
261 |
|
|
return; |
262 |
|
|
|
263 |
|
|
/* |
264 |
|
|
* NOTE: One cannot portably assume to be able to call snprintf() from |
265 |
|
|
* inside a signal handler. It is, however, safe to do on OpenBSD. |
266 |
|
|
*/ |
267 |
|
|
for (i = 0; i < nfiles; i++) { |
268 |
|
|
snprintf(fnbuf, sizeof(fnbuf), "%s%0*ld", prefix, |
269 |
|
|
(int)sufflen, i); |
270 |
|
|
unlink(fnbuf); |
271 |
|
|
} |
272 |
|
|
} |
273 |
|
|
|
274 |
|
|
/* Read a line from the input into a static buffer. */ |
275 |
|
|
char * |
276 |
|
|
get_line(void) |
277 |
|
|
{ |
278 |
|
|
static char lbuf[LINE_MAX]; |
279 |
|
|
FILE *src; |
280 |
|
|
|
281 |
|
|
src = overfile != NULL ? overfile : infile; |
282 |
|
|
|
283 |
|
|
again: if (fgets(lbuf, sizeof(lbuf), src) == NULL) { |
284 |
|
|
if (src == overfile) { |
285 |
|
|
src = infile; |
286 |
|
|
goto again; |
287 |
|
|
} |
288 |
|
|
return (NULL); |
289 |
|
|
} |
290 |
|
|
if (ferror(src)) |
291 |
|
|
err(1, "%s", infn); |
292 |
|
|
lineno++; |
293 |
|
|
|
294 |
|
|
return (lbuf); |
295 |
|
|
} |
296 |
|
|
|
297 |
|
|
/* Conceptually rewind the input (as obtained by get_line()) back `n' lines. */ |
298 |
|
|
void |
299 |
|
|
toomuch(FILE *ofp, long n) |
300 |
|
|
{ |
301 |
|
|
char buf[BUFSIZ]; |
302 |
|
|
size_t i, nread; |
303 |
|
|
|
304 |
|
|
if (overfile != NULL) { |
305 |
|
|
/* |
306 |
|
|
* Truncate the previous file we overflowed into back to |
307 |
|
|
* the correct length, close it. |
308 |
|
|
*/ |
309 |
|
|
if (fflush(overfile) != 0) |
310 |
|
|
err(1, "overflow"); |
311 |
|
|
if (ftruncate(fileno(overfile), truncofs) != 0) |
312 |
|
|
err(1, "overflow"); |
313 |
|
|
if (fclose(overfile) != 0) |
314 |
|
|
err(1, "overflow"); |
315 |
|
|
overfile = NULL; |
316 |
|
|
} |
317 |
|
|
|
318 |
|
|
if (n == 0) |
319 |
|
|
/* Just tidying up */ |
320 |
|
|
return; |
321 |
|
|
|
322 |
|
|
lineno -= n; |
323 |
|
|
|
324 |
|
|
/* |
325 |
|
|
* Wind the overflow file backwards to `n' lines before the |
326 |
|
|
* current one. |
327 |
|
|
*/ |
328 |
|
|
do { |
329 |
|
|
if (ftello(ofp) < (off_t)sizeof(buf)) |
330 |
|
|
rewind(ofp); |
331 |
|
|
else |
332 |
|
|
fseeko(ofp, -(off_t)sizeof(buf), SEEK_CUR); |
333 |
|
|
if (ferror(ofp)) |
334 |
|
|
errx(1, "%s: can't seek", currfile); |
335 |
|
|
if ((nread = fread(buf, 1, sizeof(buf), ofp)) == 0) |
336 |
|
|
errx(1, "can't read overflowed output"); |
337 |
|
|
if (fseeko(ofp, -(off_t)nread, SEEK_CUR) != 0) |
338 |
|
|
err(1, "%s", currfile); |
339 |
|
|
for (i = 1; i <= nread; i++) |
340 |
|
|
if (buf[nread - i] == '\n' && n-- == 0) |
341 |
|
|
break; |
342 |
|
|
if (ftello(ofp) == 0) |
343 |
|
|
break; |
344 |
|
|
} while (n > 0); |
345 |
|
|
if (fseeko(ofp, (off_t)(nread - i + 1), SEEK_CUR) != 0) |
346 |
|
|
err(1, "%s", currfile); |
347 |
|
|
|
348 |
|
|
/* |
349 |
|
|
* get_line() will read from here. Next call will truncate to |
350 |
|
|
* truncofs in this file. |
351 |
|
|
*/ |
352 |
|
|
overfile = ofp; |
353 |
|
|
truncofs = ftello(overfile); |
354 |
|
|
} |
355 |
|
|
|
356 |
|
|
/* Handle splits for /regexp/ and %regexp% patterns. */ |
357 |
|
|
void |
358 |
|
|
do_rexp(const char *expr) |
359 |
|
|
{ |
360 |
|
|
regex_t cre; |
361 |
|
|
intmax_t nwritten; |
362 |
|
|
long ofs; |
363 |
|
|
int first; |
364 |
|
|
char *ecopy, *ep, *p, *pofs, *re; |
365 |
|
|
FILE *ofp; |
366 |
|
|
|
367 |
|
|
if ((ecopy = strdup(expr)) == NULL) |
368 |
|
|
err(1, "strdup"); |
369 |
|
|
|
370 |
|
|
re = ecopy + 1; |
371 |
|
|
if ((pofs = strrchr(ecopy, *expr)) == NULL || pofs[-1] == '\\') |
372 |
|
|
errx(1, "%s: missing trailing %c", expr, *expr); |
373 |
|
|
*pofs++ = '\0'; |
374 |
|
|
|
375 |
|
|
if (*pofs != '\0') { |
376 |
|
|
errno = 0; |
377 |
|
|
ofs = strtol(pofs, &ep, 10); |
378 |
|
|
if (*ep != '\0' || errno != 0) |
379 |
|
|
errx(1, "%s: bad offset", pofs); |
380 |
|
|
} else |
381 |
|
|
ofs = 0; |
382 |
|
|
|
383 |
|
|
if (regcomp(&cre, re, REG_BASIC|REG_NOSUB) != 0) |
384 |
|
|
errx(1, "%s: bad regular expression", re); |
385 |
|
|
|
386 |
|
|
if (*expr == '/') |
387 |
|
|
/* /regexp/: Save results to a file. */ |
388 |
|
|
ofp = newfile(); |
389 |
|
|
else { |
390 |
|
|
/* %regexp%: Make a temporary file for overflow. */ |
391 |
|
|
if ((ofp = tmpfile()) == NULL) |
392 |
|
|
err(1, "tmpfile"); |
393 |
|
|
} |
394 |
|
|
|
395 |
|
|
/* Read and output lines until we get a match. */ |
396 |
|
|
first = 1; |
397 |
|
|
while ((p = get_line()) != NULL) { |
398 |
|
|
if (fputs(p, ofp) != 0) |
399 |
|
|
break; |
400 |
|
|
if (!first && regexec(&cre, p, 0, NULL, 0) == 0) |
401 |
|
|
break; |
402 |
|
|
first = 0; |
403 |
|
|
} |
404 |
|
|
|
405 |
|
|
if (p == NULL) |
406 |
|
|
errx(1, "%s: no match", re); |
407 |
|
|
|
408 |
|
|
if (ofs <= 0) { |
409 |
|
|
/* |
410 |
|
|
* Negative (or zero) offset: throw back any lines we should |
411 |
|
|
* not have read yet. |
412 |
|
|
*/ |
413 |
|
|
if (p != NULL) { |
414 |
|
|
toomuch(ofp, -ofs + 1); |
415 |
|
|
nwritten = (intmax_t)truncofs; |
416 |
|
|
} else |
417 |
|
|
nwritten = (intmax_t)ftello(ofp); |
418 |
|
|
} else { |
419 |
|
|
/* |
420 |
|
|
* Positive offset: copy the requested number of lines |
421 |
|
|
* after the match. |
422 |
|
|
*/ |
423 |
|
|
while (--ofs > 0 && (p = get_line()) != NULL) |
424 |
|
|
fputs(p, ofp); |
425 |
|
|
toomuch(NULL, 0); |
426 |
|
|
nwritten = (intmax_t)ftello(ofp); |
427 |
|
|
if (fclose(ofp) != 0) |
428 |
|
|
err(1, "%s", currfile); |
429 |
|
|
} |
430 |
|
|
|
431 |
|
|
if (!sflag && *expr == '/') |
432 |
|
|
printf("%jd\n", nwritten); |
433 |
|
|
|
434 |
|
|
regfree(&cre); |
435 |
|
|
free(ecopy); |
436 |
|
|
} |
437 |
|
|
|
438 |
|
|
/* Handle splits based on line number. */ |
439 |
|
|
void |
440 |
|
|
do_lineno(const char *expr) |
441 |
|
|
{ |
442 |
|
|
long lastline, tgtline; |
443 |
|
|
char *ep, *p; |
444 |
|
|
FILE *ofp; |
445 |
|
|
|
446 |
|
|
errno = 0; |
447 |
|
|
tgtline = strtol(expr, &ep, 10); |
448 |
|
|
if (tgtline <= 0 || errno != 0 || *ep != '\0') |
449 |
|
|
errx(1, "%s: bad line number", expr); |
450 |
|
|
lastline = tgtline; |
451 |
|
|
if (lastline <= lineno) |
452 |
|
|
errx(1, "%s: can't go backwards", expr); |
453 |
|
|
|
454 |
|
|
while (nfiles < maxfiles - 1) { |
455 |
|
|
ofp = newfile(); |
456 |
|
|
while (lineno + 1 != lastline) { |
457 |
|
|
if ((p = get_line()) == NULL) |
458 |
|
|
errx(1, "%ld: out of range", lastline); |
459 |
|
|
if (fputs(p, ofp) != 0) |
460 |
|
|
break; |
461 |
|
|
} |
462 |
|
|
if (!sflag) |
463 |
|
|
printf("%jd\n", (intmax_t)ftello(ofp)); |
464 |
|
|
if (fclose(ofp) != 0) |
465 |
|
|
err(1, "%s", currfile); |
466 |
|
|
if (reps-- == 0) |
467 |
|
|
break; |
468 |
|
|
lastline += tgtline; |
469 |
|
|
} |
470 |
|
|
} |