1 |
|
|
/* |
2 |
|
|
* $OpenBSD: locate.code.c,v 1.19 2015/11/15 07:38:29 deraadt Exp $ |
3 |
|
|
* |
4 |
|
|
* Copyright (c) 1989, 1993 |
5 |
|
|
* The Regents of the University of California. All rights reserved. |
6 |
|
|
* |
7 |
|
|
* This code is derived from software contributed to Berkeley by |
8 |
|
|
* James A. Woods. |
9 |
|
|
* |
10 |
|
|
* Redistribution and use in source and binary forms, with or without |
11 |
|
|
* modification, are permitted provided that the following conditions |
12 |
|
|
* are met: |
13 |
|
|
* 1. Redistributions of source code must retain the above copyright |
14 |
|
|
* notice, this list of conditions and the following disclaimer. |
15 |
|
|
* 2. Redistributions in binary form must reproduce the above copyright |
16 |
|
|
* notice, this list of conditions and the following disclaimer in the |
17 |
|
|
* documentation and/or other materials provided with the distribution. |
18 |
|
|
* 3. Neither the name of the University nor the names of its contributors |
19 |
|
|
* may be used to endorse or promote products derived from this software |
20 |
|
|
* without specific prior written permission. |
21 |
|
|
* |
22 |
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
23 |
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
24 |
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
25 |
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
26 |
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
27 |
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
28 |
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
29 |
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
30 |
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
31 |
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
32 |
|
|
* SUCH DAMAGE. |
33 |
|
|
* |
34 |
|
|
* $Id: locate.code.c,v 1.19 2015/11/15 07:38:29 deraadt Exp $ |
35 |
|
|
*/ |
36 |
|
|
|
37 |
|
|
/* |
38 |
|
|
* PURPOSE: sorted list compressor (works with a modified 'find' |
39 |
|
|
* to encode/decode a filename database) |
40 |
|
|
* |
41 |
|
|
* USAGE: bigram < list > bigrams |
42 |
|
|
* process bigrams (see updatedb) > common_bigrams |
43 |
|
|
* code common_bigrams < list > squozen_list |
44 |
|
|
* |
45 |
|
|
* METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 |
46 |
|
|
* February/March 1983, p. 8). Output format is, per line, an |
47 |
|
|
* offset differential count byte followed by a partially bigram- |
48 |
|
|
* encoded ascii residue. A bigram is a two-character sequence, |
49 |
|
|
* the first 128 most common of which are encoded in one byte. |
50 |
|
|
* |
51 |
|
|
* EXAMPLE: For simple front compression with no bigram encoding, |
52 |
|
|
* if the input is... then the output is... |
53 |
|
|
* |
54 |
|
|
* /usr/src 0 /usr/src |
55 |
|
|
* /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c |
56 |
|
|
* /usr/src/cmd/armadillo.c 14 armadillo.c |
57 |
|
|
* /usr/tmp/zoo 5 tmp/zoo |
58 |
|
|
* |
59 |
|
|
* The codes are: |
60 |
|
|
* |
61 |
|
|
* 0-28 likeliest differential counts + offset to make nonnegative |
62 |
|
|
* 30 switch code for out-of-range count to follow in next word |
63 |
|
|
* 31 an 8 bit char followed |
64 |
|
|
* 128-255 bigram codes (128 most common, as determined by 'updatedb') |
65 |
|
|
* 32-127 single character (printable) ascii residue (ie, literal) |
66 |
|
|
* |
67 |
|
|
* The locate database store any character except newline ('\n') |
68 |
|
|
* and NUL ('\0'). The 8-bit character support don't wast extra |
69 |
|
|
* space until you have characters in file names less than 32 |
70 |
|
|
* or greater than 127. |
71 |
|
|
* |
72 |
|
|
* |
73 |
|
|
* SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c |
74 |
|
|
* |
75 |
|
|
* AUTHOR: James A. Woods, Informatics General Corp., |
76 |
|
|
* NASA Ames Research Center, 10/82 |
77 |
|
|
* 8-bit file names characters: |
78 |
|
|
* Wolfram Schneider, Berlin September 1996 |
79 |
|
|
*/ |
80 |
|
|
|
81 |
|
|
#include <err.h> |
82 |
|
|
#include <errno.h> |
83 |
|
|
#include <stdio.h> |
84 |
|
|
#include <stdlib.h> |
85 |
|
|
#include <string.h> |
86 |
|
|
#include <unistd.h> |
87 |
|
|
#include <limits.h> |
88 |
|
|
|
89 |
|
|
#include "locate.h" |
90 |
|
|
|
91 |
|
|
#define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ |
92 |
|
|
|
93 |
|
|
u_char buf1[PATH_MAX] = " "; |
94 |
|
|
u_char buf2[PATH_MAX]; |
95 |
|
|
u_char bigrams[BGBUFSIZE + 1] = { 0 }; |
96 |
|
|
|
97 |
|
|
#define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ |
98 |
|
|
|
99 |
|
|
#ifdef LOOKUP |
100 |
|
|
#define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)]) |
101 |
|
|
typedef short bg_t; |
102 |
|
|
bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1]; |
103 |
|
|
#else |
104 |
|
|
#define BGINDEX(x) bgindex(x) |
105 |
|
|
typedef int bg_t; |
106 |
|
|
int bgindex(char *); |
107 |
|
|
#endif /* LOOKUP */ |
108 |
|
|
|
109 |
|
|
|
110 |
|
|
void usage(void); |
111 |
|
|
extern int optind; |
112 |
|
|
extern int optopt; |
113 |
|
|
|
114 |
|
|
int |
115 |
|
|
main(int argc, char *argv[]) |
116 |
|
|
{ |
117 |
|
|
u_char *cp, *oldpath, *path; |
118 |
|
|
int ch, code, count, diffcount, oldcount; |
119 |
|
|
FILE *fp; |
120 |
|
|
int i, j; |
121 |
|
|
|
122 |
✗✓ |
2 |
if (pledge("stdio rpath flock cpath wpath", NULL) == -1) |
123 |
|
|
err(1, "pledge"); |
124 |
|
|
|
125 |
✗✓ |
2 |
while ((ch = getopt(argc, argv, "")) != -1) |
126 |
|
|
switch (ch) { |
127 |
|
|
default: |
128 |
|
|
usage(); |
129 |
|
|
} |
130 |
|
1 |
argc -= optind; |
131 |
|
1 |
argv += optind; |
132 |
|
|
|
133 |
✗✓ |
1 |
if (argc != 1) |
134 |
|
|
usage(); |
135 |
|
|
|
136 |
✗✓ |
1 |
if ((fp = fopen(argv[0], "r")) == NULL) |
137 |
|
|
err(1, "%s", argv[0]); |
138 |
|
|
|
139 |
✗✓ |
1 |
if (pledge("stdio flock rpath cpath wpath", NULL) == -1) |
140 |
|
|
err(1, "pledge"); |
141 |
|
|
|
142 |
|
|
/* First copy bigram array to stdout. */ |
143 |
✗✓ |
1 |
if (fgets(bigrams, sizeof(bigrams), fp) == NULL) |
144 |
|
|
err(1, "fgets"); |
145 |
|
|
|
146 |
✗✓ |
1 |
if (strlen(bigrams) != BGBUFSIZE) |
147 |
|
|
errx(1, "bigram array too small to build db, index more files"); |
148 |
|
|
|
149 |
✗✓ |
1 |
if (fputs(bigrams, stdout) == EOF) |
150 |
|
|
err(1, "stdout"); |
151 |
|
1 |
(void)fclose(fp); |
152 |
|
|
|
153 |
|
|
#ifdef LOOKUP |
154 |
|
|
/* init lookup table */ |
155 |
✓✓ |
514 |
for (i = 0; i < UCHAR_MAX + 1; i++) |
156 |
✓✓ |
131584 |
for (j = 0; j < UCHAR_MAX + 1; j++) |
157 |
|
65536 |
big[i][j] = (bg_t)-1; |
158 |
|
|
|
159 |
✓✓ |
258 |
for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2) |
160 |
|
128 |
big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i; |
161 |
|
|
|
162 |
|
|
#endif /* LOOKUP */ |
163 |
|
|
|
164 |
|
|
oldpath = buf1; |
165 |
|
|
path = buf2; |
166 |
|
|
oldcount = 0; |
167 |
|
|
|
168 |
✓✓ |
28968 |
while (fgets(path, sizeof(buf2), stdin) != NULL) { |
169 |
|
|
|
170 |
|
|
/* skip empty lines */ |
171 |
✗✓ |
14483 |
if (*path == '\n') |
172 |
|
|
continue; |
173 |
|
|
|
174 |
|
|
/* remove newline */ |
175 |
✓✓ |
1199542 |
for (cp = path; *cp != '\0'; cp++) { |
176 |
|
|
/* chop newline */ |
177 |
✓✓ |
585288 |
if (*cp == '\n') |
178 |
|
14483 |
*cp = '\0'; |
179 |
|
|
} |
180 |
|
|
|
181 |
|
|
/* Skip longest common prefix. */ |
182 |
✓✓ |
988680 |
for (cp = path; *cp == *oldpath; cp++, oldpath++) |
183 |
✓✗ |
479857 |
if (*cp == '\0') |
184 |
|
|
break; |
185 |
|
|
|
186 |
|
14483 |
count = cp - path; |
187 |
|
14483 |
diffcount = count - oldcount + OFFSET; |
188 |
|
|
oldcount = count; |
189 |
✓✓ |
14483 |
if (diffcount < 0 || diffcount > 2 * OFFSET) { |
190 |
✓✗✓✗ ✗✓ |
368 |
if (putchar(SWITCH) == EOF || |
191 |
|
92 |
putw(diffcount, stdout) == EOF) |
192 |
|
|
err(1, "stdout"); |
193 |
|
|
} else |
194 |
✓✗✗✓
|
43173 |
if (putchar(diffcount) == EOF) |
195 |
|
|
err(1, "stdout"); |
196 |
|
|
|
197 |
✓✓ |
127634 |
while (*cp != '\0') { |
198 |
|
|
/* print *two* characters */ |
199 |
|
|
|
200 |
✓✓ |
49334 |
if ((code = BGINDEX(cp)) != (bg_t)-1) { |
201 |
|
|
/* |
202 |
|
|
* print *one* as bigram |
203 |
|
|
* Found, so mark byte with |
204 |
|
|
* parity bit. |
205 |
|
|
*/ |
206 |
✓✗✗✓
|
71496 |
if (putchar((code / 2) | PARITY) == EOF) |
207 |
|
|
err(1, "stdout"); |
208 |
|
23832 |
cp += 2; |
209 |
|
23832 |
} else { |
210 |
✓✓ |
137572 |
for (i = 0; i < 2; i++) { |
211 |
✓✓ |
51004 |
if (*cp == '\0') |
212 |
|
|
break; |
213 |
|
|
|
214 |
|
|
/* print umlauts in file names */ |
215 |
✓✗✗✓
|
86568 |
if (*cp < ASCII_MIN || |
216 |
|
43284 |
*cp > ASCII_MAX) { |
217 |
|
|
if (putchar(UMLAUT) == EOF || |
218 |
|
|
putchar(*cp++) == EOF) |
219 |
|
|
err(1, "stdout"); |
220 |
|
|
} else { |
221 |
|
|
/* normal character */ |
222 |
✓✗✗✓
|
129852 |
if (putchar(*cp++) == EOF) |
223 |
|
|
err(1, "stdout"); |
224 |
|
|
} |
225 |
|
|
} |
226 |
|
|
|
227 |
|
|
} |
228 |
|
|
} |
229 |
|
|
|
230 |
✓✓ |
14483 |
if (path == buf1) { /* swap pointers */ |
231 |
|
|
path = buf2; |
232 |
|
|
oldpath = buf1; |
233 |
|
7241 |
} else { |
234 |
|
|
path = buf1; |
235 |
|
|
oldpath = buf2; |
236 |
|
|
} |
237 |
|
|
} |
238 |
|
|
/* Non-zero status if there were errors */ |
239 |
✓✗✓✗ ✗✓✗✗
|
3 |
if (fflush(stdout) != 0 || ferror(stdout)) |
240 |
|
|
exit(1); |
241 |
|
|
exit(0); |
242 |
|
|
} |
243 |
|
|
|
244 |
|
|
#ifndef LOOKUP |
245 |
|
|
int |
246 |
|
|
bgindex(char *bg) /* Return location of bg in bigrams or -1. */ |
247 |
|
|
{ |
248 |
|
|
char bg0, bg1, *p; |
249 |
|
|
|
250 |
|
|
bg0 = bg[0]; |
251 |
|
|
bg1 = bg[1]; |
252 |
|
|
for (p = bigrams; *p != NULL; p++) |
253 |
|
|
if (*p++ == bg0 && *p == bg1) |
254 |
|
|
break; |
255 |
|
|
return (*p == NULL ? -1 : (--p - bigrams)); |
256 |
|
|
} |
257 |
|
|
#endif /* !LOOKUP */ |
258 |
|
|
|
259 |
|
|
void |
260 |
|
|
usage(void) |
261 |
|
|
{ |
262 |
|
|
(void)fprintf(stderr, |
263 |
|
|
"usage: locate.code common_bigrams < list > squozen_list\n"); |
264 |
|
|
exit(1); |
265 |
|
|
} |