1 |
|
|
/* $OpenBSD: bt_open.c,v 1.19 2015/12/28 22:08:18 mmcc Exp $ */ |
2 |
|
|
|
3 |
|
|
/*- |
4 |
|
|
* Copyright (c) 1990, 1993, 1994 |
5 |
|
|
* The Regents of the University of California. All rights reserved. |
6 |
|
|
* |
7 |
|
|
* This code is derived from software contributed to Berkeley by |
8 |
|
|
* Mike Olson. |
9 |
|
|
* |
10 |
|
|
* Redistribution and use in source and binary forms, with or without |
11 |
|
|
* modification, are permitted provided that the following conditions |
12 |
|
|
* are met: |
13 |
|
|
* 1. Redistributions of source code must retain the above copyright |
14 |
|
|
* notice, this list of conditions and the following disclaimer. |
15 |
|
|
* 2. Redistributions in binary form must reproduce the above copyright |
16 |
|
|
* notice, this list of conditions and the following disclaimer in the |
17 |
|
|
* documentation and/or other materials provided with the distribution. |
18 |
|
|
* 3. Neither the name of the University nor the names of its contributors |
19 |
|
|
* may be used to endorse or promote products derived from this software |
20 |
|
|
* without specific prior written permission. |
21 |
|
|
* |
22 |
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
23 |
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
24 |
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
25 |
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
26 |
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
27 |
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
28 |
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
29 |
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
30 |
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
31 |
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
32 |
|
|
* SUCH DAMAGE. |
33 |
|
|
*/ |
34 |
|
|
|
35 |
|
|
/* |
36 |
|
|
* Implementation of btree access method for 4.4BSD. |
37 |
|
|
* |
38 |
|
|
* The design here was originally based on that of the btree access method |
39 |
|
|
* used in the Postgres database system at UC Berkeley. This implementation |
40 |
|
|
* is wholly independent of the Postgres code. |
41 |
|
|
*/ |
42 |
|
|
|
43 |
|
|
#include <sys/stat.h> |
44 |
|
|
|
45 |
|
|
#include <errno.h> |
46 |
|
|
#include <fcntl.h> |
47 |
|
|
#include <limits.h> |
48 |
|
|
#include <signal.h> |
49 |
|
|
#include <stdio.h> |
50 |
|
|
#include <stdlib.h> |
51 |
|
|
#include <string.h> |
52 |
|
|
#include <unistd.h> |
53 |
|
|
|
54 |
|
|
#include <db.h> |
55 |
|
|
#include "btree.h" |
56 |
|
|
|
57 |
|
|
#ifdef DEBUG |
58 |
|
|
#undef MINPSIZE |
59 |
|
|
#define MINPSIZE 128 |
60 |
|
|
#endif |
61 |
|
|
|
62 |
|
|
static int byteorder(void); |
63 |
|
|
static int nroot(BTREE *); |
64 |
|
|
static int tmp(void); |
65 |
|
|
|
66 |
|
|
/* |
67 |
|
|
* __BT_OPEN -- Open a btree. |
68 |
|
|
* |
69 |
|
|
* Creates and fills a DB struct, and calls the routine that actually |
70 |
|
|
* opens the btree. |
71 |
|
|
* |
72 |
|
|
* Parameters: |
73 |
|
|
* fname: filename (NULL for in-memory trees) |
74 |
|
|
* flags: open flag bits |
75 |
|
|
* mode: open permission bits |
76 |
|
|
* b: BTREEINFO pointer |
77 |
|
|
* |
78 |
|
|
* Returns: |
79 |
|
|
* NULL on failure, pointer to DB on success. |
80 |
|
|
* |
81 |
|
|
*/ |
82 |
|
|
DB * |
83 |
|
|
__bt_open(const char *fname, int flags, int mode, const BTREEINFO *openinfo, |
84 |
|
|
int dflags) |
85 |
|
|
{ |
86 |
|
|
struct stat sb; |
87 |
|
|
BTMETA m; |
88 |
|
|
BTREE *t; |
89 |
|
|
BTREEINFO b; |
90 |
|
|
DB *dbp; |
91 |
|
|
pgno_t ncache; |
92 |
|
|
ssize_t nr; |
93 |
|
|
int machine_lorder, saved_errno; |
94 |
|
|
|
95 |
|
|
t = NULL; |
96 |
|
|
|
97 |
|
|
/* |
98 |
|
|
* Intention is to make sure all of the user's selections are okay |
99 |
|
|
* here and then use them without checking. Can't be complete, since |
100 |
|
|
* we don't know the right page size, lorder or flags until the backing |
101 |
|
|
* file is opened. Also, the file's page size can cause the cachesize |
102 |
|
|
* to change. |
103 |
|
|
*/ |
104 |
|
|
machine_lorder = byteorder(); |
105 |
|
|
if (openinfo) { |
106 |
|
|
b = *openinfo; |
107 |
|
|
|
108 |
|
|
/* Flags: R_DUP. */ |
109 |
|
|
if (b.flags & ~(R_DUP)) |
110 |
|
|
goto einval; |
111 |
|
|
|
112 |
|
|
/* |
113 |
|
|
* Page size must be indx_t aligned and >= MINPSIZE. Default |
114 |
|
|
* page size is set farther on, based on the underlying file |
115 |
|
|
* transfer size. |
116 |
|
|
*/ |
117 |
|
|
if (b.psize && |
118 |
|
|
(b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 || |
119 |
|
|
b.psize & (sizeof(indx_t) - 1))) |
120 |
|
|
goto einval; |
121 |
|
|
|
122 |
|
|
/* Minimum number of keys per page; absolute minimum is 2. */ |
123 |
|
|
if (b.minkeypage) { |
124 |
|
|
if (b.minkeypage < 2) |
125 |
|
|
goto einval; |
126 |
|
|
} else |
127 |
|
|
b.minkeypage = DEFMINKEYPAGE; |
128 |
|
|
|
129 |
|
|
/* If no comparison, use default comparison and prefix. */ |
130 |
|
|
if (b.compare == NULL) { |
131 |
|
|
b.compare = __bt_defcmp; |
132 |
|
|
if (b.prefix == NULL) |
133 |
|
|
b.prefix = __bt_defpfx; |
134 |
|
|
} |
135 |
|
|
|
136 |
|
|
if (b.lorder == 0) |
137 |
|
|
b.lorder = machine_lorder; |
138 |
|
|
} else { |
139 |
|
|
b.compare = __bt_defcmp; |
140 |
|
|
b.cachesize = 0; |
141 |
|
|
b.flags = 0; |
142 |
|
|
b.lorder = machine_lorder; |
143 |
|
|
b.minkeypage = DEFMINKEYPAGE; |
144 |
|
|
b.prefix = __bt_defpfx; |
145 |
|
|
b.psize = 0; |
146 |
|
|
} |
147 |
|
|
|
148 |
|
|
/* Check for the ubiquitous PDP-11. */ |
149 |
|
|
if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN) |
150 |
|
|
goto einval; |
151 |
|
|
|
152 |
|
|
/* Allocate and initialize DB and BTREE structures. */ |
153 |
|
|
if ((t = calloc(1, sizeof(BTREE))) == NULL) |
154 |
|
|
goto err; |
155 |
|
|
t->bt_fd = -1; /* Don't close unopened fd on error. */ |
156 |
|
|
t->bt_lorder = b.lorder; |
157 |
|
|
t->bt_order = NOT; |
158 |
|
|
t->bt_cmp = b.compare; |
159 |
|
|
t->bt_pfx = b.prefix; |
160 |
|
|
t->bt_rfd = -1; |
161 |
|
|
|
162 |
|
|
if ((t->bt_dbp = dbp = calloc(1, sizeof(DB))) == NULL) |
163 |
|
|
goto err; |
164 |
|
|
if (t->bt_lorder != machine_lorder) |
165 |
|
|
F_SET(t, B_NEEDSWAP); |
166 |
|
|
|
167 |
|
|
dbp->type = DB_BTREE; |
168 |
|
|
dbp->internal = t; |
169 |
|
|
dbp->close = __bt_close; |
170 |
|
|
dbp->del = __bt_delete; |
171 |
|
|
dbp->fd = __bt_fd; |
172 |
|
|
dbp->get = __bt_get; |
173 |
|
|
dbp->put = __bt_put; |
174 |
|
|
dbp->seq = __bt_seq; |
175 |
|
|
dbp->sync = __bt_sync; |
176 |
|
|
|
177 |
|
|
/* |
178 |
|
|
* If no file name was supplied, this is an in-memory btree and we |
179 |
|
|
* open a backing temporary file. Otherwise, it's a disk-based tree. |
180 |
|
|
*/ |
181 |
|
|
if (fname) { |
182 |
|
|
switch (flags & O_ACCMODE) { |
183 |
|
|
case O_RDONLY: |
184 |
|
|
F_SET(t, B_RDONLY); |
185 |
|
|
break; |
186 |
|
|
case O_RDWR: |
187 |
|
|
break; |
188 |
|
|
case O_WRONLY: |
189 |
|
|
default: |
190 |
|
|
goto einval; |
191 |
|
|
} |
192 |
|
|
|
193 |
|
|
if ((t->bt_fd = open(fname, flags | O_CLOEXEC, mode)) < 0) |
194 |
|
|
goto err; |
195 |
|
|
|
196 |
|
|
} else { |
197 |
|
|
if ((flags & O_ACCMODE) != O_RDWR) |
198 |
|
|
goto einval; |
199 |
|
|
if ((t->bt_fd = tmp()) == -1) |
200 |
|
|
goto err; |
201 |
|
|
F_SET(t, B_INMEM); |
202 |
|
|
} |
203 |
|
|
|
204 |
|
|
if (fstat(t->bt_fd, &sb)) |
205 |
|
|
goto err; |
206 |
|
|
if (sb.st_size) { |
207 |
|
|
if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0) |
208 |
|
|
goto err; |
209 |
|
|
if (nr != sizeof(BTMETA)) |
210 |
|
|
goto eftype; |
211 |
|
|
|
212 |
|
|
/* |
213 |
|
|
* Read in the meta-data. This can change the notion of what |
214 |
|
|
* the lorder, page size and flags are, and, when the page size |
215 |
|
|
* changes, the cachesize value can change too. If the user |
216 |
|
|
* specified the wrong byte order for an existing database, we |
217 |
|
|
* don't bother to return an error, we just clear the NEEDSWAP |
218 |
|
|
* bit. |
219 |
|
|
*/ |
220 |
|
|
if (m.magic == BTREEMAGIC) |
221 |
|
|
F_CLR(t, B_NEEDSWAP); |
222 |
|
|
else { |
223 |
|
|
F_SET(t, B_NEEDSWAP); |
224 |
|
|
M_32_SWAP(m.magic); |
225 |
|
|
M_32_SWAP(m.version); |
226 |
|
|
M_32_SWAP(m.psize); |
227 |
|
|
M_32_SWAP(m.free); |
228 |
|
|
M_32_SWAP(m.nrecs); |
229 |
|
|
M_32_SWAP(m.flags); |
230 |
|
|
} |
231 |
|
|
if (m.magic != BTREEMAGIC || m.version != BTREEVERSION) |
232 |
|
|
goto eftype; |
233 |
|
|
if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 || |
234 |
|
|
m.psize & (sizeof(indx_t) - 1)) |
235 |
|
|
goto eftype; |
236 |
|
|
if (m.flags & ~SAVEMETA) |
237 |
|
|
goto eftype; |
238 |
|
|
b.psize = m.psize; |
239 |
|
|
F_SET(t, m.flags); |
240 |
|
|
t->bt_free = m.free; |
241 |
|
|
t->bt_nrecs = m.nrecs; |
242 |
|
|
} else { |
243 |
|
|
/* |
244 |
|
|
* Set the page size to the best value for I/O to this file. |
245 |
|
|
* Don't overflow the page offset type. |
246 |
|
|
*/ |
247 |
|
|
if (b.psize == 0) { |
248 |
|
|
b.psize = sb.st_blksize; |
249 |
|
|
if (b.psize < MINPSIZE) |
250 |
|
|
b.psize = MINPSIZE; |
251 |
|
|
if (b.psize > MAX_PAGE_OFFSET + 1) |
252 |
|
|
b.psize = MAX_PAGE_OFFSET + 1; |
253 |
|
|
} |
254 |
|
|
|
255 |
|
|
/* Set flag if duplicates permitted. */ |
256 |
|
|
if (!(b.flags & R_DUP)) |
257 |
|
|
F_SET(t, B_NODUPS); |
258 |
|
|
|
259 |
|
|
t->bt_free = P_INVALID; |
260 |
|
|
t->bt_nrecs = 0; |
261 |
|
|
F_SET(t, B_METADIRTY); |
262 |
|
|
} |
263 |
|
|
|
264 |
|
|
t->bt_psize = b.psize; |
265 |
|
|
|
266 |
|
|
/* Set the cache size; must be a multiple of the page size. */ |
267 |
|
|
if (b.cachesize && b.cachesize & (b.psize - 1)) |
268 |
|
|
b.cachesize += (~b.cachesize & (b.psize - 1)) + 1; |
269 |
|
|
if (b.cachesize < b.psize * MINCACHE) |
270 |
|
|
b.cachesize = b.psize * MINCACHE; |
271 |
|
|
|
272 |
|
|
/* Calculate number of pages to cache. */ |
273 |
|
|
ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize; |
274 |
|
|
|
275 |
|
|
/* |
276 |
|
|
* The btree data structure requires that at least two keys can fit on |
277 |
|
|
* a page, but other than that there's no fixed requirement. The user |
278 |
|
|
* specified a minimum number per page, and we translated that into the |
279 |
|
|
* number of bytes a key/data pair can use before being placed on an |
280 |
|
|
* overflow page. This calculation includes the page header, the size |
281 |
|
|
* of the index referencing the leaf item and the size of the leaf item |
282 |
|
|
* structure. Also, don't let the user specify a minkeypage such that |
283 |
|
|
* a key/data pair won't fit even if both key and data are on overflow |
284 |
|
|
* pages. |
285 |
|
|
*/ |
286 |
|
|
t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage - |
287 |
|
|
(sizeof(indx_t) + NBLEAFDBT(0, 0)); |
288 |
|
|
if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t)) |
289 |
|
|
t->bt_ovflsize = |
290 |
|
|
NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t); |
291 |
|
|
|
292 |
|
|
/* Initialize the buffer pool. */ |
293 |
|
|
if ((t->bt_mp = |
294 |
|
|
mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL) |
295 |
|
|
goto err; |
296 |
|
|
if (!F_ISSET(t, B_INMEM)) |
297 |
|
|
mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t); |
298 |
|
|
|
299 |
|
|
/* Create a root page if new tree. */ |
300 |
|
|
if (nroot(t) == RET_ERROR) |
301 |
|
|
goto err; |
302 |
|
|
|
303 |
|
|
/* Global flags. */ |
304 |
|
|
if (dflags & DB_LOCK) |
305 |
|
|
F_SET(t, B_DB_LOCK); |
306 |
|
|
if (dflags & DB_SHMEM) |
307 |
|
|
F_SET(t, B_DB_SHMEM); |
308 |
|
|
if (dflags & DB_TXN) |
309 |
|
|
F_SET(t, B_DB_TXN); |
310 |
|
|
|
311 |
|
|
return (dbp); |
312 |
|
|
|
313 |
|
|
einval: errno = EINVAL; |
314 |
|
|
goto err; |
315 |
|
|
|
316 |
|
|
eftype: errno = EFTYPE; |
317 |
|
|
goto err; |
318 |
|
|
|
319 |
|
|
err: saved_errno = errno; |
320 |
|
|
if (t) { |
321 |
|
|
free(t->bt_dbp); |
322 |
|
|
if (t->bt_fd != -1) |
323 |
|
|
(void)close(t->bt_fd); |
324 |
|
|
free(t); |
325 |
|
|
} |
326 |
|
|
errno = saved_errno; |
327 |
|
|
return (NULL); |
328 |
|
|
} |
329 |
|
|
|
330 |
|
|
/* |
331 |
|
|
* NROOT -- Create the root of a new tree. |
332 |
|
|
* |
333 |
|
|
* Parameters: |
334 |
|
|
* t: tree |
335 |
|
|
* |
336 |
|
|
* Returns: |
337 |
|
|
* RET_ERROR, RET_SUCCESS |
338 |
|
|
*/ |
339 |
|
|
static int |
340 |
|
|
nroot(BTREE *t) |
341 |
|
|
{ |
342 |
|
|
PAGE *meta, *root; |
343 |
|
|
pgno_t npg; |
344 |
|
|
|
345 |
|
|
if ((root = mpool_get(t->bt_mp, 1, 0)) != NULL) { |
346 |
|
|
if (root->lower == 0 && |
347 |
|
|
root->pgno == 0 && |
348 |
|
|
root->linp[0] == 0) { |
349 |
|
|
mpool_delete(t->bt_mp, root); |
350 |
|
|
errno = EINVAL; |
351 |
|
|
} else { |
352 |
|
|
mpool_put(t->bt_mp, root, 0); |
353 |
|
|
return (RET_SUCCESS); |
354 |
|
|
} |
355 |
|
|
} |
356 |
|
|
if (errno != EINVAL) /* It's OK to not exist. */ |
357 |
|
|
return (RET_ERROR); |
358 |
|
|
errno = 0; |
359 |
|
|
|
360 |
|
|
if ((meta = mpool_new(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL) |
361 |
|
|
return (RET_ERROR); |
362 |
|
|
|
363 |
|
|
if ((root = mpool_new(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL) |
364 |
|
|
return (RET_ERROR); |
365 |
|
|
|
366 |
|
|
if (npg != P_ROOT) |
367 |
|
|
return (RET_ERROR); |
368 |
|
|
root->pgno = npg; |
369 |
|
|
root->prevpg = root->nextpg = P_INVALID; |
370 |
|
|
root->lower = BTDATAOFF; |
371 |
|
|
root->upper = t->bt_psize; |
372 |
|
|
root->flags = P_BLEAF; |
373 |
|
|
memset(meta, 0, t->bt_psize); |
374 |
|
|
mpool_put(t->bt_mp, meta, MPOOL_DIRTY); |
375 |
|
|
mpool_put(t->bt_mp, root, MPOOL_DIRTY); |
376 |
|
|
return (RET_SUCCESS); |
377 |
|
|
} |
378 |
|
|
|
379 |
|
|
static int |
380 |
|
|
tmp(void) |
381 |
|
|
{ |
382 |
|
|
sigset_t set, oset; |
383 |
|
|
int fd, len; |
384 |
|
|
char *envtmp = NULL; |
385 |
|
|
char path[PATH_MAX]; |
386 |
|
|
|
387 |
|
|
if (issetugid() == 0) |
388 |
|
|
envtmp = getenv("TMPDIR"); |
389 |
|
|
len = snprintf(path, |
390 |
|
|
sizeof(path), "%s/bt.XXXXXX", envtmp ? envtmp : "/tmp"); |
391 |
|
|
if (len < 0 || len >= sizeof(path)) { |
392 |
|
|
errno = ENAMETOOLONG; |
393 |
|
|
return(-1); |
394 |
|
|
} |
395 |
|
|
|
396 |
|
|
(void)sigfillset(&set); |
397 |
|
|
(void)sigprocmask(SIG_BLOCK, &set, &oset); |
398 |
|
|
if ((fd = mkostemp(path, O_CLOEXEC)) != -1) |
399 |
|
|
(void)unlink(path); |
400 |
|
|
(void)sigprocmask(SIG_SETMASK, &oset, NULL); |
401 |
|
|
return(fd); |
402 |
|
|
} |
403 |
|
|
|
404 |
|
|
static int |
405 |
|
|
byteorder(void) |
406 |
|
|
{ |
407 |
|
|
u_int32_t x; |
408 |
|
|
u_char *p; |
409 |
|
|
|
410 |
|
|
x = 0x01020304; |
411 |
|
|
p = (u_char *)&x; |
412 |
|
|
switch (*p) { |
413 |
|
|
case 1: |
414 |
|
|
return (BIG_ENDIAN); |
415 |
|
|
case 4: |
416 |
|
|
return (LITTLE_ENDIAN); |
417 |
|
|
default: |
418 |
|
|
return (0); |
419 |
|
|
} |
420 |
|
|
} |
421 |
|
|
|
422 |
|
|
int |
423 |
|
|
__bt_fd(const DB *dbp) |
424 |
|
|
{ |
425 |
|
|
BTREE *t; |
426 |
|
|
|
427 |
|
|
t = dbp->internal; |
428 |
|
|
|
429 |
|
|
/* Toss any page pinned across calls. */ |
430 |
|
|
if (t->bt_pinned != NULL) { |
431 |
|
|
mpool_put(t->bt_mp, t->bt_pinned, 0); |
432 |
|
|
t->bt_pinned = NULL; |
433 |
|
|
} |
434 |
|
|
|
435 |
|
|
/* In-memory database can't have a file descriptor. */ |
436 |
|
|
if (F_ISSET(t, B_INMEM)) { |
437 |
|
|
errno = ENOENT; |
438 |
|
|
return (-1); |
439 |
|
|
} |
440 |
|
|
return (t->bt_fd); |
441 |
|
|
} |