1  | 
     | 
     | 
    /*	$OpenBSD: btree.c,v 1.38 2017/05/26 21:23:14 sthen Exp $ */  | 
    
    
    2  | 
     | 
     | 
     | 
    
    
    3  | 
     | 
     | 
    /*  | 
    
    
    4  | 
     | 
     | 
     * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>  | 
    
    
    5  | 
     | 
     | 
     *  | 
    
    
    6  | 
     | 
     | 
     * Permission to use, copy, modify, and distribute this software for any  | 
    
    
    7  | 
     | 
     | 
     * purpose with or without fee is hereby granted, provided that the above  | 
    
    
    8  | 
     | 
     | 
     * copyright notice and this permission notice appear in all copies.  | 
    
    
    9  | 
     | 
     | 
     *  | 
    
    
    10  | 
     | 
     | 
     * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES  | 
    
    
    11  | 
     | 
     | 
     * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF  | 
    
    
    12  | 
     | 
     | 
     * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR  | 
    
    
    13  | 
     | 
     | 
     * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES  | 
    
    
    14  | 
     | 
     | 
     * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN  | 
    
    
    15  | 
     | 
     | 
     * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF  | 
    
    
    16  | 
     | 
     | 
     * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.  | 
    
    
    17  | 
     | 
     | 
     */  | 
    
    
    18  | 
     | 
     | 
     | 
    
    
    19  | 
     | 
     | 
    #include <sys/types.h>  | 
    
    
    20  | 
     | 
     | 
    #include <sys/tree.h>  | 
    
    
    21  | 
     | 
     | 
    #include <sys/stat.h>  | 
    
    
    22  | 
     | 
     | 
    #include <sys/queue.h>  | 
    
    
    23  | 
     | 
     | 
    #include <sys/uio.h>  | 
    
    
    24  | 
     | 
     | 
     | 
    
    
    25  | 
     | 
     | 
    #include <assert.h>  | 
    
    
    26  | 
     | 
     | 
    #include <err.h>  | 
    
    
    27  | 
     | 
     | 
    #include <errno.h>  | 
    
    
    28  | 
     | 
     | 
    #include <fcntl.h>  | 
    
    
    29  | 
     | 
     | 
    #include <stddef.h>  | 
    
    
    30  | 
     | 
     | 
    #include <stdint.h>  | 
    
    
    31  | 
     | 
     | 
    #include <stdio.h>  | 
    
    
    32  | 
     | 
     | 
    #include <stdlib.h>  | 
    
    
    33  | 
     | 
     | 
    #include <string.h>  | 
    
    
    34  | 
     | 
     | 
    #include <time.h>  | 
    
    
    35  | 
     | 
     | 
    #include <unistd.h>  | 
    
    
    36  | 
     | 
     | 
     | 
    
    
    37  | 
     | 
     | 
    #include "btree.h"  | 
    
    
    38  | 
     | 
     | 
     | 
    
    
    39  | 
     | 
     | 
    /* #define DEBUG */  | 
    
    
    40  | 
     | 
     | 
     | 
    
    
    41  | 
     | 
     | 
    #ifdef DEBUG  | 
    
    
    42  | 
     | 
     | 
    # define DPRINTF(...)	do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \ | 
    
    
    43  | 
     | 
     | 
    			     fprintf(stderr, __VA_ARGS__); \  | 
    
    
    44  | 
     | 
     | 
    			     fprintf(stderr, "\n"); } while(0)  | 
    
    
    45  | 
     | 
     | 
    #else  | 
    
    
    46  | 
     | 
     | 
    # define DPRINTF(...)  | 
    
    
    47  | 
     | 
     | 
    #endif  | 
    
    
    48  | 
     | 
     | 
     | 
    
    
    49  | 
     | 
     | 
    #define PAGESIZE	 4096  | 
    
    
    50  | 
     | 
     | 
    #define BT_MINKEYS	 4  | 
    
    
    51  | 
     | 
     | 
    #define BT_MAGIC	 0xB3DBB3DB  | 
    
    
    52  | 
     | 
     | 
    #define BT_VERSION	 4  | 
    
    
    53  | 
     | 
     | 
    #define MAXKEYSIZE	 255  | 
    
    
    54  | 
     | 
     | 
     | 
    
    
    55  | 
     | 
     | 
    #define P_INVALID	 0xFFFFFFFF  | 
    
    
    56  | 
     | 
     | 
     | 
    
    
    57  | 
     | 
     | 
    #define F_ISSET(w, f)	 (((w) & (f)) == (f))  | 
    
    
    58  | 
     | 
     | 
    #define MINIMUM(a,b)	 ((a) < (b) ? (a) : (b))  | 
    
    
    59  | 
     | 
     | 
     | 
    
    
    60  | 
     | 
     | 
    typedef uint32_t	 pgno_t;  | 
    
    
    61  | 
     | 
     | 
    typedef uint16_t	 indx_t;  | 
    
    
    62  | 
     | 
     | 
     | 
    
    
    63  | 
     | 
     | 
    /* There are four page types: meta, index, leaf and overflow.  | 
    
    
    64  | 
     | 
     | 
     * They all share the same page header.  | 
    
    
    65  | 
     | 
     | 
     */  | 
    
    
    66  | 
     | 
     | 
    struct page {				/* represents an on-disk page */ | 
    
    
    67  | 
     | 
     | 
    	pgno_t		 pgno;		/* page number */  | 
    
    
    68  | 
     | 
     | 
    #define	P_BRANCH	 0x01		/* branch page */  | 
    
    
    69  | 
     | 
     | 
    #define	P_LEAF		 0x02		/* leaf page */  | 
    
    
    70  | 
     | 
     | 
    #define	P_OVERFLOW	 0x04		/* overflow page */  | 
    
    
    71  | 
     | 
     | 
    #define	P_META		 0x08		/* meta page */  | 
    
    
    72  | 
     | 
     | 
    #define	P_HEAD		 0x10		/* header page */  | 
    
    
    73  | 
     | 
     | 
    	uint32_t	 flags;  | 
    
    
    74  | 
     | 
     | 
    #define lower		 b.fb.fb_lower  | 
    
    
    75  | 
     | 
     | 
    #define upper		 b.fb.fb_upper  | 
    
    
    76  | 
     | 
     | 
    #define p_next_pgno	 b.pb_next_pgno  | 
    
    
    77  | 
     | 
     | 
    	union page_bounds { | 
    
    
    78  | 
     | 
     | 
    		struct { | 
    
    
    79  | 
     | 
     | 
    			indx_t	 fb_lower;	/* lower bound of free space */  | 
    
    
    80  | 
     | 
     | 
    			indx_t	 fb_upper;	/* upper bound of free space */  | 
    
    
    81  | 
     | 
     | 
    		} fb;  | 
    
    
    82  | 
     | 
     | 
    		pgno_t		 pb_next_pgno;	/* overflow page linked list */  | 
    
    
    83  | 
     | 
     | 
    	} b;  | 
    
    
    84  | 
     | 
     | 
    	indx_t		 ptrs[1];		/* dynamic size */  | 
    
    
    85  | 
     | 
     | 
    } __packed;  | 
    
    
    86  | 
     | 
     | 
     | 
    
    
    87  | 
     | 
     | 
    #define PAGEHDRSZ	 offsetof(struct page, ptrs)  | 
    
    
    88  | 
     | 
     | 
     | 
    
    
    89  | 
     | 
     | 
    #define NUMKEYSP(p)	 (((p)->lower - PAGEHDRSZ) >> 1)  | 
    
    
    90  | 
     | 
     | 
    #define NUMKEYS(mp)	 (((mp)->page->lower - PAGEHDRSZ) >> 1)  | 
    
    
    91  | 
     | 
     | 
    #define SIZELEFT(mp)	 (indx_t)((mp)->page->upper - (mp)->page->lower)  | 
    
    
    92  | 
     | 
     | 
    #define PAGEFILL(bt, mp) (1000 * ((bt)->head.psize - PAGEHDRSZ - SIZELEFT(mp)) / \  | 
    
    
    93  | 
     | 
     | 
    				((bt)->head.psize - PAGEHDRSZ))  | 
    
    
    94  | 
     | 
     | 
    #define IS_LEAF(mp)	 F_ISSET((mp)->page->flags, P_LEAF)  | 
    
    
    95  | 
     | 
     | 
    #define IS_BRANCH(mp)	 F_ISSET((mp)->page->flags, P_BRANCH)  | 
    
    
    96  | 
     | 
     | 
    #define IS_OVERFLOW(mp)	 F_ISSET((mp)->page->flags, P_OVERFLOW)  | 
    
    
    97  | 
     | 
     | 
     | 
    
    
    98  | 
     | 
     | 
    struct bt_head {				/* header page content */ | 
    
    
    99  | 
     | 
     | 
    	uint32_t	 magic;  | 
    
    
    100  | 
     | 
     | 
    	uint32_t	 version;  | 
    
    
    101  | 
     | 
     | 
    	uint32_t	 flags;  | 
    
    
    102  | 
     | 
     | 
    	uint32_t	 psize;			/* page size */  | 
    
    
    103  | 
     | 
     | 
    } __packed;  | 
    
    
    104  | 
     | 
     | 
     | 
    
    
    105  | 
     | 
     | 
    struct bt_meta {				/* meta (footer) page content */ | 
    
    
    106  | 
     | 
     | 
    #define BT_TOMBSTONE	 0x01			/* file is replaced */  | 
    
    
    107  | 
     | 
     | 
    	uint32_t	 flags;  | 
    
    
    108  | 
     | 
     | 
    	pgno_t		 root;			/* page number of root page */  | 
    
    
    109  | 
     | 
     | 
    	pgno_t		 prev_meta;		/* previous meta page number */  | 
    
    
    110  | 
     | 
     | 
    	time_t		 created_at;  | 
    
    
    111  | 
     | 
     | 
    	uint32_t	 branch_pages;  | 
    
    
    112  | 
     | 
     | 
    	uint32_t	 leaf_pages;  | 
    
    
    113  | 
     | 
     | 
    	uint32_t	 overflow_pages;  | 
    
    
    114  | 
     | 
     | 
    	uint32_t	 revisions;  | 
    
    
    115  | 
     | 
     | 
    	uint32_t	 depth;  | 
    
    
    116  | 
     | 
     | 
    	uint64_t	 entries;  | 
    
    
    117  | 
     | 
     | 
    	unsigned char	 hash[SHA_DIGEST_LENGTH];  | 
    
    
    118  | 
     | 
     | 
    } __packed;  | 
    
    
    119  | 
     | 
     | 
     | 
    
    
    120  | 
     | 
     | 
    struct btkey { | 
    
    
    121  | 
     | 
     | 
    	size_t			 len;  | 
    
    
    122  | 
     | 
     | 
    	char			 str[MAXKEYSIZE];  | 
    
    
    123  | 
     | 
     | 
    };  | 
    
    
    124  | 
     | 
     | 
     | 
    
    
    125  | 
     | 
     | 
    struct mpage {					/* an in-memory cached page */ | 
    
    
    126  | 
     | 
     | 
    	RB_ENTRY(mpage)		 entry;		/* page cache entry */  | 
    
    
    127  | 
     | 
     | 
    	SIMPLEQ_ENTRY(mpage)	 next;		/* queue of dirty pages */  | 
    
    
    128  | 
     | 
     | 
    	TAILQ_ENTRY(mpage)	 lru_next;	/* LRU queue */  | 
    
    
    129  | 
     | 
     | 
    	struct mpage		*parent;	/* NULL if root */  | 
    
    
    130  | 
     | 
     | 
    	unsigned int		 parent_index;	/* keep track of node index */  | 
    
    
    131  | 
     | 
     | 
    	struct btkey		 prefix;  | 
    
    
    132  | 
     | 
     | 
    	struct page		*page;  | 
    
    
    133  | 
     | 
     | 
    	pgno_t			 pgno;		/* copy of page->pgno */  | 
    
    
    134  | 
     | 
     | 
    	short			 ref;		/* increased by cursors */  | 
    
    
    135  | 
     | 
     | 
    	short			 dirty;		/* 1 if on dirty queue */  | 
    
    
    136  | 
     | 
     | 
    };  | 
    
    
    137  | 
     | 
     | 
    RB_HEAD(page_cache, mpage);  | 
    
    
    138  | 
     | 
     | 
    SIMPLEQ_HEAD(dirty_queue, mpage);  | 
    
    
    139  | 
     | 
     | 
    TAILQ_HEAD(lru_queue, mpage);  | 
    
    
    140  | 
     | 
     | 
     | 
    
    
    141  | 
     | 
     | 
    static int		 mpage_cmp(struct mpage *a, struct mpage *b);  | 
    
    
    142  | 
     | 
     | 
    static struct mpage	*mpage_lookup(struct btree *bt, pgno_t pgno);  | 
    
    
    143  | 
     | 
     | 
    static void		 mpage_add(struct btree *bt, struct mpage *mp);  | 
    
    
    144  | 
     | 
     | 
    static void		 mpage_free(struct mpage *mp);  | 
    
    
    145  | 
     | 
     | 
    static void		 mpage_del(struct btree *bt, struct mpage *mp);  | 
    
    
    146  | 
     | 
     | 
    static void		 mpage_flush(struct btree *bt);  | 
    
    
    147  | 
     | 
     | 
    static struct mpage	*mpage_copy(struct btree *bt, struct mpage *mp);  | 
    
    
    148  | 
     | 
     | 
    static void		 mpage_prune(struct btree *bt);  | 
    
    
    149  | 
     | 
     | 
    static void		 mpage_dirty(struct btree *bt, struct mpage *mp);  | 
    
    
    150  | 
     | 
     | 
    static struct mpage	*mpage_touch(struct btree *bt, struct mpage *mp);  | 
    
    
    151  | 
     | 
     | 
     | 
    
    
    152  | 
     | 
     | 
    RB_PROTOTYPE(page_cache, mpage, entry, mpage_cmp);  | 
    
    
    153  | 
     | 
     | 
    RB_GENERATE(page_cache, mpage, entry, mpage_cmp);  | 
    
    
    154  | 
     | 
     | 
     | 
    
    
    155  | 
     | 
     | 
    struct ppage {					/* ordered list of pages */ | 
    
    
    156  | 
     | 
     | 
    	SLIST_ENTRY(ppage)	 entry;  | 
    
    
    157  | 
     | 
     | 
    	struct mpage		*mpage;  | 
    
    
    158  | 
     | 
     | 
    	unsigned int		 ki;		/* cursor index on page */  | 
    
    
    159  | 
     | 
     | 
    };  | 
    
    
    160  | 
     | 
     | 
    SLIST_HEAD(page_stack, ppage);  | 
    
    
    161  | 
     | 
     | 
     | 
    
    
    162  | 
     | 
     | 
    #define CURSOR_EMPTY(c)		 SLIST_EMPTY(&(c)->stack)  | 
    
    
    163  | 
     | 
     | 
    #define CURSOR_TOP(c)		 SLIST_FIRST(&(c)->stack)  | 
    
    
    164  | 
     | 
     | 
    #define CURSOR_POP(c)		 SLIST_REMOVE_HEAD(&(c)->stack, entry)  | 
    
    
    165  | 
     | 
     | 
    #define CURSOR_PUSH(c,p)	 SLIST_INSERT_HEAD(&(c)->stack, p, entry)  | 
    
    
    166  | 
     | 
     | 
     | 
    
    
    167  | 
     | 
     | 
    struct cursor { | 
    
    
    168  | 
     | 
     | 
    	struct btree		*bt;  | 
    
    
    169  | 
     | 
     | 
    	struct btree_txn	*txn;  | 
    
    
    170  | 
     | 
     | 
    	struct page_stack	 stack;		/* stack of parent pages */  | 
    
    
    171  | 
     | 
     | 
    	short			 initialized;	/* 1 if initialized */  | 
    
    
    172  | 
     | 
     | 
    	short			 eof;		/* 1 if end is reached */  | 
    
    
    173  | 
     | 
     | 
    };  | 
    
    
    174  | 
     | 
     | 
     | 
    
    
    175  | 
     | 
     | 
    #define METAHASHLEN	 offsetof(struct bt_meta, hash)  | 
    
    
    176  | 
     | 
     | 
    #define METADATA(p)	 ((void *)((char *)p + PAGEHDRSZ))  | 
    
    
    177  | 
     | 
     | 
     | 
    
    
    178  | 
     | 
     | 
    struct node { | 
    
    
    179  | 
     | 
     | 
    #define n_pgno		 p.np_pgno  | 
    
    
    180  | 
     | 
     | 
    #define n_dsize		 p.np_dsize  | 
    
    
    181  | 
     | 
     | 
    	union { | 
    
    
    182  | 
     | 
     | 
    		pgno_t		 np_pgno;	/* child page number */  | 
    
    
    183  | 
     | 
     | 
    		uint32_t	 np_dsize;	/* leaf data size */  | 
    
    
    184  | 
     | 
     | 
    	}		 p;  | 
    
    
    185  | 
     | 
     | 
    	uint16_t	 ksize;			/* key size */  | 
    
    
    186  | 
     | 
     | 
    #define F_BIGDATA	 0x01			/* data put on overflow page */  | 
    
    
    187  | 
     | 
     | 
    	uint8_t		 flags;  | 
    
    
    188  | 
     | 
     | 
    	char		 data[1];  | 
    
    
    189  | 
     | 
     | 
    } __packed;  | 
    
    
    190  | 
     | 
     | 
     | 
    
    
    191  | 
     | 
     | 
    struct btree_txn { | 
    
    
    192  | 
     | 
     | 
    	pgno_t			 root;		/* current / new root page */  | 
    
    
    193  | 
     | 
     | 
    	pgno_t			 next_pgno;	/* next unallocated page */  | 
    
    
    194  | 
     | 
     | 
    	struct btree		*bt;		/* btree is ref'd */  | 
    
    
    195  | 
     | 
     | 
    	struct dirty_queue	*dirty_queue;	/* modified pages */  | 
    
    
    196  | 
     | 
     | 
    #define BT_TXN_RDONLY		 0x01		/* read-only transaction */  | 
    
    
    197  | 
     | 
     | 
    #define BT_TXN_ERROR		 0x02		/* an error has occurred */  | 
    
    
    198  | 
     | 
     | 
    	unsigned int		 flags;  | 
    
    
    199  | 
     | 
     | 
    };  | 
    
    
    200  | 
     | 
     | 
     | 
    
    
    201  | 
     | 
     | 
    struct btree { | 
    
    
    202  | 
     | 
     | 
    	int			 fd;  | 
    
    
    203  | 
     | 
     | 
    	char			*path;  | 
    
    
    204  | 
     | 
     | 
    #define BT_FIXPADDING		 0x01		/* internal */  | 
    
    
    205  | 
     | 
     | 
    	unsigned int		 flags;  | 
    
    
    206  | 
     | 
     | 
    	bt_cmp_func		 cmp;		/* user compare function */  | 
    
    
    207  | 
     | 
     | 
    	struct bt_head		 head;  | 
    
    
    208  | 
     | 
     | 
    	struct bt_meta		 meta;  | 
    
    
    209  | 
     | 
     | 
    	struct page_cache	*page_cache;  | 
    
    
    210  | 
     | 
     | 
    	struct lru_queue	*lru_queue;  | 
    
    
    211  | 
     | 
     | 
    	struct btree_txn	*txn;		/* current write transaction */  | 
    
    
    212  | 
     | 
     | 
    	int			 ref;		/* increased by cursors & txn */  | 
    
    
    213  | 
     | 
     | 
    	struct btree_stat	 stat;  | 
    
    
    214  | 
     | 
     | 
    	off_t			 size;		/* current file size */  | 
    
    
    215  | 
     | 
     | 
    };  | 
    
    
    216  | 
     | 
     | 
     | 
    
    
    217  | 
     | 
     | 
    #define NODESIZE	 offsetof(struct node, data)  | 
    
    
    218  | 
     | 
     | 
     | 
    
    
    219  | 
     | 
     | 
    #define INDXSIZE(k)	 (NODESIZE + ((k) == NULL ? 0 : (k)->size))  | 
    
    
    220  | 
     | 
     | 
    #define LEAFSIZE(k, d)	 (NODESIZE + (k)->size + (d)->size)  | 
    
    
    221  | 
     | 
     | 
    #define NODEPTRP(p, i)	 ((struct node *)((char *)(p) + (p)->ptrs[i]))  | 
    
    
    222  | 
     | 
     | 
    #define NODEPTR(mp, i)	 NODEPTRP((mp)->page, i)  | 
    
    
    223  | 
     | 
     | 
    #define NODEKEY(node)	 (void *)((node)->data)  | 
    
    
    224  | 
     | 
     | 
    #define NODEDATA(node)	 (void *)((char *)(node)->data + (node)->ksize)  | 
    
    
    225  | 
     | 
     | 
    #define NODEPGNO(node)	 ((node)->p.np_pgno)  | 
    
    
    226  | 
     | 
     | 
    #define NODEDSZ(node)	 ((node)->p.np_dsize)  | 
    
    
    227  | 
     | 
     | 
     | 
    
    
    228  | 
     | 
     | 
    #define BT_COMMIT_PAGES	 64	/* max number of pages to write in one commit */  | 
    
    
    229  | 
     | 
     | 
    #define BT_MAXCACHE_DEF	 1024	/* max number of pages to keep in cache  */  | 
    
    
    230  | 
     | 
     | 
     | 
    
    
    231  | 
     | 
     | 
    static int		 btree_read_page(struct btree *bt, pgno_t pgno,  | 
    
    
    232  | 
     | 
     | 
    			    struct page *page);  | 
    
    
    233  | 
     | 
     | 
    static struct mpage	*btree_get_mpage(struct btree *bt, pgno_t pgno);  | 
    
    
    234  | 
     | 
     | 
    static int		 btree_search_page_root(struct btree *bt,  | 
    
    
    235  | 
     | 
     | 
    			    struct mpage *root, struct btval *key,  | 
    
    
    236  | 
     | 
     | 
    			    struct cursor *cursor, int modify,  | 
    
    
    237  | 
     | 
     | 
    			    struct mpage **mpp);  | 
    
    
    238  | 
     | 
     | 
    static int		 btree_search_page(struct btree *bt,  | 
    
    
    239  | 
     | 
     | 
    			    struct btree_txn *txn, struct btval *key,  | 
    
    
    240  | 
     | 
     | 
    			    struct cursor *cursor, int modify,  | 
    
    
    241  | 
     | 
     | 
    			    struct mpage **mpp);  | 
    
    
    242  | 
     | 
     | 
     | 
    
    
    243  | 
     | 
     | 
    static int		 btree_write_header(struct btree *bt, int fd);  | 
    
    
    244  | 
     | 
     | 
    static int		 btree_read_header(struct btree *bt);  | 
    
    
    245  | 
     | 
     | 
    static int		 btree_is_meta_page(struct page *p);  | 
    
    
    246  | 
     | 
     | 
    static int		 btree_read_meta(struct btree *bt, pgno_t *p_next);  | 
    
    
    247  | 
     | 
     | 
    static int		 btree_write_meta(struct btree *bt, pgno_t root,  | 
    
    
    248  | 
     | 
     | 
    			    unsigned int flags);  | 
    
    
    249  | 
     | 
     | 
    static void		 btree_ref(struct btree *bt);  | 
    
    
    250  | 
     | 
     | 
     | 
    
    
    251  | 
     | 
     | 
    static struct node	*btree_search_node(struct btree *bt, struct mpage *mp,  | 
    
    
    252  | 
     | 
     | 
    			    struct btval *key, int *exactp, unsigned int *kip);  | 
    
    
    253  | 
     | 
     | 
    static int		 btree_add_node(struct btree *bt, struct mpage *mp,  | 
    
    
    254  | 
     | 
     | 
    			    indx_t indx, struct btval *key, struct btval *data,  | 
    
    
    255  | 
     | 
     | 
    			    pgno_t pgno, uint8_t flags);  | 
    
    
    256  | 
     | 
     | 
    static void		 btree_del_node(struct btree *bt, struct mpage *mp,  | 
    
    
    257  | 
     | 
     | 
    			    indx_t indx);  | 
    
    
    258  | 
     | 
     | 
    static int		 btree_read_data(struct btree *bt, struct mpage *mp,  | 
    
    
    259  | 
     | 
     | 
    			    struct node *leaf, struct btval *data);  | 
    
    
    260  | 
     | 
     | 
     | 
    
    
    261  | 
     | 
     | 
    static int		 btree_rebalance(struct btree *bt, struct mpage *mp);  | 
    
    
    262  | 
     | 
     | 
    static int		 btree_update_key(struct btree *bt, struct mpage *mp,  | 
    
    
    263  | 
     | 
     | 
    			    indx_t indx, struct btval *key);  | 
    
    
    264  | 
     | 
     | 
    static int		 btree_adjust_prefix(struct btree *bt,  | 
    
    
    265  | 
     | 
     | 
    			    struct mpage *src, int delta);  | 
    
    
    266  | 
     | 
     | 
    static int		 btree_move_node(struct btree *bt, struct mpage *src,  | 
    
    
    267  | 
     | 
     | 
    			    indx_t srcindx, struct mpage *dst, indx_t dstindx);  | 
    
    
    268  | 
     | 
     | 
    static int		 btree_merge(struct btree *bt, struct mpage *src,  | 
    
    
    269  | 
     | 
     | 
    			    struct mpage *dst);  | 
    
    
    270  | 
     | 
     | 
    static int		 btree_split(struct btree *bt, struct mpage **mpp,  | 
    
    
    271  | 
     | 
     | 
    			    unsigned int *newindxp, struct btval *newkey,  | 
    
    
    272  | 
     | 
     | 
    			    struct btval *newdata, pgno_t newpgno);  | 
    
    
    273  | 
     | 
     | 
    static struct mpage	*btree_new_page(struct btree *bt, uint32_t flags);  | 
    
    
    274  | 
     | 
     | 
    static int		 btree_write_overflow_data(struct btree *bt,  | 
    
    
    275  | 
     | 
     | 
    			    struct page *p, struct btval *data);  | 
    
    
    276  | 
     | 
     | 
     | 
    
    
    277  | 
     | 
     | 
    static void		 cursor_pop_page(struct cursor *cursor);  | 
    
    
    278  | 
     | 
     | 
    static struct ppage	 *cursor_push_page(struct cursor *cursor,  | 
    
    
    279  | 
     | 
     | 
    			    struct mpage *mp);  | 
    
    
    280  | 
     | 
     | 
     | 
    
    
    281  | 
     | 
     | 
    static int		 bt_set_key(struct btree *bt, struct mpage *mp,  | 
    
    
    282  | 
     | 
     | 
    			    struct node *node, struct btval *key);  | 
    
    
    283  | 
     | 
     | 
    static int		 btree_sibling(struct cursor *cursor, int move_right);  | 
    
    
    284  | 
     | 
     | 
    static int		 btree_cursor_next(struct cursor *cursor,  | 
    
    
    285  | 
     | 
     | 
    			    struct btval *key, struct btval *data);  | 
    
    
    286  | 
     | 
     | 
    static int		 btree_cursor_set(struct cursor *cursor,  | 
    
    
    287  | 
     | 
     | 
    			    struct btval *key, struct btval *data, int *exactp);  | 
    
    
    288  | 
     | 
     | 
    static int		 btree_cursor_first(struct cursor *cursor,  | 
    
    
    289  | 
     | 
     | 
    			    struct btval *key, struct btval *data);  | 
    
    
    290  | 
     | 
     | 
     | 
    
    
    291  | 
     | 
     | 
    static void		 bt_reduce_separator(struct btree *bt, struct node *min,  | 
    
    
    292  | 
     | 
     | 
    			    struct btval *sep);  | 
    
    
    293  | 
     | 
     | 
    static void		 remove_prefix(struct btree *bt, struct btval *key,  | 
    
    
    294  | 
     | 
     | 
    			    size_t pfxlen);  | 
    
    
    295  | 
     | 
     | 
    static void		 expand_prefix(struct btree *bt, struct mpage *mp,  | 
    
    
    296  | 
     | 
     | 
    			    indx_t indx, struct btkey *expkey);  | 
    
    
    297  | 
     | 
     | 
    static void		 concat_prefix(struct btree *bt, char *s1, size_t n1,  | 
    
    
    298  | 
     | 
     | 
    			    char *s2, size_t n2, char *cs, size_t *cn);  | 
    
    
    299  | 
     | 
     | 
    static void		 common_prefix(struct btree *bt, struct btkey *min,  | 
    
    
    300  | 
     | 
     | 
    			    struct btkey *max, struct btkey *pfx);  | 
    
    
    301  | 
     | 
     | 
    static void		 find_common_prefix(struct btree *bt, struct mpage *mp);  | 
    
    
    302  | 
     | 
     | 
     | 
    
    
    303  | 
     | 
     | 
    static size_t		 bt_leaf_size(struct btree *bt, struct btval *key,  | 
    
    
    304  | 
     | 
     | 
    			    struct btval *data);  | 
    
    
    305  | 
     | 
     | 
    static size_t		 bt_branch_size(struct btree *bt, struct btval *key);  | 
    
    
    306  | 
     | 
     | 
     | 
    
    
    307  | 
     | 
     | 
    static pgno_t		 btree_compact_tree(struct btree *bt, pgno_t pgno,  | 
    
    
    308  | 
     | 
     | 
    			    struct btree *btc);  | 
    
    
    309  | 
     | 
     | 
     | 
    
    
    310  | 
     | 
     | 
    static int		 memncmp(const void *s1, size_t n1,  | 
    
    
    311  | 
     | 
     | 
    				 const void *s2, size_t n2);  | 
    
    
    312  | 
     | 
     | 
    static int		 memnrcmp(const void *s1, size_t n1,  | 
    
    
    313  | 
     | 
     | 
    				  const void *s2, size_t n2);  | 
    
    
    314  | 
     | 
     | 
     | 
    
    
    315  | 
     | 
     | 
    static int  | 
    
    
    316  | 
     | 
     | 
    memncmp(const void *s1, size_t n1, const void *s2, size_t n2)  | 
    
    
    317  | 
     | 
     | 
    { | 
    
    
    318  | 
     | 
     | 
    	if (n1 < n2) { | 
    
    
    319  | 
     | 
     | 
    		if (memcmp(s1, s2, n1) == 0)  | 
    
    
    320  | 
     | 
     | 
    			return -1;  | 
    
    
    321  | 
     | 
     | 
    	}  | 
    
    
    322  | 
     | 
     | 
    	else if (n1 > n2) { | 
    
    
    323  | 
     | 
     | 
    		if (memcmp(s1, s2, n2) == 0)  | 
    
    
    324  | 
     | 
     | 
    			return 1;  | 
    
    
    325  | 
     | 
     | 
    	}  | 
    
    
    326  | 
     | 
     | 
    	return memcmp(s1, s2, n1);  | 
    
    
    327  | 
     | 
     | 
    }  | 
    
    
    328  | 
     | 
     | 
     | 
    
    
    329  | 
     | 
     | 
    static int  | 
    
    
    330  | 
     | 
     | 
    memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2)  | 
    
    
    331  | 
     | 
     | 
    { | 
    
    
    332  | 
     | 
     | 
    	const unsigned char	*p1;  | 
    
    
    333  | 
     | 
     | 
    	const unsigned char	*p2;  | 
    
    
    334  | 
     | 
     | 
     | 
    
    
    335  | 
     | 
     | 
    	if (n1 == 0)  | 
    
    
    336  | 
     | 
     | 
    		return n2 == 0 ? 0 : -1;  | 
    
    
    337  | 
     | 
     | 
     | 
    
    
    338  | 
     | 
     | 
    	if (n2 == 0)  | 
    
    
    339  | 
     | 
     | 
    		return n1 == 0 ? 0 : 1;  | 
    
    
    340  | 
     | 
     | 
     | 
    
    
    341  | 
     | 
     | 
    	p1 = (const unsigned char *)s1 + n1 - 1;  | 
    
    
    342  | 
     | 
     | 
    	p2 = (const unsigned char *)s2 + n2 - 1;  | 
    
    
    343  | 
     | 
     | 
     | 
    
    
    344  | 
     | 
     | 
    	while (*p1 == *p2) { | 
    
    
    345  | 
     | 
     | 
    		if (p1 == s1)  | 
    
    
    346  | 
     | 
     | 
    			return (p2 == s2) ? 0 : -1;  | 
    
    
    347  | 
     | 
     | 
    		if (p2 == s2)  | 
    
    
    348  | 
     | 
     | 
    			return (p1 == p2) ? 0 : 1;  | 
    
    
    349  | 
     | 
     | 
    		p1--;  | 
    
    
    350  | 
     | 
     | 
    		p2--;  | 
    
    
    351  | 
     | 
     | 
    	}  | 
    
    
    352  | 
     | 
     | 
    	return *p1 - *p2;  | 
    
    
    353  | 
     | 
     | 
    }  | 
    
    
    354  | 
     | 
     | 
     | 
    
    
    355  | 
     | 
     | 
    int  | 
    
    
    356  | 
     | 
     | 
    btree_cmp(struct btree *bt, const struct btval *a, const struct btval *b)  | 
    
    
    357  | 
     | 
     | 
    { | 
    
    
    358  | 
     | 
     | 
    	return bt->cmp(a, b);  | 
    
    
    359  | 
     | 
     | 
    }  | 
    
    
    360  | 
     | 
     | 
     | 
    
    
    361  | 
     | 
     | 
    static void  | 
    
    
    362  | 
     | 
     | 
    common_prefix(struct btree *bt, struct btkey *min, struct btkey *max,  | 
    
    
    363  | 
     | 
     | 
        struct btkey *pfx)  | 
    
    
    364  | 
     | 
     | 
    { | 
    
    
    365  | 
     | 
     | 
    	size_t		 n = 0;  | 
    
    
    366  | 
     | 
     | 
    	char		*p1;  | 
    
    
    367  | 
     | 
     | 
    	char		*p2;  | 
    
    
    368  | 
     | 
     | 
     | 
    
    
    369  | 
     | 
     | 
    	if (min->len == 0 || max->len == 0) { | 
    
    
    370  | 
     | 
     | 
    		pfx->len = 0;  | 
    
    
    371  | 
     | 
     | 
    		return;  | 
    
    
    372  | 
     | 
     | 
    	}  | 
    
    
    373  | 
     | 
     | 
     | 
    
    
    374  | 
     | 
     | 
    	if (F_ISSET(bt->flags, BT_REVERSEKEY)) { | 
    
    
    375  | 
     | 
     | 
    		p1 = min->str + min->len - 1;  | 
    
    
    376  | 
     | 
     | 
    		p2 = max->str + max->len - 1;  | 
    
    
    377  | 
     | 
     | 
     | 
    
    
    378  | 
     | 
     | 
    		while (*p1 == *p2) { | 
    
    
    379  | 
     | 
     | 
    			if (p1 < min->str || p2 < max->str)  | 
    
    
    380  | 
     | 
     | 
    				break;  | 
    
    
    381  | 
     | 
     | 
    			p1--;  | 
    
    
    382  | 
     | 
     | 
    			p2--;  | 
    
    
    383  | 
     | 
     | 
    			n++;  | 
    
    
    384  | 
     | 
     | 
    		}  | 
    
    
    385  | 
     | 
     | 
     | 
    
    
    386  | 
     | 
     | 
    		assert(n <= (int)sizeof(pfx->str));  | 
    
    
    387  | 
     | 
     | 
    		pfx->len = n;  | 
    
    
    388  | 
     | 
     | 
    		bcopy(p2 + 1, pfx->str, n);  | 
    
    
    389  | 
     | 
     | 
    	} else { | 
    
    
    390  | 
     | 
     | 
    		p1 = min->str;  | 
    
    
    391  | 
     | 
     | 
    		p2 = max->str;  | 
    
    
    392  | 
     | 
     | 
     | 
    
    
    393  | 
     | 
     | 
    		while (*p1 == *p2) { | 
    
    
    394  | 
     | 
     | 
    			if (n == min->len || n == max->len)  | 
    
    
    395  | 
     | 
     | 
    				break;  | 
    
    
    396  | 
     | 
     | 
    			p1++;  | 
    
    
    397  | 
     | 
     | 
    			p2++;  | 
    
    
    398  | 
     | 
     | 
    			n++;  | 
    
    
    399  | 
     | 
     | 
    		}  | 
    
    
    400  | 
     | 
     | 
     | 
    
    
    401  | 
     | 
     | 
    		assert(n <= (int)sizeof(pfx->str));  | 
    
    
    402  | 
     | 
     | 
    		pfx->len = n;  | 
    
    
    403  | 
     | 
     | 
    		bcopy(max->str, pfx->str, n);  | 
    
    
    404  | 
     | 
     | 
    	}  | 
    
    
    405  | 
     | 
     | 
    }  | 
    
    
    406  | 
     | 
     | 
     | 
    
    
    407  | 
     | 
     | 
    static void  | 
    
    
    408  | 
     | 
     | 
    remove_prefix(struct btree *bt, struct btval *key, size_t pfxlen)  | 
    
    
    409  | 
     | 
     | 
    { | 
    
    
    410  | 
     | 
     | 
    	if (pfxlen == 0 || bt->cmp != NULL)  | 
    
    
    411  | 
     | 
     | 
    		return;  | 
    
    
    412  | 
     | 
     | 
     | 
    
    
    413  | 
     | 
     | 
    	DPRINTF("removing %zu bytes of prefix from key [%.*s]", pfxlen, | 
    
    
    414  | 
     | 
     | 
    	    (int)key->size, (char *)key->data);  | 
    
    
    415  | 
     | 
     | 
    	assert(pfxlen <= key->size);  | 
    
    
    416  | 
     | 
     | 
    	key->size -= pfxlen;  | 
    
    
    417  | 
     | 
     | 
    	if (!F_ISSET(bt->flags, BT_REVERSEKEY))  | 
    
    
    418  | 
     | 
     | 
    		key->data = (char *)key->data + pfxlen;  | 
    
    
    419  | 
     | 
     | 
    }  | 
    
    
    420  | 
     | 
     | 
     | 
    
    
    421  | 
     | 
     | 
    static void  | 
    
    
    422  | 
     | 
     | 
    expand_prefix(struct btree *bt, struct mpage *mp, indx_t indx,  | 
    
    
    423  | 
     | 
     | 
        struct btkey *expkey)  | 
    
    
    424  | 
     | 
     | 
    { | 
    
    
    425  | 
     | 
     | 
    	struct node	*node;  | 
    
    
    426  | 
     | 
     | 
     | 
    
    
    427  | 
     | 
     | 
    	node = NODEPTR(mp, indx);  | 
    
    
    428  | 
     | 
     | 
    	expkey->len = sizeof(expkey->str);  | 
    
    
    429  | 
     | 
     | 
    	concat_prefix(bt, mp->prefix.str, mp->prefix.len,  | 
    
    
    430  | 
     | 
     | 
    	    NODEKEY(node), node->ksize, expkey->str, &expkey->len);  | 
    
    
    431  | 
     | 
     | 
    }  | 
    
    
    432  | 
     | 
     | 
     | 
    
    
    433  | 
     | 
     | 
    static int  | 
    
    
    434  | 
     | 
     | 
    bt_cmp(struct btree *bt, const struct btval *key1, const struct btval *key2,  | 
    
    
    435  | 
     | 
     | 
        struct btkey *pfx)  | 
    
    
    436  | 
     | 
     | 
    { | 
    
    
    437  | 
     | 
     | 
    	if (F_ISSET(bt->flags, BT_REVERSEKEY))  | 
    
    
    438  | 
     | 
     | 
    		return memnrcmp(key1->data, key1->size - pfx->len,  | 
    
    
    439  | 
     | 
     | 
    		    key2->data, key2->size);  | 
    
    
    440  | 
     | 
     | 
    	else  | 
    
    
    441  | 
     | 
     | 
    		return memncmp((char *)key1->data + pfx->len, key1->size - pfx->len,  | 
    
    
    442  | 
     | 
     | 
    		    key2->data, key2->size);  | 
    
    
    443  | 
     | 
     | 
    }  | 
    
    
    444  | 
     | 
     | 
     | 
    
    
    445  | 
     | 
     | 
    void  | 
    
    
    446  | 
     | 
     | 
    btval_reset(struct btval *btv)  | 
    
    
    447  | 
     | 
     | 
    { | 
    
    
    448  | 
     | 
     | 
    	if (btv) { | 
    
    
    449  | 
     | 
     | 
    		if (btv->mp)  | 
    
    
    450  | 
     | 
     | 
    			btv->mp->ref--;  | 
    
    
    451  | 
     | 
     | 
    		if (btv->free_data)  | 
    
    
    452  | 
     | 
     | 
    			free(btv->data);  | 
    
    
    453  | 
     | 
     | 
    		memset(btv, 0, sizeof(*btv));  | 
    
    
    454  | 
     | 
     | 
    	}  | 
    
    
    455  | 
     | 
     | 
    }  | 
    
    
    456  | 
     | 
     | 
     | 
    
    
    457  | 
     | 
     | 
    static int  | 
    
    
    458  | 
     | 
     | 
    mpage_cmp(struct mpage *a, struct mpage *b)  | 
    
    
    459  | 
     | 
     | 
    { | 
    
    
    460  | 
     | 
     | 
    	if (a->pgno > b->pgno)  | 
    
    
    461  | 
     | 
     | 
    		return 1;  | 
    
    
    462  | 
     | 
     | 
    	if (a->pgno < b->pgno)  | 
    
    
    463  | 
     | 
     | 
    		return -1;  | 
    
    
    464  | 
     | 
     | 
    	return 0;  | 
    
    
    465  | 
     | 
     | 
    }  | 
    
    
    466  | 
     | 
     | 
     | 
    
    
    467  | 
     | 
     | 
    static struct mpage *  | 
    
    
    468  | 
     | 
     | 
    mpage_lookup(struct btree *bt, pgno_t pgno)  | 
    
    
    469  | 
     | 
     | 
    { | 
    
    
    470  | 
     | 
     | 
    	struct mpage	 find, *mp;  | 
    
    
    471  | 
     | 
     | 
     | 
    
    
    472  | 
     | 
     | 
    	find.pgno = pgno;  | 
    
    
    473  | 
     | 
     | 
    	mp = RB_FIND(page_cache, bt->page_cache, &find);  | 
    
    
    474  | 
     | 
     | 
    	if (mp) { | 
    
    
    475  | 
     | 
     | 
    		bt->stat.hits++;  | 
    
    
    476  | 
     | 
     | 
    		/* Update LRU queue. Move page to the end. */  | 
    
    
    477  | 
     | 
     | 
    		TAILQ_REMOVE(bt->lru_queue, mp, lru_next);  | 
    
    
    478  | 
     | 
     | 
    		TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);  | 
    
    
    479  | 
     | 
     | 
    	}  | 
    
    
    480  | 
     | 
     | 
    	return mp;  | 
    
    
    481  | 
     | 
     | 
    }  | 
    
    
    482  | 
     | 
     | 
     | 
    
    
    483  | 
     | 
     | 
    static void  | 
    
    
    484  | 
     | 
     | 
    mpage_add(struct btree *bt, struct mpage *mp)  | 
    
    
    485  | 
     | 
     | 
    { | 
    
    
    486  | 
     | 
     | 
    	assert(RB_INSERT(page_cache, bt->page_cache, mp) == NULL);  | 
    
    
    487  | 
     | 
     | 
    	bt->stat.cache_size++;  | 
    
    
    488  | 
     | 
     | 
    	TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);  | 
    
    
    489  | 
     | 
     | 
    }  | 
    
    
    490  | 
     | 
     | 
     | 
    
    
    491  | 
     | 
     | 
    static void  | 
    
    
    492  | 
     | 
     | 
    mpage_free(struct mpage *mp)  | 
    
    
    493  | 
     | 
     | 
    { | 
    
    
    494  | 
     | 
     | 
    	if (mp != NULL) { | 
    
    
    495  | 
     | 
     | 
    		free(mp->page);  | 
    
    
    496  | 
     | 
     | 
    		free(mp);  | 
    
    
    497  | 
     | 
     | 
    	}  | 
    
    
    498  | 
     | 
     | 
    }  | 
    
    
    499  | 
     | 
     | 
     | 
    
    
    500  | 
     | 
     | 
    static void  | 
    
    
    501  | 
     | 
     | 
    mpage_del(struct btree *bt, struct mpage *mp)  | 
    
    
    502  | 
     | 
     | 
    { | 
    
    
    503  | 
     | 
     | 
    	assert(RB_REMOVE(page_cache, bt->page_cache, mp) == mp);  | 
    
    
    504  | 
     | 
     | 
    	assert(bt->stat.cache_size > 0);  | 
    
    
    505  | 
     | 
     | 
    	bt->stat.cache_size--;  | 
    
    
    506  | 
     | 
     | 
    	TAILQ_REMOVE(bt->lru_queue, mp, lru_next);  | 
    
    
    507  | 
     | 
     | 
    }  | 
    
    
    508  | 
     | 
     | 
     | 
    
    
    509  | 
     | 
     | 
    static void  | 
    
    
    510  | 
     | 
     | 
    mpage_flush(struct btree *bt)  | 
    
    
    511  | 
     | 
     | 
    { | 
    
    
    512  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    513  | 
     | 
     | 
     | 
    
    
    514  | 
     | 
     | 
    	while ((mp = RB_MIN(page_cache, bt->page_cache)) != NULL) { | 
    
    
    515  | 
     | 
     | 
    		mpage_del(bt, mp);  | 
    
    
    516  | 
     | 
     | 
    		mpage_free(mp);  | 
    
    
    517  | 
     | 
     | 
    	}  | 
    
    
    518  | 
     | 
     | 
    }  | 
    
    
    519  | 
     | 
     | 
     | 
    
    
    520  | 
     | 
     | 
    static struct mpage *  | 
    
    
    521  | 
     | 
     | 
    mpage_copy(struct btree *bt, struct mpage *mp)  | 
    
    
    522  | 
     | 
     | 
    { | 
    
    
    523  | 
     | 
     | 
    	struct mpage	*copy;  | 
    
    
    524  | 
     | 
     | 
     | 
    
    
    525  | 
     | 
     | 
    	if ((copy = calloc(1, sizeof(*copy))) == NULL)  | 
    
    
    526  | 
     | 
     | 
    		return NULL;  | 
    
    
    527  | 
     | 
     | 
    	if ((copy->page = malloc(bt->head.psize)) == NULL) { | 
    
    
    528  | 
     | 
     | 
    		free(copy);  | 
    
    
    529  | 
     | 
     | 
    		return NULL;  | 
    
    
    530  | 
     | 
     | 
    	}  | 
    
    
    531  | 
     | 
     | 
    	bcopy(mp->page, copy->page, bt->head.psize);  | 
    
    
    532  | 
     | 
     | 
    	bcopy(&mp->prefix, ©->prefix, sizeof(mp->prefix));  | 
    
    
    533  | 
     | 
     | 
    	copy->parent = mp->parent;  | 
    
    
    534  | 
     | 
     | 
    	copy->parent_index = mp->parent_index;  | 
    
    
    535  | 
     | 
     | 
    	copy->pgno = mp->pgno;  | 
    
    
    536  | 
     | 
     | 
     | 
    
    
    537  | 
     | 
     | 
    	return copy;  | 
    
    
    538  | 
     | 
     | 
    }  | 
    
    
    539  | 
     | 
     | 
     | 
    
    
    540  | 
     | 
     | 
    /* Remove the least recently used memory pages until the cache size is  | 
    
    
    541  | 
     | 
     | 
     * within the configured bounds. Pages referenced by cursors or returned  | 
    
    
    542  | 
     | 
     | 
     * key/data are not pruned.  | 
    
    
    543  | 
     | 
     | 
     */  | 
    
    
    544  | 
     | 
     | 
    static void  | 
    
    
    545  | 
     | 
     | 
    mpage_prune(struct btree *bt)  | 
    
    
    546  | 
     | 
     | 
    { | 
    
    
    547  | 
     | 
     | 
    	struct mpage	*mp, *next;  | 
    
    
    548  | 
     | 
     | 
     | 
    
    
    549  | 
     | 
     | 
    	for (mp = TAILQ_FIRST(bt->lru_queue); mp; mp = next) { | 
    
    
    550  | 
     | 
     | 
    		if (bt->stat.cache_size <= bt->stat.max_cache)  | 
    
    
    551  | 
     | 
     | 
    			break;  | 
    
    
    552  | 
     | 
     | 
    		next = TAILQ_NEXT(mp, lru_next);  | 
    
    
    553  | 
     | 
     | 
    		if (!mp->dirty && mp->ref <= 0) { | 
    
    
    554  | 
     | 
     | 
    			mpage_del(bt, mp);  | 
    
    
    555  | 
     | 
     | 
    			mpage_free(mp);  | 
    
    
    556  | 
     | 
     | 
    		}  | 
    
    
    557  | 
     | 
     | 
    	}  | 
    
    
    558  | 
     | 
     | 
    }  | 
    
    
    559  | 
     | 
     | 
     | 
    
    
    560  | 
     | 
     | 
    /* Mark a page as dirty and push it on the dirty queue.  | 
    
    
    561  | 
     | 
     | 
     */  | 
    
    
    562  | 
     | 
     | 
    static void  | 
    
    
    563  | 
     | 
     | 
    mpage_dirty(struct btree *bt, struct mpage *mp)  | 
    
    
    564  | 
     | 
     | 
    { | 
    
    
    565  | 
     | 
     | 
    	assert(bt != NULL);  | 
    
    
    566  | 
     | 
     | 
    	assert(bt->txn != NULL);  | 
    
    
    567  | 
     | 
     | 
     | 
    
    
    568  | 
     | 
     | 
    	if (!mp->dirty) { | 
    
    
    569  | 
     | 
     | 
    		mp->dirty = 1;  | 
    
    
    570  | 
     | 
     | 
    		SIMPLEQ_INSERT_TAIL(bt->txn->dirty_queue, mp, next);  | 
    
    
    571  | 
     | 
     | 
    	}  | 
    
    
    572  | 
     | 
     | 
    }  | 
    
    
    573  | 
     | 
     | 
     | 
    
    
    574  | 
     | 
     | 
    /* Touch a page: make it dirty and re-insert into tree with updated pgno.  | 
    
    
    575  | 
     | 
     | 
     */  | 
    
    
    576  | 
     | 
     | 
    static struct mpage *  | 
    
    
    577  | 
     | 
     | 
    mpage_touch(struct btree *bt, struct mpage *mp)  | 
    
    
    578  | 
     | 
     | 
    { | 
    
    
    579  | 
     | 
     | 
    	assert(bt != NULL);  | 
    
    
    580  | 
     | 
     | 
    	assert(bt->txn != NULL);  | 
    
    
    581  | 
     | 
     | 
    	assert(mp != NULL);  | 
    
    
    582  | 
     | 
     | 
     | 
    
    
    583  | 
     | 
     | 
    	if (!mp->dirty) { | 
    
    
    584  | 
     | 
     | 
    		DPRINTF("touching page %u -> %u", mp->pgno, bt->txn->next_pgno); | 
    
    
    585  | 
     | 
     | 
    		if (mp->ref == 0)  | 
    
    
    586  | 
     | 
     | 
    			mpage_del(bt, mp);  | 
    
    
    587  | 
     | 
     | 
    		else { | 
    
    
    588  | 
     | 
     | 
    			if ((mp = mpage_copy(bt, mp)) == NULL)  | 
    
    
    589  | 
     | 
     | 
    				return NULL;  | 
    
    
    590  | 
     | 
     | 
    		}  | 
    
    
    591  | 
     | 
     | 
    		mp->pgno = mp->page->pgno = bt->txn->next_pgno++;  | 
    
    
    592  | 
     | 
     | 
    		mpage_dirty(bt, mp);  | 
    
    
    593  | 
     | 
     | 
    		mpage_add(bt, mp);  | 
    
    
    594  | 
     | 
     | 
     | 
    
    
    595  | 
     | 
     | 
    		/* Update the page number to new touched page. */  | 
    
    
    596  | 
     | 
     | 
    		if (mp->parent != NULL)  | 
    
    
    597  | 
     | 
     | 
    			NODEPGNO(NODEPTR(mp->parent,  | 
    
    
    598  | 
     | 
     | 
    			    mp->parent_index)) = mp->pgno;  | 
    
    
    599  | 
     | 
     | 
    	}  | 
    
    
    600  | 
     | 
     | 
     | 
    
    
    601  | 
     | 
     | 
    	return mp;  | 
    
    
    602  | 
     | 
     | 
    }  | 
    
    
    603  | 
     | 
     | 
     | 
    
    
    604  | 
     | 
     | 
    static int  | 
    
    
    605  | 
     | 
     | 
    btree_read_page(struct btree *bt, pgno_t pgno, struct page *page)  | 
    
    
    606  | 
     | 
     | 
    { | 
    
    
    607  | 
     | 
     | 
    	ssize_t		 rc;  | 
    
    
    608  | 
     | 
     | 
     | 
    
    
    609  | 
     | 
     | 
    	DPRINTF("reading page %u", pgno); | 
    
    
    610  | 
     | 
     | 
    	bt->stat.reads++;  | 
    
    
    611  | 
     | 
     | 
    	if ((rc = pread(bt->fd, page, bt->head.psize, (off_t)pgno*bt->head.psize)) == 0) { | 
    
    
    612  | 
     | 
     | 
    		DPRINTF("page %u doesn't exist", pgno); | 
    
    
    613  | 
     | 
     | 
    		errno = ENOENT;  | 
    
    
    614  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    615  | 
     | 
     | 
    	} else if (rc != (ssize_t)bt->head.psize) { | 
    
    
    616  | 
     | 
     | 
    		if (rc > 0)  | 
    
    
    617  | 
     | 
     | 
    			errno = EINVAL;  | 
    
    
    618  | 
     | 
     | 
    		DPRINTF("read: %s", strerror(errno)); | 
    
    
    619  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    620  | 
     | 
     | 
    	}  | 
    
    
    621  | 
     | 
     | 
     | 
    
    
    622  | 
     | 
     | 
    	if (page->pgno != pgno) { | 
    
    
    623  | 
     | 
     | 
    		DPRINTF("page numbers don't match: %u != %u", pgno, page->pgno); | 
    
    
    624  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    625  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    626  | 
     | 
     | 
    	}  | 
    
    
    627  | 
     | 
     | 
     | 
    
    
    628  | 
     | 
     | 
    	DPRINTF("page %u has flags 0x%X", pgno, page->flags); | 
    
    
    629  | 
     | 
     | 
     | 
    
    
    630  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    631  | 
     | 
     | 
    }  | 
    
    
    632  | 
     | 
     | 
     | 
    
    
    633  | 
     | 
     | 
    int  | 
    
    
    634  | 
     | 
     | 
    btree_sync(struct btree *bt)  | 
    
    
    635  | 
     | 
     | 
    { | 
    
    
    636  | 
     | 
     | 
    	if (!F_ISSET(bt->flags, BT_NOSYNC))  | 
    
    
    637  | 
     | 
     | 
    		return fsync(bt->fd);  | 
    
    
    638  | 
     | 
     | 
    	return 0;  | 
    
    
    639  | 
     | 
     | 
    }  | 
    
    
    640  | 
     | 
     | 
     | 
    
    
    641  | 
     | 
     | 
    struct btree_txn *  | 
    
    
    642  | 
     | 
     | 
    btree_txn_begin(struct btree *bt, int rdonly)  | 
    
    
    643  | 
     | 
     | 
    { | 
    
    
    644  | 
     | 
     | 
    	struct btree_txn	*txn;  | 
    
    
    645  | 
     | 
     | 
     | 
    
    
    646  | 
     | 
     | 
    	if (!rdonly && bt->txn != NULL) { | 
    
    
    647  | 
     | 
     | 
    		DPRINTF("write transaction already begun"); | 
    
    
    648  | 
     | 
     | 
    		errno = EBUSY;  | 
    
    
    649  | 
     | 
     | 
    		return NULL;  | 
    
    
    650  | 
     | 
     | 
    	}  | 
    
    
    651  | 
     | 
     | 
     | 
    
    
    652  | 
     | 
     | 
    	if ((txn = calloc(1, sizeof(*txn))) == NULL) { | 
    
    
    653  | 
     | 
     | 
    		DPRINTF("calloc: %s", strerror(errno)); | 
    
    
    654  | 
     | 
     | 
    		return NULL;  | 
    
    
    655  | 
     | 
     | 
    	}  | 
    
    
    656  | 
     | 
     | 
     | 
    
    
    657  | 
     | 
     | 
    	if (rdonly) { | 
    
    
    658  | 
     | 
     | 
    		txn->flags |= BT_TXN_RDONLY;  | 
    
    
    659  | 
     | 
     | 
    	} else { | 
    
    
    660  | 
     | 
     | 
    		txn->dirty_queue = calloc(1, sizeof(*txn->dirty_queue));  | 
    
    
    661  | 
     | 
     | 
    		if (txn->dirty_queue == NULL) { | 
    
    
    662  | 
     | 
     | 
    			free(txn);  | 
    
    
    663  | 
     | 
     | 
    			return NULL;  | 
    
    
    664  | 
     | 
     | 
    		}  | 
    
    
    665  | 
     | 
     | 
    		SIMPLEQ_INIT(txn->dirty_queue);  | 
    
    
    666  | 
     | 
     | 
     | 
    
    
    667  | 
     | 
     | 
    		DPRINTF("taking write lock on txn %p", txn); | 
    
    
    668  | 
     | 
     | 
    		if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) { | 
    
    
    669  | 
     | 
     | 
    			DPRINTF("flock: %s", strerror(errno)); | 
    
    
    670  | 
     | 
     | 
    			errno = EBUSY;  | 
    
    
    671  | 
     | 
     | 
    			free(txn->dirty_queue);  | 
    
    
    672  | 
     | 
     | 
    			free(txn);  | 
    
    
    673  | 
     | 
     | 
    			return NULL;  | 
    
    
    674  | 
     | 
     | 
    		}  | 
    
    
    675  | 
     | 
     | 
    		bt->txn = txn;  | 
    
    
    676  | 
     | 
     | 
    	}  | 
    
    
    677  | 
     | 
     | 
     | 
    
    
    678  | 
     | 
     | 
    	txn->bt = bt;  | 
    
    
    679  | 
     | 
     | 
    	btree_ref(bt);  | 
    
    
    680  | 
     | 
     | 
     | 
    
    
    681  | 
     | 
     | 
    	if (btree_read_meta(bt, &txn->next_pgno) != BT_SUCCESS) { | 
    
    
    682  | 
     | 
     | 
    		btree_txn_abort(txn);  | 
    
    
    683  | 
     | 
     | 
    		return NULL;  | 
    
    
    684  | 
     | 
     | 
    	}  | 
    
    
    685  | 
     | 
     | 
     | 
    
    
    686  | 
     | 
     | 
    	txn->root = bt->meta.root;  | 
    
    
    687  | 
     | 
     | 
    	DPRINTF("begin transaction on btree %p, root page %u", bt, txn->root); | 
    
    
    688  | 
     | 
     | 
     | 
    
    
    689  | 
     | 
     | 
    	return txn;  | 
    
    
    690  | 
     | 
     | 
    }  | 
    
    
    691  | 
     | 
     | 
     | 
    
    
    692  | 
     | 
     | 
    void  | 
    
    
    693  | 
     | 
     | 
    btree_txn_abort(struct btree_txn *txn)  | 
    
    
    694  | 
     | 
     | 
    { | 
    
    
    695  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    696  | 
     | 
     | 
    	struct btree	*bt;  | 
    
    
    697  | 
     | 
     | 
     | 
    
    
    698  | 
     | 
     | 
    	if (txn == NULL)  | 
    
    
    699  | 
     | 
     | 
    		return;  | 
    
    
    700  | 
     | 
     | 
     | 
    
    
    701  | 
     | 
     | 
    	bt = txn->bt;  | 
    
    
    702  | 
     | 
     | 
    	DPRINTF("abort transaction on btree %p, root page %u", bt, txn->root); | 
    
    
    703  | 
     | 
     | 
     | 
    
    
    704  | 
     | 
     | 
    	if (!F_ISSET(txn->flags, BT_TXN_RDONLY)) { | 
    
    
    705  | 
     | 
     | 
    		/* Discard all dirty pages.  | 
    
    
    706  | 
     | 
     | 
    		 */  | 
    
    
    707  | 
     | 
     | 
    		while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { | 
    
    
    708  | 
     | 
     | 
    			mp = SIMPLEQ_FIRST(txn->dirty_queue);  | 
    
    
    709  | 
     | 
     | 
    			assert(mp->ref == 0);	/* cursors should be closed */  | 
    
    
    710  | 
     | 
     | 
    			mpage_del(bt, mp);  | 
    
    
    711  | 
     | 
     | 
    			SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);  | 
    
    
    712  | 
     | 
     | 
    			mpage_free(mp);  | 
    
    
    713  | 
     | 
     | 
    		}  | 
    
    
    714  | 
     | 
     | 
     | 
    
    
    715  | 
     | 
     | 
    		DPRINTF("releasing write lock on txn %p", txn); | 
    
    
    716  | 
     | 
     | 
    		txn->bt->txn = NULL;  | 
    
    
    717  | 
     | 
     | 
    		if (flock(txn->bt->fd, LOCK_UN) != 0) { | 
    
    
    718  | 
     | 
     | 
    			DPRINTF("failed to unlock fd %d: %s", | 
    
    
    719  | 
     | 
     | 
    			    txn->bt->fd, strerror(errno));  | 
    
    
    720  | 
     | 
     | 
    		}  | 
    
    
    721  | 
     | 
     | 
    		free(txn->dirty_queue);  | 
    
    
    722  | 
     | 
     | 
    	}  | 
    
    
    723  | 
     | 
     | 
     | 
    
    
    724  | 
     | 
     | 
    	btree_close(txn->bt);  | 
    
    
    725  | 
     | 
     | 
    	free(txn);  | 
    
    
    726  | 
     | 
     | 
    }  | 
    
    
    727  | 
     | 
     | 
     | 
    
    
    728  | 
     | 
     | 
    int  | 
    
    
    729  | 
     | 
     | 
    btree_txn_commit(struct btree_txn *txn)  | 
    
    
    730  | 
     | 
     | 
    { | 
    
    
    731  | 
     | 
     | 
    	int		 n, done;  | 
    
    
    732  | 
     | 
     | 
    	ssize_t		 rc;  | 
    
    
    733  | 
     | 
     | 
    	off_t		 size;  | 
    
    
    734  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    735  | 
     | 
     | 
    	struct btree	*bt;  | 
    
    
    736  | 
     | 
     | 
    	struct iovec	 iov[BT_COMMIT_PAGES];  | 
    
    
    737  | 
     | 
     | 
     | 
    
    
    738  | 
     | 
     | 
    	assert(txn != NULL);  | 
    
    
    739  | 
     | 
     | 
    	assert(txn->bt != NULL);  | 
    
    
    740  | 
     | 
     | 
     | 
    
    
    741  | 
     | 
     | 
    	bt = txn->bt;  | 
    
    
    742  | 
     | 
     | 
     | 
    
    
    743  | 
     | 
     | 
    	if (F_ISSET(txn->flags, BT_TXN_RDONLY)) { | 
    
    
    744  | 
     | 
     | 
    		DPRINTF("attempt to commit read-only transaction"); | 
    
    
    745  | 
     | 
     | 
    		btree_txn_abort(txn);  | 
    
    
    746  | 
     | 
     | 
    		errno = EPERM;  | 
    
    
    747  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    748  | 
     | 
     | 
    	}  | 
    
    
    749  | 
     | 
     | 
     | 
    
    
    750  | 
     | 
     | 
    	if (txn != bt->txn) { | 
    
    
    751  | 
     | 
     | 
    		DPRINTF("attempt to commit unknown transaction"); | 
    
    
    752  | 
     | 
     | 
    		btree_txn_abort(txn);  | 
    
    
    753  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    754  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    755  | 
     | 
     | 
    	}  | 
    
    
    756  | 
     | 
     | 
     | 
    
    
    757  | 
     | 
     | 
    	if (F_ISSET(txn->flags, BT_TXN_ERROR)) { | 
    
    
    758  | 
     | 
     | 
    		DPRINTF("error flag is set, can't commit"); | 
    
    
    759  | 
     | 
     | 
    		btree_txn_abort(txn);  | 
    
    
    760  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    761  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    762  | 
     | 
     | 
    	}  | 
    
    
    763  | 
     | 
     | 
     | 
    
    
    764  | 
     | 
     | 
    	if (SIMPLEQ_EMPTY(txn->dirty_queue))  | 
    
    
    765  | 
     | 
     | 
    		goto done;  | 
    
    
    766  | 
     | 
     | 
     | 
    
    
    767  | 
     | 
     | 
    	if (F_ISSET(bt->flags, BT_FIXPADDING)) { | 
    
    
    768  | 
     | 
     | 
    		size = lseek(bt->fd, 0, SEEK_END);  | 
    
    
    769  | 
     | 
     | 
    		size += bt->head.psize - (size % bt->head.psize);  | 
    
    
    770  | 
     | 
     | 
    		DPRINTF("extending to multiple of page size: %llu", size); | 
    
    
    771  | 
     | 
     | 
    		if (ftruncate(bt->fd, size) != 0) { | 
    
    
    772  | 
     | 
     | 
    			DPRINTF("ftruncate: %s", strerror(errno)); | 
    
    
    773  | 
     | 
     | 
    			btree_txn_abort(txn);  | 
    
    
    774  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    775  | 
     | 
     | 
    		}  | 
    
    
    776  | 
     | 
     | 
    		bt->flags &= ~BT_FIXPADDING;  | 
    
    
    777  | 
     | 
     | 
    	}  | 
    
    
    778  | 
     | 
     | 
     | 
    
    
    779  | 
     | 
     | 
    	DPRINTF("committing transaction on btree %p, root page %u", | 
    
    
    780  | 
     | 
     | 
    	    bt, txn->root);  | 
    
    
    781  | 
     | 
     | 
     | 
    
    
    782  | 
     | 
     | 
    	/* Commit up to BT_COMMIT_PAGES dirty pages to disk until done.  | 
    
    
    783  | 
     | 
     | 
    	 */  | 
    
    
    784  | 
     | 
     | 
    	do { | 
    
    
    785  | 
     | 
     | 
    		n = 0;  | 
    
    
    786  | 
     | 
     | 
    		done = 1;  | 
    
    
    787  | 
     | 
     | 
    		SIMPLEQ_FOREACH(mp, txn->dirty_queue, next) { | 
    
    
    788  | 
     | 
     | 
    			DPRINTF("committing page %u", mp->pgno); | 
    
    
    789  | 
     | 
     | 
    			iov[n].iov_len = bt->head.psize;  | 
    
    
    790  | 
     | 
     | 
    			iov[n].iov_base = mp->page;  | 
    
    
    791  | 
     | 
     | 
    			if (++n >= BT_COMMIT_PAGES) { | 
    
    
    792  | 
     | 
     | 
    				done = 0;  | 
    
    
    793  | 
     | 
     | 
    				break;  | 
    
    
    794  | 
     | 
     | 
    			}  | 
    
    
    795  | 
     | 
     | 
    		}  | 
    
    
    796  | 
     | 
     | 
     | 
    
    
    797  | 
     | 
     | 
    		if (n == 0)  | 
    
    
    798  | 
     | 
     | 
    			break;  | 
    
    
    799  | 
     | 
     | 
     | 
    
    
    800  | 
     | 
     | 
    		DPRINTF("committing %u dirty pages", n); | 
    
    
    801  | 
     | 
     | 
    		rc = writev(bt->fd, iov, n);  | 
    
    
    802  | 
     | 
     | 
    		if (rc != (ssize_t)bt->head.psize*n) { | 
    
    
    803  | 
     | 
     | 
    			if (rc > 0)  | 
    
    
    804  | 
     | 
     | 
    				DPRINTF("short write, filesystem full?"); | 
    
    
    805  | 
     | 
     | 
    			else  | 
    
    
    806  | 
     | 
     | 
    				DPRINTF("writev: %s", strerror(errno)); | 
    
    
    807  | 
     | 
     | 
    			btree_txn_abort(txn);  | 
    
    
    808  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    809  | 
     | 
     | 
    		}  | 
    
    
    810  | 
     | 
     | 
     | 
    
    
    811  | 
     | 
     | 
    		/* Remove the dirty flag from the written pages.  | 
    
    
    812  | 
     | 
     | 
    		 */  | 
    
    
    813  | 
     | 
     | 
    		while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { | 
    
    
    814  | 
     | 
     | 
    			mp = SIMPLEQ_FIRST(txn->dirty_queue);  | 
    
    
    815  | 
     | 
     | 
    			mp->dirty = 0;  | 
    
    
    816  | 
     | 
     | 
    			SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);  | 
    
    
    817  | 
     | 
     | 
    			if (--n == 0)  | 
    
    
    818  | 
     | 
     | 
    				break;  | 
    
    
    819  | 
     | 
     | 
    		}  | 
    
    
    820  | 
     | 
     | 
    	} while (!done);  | 
    
    
    821  | 
     | 
     | 
     | 
    
    
    822  | 
     | 
     | 
    	if (btree_sync(bt) != 0 ||  | 
    
    
    823  | 
     | 
     | 
    	    btree_write_meta(bt, txn->root, 0) != BT_SUCCESS ||  | 
    
    
    824  | 
     | 
     | 
    	    btree_sync(bt) != 0) { | 
    
    
    825  | 
     | 
     | 
    		btree_txn_abort(txn);  | 
    
    
    826  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    827  | 
     | 
     | 
    	}  | 
    
    
    828  | 
     | 
     | 
     | 
    
    
    829  | 
     | 
     | 
    done:  | 
    
    
    830  | 
     | 
     | 
    	mpage_prune(bt);  | 
    
    
    831  | 
     | 
     | 
    	btree_txn_abort(txn);  | 
    
    
    832  | 
     | 
     | 
     | 
    
    
    833  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    834  | 
     | 
     | 
    }  | 
    
    
    835  | 
     | 
     | 
     | 
    
    
    836  | 
     | 
     | 
    static int  | 
    
    
    837  | 
     | 
     | 
    btree_write_header(struct btree *bt, int fd)  | 
    
    
    838  | 
     | 
     | 
    { | 
    
    
    839  | 
     | 
     | 
    	struct stat	 sb;  | 
    
    
    840  | 
     | 
     | 
    	struct bt_head	*h;  | 
    
    
    841  | 
     | 
     | 
    	struct page	*p;  | 
    
    
    842  | 
     | 
     | 
    	ssize_t		 rc;  | 
    
    
    843  | 
     | 
     | 
    	unsigned int	 psize;  | 
    
    
    844  | 
     | 
     | 
     | 
    
    
    845  | 
     | 
     | 
    	DPRINTF("writing header page"); | 
    
    
    846  | 
     | 
     | 
    	assert(bt != NULL);  | 
    
    
    847  | 
     | 
     | 
     | 
    
    
    848  | 
     | 
     | 
    	/*  | 
    
    
    849  | 
     | 
     | 
    	 * Ask stat for 'optimal blocksize for I/O', but cap to fit in indx_t.  | 
    
    
    850  | 
     | 
     | 
    	 */  | 
    
    
    851  | 
     | 
     | 
    	if (fstat(fd, &sb) == 0)  | 
    
    
    852  | 
     | 
     | 
    		psize = MINIMUM(32*1024, sb.st_blksize);  | 
    
    
    853  | 
     | 
     | 
    	else  | 
    
    
    854  | 
     | 
     | 
    		psize = PAGESIZE;  | 
    
    
    855  | 
     | 
     | 
     | 
    
    
    856  | 
     | 
     | 
    	if ((p = calloc(1, psize)) == NULL)  | 
    
    
    857  | 
     | 
     | 
    		return -1;  | 
    
    
    858  | 
     | 
     | 
    	p->flags = P_HEAD;  | 
    
    
    859  | 
     | 
     | 
     | 
    
    
    860  | 
     | 
     | 
    	h = METADATA(p);  | 
    
    
    861  | 
     | 
     | 
    	h->magic = BT_MAGIC;  | 
    
    
    862  | 
     | 
     | 
    	h->version = BT_VERSION;  | 
    
    
    863  | 
     | 
     | 
    	h->psize = psize;  | 
    
    
    864  | 
     | 
     | 
    	bcopy(h, &bt->head, sizeof(*h));  | 
    
    
    865  | 
     | 
     | 
     | 
    
    
    866  | 
     | 
     | 
    	rc = write(fd, p, bt->head.psize);  | 
    
    
    867  | 
     | 
     | 
    	free(p);  | 
    
    
    868  | 
     | 
     | 
    	if (rc != (ssize_t)bt->head.psize) { | 
    
    
    869  | 
     | 
     | 
    		if (rc > 0)  | 
    
    
    870  | 
     | 
     | 
    			DPRINTF("short write, filesystem full?"); | 
    
    
    871  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    872  | 
     | 
     | 
    	}  | 
    
    
    873  | 
     | 
     | 
     | 
    
    
    874  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    875  | 
     | 
     | 
    }  | 
    
    
    876  | 
     | 
     | 
     | 
    
    
    877  | 
     | 
     | 
    static int  | 
    
    
    878  | 
     | 
     | 
    btree_read_header(struct btree *bt)  | 
    
    
    879  | 
     | 
     | 
    { | 
    
    
    880  | 
     | 
     | 
    	char		 page[PAGESIZE];  | 
    
    
    881  | 
     | 
     | 
    	struct page	*p;  | 
    
    
    882  | 
     | 
     | 
    	struct bt_head	*h;  | 
    
    
    883  | 
     | 
     | 
    	int		 rc;  | 
    
    
    884  | 
     | 
     | 
     | 
    
    
    885  | 
     | 
     | 
    	assert(bt != NULL);  | 
    
    
    886  | 
     | 
     | 
     | 
    
    
    887  | 
     | 
     | 
    	/* We don't know the page size yet, so use a minimum value.  | 
    
    
    888  | 
     | 
     | 
    	 */  | 
    
    
    889  | 
     | 
     | 
     | 
    
    
    890  | 
     | 
     | 
    	if ((rc = pread(bt->fd, page, PAGESIZE, 0)) == 0) { | 
    
    
    891  | 
     | 
     | 
    		errno = ENOENT;  | 
    
    
    892  | 
     | 
     | 
    		return -1;  | 
    
    
    893  | 
     | 
     | 
    	} else if (rc != PAGESIZE) { | 
    
    
    894  | 
     | 
     | 
    		if (rc > 0)  | 
    
    
    895  | 
     | 
     | 
    			errno = EINVAL;  | 
    
    
    896  | 
     | 
     | 
    		DPRINTF("read: %s", strerror(errno)); | 
    
    
    897  | 
     | 
     | 
    		return -1;  | 
    
    
    898  | 
     | 
     | 
    	}  | 
    
    
    899  | 
     | 
     | 
     | 
    
    
    900  | 
     | 
     | 
    	p = (struct page *)page;  | 
    
    
    901  | 
     | 
     | 
     | 
    
    
    902  | 
     | 
     | 
    	if (!F_ISSET(p->flags, P_HEAD)) { | 
    
    
    903  | 
     | 
     | 
    		DPRINTF("page %d not a header page", p->pgno); | 
    
    
    904  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    905  | 
     | 
     | 
    		return -1;  | 
    
    
    906  | 
     | 
     | 
    	}  | 
    
    
    907  | 
     | 
     | 
     | 
    
    
    908  | 
     | 
     | 
    	h = METADATA(p);  | 
    
    
    909  | 
     | 
     | 
    	if (h->magic != BT_MAGIC) { | 
    
    
    910  | 
     | 
     | 
    		DPRINTF("header has invalid magic"); | 
    
    
    911  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    912  | 
     | 
     | 
    		return -1;  | 
    
    
    913  | 
     | 
     | 
    	}  | 
    
    
    914  | 
     | 
     | 
     | 
    
    
    915  | 
     | 
     | 
    	if (h->version != BT_VERSION) { | 
    
    
    916  | 
     | 
     | 
    		DPRINTF("database is version %u, expected version %u", | 
    
    
    917  | 
     | 
     | 
    		    bt->head.version, BT_VERSION);  | 
    
    
    918  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    919  | 
     | 
     | 
    		return -1;  | 
    
    
    920  | 
     | 
     | 
    	}  | 
    
    
    921  | 
     | 
     | 
     | 
    
    
    922  | 
     | 
     | 
    	bcopy(h, &bt->head, sizeof(*h));  | 
    
    
    923  | 
     | 
     | 
    	return 0;  | 
    
    
    924  | 
     | 
     | 
    }  | 
    
    
    925  | 
     | 
     | 
     | 
    
    
    926  | 
     | 
     | 
    static int  | 
    
    
    927  | 
     | 
     | 
    btree_write_meta(struct btree *bt, pgno_t root, unsigned int flags)  | 
    
    
    928  | 
     | 
     | 
    { | 
    
    
    929  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    930  | 
     | 
     | 
    	struct bt_meta	*meta;  | 
    
    
    931  | 
     | 
     | 
    	ssize_t		 rc;  | 
    
    
    932  | 
     | 
     | 
     | 
    
    
    933  | 
     | 
     | 
    	DPRINTF("writing meta page for root page %u", root); | 
    
    
    934  | 
     | 
     | 
     | 
    
    
    935  | 
     | 
     | 
    	assert(bt != NULL);  | 
    
    
    936  | 
     | 
     | 
    	assert(bt->txn != NULL);  | 
    
    
    937  | 
     | 
     | 
     | 
    
    
    938  | 
     | 
     | 
    	if ((mp = btree_new_page(bt, P_META)) == NULL)  | 
    
    
    939  | 
     | 
     | 
    		return -1;  | 
    
    
    940  | 
     | 
     | 
     | 
    
    
    941  | 
     | 
     | 
    	bt->meta.prev_meta = bt->meta.root;  | 
    
    
    942  | 
     | 
     | 
    	bt->meta.root = root;  | 
    
    
    943  | 
     | 
     | 
    	bt->meta.flags = flags;  | 
    
    
    944  | 
     | 
     | 
    	bt->meta.created_at = time(0);  | 
    
    
    945  | 
     | 
     | 
    	bt->meta.revisions++;  | 
    
    
    946  | 
     | 
     | 
    	SHA1((unsigned char *)&bt->meta, METAHASHLEN, bt->meta.hash);  | 
    
    
    947  | 
     | 
     | 
     | 
    
    
    948  | 
     | 
     | 
    	/* Copy the meta data changes to the new meta page. */  | 
    
    
    949  | 
     | 
     | 
    	meta = METADATA(mp->page);  | 
    
    
    950  | 
     | 
     | 
    	bcopy(&bt->meta, meta, sizeof(*meta));  | 
    
    
    951  | 
     | 
     | 
     | 
    
    
    952  | 
     | 
     | 
    	rc = write(bt->fd, mp->page, bt->head.psize);  | 
    
    
    953  | 
     | 
     | 
    	mp->dirty = 0;  | 
    
    
    954  | 
     | 
     | 
    	SIMPLEQ_REMOVE_HEAD(bt->txn->dirty_queue, next);  | 
    
    
    955  | 
     | 
     | 
    	if (rc != (ssize_t)bt->head.psize) { | 
    
    
    956  | 
     | 
     | 
    		if (rc > 0)  | 
    
    
    957  | 
     | 
     | 
    			DPRINTF("short write, filesystem full?"); | 
    
    
    958  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    959  | 
     | 
     | 
    	}  | 
    
    
    960  | 
     | 
     | 
     | 
    
    
    961  | 
     | 
     | 
    	if ((bt->size = lseek(bt->fd, 0, SEEK_END)) == -1) { | 
    
    
    962  | 
     | 
     | 
    		DPRINTF("failed to update file size: %s", strerror(errno)); | 
    
    
    963  | 
     | 
     | 
    		bt->size = 0;  | 
    
    
    964  | 
     | 
     | 
    	}  | 
    
    
    965  | 
     | 
     | 
     | 
    
    
    966  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    967  | 
     | 
     | 
    }  | 
    
    
    968  | 
     | 
     | 
     | 
    
    
    969  | 
     | 
     | 
    /* Returns true if page p is a valid meta page, false otherwise.  | 
    
    
    970  | 
     | 
     | 
     */  | 
    
    
    971  | 
     | 
     | 
    static int  | 
    
    
    972  | 
     | 
     | 
    btree_is_meta_page(struct page *p)  | 
    
    
    973  | 
     | 
     | 
    { | 
    
    
    974  | 
     | 
     | 
    	struct bt_meta	*m;  | 
    
    
    975  | 
     | 
     | 
    	unsigned char	 hash[SHA_DIGEST_LENGTH];  | 
    
    
    976  | 
     | 
     | 
     | 
    
    
    977  | 
     | 
     | 
    	m = METADATA(p);  | 
    
    
    978  | 
     | 
     | 
    	if (!F_ISSET(p->flags, P_META)) { | 
    
    
    979  | 
     | 
     | 
    		DPRINTF("page %d not a meta page", p->pgno); | 
    
    
    980  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    981  | 
     | 
     | 
    		return 0;  | 
    
    
    982  | 
     | 
     | 
    	}  | 
    
    
    983  | 
     | 
     | 
     | 
    
    
    984  | 
     | 
     | 
    	if (m->root >= p->pgno && m->root != P_INVALID) { | 
    
    
    985  | 
     | 
     | 
    		DPRINTF("page %d points to an invalid root page", p->pgno); | 
    
    
    986  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    987  | 
     | 
     | 
    		return 0;  | 
    
    
    988  | 
     | 
     | 
    	}  | 
    
    
    989  | 
     | 
     | 
     | 
    
    
    990  | 
     | 
     | 
    	SHA1((unsigned char *)m, METAHASHLEN, hash);  | 
    
    
    991  | 
     | 
     | 
    	if (bcmp(hash, m->hash, SHA_DIGEST_LENGTH) != 0) { | 
    
    
    992  | 
     | 
     | 
    		DPRINTF("page %d has an invalid digest", p->pgno); | 
    
    
    993  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    994  | 
     | 
     | 
    		return 0;  | 
    
    
    995  | 
     | 
     | 
    	}  | 
    
    
    996  | 
     | 
     | 
     | 
    
    
    997  | 
     | 
     | 
    	return 1;  | 
    
    
    998  | 
     | 
     | 
    }  | 
    
    
    999  | 
     | 
     | 
     | 
    
    
    1000  | 
     | 
     | 
    static int  | 
    
    
    1001  | 
     | 
     | 
    btree_read_meta(struct btree *bt, pgno_t *p_next)  | 
    
    
    1002  | 
     | 
     | 
    { | 
    
    
    1003  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    1004  | 
     | 
     | 
    	struct bt_meta	*meta;  | 
    
    
    1005  | 
     | 
     | 
    	pgno_t		 meta_pgno, next_pgno;  | 
    
    
    1006  | 
     | 
     | 
    	off_t		 size;  | 
    
    
    1007  | 
     | 
     | 
     | 
    
    
    1008  | 
     | 
     | 
    	assert(bt != NULL);  | 
    
    
    1009  | 
     | 
     | 
     | 
    
    
    1010  | 
     | 
     | 
    	if ((size = lseek(bt->fd, 0, SEEK_END)) == -1)  | 
    
    
    1011  | 
     | 
     | 
    		goto fail;  | 
    
    
    1012  | 
     | 
     | 
     | 
    
    
    1013  | 
     | 
     | 
    	DPRINTF("btree_read_meta: size = %llu", size); | 
    
    
    1014  | 
     | 
     | 
     | 
    
    
    1015  | 
     | 
     | 
    	if (size < bt->size) { | 
    
    
    1016  | 
     | 
     | 
    		DPRINTF("file has shrunk!"); | 
    
    
    1017  | 
     | 
     | 
    		errno = EIO;  | 
    
    
    1018  | 
     | 
     | 
    		goto fail;  | 
    
    
    1019  | 
     | 
     | 
    	}  | 
    
    
    1020  | 
     | 
     | 
     | 
    
    
    1021  | 
     | 
     | 
    	if (size == bt->head.psize) {		/* there is only the header */ | 
    
    
    1022  | 
     | 
     | 
    		if (p_next != NULL)  | 
    
    
    1023  | 
     | 
     | 
    			*p_next = 1;  | 
    
    
    1024  | 
     | 
     | 
    		return BT_SUCCESS;		/* new file */  | 
    
    
    1025  | 
     | 
     | 
    	}  | 
    
    
    1026  | 
     | 
     | 
     | 
    
    
    1027  | 
     | 
     | 
    	next_pgno = size / bt->head.psize;  | 
    
    
    1028  | 
     | 
     | 
    	if (next_pgno == 0) { | 
    
    
    1029  | 
     | 
     | 
    		DPRINTF("corrupt file"); | 
    
    
    1030  | 
     | 
     | 
    		errno = EIO;  | 
    
    
    1031  | 
     | 
     | 
    		goto fail;  | 
    
    
    1032  | 
     | 
     | 
    	}  | 
    
    
    1033  | 
     | 
     | 
     | 
    
    
    1034  | 
     | 
     | 
    	meta_pgno = next_pgno - 1;  | 
    
    
    1035  | 
     | 
     | 
     | 
    
    
    1036  | 
     | 
     | 
    	if (size % bt->head.psize != 0) { | 
    
    
    1037  | 
     | 
     | 
    		DPRINTF("filesize not a multiple of the page size!"); | 
    
    
    1038  | 
     | 
     | 
    		bt->flags |= BT_FIXPADDING;  | 
    
    
    1039  | 
     | 
     | 
    		next_pgno++;  | 
    
    
    1040  | 
     | 
     | 
    	}  | 
    
    
    1041  | 
     | 
     | 
     | 
    
    
    1042  | 
     | 
     | 
    	if (p_next != NULL)  | 
    
    
    1043  | 
     | 
     | 
    		*p_next = next_pgno;  | 
    
    
    1044  | 
     | 
     | 
     | 
    
    
    1045  | 
     | 
     | 
    	if (size == bt->size) { | 
    
    
    1046  | 
     | 
     | 
    		DPRINTF("size unchanged, keeping current meta page"); | 
    
    
    1047  | 
     | 
     | 
    		if (F_ISSET(bt->meta.flags, BT_TOMBSTONE)) { | 
    
    
    1048  | 
     | 
     | 
    			DPRINTF("file is dead"); | 
    
    
    1049  | 
     | 
     | 
    			errno = ESTALE;  | 
    
    
    1050  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    1051  | 
     | 
     | 
    		} else  | 
    
    
    1052  | 
     | 
     | 
    			return BT_SUCCESS;  | 
    
    
    1053  | 
     | 
     | 
    	}  | 
    
    
    1054  | 
     | 
     | 
    	bt->size = size;  | 
    
    
    1055  | 
     | 
     | 
     | 
    
    
    1056  | 
     | 
     | 
    	while (meta_pgno > 0) { | 
    
    
    1057  | 
     | 
     | 
    		if ((mp = btree_get_mpage(bt, meta_pgno)) == NULL)  | 
    
    
    1058  | 
     | 
     | 
    			break;  | 
    
    
    1059  | 
     | 
     | 
    		if (btree_is_meta_page(mp->page)) { | 
    
    
    1060  | 
     | 
     | 
    			meta = METADATA(mp->page);  | 
    
    
    1061  | 
     | 
     | 
    			DPRINTF("flags = 0x%x", meta->flags); | 
    
    
    1062  | 
     | 
     | 
    			if (F_ISSET(meta->flags, BT_TOMBSTONE)) { | 
    
    
    1063  | 
     | 
     | 
    				DPRINTF("file is dead"); | 
    
    
    1064  | 
     | 
     | 
    				errno = ESTALE;  | 
    
    
    1065  | 
     | 
     | 
    				return BT_FAIL;  | 
    
    
    1066  | 
     | 
     | 
    			} else { | 
    
    
    1067  | 
     | 
     | 
    				/* Make copy of last meta page. */  | 
    
    
    1068  | 
     | 
     | 
    				bcopy(meta, &bt->meta, sizeof(bt->meta));  | 
    
    
    1069  | 
     | 
     | 
    				return BT_SUCCESS;  | 
    
    
    1070  | 
     | 
     | 
    			}  | 
    
    
    1071  | 
     | 
     | 
    		}  | 
    
    
    1072  | 
     | 
     | 
    		--meta_pgno;	/* scan backwards to first valid meta page */  | 
    
    
    1073  | 
     | 
     | 
    	}  | 
    
    
    1074  | 
     | 
     | 
     | 
    
    
    1075  | 
     | 
     | 
    	errno = EIO;  | 
    
    
    1076  | 
     | 
     | 
    fail:  | 
    
    
    1077  | 
     | 
     | 
    	if (p_next != NULL)  | 
    
    
    1078  | 
     | 
     | 
    		*p_next = P_INVALID;  | 
    
    
    1079  | 
     | 
     | 
    	return BT_FAIL;  | 
    
    
    1080  | 
     | 
     | 
    }  | 
    
    
    1081  | 
     | 
     | 
     | 
    
    
    1082  | 
     | 
     | 
    struct btree *  | 
    
    
    1083  | 
     | 
     | 
    btree_open_fd(int fd, unsigned int flags)  | 
    
    
    1084  | 
     | 
     | 
    { | 
    
    
    1085  | 
     | 
     | 
    	struct btree	*bt;  | 
    
    
    1086  | 
     | 
     | 
    	int		 fl;  | 
    
    
    1087  | 
     | 
     | 
     | 
    
    
    1088  | 
     | 
     | 
    	fl = fcntl(fd, F_GETFL);  | 
    
    
    1089  | 
     | 
     | 
    	if (fcntl(fd, F_SETFL, fl | O_APPEND) == -1)  | 
    
    
    1090  | 
     | 
     | 
    		return NULL;  | 
    
    
    1091  | 
     | 
     | 
     | 
    
    
    1092  | 
     | 
     | 
    	if ((bt = calloc(1, sizeof(*bt))) == NULL)  | 
    
    
    1093  | 
     | 
     | 
    		return NULL;  | 
    
    
    1094  | 
     | 
     | 
    	bt->fd = fd;  | 
    
    
    1095  | 
     | 
     | 
    	bt->flags = flags;  | 
    
    
    1096  | 
     | 
     | 
    	bt->flags &= ~BT_FIXPADDING;  | 
    
    
    1097  | 
     | 
     | 
    	bt->ref = 1;  | 
    
    
    1098  | 
     | 
     | 
    	bt->meta.root = P_INVALID;  | 
    
    
    1099  | 
     | 
     | 
     | 
    
    
    1100  | 
     | 
     | 
    	if ((bt->page_cache = calloc(1, sizeof(*bt->page_cache))) == NULL)  | 
    
    
    1101  | 
     | 
     | 
    		goto fail;  | 
    
    
    1102  | 
     | 
     | 
    	bt->stat.max_cache = BT_MAXCACHE_DEF;  | 
    
    
    1103  | 
     | 
     | 
    	RB_INIT(bt->page_cache);  | 
    
    
    1104  | 
     | 
     | 
     | 
    
    
    1105  | 
     | 
     | 
    	if ((bt->lru_queue = calloc(1, sizeof(*bt->lru_queue))) == NULL)  | 
    
    
    1106  | 
     | 
     | 
    		goto fail;  | 
    
    
    1107  | 
     | 
     | 
    	TAILQ_INIT(bt->lru_queue);  | 
    
    
    1108  | 
     | 
     | 
     | 
    
    
    1109  | 
     | 
     | 
    	if (btree_read_header(bt) != 0) { | 
    
    
    1110  | 
     | 
     | 
    		if (errno != ENOENT)  | 
    
    
    1111  | 
     | 
     | 
    			goto fail;  | 
    
    
    1112  | 
     | 
     | 
    		DPRINTF("new database"); | 
    
    
    1113  | 
     | 
     | 
    		btree_write_header(bt, bt->fd);  | 
    
    
    1114  | 
     | 
     | 
    	}  | 
    
    
    1115  | 
     | 
     | 
     | 
    
    
    1116  | 
     | 
     | 
    	if (btree_read_meta(bt, NULL) != 0)  | 
    
    
    1117  | 
     | 
     | 
    		goto fail;  | 
    
    
    1118  | 
     | 
     | 
     | 
    
    
    1119  | 
     | 
     | 
    	DPRINTF("opened database version %u, pagesize %u", | 
    
    
    1120  | 
     | 
     | 
    	    bt->head.version, bt->head.psize);  | 
    
    
    1121  | 
     | 
     | 
    	DPRINTF("timestamp: %s", ctime(&bt->meta.created_at)); | 
    
    
    1122  | 
     | 
     | 
    	DPRINTF("depth: %u", bt->meta.depth); | 
    
    
    1123  | 
     | 
     | 
    	DPRINTF("entries: %llu", bt->meta.entries); | 
    
    
    1124  | 
     | 
     | 
    	DPRINTF("revisions: %u", bt->meta.revisions); | 
    
    
    1125  | 
     | 
     | 
    	DPRINTF("branch pages: %u", bt->meta.branch_pages); | 
    
    
    1126  | 
     | 
     | 
    	DPRINTF("leaf pages: %u", bt->meta.leaf_pages); | 
    
    
    1127  | 
     | 
     | 
    	DPRINTF("overflow pages: %u", bt->meta.overflow_pages); | 
    
    
    1128  | 
     | 
     | 
    	DPRINTF("root: %u", bt->meta.root); | 
    
    
    1129  | 
     | 
     | 
    	DPRINTF("previous meta page: %u", bt->meta.prev_meta); | 
    
    
    1130  | 
     | 
     | 
     | 
    
    
    1131  | 
     | 
     | 
    	return bt;  | 
    
    
    1132  | 
     | 
     | 
     | 
    
    
    1133  | 
     | 
     | 
    fail:  | 
    
    
    1134  | 
     | 
     | 
    	free(bt->lru_queue);  | 
    
    
    1135  | 
     | 
     | 
    	free(bt->page_cache);  | 
    
    
    1136  | 
     | 
     | 
    	free(bt);  | 
    
    
    1137  | 
     | 
     | 
    	return NULL;  | 
    
    
    1138  | 
     | 
     | 
    }  | 
    
    
    1139  | 
     | 
     | 
     | 
    
    
    1140  | 
     | 
     | 
    struct btree *  | 
    
    
    1141  | 
     | 
     | 
    btree_open(const char *path, unsigned int flags, mode_t mode)  | 
    
    
    1142  | 
     | 
     | 
    { | 
    
    
    1143  | 
     | 
     | 
    	int		 fd, oflags;  | 
    
    
    1144  | 
     | 
     | 
    	struct btree	*bt;  | 
    
    
    1145  | 
     | 
     | 
     | 
    
    
    1146  | 
     | 
     | 
    	if (F_ISSET(flags, BT_RDONLY))  | 
    
    
    1147  | 
     | 
     | 
    		oflags = O_RDONLY;  | 
    
    
    1148  | 
     | 
     | 
    	else  | 
    
    
    1149  | 
     | 
     | 
    		oflags = O_RDWR | O_CREAT | O_APPEND;  | 
    
    
    1150  | 
     | 
     | 
     | 
    
    
    1151  | 
     | 
     | 
    	if ((fd = open(path, oflags, mode)) == -1)  | 
    
    
    1152  | 
     | 
     | 
    		return NULL;  | 
    
    
    1153  | 
     | 
     | 
     | 
    
    
    1154  | 
     | 
     | 
    	if ((bt = btree_open_fd(fd, flags)) == NULL)  | 
    
    
    1155  | 
     | 
     | 
    		close(fd);  | 
    
    
    1156  | 
     | 
     | 
    	else { | 
    
    
    1157  | 
     | 
     | 
    		bt->path = strdup(path);  | 
    
    
    1158  | 
     | 
     | 
    		DPRINTF("opened btree %p", bt); | 
    
    
    1159  | 
     | 
     | 
    	}  | 
    
    
    1160  | 
     | 
     | 
     | 
    
    
    1161  | 
     | 
     | 
    	return bt;  | 
    
    
    1162  | 
     | 
     | 
    }  | 
    
    
    1163  | 
     | 
     | 
     | 
    
    
    1164  | 
     | 
     | 
    static void  | 
    
    
    1165  | 
     | 
     | 
    btree_ref(struct btree *bt)  | 
    
    
    1166  | 
     | 
     | 
    { | 
    
    
    1167  | 
     | 
     | 
    	bt->ref++;  | 
    
    
    1168  | 
     | 
     | 
    	DPRINTF("ref is now %d on btree %p", bt->ref, bt); | 
    
    
    1169  | 
     | 
     | 
    }  | 
    
    
    1170  | 
     | 
     | 
     | 
    
    
    1171  | 
     | 
     | 
    void  | 
    
    
    1172  | 
     | 
     | 
    btree_close(struct btree *bt)  | 
    
    
    1173  | 
     | 
     | 
    { | 
    
    
    1174  | 
     | 
     | 
    	if (bt == NULL)  | 
    
    
    1175  | 
     | 
     | 
    		return;  | 
    
    
    1176  | 
     | 
     | 
     | 
    
    
    1177  | 
     | 
     | 
    	if (--bt->ref == 0) { | 
    
    
    1178  | 
     | 
     | 
    		DPRINTF("ref is zero, closing btree %p", bt); | 
    
    
    1179  | 
     | 
     | 
    		close(bt->fd);  | 
    
    
    1180  | 
     | 
     | 
    		mpage_flush(bt);  | 
    
    
    1181  | 
     | 
     | 
    		free(bt->lru_queue);  | 
    
    
    1182  | 
     | 
     | 
    		free(bt->path);  | 
    
    
    1183  | 
     | 
     | 
    		free(bt->page_cache);  | 
    
    
    1184  | 
     | 
     | 
    		free(bt);  | 
    
    
    1185  | 
     | 
     | 
    	} else  | 
    
    
    1186  | 
     | 
     | 
    		DPRINTF("ref is now %d on btree %p", bt->ref, bt); | 
    
    
    1187  | 
     | 
     | 
    }  | 
    
    
    1188  | 
     | 
     | 
     | 
    
    
    1189  | 
     | 
     | 
    /* Search for key within a leaf page, using binary search.  | 
    
    
    1190  | 
     | 
     | 
     * Returns the smallest entry larger or equal to the key.  | 
    
    
    1191  | 
     | 
     | 
     * If exactp is non-null, stores whether the found entry was an exact match  | 
    
    
    1192  | 
     | 
     | 
     * in *exactp (1 or 0).  | 
    
    
    1193  | 
     | 
     | 
     * If kip is non-null, stores the index of the found entry in *kip.  | 
    
    
    1194  | 
     | 
     | 
     * If no entry larger of equal to the key is found, returns NULL.  | 
    
    
    1195  | 
     | 
     | 
     */  | 
    
    
    1196  | 
     | 
     | 
    static struct node *  | 
    
    
    1197  | 
     | 
     | 
    btree_search_node(struct btree *bt, struct mpage *mp, struct btval *key,  | 
    
    
    1198  | 
     | 
     | 
        int *exactp, unsigned int *kip)  | 
    
    
    1199  | 
     | 
     | 
    { | 
    
    
    1200  | 
     | 
     | 
    	unsigned int	 i = 0;  | 
    
    
    1201  | 
     | 
     | 
    	int		 low, high;  | 
    
    
    1202  | 
     | 
     | 
    	int		 rc = 0;  | 
    
    
    1203  | 
     | 
     | 
    	struct node	*node;  | 
    
    
    1204  | 
     | 
     | 
    	struct btval	 nodekey;  | 
    
    
    1205  | 
     | 
     | 
     | 
    
    
    1206  | 
     | 
     | 
    	DPRINTF("searching %lu keys in %s page %u with prefix [%.*s]", | 
    
    
    1207  | 
     | 
     | 
    	    NUMKEYS(mp),  | 
    
    
    1208  | 
     | 
     | 
    	    IS_LEAF(mp) ? "leaf" : "branch",  | 
    
    
    1209  | 
     | 
     | 
    	    mp->pgno, (int)mp->prefix.len, (char *)mp->prefix.str);  | 
    
    
    1210  | 
     | 
     | 
     | 
    
    
    1211  | 
     | 
     | 
    	assert(NUMKEYS(mp) > 0);  | 
    
    
    1212  | 
     | 
     | 
     | 
    
    
    1213  | 
     | 
     | 
    	memset(&nodekey, 0, sizeof(nodekey));  | 
    
    
    1214  | 
     | 
     | 
     | 
    
    
    1215  | 
     | 
     | 
    	low = IS_LEAF(mp) ? 0 : 1;  | 
    
    
    1216  | 
     | 
     | 
    	high = NUMKEYS(mp) - 1;  | 
    
    
    1217  | 
     | 
     | 
    	while (low <= high) { | 
    
    
    1218  | 
     | 
     | 
    		i = (low + high) >> 1;  | 
    
    
    1219  | 
     | 
     | 
    		node = NODEPTR(mp, i);  | 
    
    
    1220  | 
     | 
     | 
     | 
    
    
    1221  | 
     | 
     | 
    		nodekey.size = node->ksize;  | 
    
    
    1222  | 
     | 
     | 
    		nodekey.data = NODEKEY(node);  | 
    
    
    1223  | 
     | 
     | 
     | 
    
    
    1224  | 
     | 
     | 
    		if (bt->cmp)  | 
    
    
    1225  | 
     | 
     | 
    			rc = bt->cmp(key, &nodekey);  | 
    
    
    1226  | 
     | 
     | 
    		else  | 
    
    
    1227  | 
     | 
     | 
    			rc = bt_cmp(bt, key, &nodekey, &mp->prefix);  | 
    
    
    1228  | 
     | 
     | 
     | 
    
    
    1229  | 
     | 
     | 
    		if (IS_LEAF(mp))  | 
    
    
    1230  | 
     | 
     | 
    			DPRINTF("found leaf index %u [%.*s], rc = %d", | 
    
    
    1231  | 
     | 
     | 
    			    i, (int)nodekey.size, (char *)nodekey.data, rc);  | 
    
    
    1232  | 
     | 
     | 
    		else  | 
    
    
    1233  | 
     | 
     | 
    			DPRINTF("found branch index %u [%.*s -> %u], rc = %d", | 
    
    
    1234  | 
     | 
     | 
    			    i, (int)node->ksize, (char *)NODEKEY(node),  | 
    
    
    1235  | 
     | 
     | 
    			    node->n_pgno, rc);  | 
    
    
    1236  | 
     | 
     | 
     | 
    
    
    1237  | 
     | 
     | 
    		if (rc == 0)  | 
    
    
    1238  | 
     | 
     | 
    			break;  | 
    
    
    1239  | 
     | 
     | 
    		if (rc > 0)  | 
    
    
    1240  | 
     | 
     | 
    			low = i + 1;  | 
    
    
    1241  | 
     | 
     | 
    		else  | 
    
    
    1242  | 
     | 
     | 
    			high = i - 1;  | 
    
    
    1243  | 
     | 
     | 
    	}  | 
    
    
    1244  | 
     | 
     | 
     | 
    
    
    1245  | 
     | 
     | 
    	if (rc > 0) {	/* Found entry is less than the key. */ | 
    
    
    1246  | 
     | 
     | 
    		i++;	/* Skip to get the smallest entry larger than key. */  | 
    
    
    1247  | 
     | 
     | 
    		if (i >= NUMKEYS(mp))  | 
    
    
    1248  | 
     | 
     | 
    			/* There is no entry larger or equal to the key. */  | 
    
    
    1249  | 
     | 
     | 
    			return NULL;  | 
    
    
    1250  | 
     | 
     | 
    	}  | 
    
    
    1251  | 
     | 
     | 
    	if (exactp)  | 
    
    
    1252  | 
     | 
     | 
    		*exactp = (rc == 0);  | 
    
    
    1253  | 
     | 
     | 
    	if (kip)	/* Store the key index if requested. */  | 
    
    
    1254  | 
     | 
     | 
    		*kip = i;  | 
    
    
    1255  | 
     | 
     | 
     | 
    
    
    1256  | 
     | 
     | 
    	return NODEPTR(mp, i);  | 
    
    
    1257  | 
     | 
     | 
    }  | 
    
    
    1258  | 
     | 
     | 
     | 
    
    
    1259  | 
     | 
     | 
    static void  | 
    
    
    1260  | 
     | 
     | 
    cursor_pop_page(struct cursor *cursor)  | 
    
    
    1261  | 
     | 
     | 
    { | 
    
    
    1262  | 
     | 
     | 
    	struct ppage	*top;  | 
    
    
    1263  | 
     | 
     | 
     | 
    
    
    1264  | 
     | 
     | 
    	top = CURSOR_TOP(cursor);  | 
    
    
    1265  | 
     | 
     | 
    	CURSOR_POP(cursor);  | 
    
    
    1266  | 
     | 
     | 
    	top->mpage->ref--;  | 
    
    
    1267  | 
     | 
     | 
     | 
    
    
    1268  | 
     | 
     | 
    	DPRINTF("popped page %u off cursor %p", top->mpage->pgno, cursor); | 
    
    
    1269  | 
     | 
     | 
     | 
    
    
    1270  | 
     | 
     | 
    	free(top);  | 
    
    
    1271  | 
     | 
     | 
    }  | 
    
    
    1272  | 
     | 
     | 
     | 
    
    
    1273  | 
     | 
     | 
    static struct ppage *  | 
    
    
    1274  | 
     | 
     | 
    cursor_push_page(struct cursor *cursor, struct mpage *mp)  | 
    
    
    1275  | 
     | 
     | 
    { | 
    
    
    1276  | 
     | 
     | 
    	struct ppage	*ppage;  | 
    
    
    1277  | 
     | 
     | 
     | 
    
    
    1278  | 
     | 
     | 
    	DPRINTF("pushing page %u on cursor %p", mp->pgno, cursor); | 
    
    
    1279  | 
     | 
     | 
     | 
    
    
    1280  | 
     | 
     | 
    	if ((ppage = calloc(1, sizeof(*ppage))) == NULL)  | 
    
    
    1281  | 
     | 
     | 
    		return NULL;  | 
    
    
    1282  | 
     | 
     | 
    	ppage->mpage = mp;  | 
    
    
    1283  | 
     | 
     | 
    	mp->ref++;  | 
    
    
    1284  | 
     | 
     | 
    	CURSOR_PUSH(cursor, ppage);  | 
    
    
    1285  | 
     | 
     | 
    	return ppage;  | 
    
    
    1286  | 
     | 
     | 
    }  | 
    
    
    1287  | 
     | 
     | 
     | 
    
    
    1288  | 
     | 
     | 
    static struct mpage *  | 
    
    
    1289  | 
     | 
     | 
    btree_get_mpage(struct btree *bt, pgno_t pgno)  | 
    
    
    1290  | 
     | 
     | 
    { | 
    
    
    1291  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    1292  | 
     | 
     | 
     | 
    
    
    1293  | 
     | 
     | 
    	mp = mpage_lookup(bt, pgno);  | 
    
    
    1294  | 
     | 
     | 
    	if (mp == NULL) { | 
    
    
    1295  | 
     | 
     | 
    		if ((mp = calloc(1, sizeof(*mp))) == NULL)  | 
    
    
    1296  | 
     | 
     | 
    			return NULL;  | 
    
    
    1297  | 
     | 
     | 
    		if ((mp->page = malloc(bt->head.psize)) == NULL) { | 
    
    
    1298  | 
     | 
     | 
    			free(mp);  | 
    
    
    1299  | 
     | 
     | 
    			return NULL;  | 
    
    
    1300  | 
     | 
     | 
    		}  | 
    
    
    1301  | 
     | 
     | 
    		if (btree_read_page(bt, pgno, mp->page) != BT_SUCCESS) { | 
    
    
    1302  | 
     | 
     | 
    			mpage_free(mp);  | 
    
    
    1303  | 
     | 
     | 
    			return NULL;  | 
    
    
    1304  | 
     | 
     | 
    		}  | 
    
    
    1305  | 
     | 
     | 
    		mp->pgno = pgno;  | 
    
    
    1306  | 
     | 
     | 
    		mpage_add(bt, mp);  | 
    
    
    1307  | 
     | 
     | 
    	} else  | 
    
    
    1308  | 
     | 
     | 
    		DPRINTF("returning page %u from cache", pgno); | 
    
    
    1309  | 
     | 
     | 
     | 
    
    
    1310  | 
     | 
     | 
    	return mp;  | 
    
    
    1311  | 
     | 
     | 
    }  | 
    
    
    1312  | 
     | 
     | 
     | 
    
    
    1313  | 
     | 
     | 
    static void  | 
    
    
    1314  | 
     | 
     | 
    concat_prefix(struct btree *bt, char *s1, size_t n1, char *s2, size_t n2,  | 
    
    
    1315  | 
     | 
     | 
        char *cs, size_t *cn)  | 
    
    
    1316  | 
     | 
     | 
    { | 
    
    
    1317  | 
     | 
     | 
    	assert(*cn >= n1 + n2);  | 
    
    
    1318  | 
     | 
     | 
    	if (F_ISSET(bt->flags, BT_REVERSEKEY)) { | 
    
    
    1319  | 
     | 
     | 
    		bcopy(s2, cs, n2);  | 
    
    
    1320  | 
     | 
     | 
    		bcopy(s1, cs + n2, n1);  | 
    
    
    1321  | 
     | 
     | 
    	} else { | 
    
    
    1322  | 
     | 
     | 
    		bcopy(s1, cs, n1);  | 
    
    
    1323  | 
     | 
     | 
    		bcopy(s2, cs + n1, n2);  | 
    
    
    1324  | 
     | 
     | 
    	}  | 
    
    
    1325  | 
     | 
     | 
    	*cn = n1 + n2;  | 
    
    
    1326  | 
     | 
     | 
    }  | 
    
    
    1327  | 
     | 
     | 
     | 
    
    
    1328  | 
     | 
     | 
    static void  | 
    
    
    1329  | 
     | 
     | 
    find_common_prefix(struct btree *bt, struct mpage *mp)  | 
    
    
    1330  | 
     | 
     | 
    { | 
    
    
    1331  | 
     | 
     | 
    	indx_t			 lbound = 0, ubound = 0;  | 
    
    
    1332  | 
     | 
     | 
    	struct mpage		*lp, *up;  | 
    
    
    1333  | 
     | 
     | 
    	struct btkey		 lprefix, uprefix;  | 
    
    
    1334  | 
     | 
     | 
     | 
    
    
    1335  | 
     | 
     | 
    	mp->prefix.len = 0;  | 
    
    
    1336  | 
     | 
     | 
    	if (bt->cmp != NULL)  | 
    
    
    1337  | 
     | 
     | 
    		return;  | 
    
    
    1338  | 
     | 
     | 
     | 
    
    
    1339  | 
     | 
     | 
    	lp = mp;  | 
    
    
    1340  | 
     | 
     | 
    	while (lp->parent != NULL) { | 
    
    
    1341  | 
     | 
     | 
    		if (lp->parent_index > 0) { | 
    
    
    1342  | 
     | 
     | 
    			lbound = lp->parent_index;  | 
    
    
    1343  | 
     | 
     | 
    			break;  | 
    
    
    1344  | 
     | 
     | 
    		}  | 
    
    
    1345  | 
     | 
     | 
    		lp = lp->parent;  | 
    
    
    1346  | 
     | 
     | 
    	}  | 
    
    
    1347  | 
     | 
     | 
     | 
    
    
    1348  | 
     | 
     | 
    	up = mp;  | 
    
    
    1349  | 
     | 
     | 
    	while (up->parent != NULL) { | 
    
    
    1350  | 
     | 
     | 
    		if (up->parent_index + 1 < (indx_t)NUMKEYS(up->parent)) { | 
    
    
    1351  | 
     | 
     | 
    			ubound = up->parent_index + 1;  | 
    
    
    1352  | 
     | 
     | 
    			break;  | 
    
    
    1353  | 
     | 
     | 
    		}  | 
    
    
    1354  | 
     | 
     | 
    		up = up->parent;  | 
    
    
    1355  | 
     | 
     | 
    	}  | 
    
    
    1356  | 
     | 
     | 
     | 
    
    
    1357  | 
     | 
     | 
    	if (lp->parent != NULL && up->parent != NULL) { | 
    
    
    1358  | 
     | 
     | 
    		expand_prefix(bt, lp->parent, lbound, &lprefix);  | 
    
    
    1359  | 
     | 
     | 
    		expand_prefix(bt, up->parent, ubound, &uprefix);  | 
    
    
    1360  | 
     | 
     | 
    		common_prefix(bt, &lprefix, &uprefix, &mp->prefix);  | 
    
    
    1361  | 
     | 
     | 
    	}  | 
    
    
    1362  | 
     | 
     | 
    	else if (mp->parent)  | 
    
    
    1363  | 
     | 
     | 
    		bcopy(&mp->parent->prefix, &mp->prefix, sizeof(mp->prefix));  | 
    
    
    1364  | 
     | 
     | 
     | 
    
    
    1365  | 
     | 
     | 
    	DPRINTF("found common prefix [%.*s] (len %zu) for page %u", | 
    
    
    1366  | 
     | 
     | 
    	    (int)mp->prefix.len, mp->prefix.str, mp->prefix.len, mp->pgno);  | 
    
    
    1367  | 
     | 
     | 
    }  | 
    
    
    1368  | 
     | 
     | 
     | 
    
    
    1369  | 
     | 
     | 
    static int  | 
    
    
    1370  | 
     | 
     | 
    btree_search_page_root(struct btree *bt, struct mpage *root, struct btval *key,  | 
    
    
    1371  | 
     | 
     | 
        struct cursor *cursor, int modify, struct mpage **mpp)  | 
    
    
    1372  | 
     | 
     | 
    { | 
    
    
    1373  | 
     | 
     | 
    	struct mpage	*mp, *parent;  | 
    
    
    1374  | 
     | 
     | 
     | 
    
    
    1375  | 
     | 
     | 
    	if (cursor && cursor_push_page(cursor, root) == NULL)  | 
    
    
    1376  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1377  | 
     | 
     | 
     | 
    
    
    1378  | 
     | 
     | 
    	mp = root;  | 
    
    
    1379  | 
     | 
     | 
    	while (IS_BRANCH(mp)) { | 
    
    
    1380  | 
     | 
     | 
    		unsigned int	 i = 0;  | 
    
    
    1381  | 
     | 
     | 
    		struct node	*node;  | 
    
    
    1382  | 
     | 
     | 
     | 
    
    
    1383  | 
     | 
     | 
    		DPRINTF("branch page %u has %lu keys", mp->pgno, NUMKEYS(mp)); | 
    
    
    1384  | 
     | 
     | 
    		assert(NUMKEYS(mp) > 1);  | 
    
    
    1385  | 
     | 
     | 
    		DPRINTF("found index 0 to page %u", NODEPGNO(NODEPTR(mp, 0))); | 
    
    
    1386  | 
     | 
     | 
     | 
    
    
    1387  | 
     | 
     | 
    		if (key == NULL)	/* Initialize cursor to first page. */  | 
    
    
    1388  | 
     | 
     | 
    			i = 0;  | 
    
    
    1389  | 
     | 
     | 
    		else { | 
    
    
    1390  | 
     | 
     | 
    			int	 exact;  | 
    
    
    1391  | 
     | 
     | 
    			node = btree_search_node(bt, mp, key, &exact, &i);  | 
    
    
    1392  | 
     | 
     | 
    			if (node == NULL)  | 
    
    
    1393  | 
     | 
     | 
    				i = NUMKEYS(mp) - 1;  | 
    
    
    1394  | 
     | 
     | 
    			else if (!exact) { | 
    
    
    1395  | 
     | 
     | 
    				assert(i > 0);  | 
    
    
    1396  | 
     | 
     | 
    				i--;  | 
    
    
    1397  | 
     | 
     | 
    			}  | 
    
    
    1398  | 
     | 
     | 
    		}  | 
    
    
    1399  | 
     | 
     | 
     | 
    
    
    1400  | 
     | 
     | 
    		if (key)  | 
    
    
    1401  | 
     | 
     | 
    			DPRINTF("following index %u for key %.*s", | 
    
    
    1402  | 
     | 
     | 
    			    i, (int)key->size, (char *)key->data);  | 
    
    
    1403  | 
     | 
     | 
    		assert(i >= 0 && i < NUMKEYS(mp));  | 
    
    
    1404  | 
     | 
     | 
    		node = NODEPTR(mp, i);  | 
    
    
    1405  | 
     | 
     | 
     | 
    
    
    1406  | 
     | 
     | 
    		if (cursor)  | 
    
    
    1407  | 
     | 
     | 
    			CURSOR_TOP(cursor)->ki = i;  | 
    
    
    1408  | 
     | 
     | 
     | 
    
    
    1409  | 
     | 
     | 
    		parent = mp;  | 
    
    
    1410  | 
     | 
     | 
    		if ((mp = btree_get_mpage(bt, NODEPGNO(node))) == NULL)  | 
    
    
    1411  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    1412  | 
     | 
     | 
    		mp->parent = parent;  | 
    
    
    1413  | 
     | 
     | 
    		mp->parent_index = i;  | 
    
    
    1414  | 
     | 
     | 
    		find_common_prefix(bt, mp);  | 
    
    
    1415  | 
     | 
     | 
     | 
    
    
    1416  | 
     | 
     | 
    		if (cursor && cursor_push_page(cursor, mp) == NULL)  | 
    
    
    1417  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    1418  | 
     | 
     | 
     | 
    
    
    1419  | 
     | 
     | 
    		if (modify && (mp = mpage_touch(bt, mp)) == NULL)  | 
    
    
    1420  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    1421  | 
     | 
     | 
    	}  | 
    
    
    1422  | 
     | 
     | 
     | 
    
    
    1423  | 
     | 
     | 
    	if (!IS_LEAF(mp)) { | 
    
    
    1424  | 
     | 
     | 
    		DPRINTF("internal error, index points to a %02X page!?", | 
    
    
    1425  | 
     | 
     | 
    		    mp->page->flags);  | 
    
    
    1426  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1427  | 
     | 
     | 
    	}  | 
    
    
    1428  | 
     | 
     | 
     | 
    
    
    1429  | 
     | 
     | 
    	DPRINTF("found leaf page %u for key %.*s", mp->pgno, | 
    
    
    1430  | 
     | 
     | 
    	    key ? (int)key->size : 0, key ? (char *)key->data : NULL);  | 
    
    
    1431  | 
     | 
     | 
     | 
    
    
    1432  | 
     | 
     | 
    	*mpp = mp;  | 
    
    
    1433  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    1434  | 
     | 
     | 
    }  | 
    
    
    1435  | 
     | 
     | 
     | 
    
    
    1436  | 
     | 
     | 
    /* Search for the page a given key should be in.  | 
    
    
    1437  | 
     | 
     | 
     * Stores a pointer to the found page in *mpp.  | 
    
    
    1438  | 
     | 
     | 
     * If key is NULL, search for the lowest page (used by btree_cursor_first).  | 
    
    
    1439  | 
     | 
     | 
     * If cursor is non-null, pushes parent pages on the cursor stack.  | 
    
    
    1440  | 
     | 
     | 
     * If modify is true, visited pages are updated with new page numbers.  | 
    
    
    1441  | 
     | 
     | 
     */  | 
    
    
    1442  | 
     | 
     | 
    static int  | 
    
    
    1443  | 
     | 
     | 
    btree_search_page(struct btree *bt, struct btree_txn *txn, struct btval *key,  | 
    
    
    1444  | 
     | 
     | 
        struct cursor *cursor, int modify, struct mpage **mpp)  | 
    
    
    1445  | 
     | 
     | 
    { | 
    
    
    1446  | 
     | 
     | 
    	int		 rc;  | 
    
    
    1447  | 
     | 
     | 
    	pgno_t		 root;  | 
    
    
    1448  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    1449  | 
     | 
     | 
     | 
    
    
    1450  | 
     | 
     | 
    	/* Can't modify pages outside a transaction. */  | 
    
    
    1451  | 
     | 
     | 
    	if (txn == NULL && modify) { | 
    
    
    1452  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    1453  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1454  | 
     | 
     | 
    	}  | 
    
    
    1455  | 
     | 
     | 
     | 
    
    
    1456  | 
     | 
     | 
    	/* Choose which root page to start with. If a transaction is given  | 
    
    
    1457  | 
     | 
     | 
             * use the root page from the transaction, otherwise read the last  | 
    
    
    1458  | 
     | 
     | 
             * committed root page.  | 
    
    
    1459  | 
     | 
     | 
    	 */  | 
    
    
    1460  | 
     | 
     | 
    	if (txn == NULL) { | 
    
    
    1461  | 
     | 
     | 
    		if ((rc = btree_read_meta(bt, NULL)) != BT_SUCCESS)  | 
    
    
    1462  | 
     | 
     | 
    			return rc;  | 
    
    
    1463  | 
     | 
     | 
    		root = bt->meta.root;  | 
    
    
    1464  | 
     | 
     | 
    	} else if (F_ISSET(txn->flags, BT_TXN_ERROR)) { | 
    
    
    1465  | 
     | 
     | 
    		DPRINTF("transaction has failed, must abort"); | 
    
    
    1466  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    1467  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1468  | 
     | 
     | 
    	} else  | 
    
    
    1469  | 
     | 
     | 
    		root = txn->root;  | 
    
    
    1470  | 
     | 
     | 
     | 
    
    
    1471  | 
     | 
     | 
    	if (root == P_INVALID) {		/* Tree is empty. */ | 
    
    
    1472  | 
     | 
     | 
    		DPRINTF("tree is empty"); | 
    
    
    1473  | 
     | 
     | 
    		errno = ENOENT;  | 
    
    
    1474  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1475  | 
     | 
     | 
    	}  | 
    
    
    1476  | 
     | 
     | 
     | 
    
    
    1477  | 
     | 
     | 
    	if ((mp = btree_get_mpage(bt, root)) == NULL)  | 
    
    
    1478  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1479  | 
     | 
     | 
     | 
    
    
    1480  | 
     | 
     | 
    	DPRINTF("root page has flags 0x%X", mp->page->flags); | 
    
    
    1481  | 
     | 
     | 
     | 
    
    
    1482  | 
     | 
     | 
    	assert(mp->parent == NULL);  | 
    
    
    1483  | 
     | 
     | 
    	assert(mp->prefix.len == 0);  | 
    
    
    1484  | 
     | 
     | 
     | 
    
    
    1485  | 
     | 
     | 
    	if (modify && !mp->dirty) { | 
    
    
    1486  | 
     | 
     | 
    		if ((mp = mpage_touch(bt, mp)) == NULL)  | 
    
    
    1487  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    1488  | 
     | 
     | 
    		txn->root = mp->pgno;  | 
    
    
    1489  | 
     | 
     | 
    	}  | 
    
    
    1490  | 
     | 
     | 
     | 
    
    
    1491  | 
     | 
     | 
    	return btree_search_page_root(bt, mp, key, cursor, modify, mpp);  | 
    
    
    1492  | 
     | 
     | 
    }  | 
    
    
    1493  | 
     | 
     | 
     | 
    
    
    1494  | 
     | 
     | 
    static int  | 
    
    
    1495  | 
     | 
     | 
    btree_read_data(struct btree *bt, struct mpage *mp, struct node *leaf,  | 
    
    
    1496  | 
     | 
     | 
        struct btval *data)  | 
    
    
    1497  | 
     | 
     | 
    { | 
    
    
    1498  | 
     | 
     | 
    	struct mpage	*omp;		/* overflow mpage */  | 
    
    
    1499  | 
     | 
     | 
    	size_t		 psz;  | 
    
    
    1500  | 
     | 
     | 
    	size_t		 max;  | 
    
    
    1501  | 
     | 
     | 
    	size_t		 sz = 0;  | 
    
    
    1502  | 
     | 
     | 
    	pgno_t		 pgno;  | 
    
    
    1503  | 
     | 
     | 
     | 
    
    
    1504  | 
     | 
     | 
    	memset(data, 0, sizeof(*data));  | 
    
    
    1505  | 
     | 
     | 
    	max = bt->head.psize - PAGEHDRSZ;  | 
    
    
    1506  | 
     | 
     | 
     | 
    
    
    1507  | 
     | 
     | 
    	if (!F_ISSET(leaf->flags, F_BIGDATA)) { | 
    
    
    1508  | 
     | 
     | 
    		data->size = leaf->n_dsize;  | 
    
    
    1509  | 
     | 
     | 
    		if (data->size > 0) { | 
    
    
    1510  | 
     | 
     | 
    			if (mp == NULL) { | 
    
    
    1511  | 
     | 
     | 
    				if ((data->data = malloc(data->size)) == NULL)  | 
    
    
    1512  | 
     | 
     | 
    					return BT_FAIL;  | 
    
    
    1513  | 
     | 
     | 
    				bcopy(NODEDATA(leaf), data->data, data->size);  | 
    
    
    1514  | 
     | 
     | 
    				data->free_data = 1;  | 
    
    
    1515  | 
     | 
     | 
    				data->mp = NULL;  | 
    
    
    1516  | 
     | 
     | 
    			} else { | 
    
    
    1517  | 
     | 
     | 
    				data->data = NODEDATA(leaf);  | 
    
    
    1518  | 
     | 
     | 
    				data->free_data = 0;  | 
    
    
    1519  | 
     | 
     | 
    				data->mp = mp;  | 
    
    
    1520  | 
     | 
     | 
    				mp->ref++;  | 
    
    
    1521  | 
     | 
     | 
    			}  | 
    
    
    1522  | 
     | 
     | 
    		}  | 
    
    
    1523  | 
     | 
     | 
    		return BT_SUCCESS;  | 
    
    
    1524  | 
     | 
     | 
    	}  | 
    
    
    1525  | 
     | 
     | 
     | 
    
    
    1526  | 
     | 
     | 
    	/* Read overflow data.  | 
    
    
    1527  | 
     | 
     | 
    	 */  | 
    
    
    1528  | 
     | 
     | 
    	DPRINTF("allocating %u byte for overflow data", leaf->n_dsize); | 
    
    
    1529  | 
     | 
     | 
    	if ((data->data = malloc(leaf->n_dsize)) == NULL)  | 
    
    
    1530  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1531  | 
     | 
     | 
    	data->size = leaf->n_dsize;  | 
    
    
    1532  | 
     | 
     | 
    	data->free_data = 1;  | 
    
    
    1533  | 
     | 
     | 
    	data->mp = NULL;  | 
    
    
    1534  | 
     | 
     | 
    	bcopy(NODEDATA(leaf), &pgno, sizeof(pgno));  | 
    
    
    1535  | 
     | 
     | 
    	for (sz = 0; sz < data->size; ) { | 
    
    
    1536  | 
     | 
     | 
    		if ((omp = btree_get_mpage(bt, pgno)) == NULL ||  | 
    
    
    1537  | 
     | 
     | 
    		    !F_ISSET(omp->page->flags, P_OVERFLOW)) { | 
    
    
    1538  | 
     | 
     | 
    			DPRINTF("read overflow page %u failed", pgno); | 
    
    
    1539  | 
     | 
     | 
    			free(data->data);  | 
    
    
    1540  | 
     | 
     | 
    			mpage_free(omp);  | 
    
    
    1541  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    1542  | 
     | 
     | 
    		}  | 
    
    
    1543  | 
     | 
     | 
    		psz = data->size - sz;  | 
    
    
    1544  | 
     | 
     | 
    		if (psz > max)  | 
    
    
    1545  | 
     | 
     | 
    			psz = max;  | 
    
    
    1546  | 
     | 
     | 
    		bcopy(omp->page->ptrs, (char *)data->data + sz, psz);  | 
    
    
    1547  | 
     | 
     | 
    		sz += psz;  | 
    
    
    1548  | 
     | 
     | 
    		pgno = omp->page->p_next_pgno;  | 
    
    
    1549  | 
     | 
     | 
    	}  | 
    
    
    1550  | 
     | 
     | 
     | 
    
    
    1551  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    1552  | 
     | 
     | 
    }  | 
    
    
    1553  | 
     | 
     | 
     | 
    
    
    1554  | 
     | 
     | 
    int  | 
    
    
    1555  | 
     | 
     | 
    btree_txn_get(struct btree *bt, struct btree_txn *txn,  | 
    
    
    1556  | 
     | 
     | 
        struct btval *key, struct btval *data)  | 
    
    
    1557  | 
     | 
     | 
    { | 
    
    
    1558  | 
     | 
     | 
    	int		 rc, exact;  | 
    
    
    1559  | 
     | 
     | 
    	struct node	*leaf;  | 
    
    
    1560  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    1561  | 
     | 
     | 
     | 
    
    
    1562  | 
     | 
     | 
    	assert(key);  | 
    
    
    1563  | 
     | 
     | 
    	assert(data);  | 
    
    
    1564  | 
     | 
     | 
    	DPRINTF("===> get key [%.*s]", (int)key->size, (char *)key->data); | 
    
    
    1565  | 
     | 
     | 
     | 
    
    
    1566  | 
     | 
     | 
    	if (bt != NULL && txn != NULL && bt != txn->bt) { | 
    
    
    1567  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    1568  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1569  | 
     | 
     | 
    	}  | 
    
    
    1570  | 
     | 
     | 
     | 
    
    
    1571  | 
     | 
     | 
    	if (bt == NULL) { | 
    
    
    1572  | 
     | 
     | 
    		if (txn == NULL) { | 
    
    
    1573  | 
     | 
     | 
    			errno = EINVAL;  | 
    
    
    1574  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    1575  | 
     | 
     | 
    		}  | 
    
    
    1576  | 
     | 
     | 
    		bt = txn->bt;  | 
    
    
    1577  | 
     | 
     | 
    	}  | 
    
    
    1578  | 
     | 
     | 
     | 
    
    
    1579  | 
     | 
     | 
    	if (key->size == 0 || key->size > MAXKEYSIZE) { | 
    
    
    1580  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    1581  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1582  | 
     | 
     | 
    	}  | 
    
    
    1583  | 
     | 
     | 
     | 
    
    
    1584  | 
     | 
     | 
    	if ((rc = btree_search_page(bt, txn, key, NULL, 0, &mp)) != BT_SUCCESS)  | 
    
    
    1585  | 
     | 
     | 
    		return rc;  | 
    
    
    1586  | 
     | 
     | 
     | 
    
    
    1587  | 
     | 
     | 
    	leaf = btree_search_node(bt, mp, key, &exact, NULL);  | 
    
    
    1588  | 
     | 
     | 
    	if (leaf && exact)  | 
    
    
    1589  | 
     | 
     | 
    		rc = btree_read_data(bt, mp, leaf, data);  | 
    
    
    1590  | 
     | 
     | 
    	else { | 
    
    
    1591  | 
     | 
     | 
    		errno = ENOENT;  | 
    
    
    1592  | 
     | 
     | 
    		rc = BT_FAIL;  | 
    
    
    1593  | 
     | 
     | 
    	}  | 
    
    
    1594  | 
     | 
     | 
     | 
    
    
    1595  | 
     | 
     | 
    	mpage_prune(bt);  | 
    
    
    1596  | 
     | 
     | 
    	return rc;  | 
    
    
    1597  | 
     | 
     | 
    }  | 
    
    
    1598  | 
     | 
     | 
     | 
    
    
    1599  | 
     | 
     | 
    static int  | 
    
    
    1600  | 
     | 
     | 
    btree_sibling(struct cursor *cursor, int move_right)  | 
    
    
    1601  | 
     | 
     | 
    { | 
    
    
    1602  | 
     | 
     | 
    	int		 rc;  | 
    
    
    1603  | 
     | 
     | 
    	struct node	*indx;  | 
    
    
    1604  | 
     | 
     | 
    	struct ppage	*parent, *top;  | 
    
    
    1605  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    1606  | 
     | 
     | 
     | 
    
    
    1607  | 
     | 
     | 
    	top = CURSOR_TOP(cursor);  | 
    
    
    1608  | 
     | 
     | 
    	if ((parent = SLIST_NEXT(top, entry)) == NULL) { | 
    
    
    1609  | 
     | 
     | 
    		errno = ENOENT;  | 
    
    
    1610  | 
     | 
     | 
    		return BT_FAIL;			/* root has no siblings */  | 
    
    
    1611  | 
     | 
     | 
    	}  | 
    
    
    1612  | 
     | 
     | 
     | 
    
    
    1613  | 
     | 
     | 
    	DPRINTF("parent page is page %u, index %u", | 
    
    
    1614  | 
     | 
     | 
    	    parent->mpage->pgno, parent->ki);  | 
    
    
    1615  | 
     | 
     | 
     | 
    
    
    1616  | 
     | 
     | 
    	cursor_pop_page(cursor);  | 
    
    
    1617  | 
     | 
     | 
    	if (move_right ? (parent->ki + 1 >= NUMKEYS(parent->mpage))  | 
    
    
    1618  | 
     | 
     | 
    		       : (parent->ki == 0)) { | 
    
    
    1619  | 
     | 
     | 
    		DPRINTF("no more keys left, moving to %s sibling", | 
    
    
    1620  | 
     | 
     | 
    		    move_right ? "right" : "left");  | 
    
    
    1621  | 
     | 
     | 
    		if ((rc = btree_sibling(cursor, move_right)) != BT_SUCCESS)  | 
    
    
    1622  | 
     | 
     | 
    			return rc;  | 
    
    
    1623  | 
     | 
     | 
    		parent = CURSOR_TOP(cursor);  | 
    
    
    1624  | 
     | 
     | 
    	} else { | 
    
    
    1625  | 
     | 
     | 
    		if (move_right)  | 
    
    
    1626  | 
     | 
     | 
    			parent->ki++;  | 
    
    
    1627  | 
     | 
     | 
    		else  | 
    
    
    1628  | 
     | 
     | 
    			parent->ki--;  | 
    
    
    1629  | 
     | 
     | 
    		DPRINTF("just moving to %s index key %u", | 
    
    
    1630  | 
     | 
     | 
    		    move_right ? "right" : "left", parent->ki);  | 
    
    
    1631  | 
     | 
     | 
    	}  | 
    
    
    1632  | 
     | 
     | 
    	assert(IS_BRANCH(parent->mpage));  | 
    
    
    1633  | 
     | 
     | 
     | 
    
    
    1634  | 
     | 
     | 
    	indx = NODEPTR(parent->mpage, parent->ki);  | 
    
    
    1635  | 
     | 
     | 
    	if ((mp = btree_get_mpage(cursor->bt, indx->n_pgno)) == NULL)  | 
    
    
    1636  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1637  | 
     | 
     | 
    	mp->parent = parent->mpage;  | 
    
    
    1638  | 
     | 
     | 
    	mp->parent_index = parent->ki;  | 
    
    
    1639  | 
     | 
     | 
     | 
    
    
    1640  | 
     | 
     | 
    	cursor_push_page(cursor, mp);  | 
    
    
    1641  | 
     | 
     | 
    	find_common_prefix(cursor->bt, mp);  | 
    
    
    1642  | 
     | 
     | 
     | 
    
    
    1643  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    1644  | 
     | 
     | 
    }  | 
    
    
    1645  | 
     | 
     | 
     | 
    
    
    1646  | 
     | 
     | 
    static int  | 
    
    
    1647  | 
     | 
     | 
    bt_set_key(struct btree *bt, struct mpage *mp, struct node *node,  | 
    
    
    1648  | 
     | 
     | 
        struct btval *key)  | 
    
    
    1649  | 
     | 
     | 
    { | 
    
    
    1650  | 
     | 
     | 
    	if (key == NULL)  | 
    
    
    1651  | 
     | 
     | 
    		return 0;  | 
    
    
    1652  | 
     | 
     | 
     | 
    
    
    1653  | 
     | 
     | 
    	if (mp->prefix.len > 0) { | 
    
    
    1654  | 
     | 
     | 
    		key->size = node->ksize + mp->prefix.len;  | 
    
    
    1655  | 
     | 
     | 
    		key->data = malloc(key->size);  | 
    
    
    1656  | 
     | 
     | 
    		if (key->data == NULL)  | 
    
    
    1657  | 
     | 
     | 
    			return -1;  | 
    
    
    1658  | 
     | 
     | 
    		concat_prefix(bt,  | 
    
    
    1659  | 
     | 
     | 
    		    mp->prefix.str, mp->prefix.len,  | 
    
    
    1660  | 
     | 
     | 
    		    NODEKEY(node), node->ksize,  | 
    
    
    1661  | 
     | 
     | 
    		    key->data, &key->size);  | 
    
    
    1662  | 
     | 
     | 
    		key->free_data = 1;  | 
    
    
    1663  | 
     | 
     | 
    	} else { | 
    
    
    1664  | 
     | 
     | 
    		key->size = node->ksize;  | 
    
    
    1665  | 
     | 
     | 
    		key->data = NODEKEY(node);  | 
    
    
    1666  | 
     | 
     | 
    		key->free_data = 0;  | 
    
    
    1667  | 
     | 
     | 
    		key->mp = mp;  | 
    
    
    1668  | 
     | 
     | 
    		mp->ref++;  | 
    
    
    1669  | 
     | 
     | 
    	}  | 
    
    
    1670  | 
     | 
     | 
     | 
    
    
    1671  | 
     | 
     | 
    	return 0;  | 
    
    
    1672  | 
     | 
     | 
    }  | 
    
    
    1673  | 
     | 
     | 
     | 
    
    
    1674  | 
     | 
     | 
    static int  | 
    
    
    1675  | 
     | 
     | 
    btree_cursor_next(struct cursor *cursor, struct btval *key, struct btval *data)  | 
    
    
    1676  | 
     | 
     | 
    { | 
    
    
    1677  | 
     | 
     | 
    	struct ppage	*top;  | 
    
    
    1678  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    1679  | 
     | 
     | 
    	struct node	*leaf;  | 
    
    
    1680  | 
     | 
     | 
     | 
    
    
    1681  | 
     | 
     | 
    	if (cursor->eof) { | 
    
    
    1682  | 
     | 
     | 
    		errno = ENOENT;  | 
    
    
    1683  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1684  | 
     | 
     | 
    	}  | 
    
    
    1685  | 
     | 
     | 
     | 
    
    
    1686  | 
     | 
     | 
    	assert(cursor->initialized);  | 
    
    
    1687  | 
     | 
     | 
     | 
    
    
    1688  | 
     | 
     | 
    	top = CURSOR_TOP(cursor);  | 
    
    
    1689  | 
     | 
     | 
    	mp = top->mpage;  | 
    
    
    1690  | 
     | 
     | 
     | 
    
    
    1691  | 
     | 
     | 
    	DPRINTF("cursor_next: top page is %u in cursor %p", mp->pgno, cursor); | 
    
    
    1692  | 
     | 
     | 
     | 
    
    
    1693  | 
     | 
     | 
    	if (top->ki + 1 >= NUMKEYS(mp)) { | 
    
    
    1694  | 
     | 
     | 
    		DPRINTF("=====> move to next sibling page"); | 
    
    
    1695  | 
     | 
     | 
    		if (btree_sibling(cursor, 1) != BT_SUCCESS) { | 
    
    
    1696  | 
     | 
     | 
    			cursor->eof = 1;  | 
    
    
    1697  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    1698  | 
     | 
     | 
    		}  | 
    
    
    1699  | 
     | 
     | 
    		top = CURSOR_TOP(cursor);  | 
    
    
    1700  | 
     | 
     | 
    		mp = top->mpage;  | 
    
    
    1701  | 
     | 
     | 
    		DPRINTF("next page is %u, key index %u", mp->pgno, top->ki); | 
    
    
    1702  | 
     | 
     | 
    	} else  | 
    
    
    1703  | 
     | 
     | 
    		top->ki++;  | 
    
    
    1704  | 
     | 
     | 
     | 
    
    
    1705  | 
     | 
     | 
    	DPRINTF("==> cursor points to page %u with %lu keys, key index %u", | 
    
    
    1706  | 
     | 
     | 
    	    mp->pgno, NUMKEYS(mp), top->ki);  | 
    
    
    1707  | 
     | 
     | 
     | 
    
    
    1708  | 
     | 
     | 
    	assert(IS_LEAF(mp));  | 
    
    
    1709  | 
     | 
     | 
    	leaf = NODEPTR(mp, top->ki);  | 
    
    
    1710  | 
     | 
     | 
     | 
    
    
    1711  | 
     | 
     | 
    	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)  | 
    
    
    1712  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1713  | 
     | 
     | 
     | 
    
    
    1714  | 
     | 
     | 
    	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)  | 
    
    
    1715  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1716  | 
     | 
     | 
     | 
    
    
    1717  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    1718  | 
     | 
     | 
    }  | 
    
    
    1719  | 
     | 
     | 
     | 
    
    
    1720  | 
     | 
     | 
    static int  | 
    
    
    1721  | 
     | 
     | 
    btree_cursor_set(struct cursor *cursor, struct btval *key, struct btval *data,  | 
    
    
    1722  | 
     | 
     | 
        int *exactp)  | 
    
    
    1723  | 
     | 
     | 
    { | 
    
    
    1724  | 
     | 
     | 
    	int		 rc;  | 
    
    
    1725  | 
     | 
     | 
    	struct node	*leaf;  | 
    
    
    1726  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    1727  | 
     | 
     | 
    	struct ppage	*top;  | 
    
    
    1728  | 
     | 
     | 
     | 
    
    
    1729  | 
     | 
     | 
    	assert(cursor);  | 
    
    
    1730  | 
     | 
     | 
    	assert(key);  | 
    
    
    1731  | 
     | 
     | 
    	assert(key->size > 0);  | 
    
    
    1732  | 
     | 
     | 
     | 
    
    
    1733  | 
     | 
     | 
    	rc = btree_search_page(cursor->bt, cursor->txn, key, cursor, 0, &mp);  | 
    
    
    1734  | 
     | 
     | 
    	if (rc != BT_SUCCESS)  | 
    
    
    1735  | 
     | 
     | 
    		return rc;  | 
    
    
    1736  | 
     | 
     | 
    	assert(IS_LEAF(mp));  | 
    
    
    1737  | 
     | 
     | 
     | 
    
    
    1738  | 
     | 
     | 
    	top = CURSOR_TOP(cursor);  | 
    
    
    1739  | 
     | 
     | 
    	leaf = btree_search_node(cursor->bt, mp, key, exactp, &top->ki);  | 
    
    
    1740  | 
     | 
     | 
    	if (exactp != NULL && !*exactp) { | 
    
    
    1741  | 
     | 
     | 
    		/* BT_CURSOR_EXACT specified and not an exact match. */  | 
    
    
    1742  | 
     | 
     | 
    		errno = ENOENT;  | 
    
    
    1743  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1744  | 
     | 
     | 
    	}  | 
    
    
    1745  | 
     | 
     | 
     | 
    
    
    1746  | 
     | 
     | 
    	if (leaf == NULL) { | 
    
    
    1747  | 
     | 
     | 
    		DPRINTF("===> inexact leaf not found, goto sibling"); | 
    
    
    1748  | 
     | 
     | 
    		if (btree_sibling(cursor, 1) != BT_SUCCESS)  | 
    
    
    1749  | 
     | 
     | 
    			return BT_FAIL;		/* no entries matched */  | 
    
    
    1750  | 
     | 
     | 
    		top = CURSOR_TOP(cursor);  | 
    
    
    1751  | 
     | 
     | 
    		top->ki = 0;  | 
    
    
    1752  | 
     | 
     | 
    		mp = top->mpage;  | 
    
    
    1753  | 
     | 
     | 
    		assert(IS_LEAF(mp));  | 
    
    
    1754  | 
     | 
     | 
    		leaf = NODEPTR(mp, 0);  | 
    
    
    1755  | 
     | 
     | 
    	}  | 
    
    
    1756  | 
     | 
     | 
     | 
    
    
    1757  | 
     | 
     | 
    	cursor->initialized = 1;  | 
    
    
    1758  | 
     | 
     | 
    	cursor->eof = 0;  | 
    
    
    1759  | 
     | 
     | 
     | 
    
    
    1760  | 
     | 
     | 
    	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)  | 
    
    
    1761  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1762  | 
     | 
     | 
     | 
    
    
    1763  | 
     | 
     | 
    	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)  | 
    
    
    1764  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1765  | 
     | 
     | 
    	DPRINTF("==> cursor placed on key %.*s", | 
    
    
    1766  | 
     | 
     | 
    	    (int)key->size, (char *)key->data);  | 
    
    
    1767  | 
     | 
     | 
     | 
    
    
    1768  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    1769  | 
     | 
     | 
    }  | 
    
    
    1770  | 
     | 
     | 
     | 
    
    
    1771  | 
     | 
     | 
    static int  | 
    
    
    1772  | 
     | 
     | 
    btree_cursor_first(struct cursor *cursor, struct btval *key, struct btval *data)  | 
    
    
    1773  | 
     | 
     | 
    { | 
    
    
    1774  | 
     | 
     | 
    	int		 rc;  | 
    
    
    1775  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    1776  | 
     | 
     | 
    	struct node	*leaf;  | 
    
    
    1777  | 
     | 
     | 
     | 
    
    
    1778  | 
     | 
     | 
    	rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, 0, &mp);  | 
    
    
    1779  | 
     | 
     | 
    	if (rc != BT_SUCCESS)  | 
    
    
    1780  | 
     | 
     | 
    		return rc;  | 
    
    
    1781  | 
     | 
     | 
    	assert(IS_LEAF(mp));  | 
    
    
    1782  | 
     | 
     | 
     | 
    
    
    1783  | 
     | 
     | 
    	leaf = NODEPTR(mp, 0);  | 
    
    
    1784  | 
     | 
     | 
    	cursor->initialized = 1;  | 
    
    
    1785  | 
     | 
     | 
    	cursor->eof = 0;  | 
    
    
    1786  | 
     | 
     | 
     | 
    
    
    1787  | 
     | 
     | 
    	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)  | 
    
    
    1788  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1789  | 
     | 
     | 
     | 
    
    
    1790  | 
     | 
     | 
    	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)  | 
    
    
    1791  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1792  | 
     | 
     | 
     | 
    
    
    1793  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    1794  | 
     | 
     | 
    }  | 
    
    
    1795  | 
     | 
     | 
     | 
    
    
    1796  | 
     | 
     | 
    int  | 
    
    
    1797  | 
     | 
     | 
    btree_cursor_get(struct cursor *cursor, struct btval *key, struct btval *data,  | 
    
    
    1798  | 
     | 
     | 
        enum cursor_op op)  | 
    
    
    1799  | 
     | 
     | 
    { | 
    
    
    1800  | 
     | 
     | 
    	int		 rc;  | 
    
    
    1801  | 
     | 
     | 
    	int		 exact = 0;  | 
    
    
    1802  | 
     | 
     | 
     | 
    
    
    1803  | 
     | 
     | 
    	assert(cursor);  | 
    
    
    1804  | 
     | 
     | 
     | 
    
    
    1805  | 
     | 
     | 
    	switch (op) { | 
    
    
    1806  | 
     | 
     | 
    	case BT_CURSOR:  | 
    
    
    1807  | 
     | 
     | 
    	case BT_CURSOR_EXACT:  | 
    
    
    1808  | 
     | 
     | 
    		while (CURSOR_TOP(cursor) != NULL)  | 
    
    
    1809  | 
     | 
     | 
    			cursor_pop_page(cursor);  | 
    
    
    1810  | 
     | 
     | 
    		if (key == NULL || key->size == 0 || key->size > MAXKEYSIZE) { | 
    
    
    1811  | 
     | 
     | 
    			errno = EINVAL;  | 
    
    
    1812  | 
     | 
     | 
    			rc = BT_FAIL;  | 
    
    
    1813  | 
     | 
     | 
    		} else if (op == BT_CURSOR_EXACT)  | 
    
    
    1814  | 
     | 
     | 
    			rc = btree_cursor_set(cursor, key, data, &exact);  | 
    
    
    1815  | 
     | 
     | 
    		else  | 
    
    
    1816  | 
     | 
     | 
    			rc = btree_cursor_set(cursor, key, data, NULL);  | 
    
    
    1817  | 
     | 
     | 
    		break;  | 
    
    
    1818  | 
     | 
     | 
    	case BT_NEXT:  | 
    
    
    1819  | 
     | 
     | 
    		if (!cursor->initialized)  | 
    
    
    1820  | 
     | 
     | 
    			rc = btree_cursor_first(cursor, key, data);  | 
    
    
    1821  | 
     | 
     | 
    		else  | 
    
    
    1822  | 
     | 
     | 
    			rc = btree_cursor_next(cursor, key, data);  | 
    
    
    1823  | 
     | 
     | 
    		break;  | 
    
    
    1824  | 
     | 
     | 
    	case BT_FIRST:  | 
    
    
    1825  | 
     | 
     | 
    		while (CURSOR_TOP(cursor) != NULL)  | 
    
    
    1826  | 
     | 
     | 
    			cursor_pop_page(cursor);  | 
    
    
    1827  | 
     | 
     | 
    		rc = btree_cursor_first(cursor, key, data);  | 
    
    
    1828  | 
     | 
     | 
    		break;  | 
    
    
    1829  | 
     | 
     | 
    	default:  | 
    
    
    1830  | 
     | 
     | 
    		DPRINTF("unhandled/unimplemented cursor operation %u", op); | 
    
    
    1831  | 
     | 
     | 
    		rc = BT_FAIL;  | 
    
    
    1832  | 
     | 
     | 
    		break;  | 
    
    
    1833  | 
     | 
     | 
    	}  | 
    
    
    1834  | 
     | 
     | 
     | 
    
    
    1835  | 
     | 
     | 
    	mpage_prune(cursor->bt);  | 
    
    
    1836  | 
     | 
     | 
     | 
    
    
    1837  | 
     | 
     | 
    	return rc;  | 
    
    
    1838  | 
     | 
     | 
    }  | 
    
    
    1839  | 
     | 
     | 
     | 
    
    
    1840  | 
     | 
     | 
    static struct mpage *  | 
    
    
    1841  | 
     | 
     | 
    btree_new_page(struct btree *bt, uint32_t flags)  | 
    
    
    1842  | 
     | 
     | 
    { | 
    
    
    1843  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    1844  | 
     | 
     | 
     | 
    
    
    1845  | 
     | 
     | 
    	assert(bt != NULL);  | 
    
    
    1846  | 
     | 
     | 
    	assert(bt->txn != NULL);  | 
    
    
    1847  | 
     | 
     | 
     | 
    
    
    1848  | 
     | 
     | 
    	DPRINTF("allocating new mpage %u, page size %u", | 
    
    
    1849  | 
     | 
     | 
    	    bt->txn->next_pgno, bt->head.psize);  | 
    
    
    1850  | 
     | 
     | 
    	if ((mp = calloc(1, sizeof(*mp))) == NULL)  | 
    
    
    1851  | 
     | 
     | 
    		return NULL;  | 
    
    
    1852  | 
     | 
     | 
    	if ((mp->page = malloc(bt->head.psize)) == NULL) { | 
    
    
    1853  | 
     | 
     | 
    		free(mp);  | 
    
    
    1854  | 
     | 
     | 
    		return NULL;  | 
    
    
    1855  | 
     | 
     | 
    	}  | 
    
    
    1856  | 
     | 
     | 
    	mp->pgno = mp->page->pgno = bt->txn->next_pgno++;  | 
    
    
    1857  | 
     | 
     | 
    	mp->page->flags = flags;  | 
    
    
    1858  | 
     | 
     | 
    	mp->page->lower = PAGEHDRSZ;  | 
    
    
    1859  | 
     | 
     | 
    	mp->page->upper = bt->head.psize;  | 
    
    
    1860  | 
     | 
     | 
     | 
    
    
    1861  | 
     | 
     | 
    	if (IS_BRANCH(mp))  | 
    
    
    1862  | 
     | 
     | 
    		bt->meta.branch_pages++;  | 
    
    
    1863  | 
     | 
     | 
    	else if (IS_LEAF(mp))  | 
    
    
    1864  | 
     | 
     | 
    		bt->meta.leaf_pages++;  | 
    
    
    1865  | 
     | 
     | 
    	else if (IS_OVERFLOW(mp))  | 
    
    
    1866  | 
     | 
     | 
    		bt->meta.overflow_pages++;  | 
    
    
    1867  | 
     | 
     | 
     | 
    
    
    1868  | 
     | 
     | 
    	mpage_add(bt, mp);  | 
    
    
    1869  | 
     | 
     | 
    	mpage_dirty(bt, mp);  | 
    
    
    1870  | 
     | 
     | 
     | 
    
    
    1871  | 
     | 
     | 
    	return mp;  | 
    
    
    1872  | 
     | 
     | 
    }  | 
    
    
    1873  | 
     | 
     | 
     | 
    
    
    1874  | 
     | 
     | 
    static size_t  | 
    
    
    1875  | 
     | 
     | 
    bt_leaf_size(struct btree *bt, struct btval *key, struct btval *data)  | 
    
    
    1876  | 
     | 
     | 
    { | 
    
    
    1877  | 
     | 
     | 
    	size_t		 sz;  | 
    
    
    1878  | 
     | 
     | 
     | 
    
    
    1879  | 
     | 
     | 
    	sz = LEAFSIZE(key, data);  | 
    
    
    1880  | 
     | 
     | 
    	if (data->size >= bt->head.psize / BT_MINKEYS) { | 
    
    
    1881  | 
     | 
     | 
    		/* put on overflow page */  | 
    
    
    1882  | 
     | 
     | 
    		sz -= data->size - sizeof(pgno_t);  | 
    
    
    1883  | 
     | 
     | 
    	}  | 
    
    
    1884  | 
     | 
     | 
     | 
    
    
    1885  | 
     | 
     | 
    	return sz + sizeof(indx_t);  | 
    
    
    1886  | 
     | 
     | 
    }  | 
    
    
    1887  | 
     | 
     | 
     | 
    
    
    1888  | 
     | 
     | 
    static size_t  | 
    
    
    1889  | 
     | 
     | 
    bt_branch_size(struct btree *bt, struct btval *key)  | 
    
    
    1890  | 
     | 
     | 
    { | 
    
    
    1891  | 
     | 
     | 
    	size_t		 sz;  | 
    
    
    1892  | 
     | 
     | 
     | 
    
    
    1893  | 
     | 
     | 
    	sz = INDXSIZE(key);  | 
    
    
    1894  | 
     | 
     | 
    	if (sz >= bt->head.psize / BT_MINKEYS) { | 
    
    
    1895  | 
     | 
     | 
    		/* put on overflow page */  | 
    
    
    1896  | 
     | 
     | 
    		/* not implemented */  | 
    
    
    1897  | 
     | 
     | 
    		/* sz -= key->size - sizeof(pgno_t); */  | 
    
    
    1898  | 
     | 
     | 
    	}  | 
    
    
    1899  | 
     | 
     | 
     | 
    
    
    1900  | 
     | 
     | 
    	return sz + sizeof(indx_t);  | 
    
    
    1901  | 
     | 
     | 
    }  | 
    
    
    1902  | 
     | 
     | 
     | 
    
    
    1903  | 
     | 
     | 
    static int  | 
    
    
    1904  | 
     | 
     | 
    btree_write_overflow_data(struct btree *bt, struct page *p, struct btval *data)  | 
    
    
    1905  | 
     | 
     | 
    { | 
    
    
    1906  | 
     | 
     | 
    	size_t		 done = 0;  | 
    
    
    1907  | 
     | 
     | 
    	size_t		 sz;  | 
    
    
    1908  | 
     | 
     | 
    	size_t		 max;  | 
    
    
    1909  | 
     | 
     | 
    	pgno_t		*linkp;			/* linked page stored here */  | 
    
    
    1910  | 
     | 
     | 
    	struct mpage	*next = NULL;  | 
    
    
    1911  | 
     | 
     | 
     | 
    
    
    1912  | 
     | 
     | 
    	max = bt->head.psize - PAGEHDRSZ;  | 
    
    
    1913  | 
     | 
     | 
     | 
    
    
    1914  | 
     | 
     | 
    	while (done < data->size) { | 
    
    
    1915  | 
     | 
     | 
    		if (next != NULL)  | 
    
    
    1916  | 
     | 
     | 
    			p = next->page;  | 
    
    
    1917  | 
     | 
     | 
    		linkp = &p->p_next_pgno;  | 
    
    
    1918  | 
     | 
     | 
    		if (data->size - done > max) { | 
    
    
    1919  | 
     | 
     | 
    			/* need another overflow page */  | 
    
    
    1920  | 
     | 
     | 
    			if ((next = btree_new_page(bt, P_OVERFLOW)) == NULL)  | 
    
    
    1921  | 
     | 
     | 
    				return BT_FAIL;  | 
    
    
    1922  | 
     | 
     | 
    			*linkp = next->pgno;  | 
    
    
    1923  | 
     | 
     | 
    			DPRINTF("linking overflow page %u", next->pgno); | 
    
    
    1924  | 
     | 
     | 
    		} else  | 
    
    
    1925  | 
     | 
     | 
    			*linkp = 0;		/* indicates end of list */  | 
    
    
    1926  | 
     | 
     | 
    		sz = data->size - done;  | 
    
    
    1927  | 
     | 
     | 
    		if (sz > max)  | 
    
    
    1928  | 
     | 
     | 
    			sz = max;  | 
    
    
    1929  | 
     | 
     | 
    		DPRINTF("copying %zu bytes to overflow page %u", sz, p->pgno); | 
    
    
    1930  | 
     | 
     | 
    		bcopy((char *)data->data + done, p->ptrs, sz);  | 
    
    
    1931  | 
     | 
     | 
    		done += sz;  | 
    
    
    1932  | 
     | 
     | 
    	}  | 
    
    
    1933  | 
     | 
     | 
     | 
    
    
    1934  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    1935  | 
     | 
     | 
    }  | 
    
    
    1936  | 
     | 
     | 
     | 
    
    
    1937  | 
     | 
     | 
    /* Key prefix should already be stripped.  | 
    
    
    1938  | 
     | 
     | 
     */  | 
    
    
    1939  | 
     | 
     | 
    static int  | 
    
    
    1940  | 
     | 
     | 
    btree_add_node(struct btree *bt, struct mpage *mp, indx_t indx,  | 
    
    
    1941  | 
     | 
     | 
        struct btval *key, struct btval *data, pgno_t pgno, uint8_t flags)  | 
    
    
    1942  | 
     | 
     | 
    { | 
    
    
    1943  | 
     | 
     | 
    	unsigned int	 i;  | 
    
    
    1944  | 
     | 
     | 
    	size_t		 node_size = NODESIZE;  | 
    
    
    1945  | 
     | 
     | 
    	indx_t		 ofs;  | 
    
    
    1946  | 
     | 
     | 
    	struct node	*node;  | 
    
    
    1947  | 
     | 
     | 
    	struct page	*p;  | 
    
    
    1948  | 
     | 
     | 
    	struct mpage	*ofp = NULL;		/* overflow page */  | 
    
    
    1949  | 
     | 
     | 
     | 
    
    
    1950  | 
     | 
     | 
    	p = mp->page;  | 
    
    
    1951  | 
     | 
     | 
    	assert(p->upper >= p->lower);  | 
    
    
    1952  | 
     | 
     | 
     | 
    
    
    1953  | 
     | 
     | 
    	DPRINTF("add node [%.*s] to %s page %u at index %d, key size %zu", | 
    
    
    1954  | 
     | 
     | 
    	    key ? (int)key->size : 0, key ? (char *)key->data : NULL,  | 
    
    
    1955  | 
     | 
     | 
    	    IS_LEAF(mp) ? "leaf" : "branch",  | 
    
    
    1956  | 
     | 
     | 
    	    mp->pgno, indx, key ? key->size : 0);  | 
    
    
    1957  | 
     | 
     | 
     | 
    
    
    1958  | 
     | 
     | 
    	if (key != NULL)  | 
    
    
    1959  | 
     | 
     | 
    		node_size += key->size;  | 
    
    
    1960  | 
     | 
     | 
     | 
    
    
    1961  | 
     | 
     | 
    	if (IS_LEAF(mp)) { | 
    
    
    1962  | 
     | 
     | 
    		assert(data);  | 
    
    
    1963  | 
     | 
     | 
    		node_size += data->size;  | 
    
    
    1964  | 
     | 
     | 
    		if (F_ISSET(flags, F_BIGDATA)) { | 
    
    
    1965  | 
     | 
     | 
    			/* Data already on overflow page. */  | 
    
    
    1966  | 
     | 
     | 
    			node_size -= data->size - sizeof(pgno_t);  | 
    
    
    1967  | 
     | 
     | 
    		} else if (data->size >= bt->head.psize / BT_MINKEYS) { | 
    
    
    1968  | 
     | 
     | 
    			/* Put data on overflow page. */  | 
    
    
    1969  | 
     | 
     | 
    			DPRINTF("data size is %zu, put on overflow page", | 
    
    
    1970  | 
     | 
     | 
    			    data->size);  | 
    
    
    1971  | 
     | 
     | 
    			node_size -= data->size - sizeof(pgno_t);  | 
    
    
    1972  | 
     | 
     | 
    			if ((ofp = btree_new_page(bt, P_OVERFLOW)) == NULL)  | 
    
    
    1973  | 
     | 
     | 
    				return BT_FAIL;  | 
    
    
    1974  | 
     | 
     | 
    			DPRINTF("allocated overflow page %u", ofp->pgno); | 
    
    
    1975  | 
     | 
     | 
    			flags |= F_BIGDATA;  | 
    
    
    1976  | 
     | 
     | 
    		}  | 
    
    
    1977  | 
     | 
     | 
    	}  | 
    
    
    1978  | 
     | 
     | 
     | 
    
    
    1979  | 
     | 
     | 
    	if (node_size + sizeof(indx_t) > SIZELEFT(mp)) { | 
    
    
    1980  | 
     | 
     | 
    		DPRINTF("not enough room in page %u, got %lu ptrs", | 
    
    
    1981  | 
     | 
     | 
    		    mp->pgno, NUMKEYS(mp));  | 
    
    
    1982  | 
     | 
     | 
    		DPRINTF("upper - lower = %u - %u = %u", p->upper, p->lower, | 
    
    
    1983  | 
     | 
     | 
    		    p->upper - p->lower);  | 
    
    
    1984  | 
     | 
     | 
    		DPRINTF("node size = %zu", node_size); | 
    
    
    1985  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    1986  | 
     | 
     | 
    	}  | 
    
    
    1987  | 
     | 
     | 
     | 
    
    
    1988  | 
     | 
     | 
    	/* Move higher pointers up one slot. */  | 
    
    
    1989  | 
     | 
     | 
    	for (i = NUMKEYS(mp); i > indx; i--)  | 
    
    
    1990  | 
     | 
     | 
    		p->ptrs[i] = p->ptrs[i - 1];  | 
    
    
    1991  | 
     | 
     | 
     | 
    
    
    1992  | 
     | 
     | 
    	/* Adjust free space offsets. */  | 
    
    
    1993  | 
     | 
     | 
    	ofs = p->upper - node_size;  | 
    
    
    1994  | 
     | 
     | 
    	assert(ofs >= p->lower + sizeof(indx_t));  | 
    
    
    1995  | 
     | 
     | 
    	p->ptrs[indx] = ofs;  | 
    
    
    1996  | 
     | 
     | 
    	p->upper = ofs;  | 
    
    
    1997  | 
     | 
     | 
    	p->lower += sizeof(indx_t);  | 
    
    
    1998  | 
     | 
     | 
     | 
    
    
    1999  | 
     | 
     | 
    	/* Write the node data. */  | 
    
    
    2000  | 
     | 
     | 
    	node = NODEPTR(mp, indx);  | 
    
    
    2001  | 
     | 
     | 
    	node->ksize = (key == NULL) ? 0 : key->size;  | 
    
    
    2002  | 
     | 
     | 
    	node->flags = flags;  | 
    
    
    2003  | 
     | 
     | 
    	if (IS_LEAF(mp))  | 
    
    
    2004  | 
     | 
     | 
    		node->n_dsize = data->size;  | 
    
    
    2005  | 
     | 
     | 
    	else  | 
    
    
    2006  | 
     | 
     | 
    		node->n_pgno = pgno;  | 
    
    
    2007  | 
     | 
     | 
     | 
    
    
    2008  | 
     | 
     | 
    	if (key)  | 
    
    
    2009  | 
     | 
     | 
    		bcopy(key->data, NODEKEY(node), key->size);  | 
    
    
    2010  | 
     | 
     | 
     | 
    
    
    2011  | 
     | 
     | 
    	if (IS_LEAF(mp)) { | 
    
    
    2012  | 
     | 
     | 
    		assert(key);  | 
    
    
    2013  | 
     | 
     | 
    		if (ofp == NULL) { | 
    
    
    2014  | 
     | 
     | 
    			if (F_ISSET(flags, F_BIGDATA))  | 
    
    
    2015  | 
     | 
     | 
    				bcopy(data->data, node->data + key->size,  | 
    
    
    2016  | 
     | 
     | 
    				    sizeof(pgno_t));  | 
    
    
    2017  | 
     | 
     | 
    			else  | 
    
    
    2018  | 
     | 
     | 
    				bcopy(data->data, node->data + key->size,  | 
    
    
    2019  | 
     | 
     | 
    				    data->size);  | 
    
    
    2020  | 
     | 
     | 
    		} else { | 
    
    
    2021  | 
     | 
     | 
    			bcopy(&ofp->pgno, node->data + key->size,  | 
    
    
    2022  | 
     | 
     | 
    			    sizeof(pgno_t));  | 
    
    
    2023  | 
     | 
     | 
    			if (btree_write_overflow_data(bt, ofp->page,  | 
    
    
    2024  | 
     | 
     | 
    			    data) == BT_FAIL)  | 
    
    
    2025  | 
     | 
     | 
    				return BT_FAIL;  | 
    
    
    2026  | 
     | 
     | 
    		}  | 
    
    
    2027  | 
     | 
     | 
    	}  | 
    
    
    2028  | 
     | 
     | 
     | 
    
    
    2029  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    2030  | 
     | 
     | 
    }  | 
    
    
    2031  | 
     | 
     | 
     | 
    
    
    2032  | 
     | 
     | 
    static void  | 
    
    
    2033  | 
     | 
     | 
    btree_del_node(struct btree *bt, struct mpage *mp, indx_t indx)  | 
    
    
    2034  | 
     | 
     | 
    { | 
    
    
    2035  | 
     | 
     | 
    	unsigned int	 sz;  | 
    
    
    2036  | 
     | 
     | 
    	indx_t		 i, j, numkeys, ptr;  | 
    
    
    2037  | 
     | 
     | 
    	struct node	*node;  | 
    
    
    2038  | 
     | 
     | 
    	char		*base;  | 
    
    
    2039  | 
     | 
     | 
     | 
    
    
    2040  | 
     | 
     | 
    	DPRINTF("delete node %u on %s page %u", indx, | 
    
    
    2041  | 
     | 
     | 
    	    IS_LEAF(mp) ? "leaf" : "branch", mp->pgno);  | 
    
    
    2042  | 
     | 
     | 
    	assert(indx < NUMKEYS(mp));  | 
    
    
    2043  | 
     | 
     | 
     | 
    
    
    2044  | 
     | 
     | 
    	node = NODEPTR(mp, indx);  | 
    
    
    2045  | 
     | 
     | 
    	sz = NODESIZE + node->ksize;  | 
    
    
    2046  | 
     | 
     | 
    	if (IS_LEAF(mp)) { | 
    
    
    2047  | 
     | 
     | 
    		if (F_ISSET(node->flags, F_BIGDATA))  | 
    
    
    2048  | 
     | 
     | 
    			sz += sizeof(pgno_t);  | 
    
    
    2049  | 
     | 
     | 
    		else  | 
    
    
    2050  | 
     | 
     | 
    			sz += NODEDSZ(node);  | 
    
    
    2051  | 
     | 
     | 
    	}  | 
    
    
    2052  | 
     | 
     | 
     | 
    
    
    2053  | 
     | 
     | 
    	ptr = mp->page->ptrs[indx];  | 
    
    
    2054  | 
     | 
     | 
    	numkeys = NUMKEYS(mp);  | 
    
    
    2055  | 
     | 
     | 
    	for (i = j = 0; i < numkeys; i++) { | 
    
    
    2056  | 
     | 
     | 
    		if (i != indx) { | 
    
    
    2057  | 
     | 
     | 
    			mp->page->ptrs[j] = mp->page->ptrs[i];  | 
    
    
    2058  | 
     | 
     | 
    			if (mp->page->ptrs[i] < ptr)  | 
    
    
    2059  | 
     | 
     | 
    				mp->page->ptrs[j] += sz;  | 
    
    
    2060  | 
     | 
     | 
    			j++;  | 
    
    
    2061  | 
     | 
     | 
    		}  | 
    
    
    2062  | 
     | 
     | 
    	}  | 
    
    
    2063  | 
     | 
     | 
     | 
    
    
    2064  | 
     | 
     | 
    	base = (char *)mp->page + mp->page->upper;  | 
    
    
    2065  | 
     | 
     | 
    	bcopy(base, base + sz, ptr - mp->page->upper);  | 
    
    
    2066  | 
     | 
     | 
     | 
    
    
    2067  | 
     | 
     | 
    	mp->page->lower -= sizeof(indx_t);  | 
    
    
    2068  | 
     | 
     | 
    	mp->page->upper += sz;  | 
    
    
    2069  | 
     | 
     | 
    }  | 
    
    
    2070  | 
     | 
     | 
     | 
    
    
    2071  | 
     | 
     | 
    struct cursor *  | 
    
    
    2072  | 
     | 
     | 
    btree_txn_cursor_open(struct btree *bt, struct btree_txn *txn)  | 
    
    
    2073  | 
     | 
     | 
    { | 
    
    
    2074  | 
     | 
     | 
    	struct cursor	*cursor;  | 
    
    
    2075  | 
     | 
     | 
     | 
    
    
    2076  | 
     | 
     | 
    	if (bt != NULL && txn != NULL && bt != txn->bt) { | 
    
    
    2077  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    2078  | 
     | 
     | 
    		return NULL;  | 
    
    
    2079  | 
     | 
     | 
    	}  | 
    
    
    2080  | 
     | 
     | 
     | 
    
    
    2081  | 
     | 
     | 
    	if (bt == NULL) { | 
    
    
    2082  | 
     | 
     | 
    		if (txn == NULL) { | 
    
    
    2083  | 
     | 
     | 
    			errno = EINVAL;  | 
    
    
    2084  | 
     | 
     | 
    			return NULL;  | 
    
    
    2085  | 
     | 
     | 
    		}  | 
    
    
    2086  | 
     | 
     | 
    		bt = txn->bt;  | 
    
    
    2087  | 
     | 
     | 
    	}  | 
    
    
    2088  | 
     | 
     | 
     | 
    
    
    2089  | 
     | 
     | 
    	if ((cursor = calloc(1, sizeof(*cursor))) != NULL) { | 
    
    
    2090  | 
     | 
     | 
    		SLIST_INIT(&cursor->stack);  | 
    
    
    2091  | 
     | 
     | 
    		cursor->bt = bt;  | 
    
    
    2092  | 
     | 
     | 
    		cursor->txn = txn;  | 
    
    
    2093  | 
     | 
     | 
    		btree_ref(bt);  | 
    
    
    2094  | 
     | 
     | 
    	}  | 
    
    
    2095  | 
     | 
     | 
     | 
    
    
    2096  | 
     | 
     | 
    	return cursor;  | 
    
    
    2097  | 
     | 
     | 
    }  | 
    
    
    2098  | 
     | 
     | 
     | 
    
    
    2099  | 
     | 
     | 
    void  | 
    
    
    2100  | 
     | 
     | 
    btree_cursor_close(struct cursor *cursor)  | 
    
    
    2101  | 
     | 
     | 
    { | 
    
    
    2102  | 
     | 
     | 
    	if (cursor != NULL) { | 
    
    
    2103  | 
     | 
     | 
    		while (!CURSOR_EMPTY(cursor))  | 
    
    
    2104  | 
     | 
     | 
    			cursor_pop_page(cursor);  | 
    
    
    2105  | 
     | 
     | 
     | 
    
    
    2106  | 
     | 
     | 
    		btree_close(cursor->bt);  | 
    
    
    2107  | 
     | 
     | 
    		free(cursor);  | 
    
    
    2108  | 
     | 
     | 
    	}  | 
    
    
    2109  | 
     | 
     | 
    }  | 
    
    
    2110  | 
     | 
     | 
     | 
    
    
    2111  | 
     | 
     | 
    static int  | 
    
    
    2112  | 
     | 
     | 
    btree_update_key(struct btree *bt, struct mpage *mp, indx_t indx,  | 
    
    
    2113  | 
     | 
     | 
        struct btval *key)  | 
    
    
    2114  | 
     | 
     | 
    { | 
    
    
    2115  | 
     | 
     | 
    	indx_t			 ptr, i, numkeys;  | 
    
    
    2116  | 
     | 
     | 
    	int			 delta;  | 
    
    
    2117  | 
     | 
     | 
    	size_t			 len;  | 
    
    
    2118  | 
     | 
     | 
    	struct node		*node;  | 
    
    
    2119  | 
     | 
     | 
    	char			*base;  | 
    
    
    2120  | 
     | 
     | 
     | 
    
    
    2121  | 
     | 
     | 
    	node = NODEPTR(mp, indx);  | 
    
    
    2122  | 
     | 
     | 
    	ptr = mp->page->ptrs[indx];  | 
    
    
    2123  | 
     | 
     | 
    	DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %u", | 
    
    
    2124  | 
     | 
     | 
    	    indx, ptr,  | 
    
    
    2125  | 
     | 
     | 
    	    (int)node->ksize, (char *)NODEKEY(node),  | 
    
    
    2126  | 
     | 
     | 
    	    (int)key->size, (char *)key->data,  | 
    
    
    2127  | 
     | 
     | 
    	    mp->pgno);  | 
    
    
    2128  | 
     | 
     | 
     | 
    
    
    2129  | 
     | 
     | 
    	if (key->size != node->ksize) { | 
    
    
    2130  | 
     | 
     | 
    		delta = key->size - node->ksize;  | 
    
    
    2131  | 
     | 
     | 
    		if (delta > 0 && SIZELEFT(mp) < delta) { | 
    
    
    2132  | 
     | 
     | 
    			DPRINTF("OUCH! Not enough room, delta = %d", delta); | 
    
    
    2133  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2134  | 
     | 
     | 
    		}  | 
    
    
    2135  | 
     | 
     | 
     | 
    
    
    2136  | 
     | 
     | 
    		numkeys = NUMKEYS(mp);  | 
    
    
    2137  | 
     | 
     | 
    		for (i = 0; i < numkeys; i++) { | 
    
    
    2138  | 
     | 
     | 
    			if (mp->page->ptrs[i] <= ptr)  | 
    
    
    2139  | 
     | 
     | 
    				mp->page->ptrs[i] -= delta;  | 
    
    
    2140  | 
     | 
     | 
    		}  | 
    
    
    2141  | 
     | 
     | 
     | 
    
    
    2142  | 
     | 
     | 
    		base = (char *)mp->page + mp->page->upper;  | 
    
    
    2143  | 
     | 
     | 
    		len = ptr - mp->page->upper + NODESIZE;  | 
    
    
    2144  | 
     | 
     | 
    		bcopy(base, base - delta, len);  | 
    
    
    2145  | 
     | 
     | 
    		mp->page->upper -= delta;  | 
    
    
    2146  | 
     | 
     | 
     | 
    
    
    2147  | 
     | 
     | 
    		node = NODEPTR(mp, indx);  | 
    
    
    2148  | 
     | 
     | 
    		node->ksize = key->size;  | 
    
    
    2149  | 
     | 
     | 
    	}  | 
    
    
    2150  | 
     | 
     | 
     | 
    
    
    2151  | 
     | 
     | 
    	bcopy(key->data, NODEKEY(node), key->size);  | 
    
    
    2152  | 
     | 
     | 
     | 
    
    
    2153  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    2154  | 
     | 
     | 
    }  | 
    
    
    2155  | 
     | 
     | 
     | 
    
    
    2156  | 
     | 
     | 
    static int  | 
    
    
    2157  | 
     | 
     | 
    btree_adjust_prefix(struct btree *bt, struct mpage *src, int delta)  | 
    
    
    2158  | 
     | 
     | 
    { | 
    
    
    2159  | 
     | 
     | 
    	indx_t		 i;  | 
    
    
    2160  | 
     | 
     | 
    	struct node	*node;  | 
    
    
    2161  | 
     | 
     | 
    	struct btkey	 tmpkey;  | 
    
    
    2162  | 
     | 
     | 
    	struct btval	 key;  | 
    
    
    2163  | 
     | 
     | 
     | 
    
    
    2164  | 
     | 
     | 
    	DPRINTF("adjusting prefix lengths on page %u with delta %d", | 
    
    
    2165  | 
     | 
     | 
    	    src->pgno, delta);  | 
    
    
    2166  | 
     | 
     | 
    	assert(delta != 0);  | 
    
    
    2167  | 
     | 
     | 
     | 
    
    
    2168  | 
     | 
     | 
    	for (i = 0; i < NUMKEYS(src); i++) { | 
    
    
    2169  | 
     | 
     | 
    		node = NODEPTR(src, i);  | 
    
    
    2170  | 
     | 
     | 
    		tmpkey.len = node->ksize - delta;  | 
    
    
    2171  | 
     | 
     | 
    		if (delta > 0) { | 
    
    
    2172  | 
     | 
     | 
    			if (F_ISSET(bt->flags, BT_REVERSEKEY))  | 
    
    
    2173  | 
     | 
     | 
    				bcopy(NODEKEY(node), tmpkey.str, tmpkey.len);  | 
    
    
    2174  | 
     | 
     | 
    			else  | 
    
    
    2175  | 
     | 
     | 
    				bcopy((char *)NODEKEY(node) + delta, tmpkey.str,  | 
    
    
    2176  | 
     | 
     | 
    				    tmpkey.len);  | 
    
    
    2177  | 
     | 
     | 
    		} else { | 
    
    
    2178  | 
     | 
     | 
    			if (F_ISSET(bt->flags, BT_REVERSEKEY)) { | 
    
    
    2179  | 
     | 
     | 
    				bcopy(NODEKEY(node), tmpkey.str, node->ksize);  | 
    
    
    2180  | 
     | 
     | 
    				bcopy(src->prefix.str, tmpkey.str + node->ksize,  | 
    
    
    2181  | 
     | 
     | 
    				    -delta);  | 
    
    
    2182  | 
     | 
     | 
    			} else { | 
    
    
    2183  | 
     | 
     | 
    				bcopy(src->prefix.str + src->prefix.len + delta,  | 
    
    
    2184  | 
     | 
     | 
    				    tmpkey.str, -delta);  | 
    
    
    2185  | 
     | 
     | 
    				bcopy(NODEKEY(node), tmpkey.str - delta,  | 
    
    
    2186  | 
     | 
     | 
    				    node->ksize);  | 
    
    
    2187  | 
     | 
     | 
    			}  | 
    
    
    2188  | 
     | 
     | 
    		}  | 
    
    
    2189  | 
     | 
     | 
    		key.size = tmpkey.len;  | 
    
    
    2190  | 
     | 
     | 
    		key.data = tmpkey.str;  | 
    
    
    2191  | 
     | 
     | 
    		if (btree_update_key(bt, src, i, &key) != BT_SUCCESS)  | 
    
    
    2192  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2193  | 
     | 
     | 
    	}  | 
    
    
    2194  | 
     | 
     | 
     | 
    
    
    2195  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    2196  | 
     | 
     | 
    }  | 
    
    
    2197  | 
     | 
     | 
     | 
    
    
    2198  | 
     | 
     | 
    /* Move a node from src to dst.  | 
    
    
    2199  | 
     | 
     | 
     */  | 
    
    
    2200  | 
     | 
     | 
    static int  | 
    
    
    2201  | 
     | 
     | 
    btree_move_node(struct btree *bt, struct mpage *src, indx_t srcindx,  | 
    
    
    2202  | 
     | 
     | 
        struct mpage *dst, indx_t dstindx)  | 
    
    
    2203  | 
     | 
     | 
    { | 
    
    
    2204  | 
     | 
     | 
    	int			 rc;  | 
    
    
    2205  | 
     | 
     | 
    	unsigned int		 pfxlen, mp_pfxlen = 0;  | 
    
    
    2206  | 
     | 
     | 
    	struct node		*srcnode;  | 
    
    
    2207  | 
     | 
     | 
    	struct mpage		*mp = NULL;  | 
    
    
    2208  | 
     | 
     | 
    	struct btkey		 tmpkey, srckey;  | 
    
    
    2209  | 
     | 
     | 
    	struct btval		 key, data;  | 
    
    
    2210  | 
     | 
     | 
     | 
    
    
    2211  | 
     | 
     | 
    	assert(src->parent);  | 
    
    
    2212  | 
     | 
     | 
    	assert(dst->parent);  | 
    
    
    2213  | 
     | 
     | 
     | 
    
    
    2214  | 
     | 
     | 
    	srcnode = NODEPTR(src, srcindx);  | 
    
    
    2215  | 
     | 
     | 
    	DPRINTF("moving %s node %u [%.*s] on page %u to node %u on page %u", | 
    
    
    2216  | 
     | 
     | 
    	    IS_LEAF(src) ? "leaf" : "branch",  | 
    
    
    2217  | 
     | 
     | 
    	    srcindx,  | 
    
    
    2218  | 
     | 
     | 
    	    (int)srcnode->ksize, (char *)NODEKEY(srcnode),  | 
    
    
    2219  | 
     | 
     | 
    	    src->pgno,  | 
    
    
    2220  | 
     | 
     | 
    	    dstindx, dst->pgno);  | 
    
    
    2221  | 
     | 
     | 
     | 
    
    
    2222  | 
     | 
     | 
    	find_common_prefix(bt, src);  | 
    
    
    2223  | 
     | 
     | 
     | 
    
    
    2224  | 
     | 
     | 
    	if (IS_BRANCH(src)) { | 
    
    
    2225  | 
     | 
     | 
    		/* Need to check if the page the moved node points to  | 
    
    
    2226  | 
     | 
     | 
    		 * changes prefix.  | 
    
    
    2227  | 
     | 
     | 
    		 */  | 
    
    
    2228  | 
     | 
     | 
    		if ((mp = btree_get_mpage(bt, NODEPGNO(srcnode))) == NULL)  | 
    
    
    2229  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2230  | 
     | 
     | 
    		mp->parent = src;  | 
    
    
    2231  | 
     | 
     | 
    		mp->parent_index = srcindx;  | 
    
    
    2232  | 
     | 
     | 
    		find_common_prefix(bt, mp);  | 
    
    
    2233  | 
     | 
     | 
    		mp_pfxlen = mp->prefix.len;  | 
    
    
    2234  | 
     | 
     | 
    	}  | 
    
    
    2235  | 
     | 
     | 
     | 
    
    
    2236  | 
     | 
     | 
    	/* Mark src and dst as dirty. */  | 
    
    
    2237  | 
     | 
     | 
    	if ((src = mpage_touch(bt, src)) == NULL ||  | 
    
    
    2238  | 
     | 
     | 
    	    (dst = mpage_touch(bt, dst)) == NULL)  | 
    
    
    2239  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    2240  | 
     | 
     | 
     | 
    
    
    2241  | 
     | 
     | 
    	find_common_prefix(bt, dst);  | 
    
    
    2242  | 
     | 
     | 
     | 
    
    
    2243  | 
     | 
     | 
    	/* Check if src node has destination page prefix. Otherwise the  | 
    
    
    2244  | 
     | 
     | 
    	 * destination page must expand its prefix on all its nodes.  | 
    
    
    2245  | 
     | 
     | 
    	 */  | 
    
    
    2246  | 
     | 
     | 
    	srckey.len = srcnode->ksize;  | 
    
    
    2247  | 
     | 
     | 
    	bcopy(NODEKEY(srcnode), srckey.str, srckey.len);  | 
    
    
    2248  | 
     | 
     | 
    	common_prefix(bt, &srckey, &dst->prefix, &tmpkey);  | 
    
    
    2249  | 
     | 
     | 
    	if (tmpkey.len != dst->prefix.len) { | 
    
    
    2250  | 
     | 
     | 
    		if (btree_adjust_prefix(bt, dst,  | 
    
    
    2251  | 
     | 
     | 
    		    tmpkey.len - dst->prefix.len) != BT_SUCCESS)  | 
    
    
    2252  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2253  | 
     | 
     | 
    		bcopy(&tmpkey, &dst->prefix, sizeof(tmpkey));  | 
    
    
    2254  | 
     | 
     | 
    	}  | 
    
    
    2255  | 
     | 
     | 
     | 
    
    
    2256  | 
     | 
     | 
    	if (srcindx == 0 && IS_BRANCH(src)) { | 
    
    
    2257  | 
     | 
     | 
    		struct mpage	*low;  | 
    
    
    2258  | 
     | 
     | 
     | 
    
    
    2259  | 
     | 
     | 
    		/* must find the lowest key below src  | 
    
    
    2260  | 
     | 
     | 
    		 */  | 
    
    
    2261  | 
     | 
     | 
    		assert(btree_search_page_root(bt, src, NULL, NULL, 0,  | 
    
    
    2262  | 
     | 
     | 
    		    &low) == BT_SUCCESS);  | 
    
    
    2263  | 
     | 
     | 
    		expand_prefix(bt, low, 0, &srckey);  | 
    
    
    2264  | 
     | 
     | 
    		DPRINTF("found lowest key [%.*s] on leaf page %u", | 
    
    
    2265  | 
     | 
     | 
    		    (int)srckey.len, srckey.str, low->pgno);  | 
    
    
    2266  | 
     | 
     | 
    	} else { | 
    
    
    2267  | 
     | 
     | 
    		srckey.len = srcnode->ksize;  | 
    
    
    2268  | 
     | 
     | 
    		bcopy(NODEKEY(srcnode), srckey.str, srcnode->ksize);  | 
    
    
    2269  | 
     | 
     | 
    	}  | 
    
    
    2270  | 
     | 
     | 
    	find_common_prefix(bt, src);  | 
    
    
    2271  | 
     | 
     | 
     | 
    
    
    2272  | 
     | 
     | 
    	/* expand the prefix */  | 
    
    
    2273  | 
     | 
     | 
    	tmpkey.len = sizeof(tmpkey.str);  | 
    
    
    2274  | 
     | 
     | 
    	concat_prefix(bt, src->prefix.str, src->prefix.len,  | 
    
    
    2275  | 
     | 
     | 
    	    srckey.str, srckey.len, tmpkey.str, &tmpkey.len);  | 
    
    
    2276  | 
     | 
     | 
     | 
    
    
    2277  | 
     | 
     | 
    	/* Add the node to the destination page. Adjust prefix for  | 
    
    
    2278  | 
     | 
     | 
    	 * destination page.  | 
    
    
    2279  | 
     | 
     | 
    	 */  | 
    
    
    2280  | 
     | 
     | 
    	key.size = tmpkey.len;  | 
    
    
    2281  | 
     | 
     | 
    	key.data = tmpkey.str;  | 
    
    
    2282  | 
     | 
     | 
    	remove_prefix(bt, &key, dst->prefix.len);  | 
    
    
    2283  | 
     | 
     | 
    	data.size = NODEDSZ(srcnode);  | 
    
    
    2284  | 
     | 
     | 
    	data.data = NODEDATA(srcnode);  | 
    
    
    2285  | 
     | 
     | 
    	rc = btree_add_node(bt, dst, dstindx, &key, &data, NODEPGNO(srcnode),  | 
    
    
    2286  | 
     | 
     | 
    	    srcnode->flags);  | 
    
    
    2287  | 
     | 
     | 
    	if (rc != BT_SUCCESS)  | 
    
    
    2288  | 
     | 
     | 
    		return rc;  | 
    
    
    2289  | 
     | 
     | 
     | 
    
    
    2290  | 
     | 
     | 
    	/* Delete the node from the source page.  | 
    
    
    2291  | 
     | 
     | 
    	 */  | 
    
    
    2292  | 
     | 
     | 
    	btree_del_node(bt, src, srcindx);  | 
    
    
    2293  | 
     | 
     | 
     | 
    
    
    2294  | 
     | 
     | 
    	/* Update the parent separators.  | 
    
    
    2295  | 
     | 
     | 
    	 */  | 
    
    
    2296  | 
     | 
     | 
    	if (srcindx == 0 && src->parent_index != 0) { | 
    
    
    2297  | 
     | 
     | 
    		expand_prefix(bt, src, 0, &tmpkey);  | 
    
    
    2298  | 
     | 
     | 
    		key.size = tmpkey.len;  | 
    
    
    2299  | 
     | 
     | 
    		key.data = tmpkey.str;  | 
    
    
    2300  | 
     | 
     | 
    		remove_prefix(bt, &key, src->parent->prefix.len);  | 
    
    
    2301  | 
     | 
     | 
     | 
    
    
    2302  | 
     | 
     | 
    		DPRINTF("update separator for source page %u to [%.*s]", | 
    
    
    2303  | 
     | 
     | 
    		    src->pgno, (int)key.size, (char *)key.data);  | 
    
    
    2304  | 
     | 
     | 
    		if (btree_update_key(bt, src->parent, src->parent_index,  | 
    
    
    2305  | 
     | 
     | 
    		    &key) != BT_SUCCESS)  | 
    
    
    2306  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2307  | 
     | 
     | 
    	}  | 
    
    
    2308  | 
     | 
     | 
     | 
    
    
    2309  | 
     | 
     | 
    	if (srcindx == 0 && IS_BRANCH(src)) { | 
    
    
    2310  | 
     | 
     | 
    		struct btval	 nullkey;  | 
    
    
    2311  | 
     | 
     | 
    		nullkey.size = 0;  | 
    
    
    2312  | 
     | 
     | 
    		assert(btree_update_key(bt, src, 0, &nullkey) == BT_SUCCESS);  | 
    
    
    2313  | 
     | 
     | 
    	}  | 
    
    
    2314  | 
     | 
     | 
     | 
    
    
    2315  | 
     | 
     | 
    	if (dstindx == 0 && dst->parent_index != 0) { | 
    
    
    2316  | 
     | 
     | 
    		expand_prefix(bt, dst, 0, &tmpkey);  | 
    
    
    2317  | 
     | 
     | 
    		key.size = tmpkey.len;  | 
    
    
    2318  | 
     | 
     | 
    		key.data = tmpkey.str;  | 
    
    
    2319  | 
     | 
     | 
    		remove_prefix(bt, &key, dst->parent->prefix.len);  | 
    
    
    2320  | 
     | 
     | 
     | 
    
    
    2321  | 
     | 
     | 
    		DPRINTF("update separator for destination page %u to [%.*s]", | 
    
    
    2322  | 
     | 
     | 
    		    dst->pgno, (int)key.size, (char *)key.data);  | 
    
    
    2323  | 
     | 
     | 
    		if (btree_update_key(bt, dst->parent, dst->parent_index,  | 
    
    
    2324  | 
     | 
     | 
    		    &key) != BT_SUCCESS)  | 
    
    
    2325  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2326  | 
     | 
     | 
    	}  | 
    
    
    2327  | 
     | 
     | 
     | 
    
    
    2328  | 
     | 
     | 
    	if (dstindx == 0 && IS_BRANCH(dst)) { | 
    
    
    2329  | 
     | 
     | 
    		struct btval	 nullkey;  | 
    
    
    2330  | 
     | 
     | 
    		nullkey.size = 0;  | 
    
    
    2331  | 
     | 
     | 
    		assert(btree_update_key(bt, dst, 0, &nullkey) == BT_SUCCESS);  | 
    
    
    2332  | 
     | 
     | 
    	}  | 
    
    
    2333  | 
     | 
     | 
     | 
    
    
    2334  | 
     | 
     | 
    	/* We can get a new page prefix here!  | 
    
    
    2335  | 
     | 
     | 
    	 * Must update keys in all nodes of this page!  | 
    
    
    2336  | 
     | 
     | 
    	 */  | 
    
    
    2337  | 
     | 
     | 
    	pfxlen = src->prefix.len;  | 
    
    
    2338  | 
     | 
     | 
    	find_common_prefix(bt, src);  | 
    
    
    2339  | 
     | 
     | 
    	if (src->prefix.len != pfxlen) { | 
    
    
    2340  | 
     | 
     | 
    		if (btree_adjust_prefix(bt, src,  | 
    
    
    2341  | 
     | 
     | 
    		    src->prefix.len - pfxlen) != BT_SUCCESS)  | 
    
    
    2342  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2343  | 
     | 
     | 
    	}  | 
    
    
    2344  | 
     | 
     | 
     | 
    
    
    2345  | 
     | 
     | 
    	pfxlen = dst->prefix.len;  | 
    
    
    2346  | 
     | 
     | 
    	find_common_prefix(bt, dst);  | 
    
    
    2347  | 
     | 
     | 
    	if (dst->prefix.len != pfxlen) { | 
    
    
    2348  | 
     | 
     | 
    		if (btree_adjust_prefix(bt, dst,  | 
    
    
    2349  | 
     | 
     | 
    		    dst->prefix.len - pfxlen) != BT_SUCCESS)  | 
    
    
    2350  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2351  | 
     | 
     | 
    	}  | 
    
    
    2352  | 
     | 
     | 
     | 
    
    
    2353  | 
     | 
     | 
    	if (IS_BRANCH(dst)) { | 
    
    
    2354  | 
     | 
     | 
    		assert(mp);  | 
    
    
    2355  | 
     | 
     | 
    		mp->parent = dst;  | 
    
    
    2356  | 
     | 
     | 
    		mp->parent_index = dstindx;  | 
    
    
    2357  | 
     | 
     | 
    		find_common_prefix(bt, mp);  | 
    
    
    2358  | 
     | 
     | 
    		if (mp->prefix.len != mp_pfxlen) { | 
    
    
    2359  | 
     | 
     | 
    			DPRINTF("moved branch node has changed prefix"); | 
    
    
    2360  | 
     | 
     | 
    			if ((mp = mpage_touch(bt, mp)) == NULL)  | 
    
    
    2361  | 
     | 
     | 
    				return BT_FAIL;  | 
    
    
    2362  | 
     | 
     | 
    			if (btree_adjust_prefix(bt, mp,  | 
    
    
    2363  | 
     | 
     | 
    			    mp->prefix.len - mp_pfxlen) != BT_SUCCESS)  | 
    
    
    2364  | 
     | 
     | 
    				return BT_FAIL;  | 
    
    
    2365  | 
     | 
     | 
    		}  | 
    
    
    2366  | 
     | 
     | 
    	}  | 
    
    
    2367  | 
     | 
     | 
     | 
    
    
    2368  | 
     | 
     | 
    	return BT_SUCCESS;  | 
    
    
    2369  | 
     | 
     | 
    }  | 
    
    
    2370  | 
     | 
     | 
     | 
    
    
    2371  | 
     | 
     | 
    static int  | 
    
    
    2372  | 
     | 
     | 
    btree_merge(struct btree *bt, struct mpage *src, struct mpage *dst)  | 
    
    
    2373  | 
     | 
     | 
    { | 
    
    
    2374  | 
     | 
     | 
    	int			 rc;  | 
    
    
    2375  | 
     | 
     | 
    	indx_t			 i;  | 
    
    
    2376  | 
     | 
     | 
    	unsigned int		 pfxlen;  | 
    
    
    2377  | 
     | 
     | 
    	struct node		*srcnode;  | 
    
    
    2378  | 
     | 
     | 
    	struct btkey		 tmpkey, dstpfx;  | 
    
    
    2379  | 
     | 
     | 
    	struct btval		 key, data;  | 
    
    
    2380  | 
     | 
     | 
     | 
    
    
    2381  | 
     | 
     | 
    	DPRINTF("merging page %u and %u", src->pgno, dst->pgno); | 
    
    
    2382  | 
     | 
     | 
     | 
    
    
    2383  | 
     | 
     | 
    	assert(src->parent);	/* can't merge root page */  | 
    
    
    2384  | 
     | 
     | 
    	assert(dst->parent);  | 
    
    
    2385  | 
     | 
     | 
    	assert(bt->txn != NULL);  | 
    
    
    2386  | 
     | 
     | 
     | 
    
    
    2387  | 
     | 
     | 
    	/* Mark src and dst as dirty. */  | 
    
    
    2388  | 
     | 
     | 
    	if ((src = mpage_touch(bt, src)) == NULL ||  | 
    
    
    2389  | 
     | 
     | 
    	    (dst = mpage_touch(bt, dst)) == NULL)  | 
    
    
    2390  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    2391  | 
     | 
     | 
     | 
    
    
    2392  | 
     | 
     | 
    	find_common_prefix(bt, src);  | 
    
    
    2393  | 
     | 
     | 
    	find_common_prefix(bt, dst);  | 
    
    
    2394  | 
     | 
     | 
     | 
    
    
    2395  | 
     | 
     | 
    	/* Check if source nodes has destination page prefix. Otherwise  | 
    
    
    2396  | 
     | 
     | 
    	 * the destination page must expand its prefix on all its nodes.  | 
    
    
    2397  | 
     | 
     | 
    	 */  | 
    
    
    2398  | 
     | 
     | 
    	common_prefix(bt, &src->prefix, &dst->prefix, &dstpfx);  | 
    
    
    2399  | 
     | 
     | 
    	if (dstpfx.len != dst->prefix.len) { | 
    
    
    2400  | 
     | 
     | 
    		if (btree_adjust_prefix(bt, dst,  | 
    
    
    2401  | 
     | 
     | 
    		    dstpfx.len - dst->prefix.len) != BT_SUCCESS)  | 
    
    
    2402  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2403  | 
     | 
     | 
    		bcopy(&dstpfx, &dst->prefix, sizeof(dstpfx));  | 
    
    
    2404  | 
     | 
     | 
    	}  | 
    
    
    2405  | 
     | 
     | 
     | 
    
    
    2406  | 
     | 
     | 
    	/* Move all nodes from src to dst.  | 
    
    
    2407  | 
     | 
     | 
    	 */  | 
    
    
    2408  | 
     | 
     | 
    	for (i = 0; i < NUMKEYS(src); i++) { | 
    
    
    2409  | 
     | 
     | 
    		srcnode = NODEPTR(src, i);  | 
    
    
    2410  | 
     | 
     | 
     | 
    
    
    2411  | 
     | 
     | 
    		/* If branch node 0 (implicit key), find the real key.  | 
    
    
    2412  | 
     | 
     | 
    		 */  | 
    
    
    2413  | 
     | 
     | 
    		if (i == 0 && IS_BRANCH(src)) { | 
    
    
    2414  | 
     | 
     | 
    			struct mpage	*low;  | 
    
    
    2415  | 
     | 
     | 
     | 
    
    
    2416  | 
     | 
     | 
    			/* must find the lowest key below src  | 
    
    
    2417  | 
     | 
     | 
    			 */  | 
    
    
    2418  | 
     | 
     | 
    			assert(btree_search_page_root(bt, src, NULL, NULL, 0,  | 
    
    
    2419  | 
     | 
     | 
    			    &low) == BT_SUCCESS);  | 
    
    
    2420  | 
     | 
     | 
    			expand_prefix(bt, low, 0, &tmpkey);  | 
    
    
    2421  | 
     | 
     | 
    			DPRINTF("found lowest key [%.*s] on leaf page %u", | 
    
    
    2422  | 
     | 
     | 
    			    (int)tmpkey.len, tmpkey.str, low->pgno);  | 
    
    
    2423  | 
     | 
     | 
    		} else { | 
    
    
    2424  | 
     | 
     | 
    			expand_prefix(bt, src, i, &tmpkey);  | 
    
    
    2425  | 
     | 
     | 
    		}  | 
    
    
    2426  | 
     | 
     | 
     | 
    
    
    2427  | 
     | 
     | 
    		key.size = tmpkey.len;  | 
    
    
    2428  | 
     | 
     | 
    		key.data = tmpkey.str;  | 
    
    
    2429  | 
     | 
     | 
     | 
    
    
    2430  | 
     | 
     | 
    		remove_prefix(bt, &key, dst->prefix.len);  | 
    
    
    2431  | 
     | 
     | 
    		data.size = NODEDSZ(srcnode);  | 
    
    
    2432  | 
     | 
     | 
    		data.data = NODEDATA(srcnode);  | 
    
    
    2433  | 
     | 
     | 
    		rc = btree_add_node(bt, dst, NUMKEYS(dst), &key,  | 
    
    
    2434  | 
     | 
     | 
    		    &data, NODEPGNO(srcnode), srcnode->flags);  | 
    
    
    2435  | 
     | 
     | 
    		if (rc != BT_SUCCESS)  | 
    
    
    2436  | 
     | 
     | 
    			return rc;  | 
    
    
    2437  | 
     | 
     | 
    	}  | 
    
    
    2438  | 
     | 
     | 
     | 
    
    
    2439  | 
     | 
     | 
    	DPRINTF("dst page %u now has %lu keys (%.1f%% filled)", | 
    
    
    2440  | 
     | 
     | 
    	    dst->pgno, NUMKEYS(dst), (float)PAGEFILL(bt, dst) / 10);  | 
    
    
    2441  | 
     | 
     | 
     | 
    
    
    2442  | 
     | 
     | 
    	/* Unlink the src page from parent.  | 
    
    
    2443  | 
     | 
     | 
    	 */  | 
    
    
    2444  | 
     | 
     | 
    	btree_del_node(bt, src->parent, src->parent_index);  | 
    
    
    2445  | 
     | 
     | 
    	if (src->parent_index == 0) { | 
    
    
    2446  | 
     | 
     | 
    		key.size = 0;  | 
    
    
    2447  | 
     | 
     | 
    		if (btree_update_key(bt, src->parent, 0, &key) != BT_SUCCESS)  | 
    
    
    2448  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2449  | 
     | 
     | 
     | 
    
    
    2450  | 
     | 
     | 
    		pfxlen = src->prefix.len;  | 
    
    
    2451  | 
     | 
     | 
    		find_common_prefix(bt, src);  | 
    
    
    2452  | 
     | 
     | 
    		assert (src->prefix.len == pfxlen);  | 
    
    
    2453  | 
     | 
     | 
    	}  | 
    
    
    2454  | 
     | 
     | 
     | 
    
    
    2455  | 
     | 
     | 
    	if (IS_LEAF(src))  | 
    
    
    2456  | 
     | 
     | 
    		bt->meta.leaf_pages--;  | 
    
    
    2457  | 
     | 
     | 
    	else  | 
    
    
    2458  | 
     | 
     | 
    		bt->meta.branch_pages--;  | 
    
    
    2459  | 
     | 
     | 
     | 
    
    
    2460  | 
     | 
     | 
    	return btree_rebalance(bt, src->parent);  | 
    
    
    2461  | 
     | 
     | 
    }  | 
    
    
    2462  | 
     | 
     | 
     | 
    
    
    2463  | 
     | 
     | 
    #define FILL_THRESHOLD	 250  | 
    
    
    2464  | 
     | 
     | 
     | 
    
    
    2465  | 
     | 
     | 
    static int  | 
    
    
    2466  | 
     | 
     | 
    btree_rebalance(struct btree *bt, struct mpage *mp)  | 
    
    
    2467  | 
     | 
     | 
    { | 
    
    
    2468  | 
     | 
     | 
    	struct node	*node;  | 
    
    
    2469  | 
     | 
     | 
    	struct mpage	*parent;  | 
    
    
    2470  | 
     | 
     | 
    	struct mpage	*root;  | 
    
    
    2471  | 
     | 
     | 
    	struct mpage	*neighbor = NULL;  | 
    
    
    2472  | 
     | 
     | 
    	indx_t		 si = 0, di = 0;  | 
    
    
    2473  | 
     | 
     | 
     | 
    
    
    2474  | 
     | 
     | 
    	assert(bt != NULL);  | 
    
    
    2475  | 
     | 
     | 
    	assert(bt->txn != NULL);  | 
    
    
    2476  | 
     | 
     | 
    	assert(mp != NULL);  | 
    
    
    2477  | 
     | 
     | 
     | 
    
    
    2478  | 
     | 
     | 
    	DPRINTF("rebalancing %s page %u (has %lu keys, %.1f%% full)", | 
    
    
    2479  | 
     | 
     | 
    	    IS_LEAF(mp) ? "leaf" : "branch",  | 
    
    
    2480  | 
     | 
     | 
    	    mp->pgno, NUMKEYS(mp), (float)PAGEFILL(bt, mp) / 10);  | 
    
    
    2481  | 
     | 
     | 
     | 
    
    
    2482  | 
     | 
     | 
    	if (PAGEFILL(bt, mp) >= FILL_THRESHOLD) { | 
    
    
    2483  | 
     | 
     | 
    		DPRINTF("no need to rebalance page %u, above fill threshold", | 
    
    
    2484  | 
     | 
     | 
    		    mp->pgno);  | 
    
    
    2485  | 
     | 
     | 
    		return BT_SUCCESS;  | 
    
    
    2486  | 
     | 
     | 
    	}  | 
    
    
    2487  | 
     | 
     | 
     | 
    
    
    2488  | 
     | 
     | 
    	parent = mp->parent;  | 
    
    
    2489  | 
     | 
     | 
     | 
    
    
    2490  | 
     | 
     | 
    	if (parent == NULL) { | 
    
    
    2491  | 
     | 
     | 
    		if (NUMKEYS(mp) == 0) { | 
    
    
    2492  | 
     | 
     | 
    			DPRINTF("tree is completely empty"); | 
    
    
    2493  | 
     | 
     | 
    			bt->txn->root = P_INVALID;  | 
    
    
    2494  | 
     | 
     | 
    			bt->meta.depth--;  | 
    
    
    2495  | 
     | 
     | 
    			bt->meta.leaf_pages--;  | 
    
    
    2496  | 
     | 
     | 
    		} else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { | 
    
    
    2497  | 
     | 
     | 
    			DPRINTF("collapsing root page!"); | 
    
    
    2498  | 
     | 
     | 
    			bt->txn->root = NODEPGNO(NODEPTR(mp, 0));  | 
    
    
    2499  | 
     | 
     | 
    			if ((root = btree_get_mpage(bt, bt->txn->root)) == NULL)  | 
    
    
    2500  | 
     | 
     | 
    				return BT_FAIL;  | 
    
    
    2501  | 
     | 
     | 
    			root->parent = NULL;  | 
    
    
    2502  | 
     | 
     | 
    			bt->meta.depth--;  | 
    
    
    2503  | 
     | 
     | 
    			bt->meta.branch_pages--;  | 
    
    
    2504  | 
     | 
     | 
    		} else  | 
    
    
    2505  | 
     | 
     | 
    			DPRINTF("root page doesn't need rebalancing"); | 
    
    
    2506  | 
     | 
     | 
    		return BT_SUCCESS;  | 
    
    
    2507  | 
     | 
     | 
    	}  | 
    
    
    2508  | 
     | 
     | 
     | 
    
    
    2509  | 
     | 
     | 
    	/* The parent (branch page) must have at least 2 pointers,  | 
    
    
    2510  | 
     | 
     | 
    	 * otherwise the tree is invalid.  | 
    
    
    2511  | 
     | 
     | 
    	 */  | 
    
    
    2512  | 
     | 
     | 
    	assert(NUMKEYS(parent) > 1);  | 
    
    
    2513  | 
     | 
     | 
     | 
    
    
    2514  | 
     | 
     | 
    	/* Leaf page fill factor is below the threshold.  | 
    
    
    2515  | 
     | 
     | 
    	 * Try to move keys from left or right neighbor, or  | 
    
    
    2516  | 
     | 
     | 
    	 * merge with a neighbor page.  | 
    
    
    2517  | 
     | 
     | 
    	 */  | 
    
    
    2518  | 
     | 
     | 
     | 
    
    
    2519  | 
     | 
     | 
    	/* Find neighbors.  | 
    
    
    2520  | 
     | 
     | 
    	 */  | 
    
    
    2521  | 
     | 
     | 
    	if (mp->parent_index == 0) { | 
    
    
    2522  | 
     | 
     | 
    		/* We're the leftmost leaf in our parent.  | 
    
    
    2523  | 
     | 
     | 
    		 */  | 
    
    
    2524  | 
     | 
     | 
    		DPRINTF("reading right neighbor"); | 
    
    
    2525  | 
     | 
     | 
    		node = NODEPTR(parent, mp->parent_index + 1);  | 
    
    
    2526  | 
     | 
     | 
    		if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL)  | 
    
    
    2527  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2528  | 
     | 
     | 
    		neighbor->parent_index = mp->parent_index + 1;  | 
    
    
    2529  | 
     | 
     | 
    		si = 0;  | 
    
    
    2530  | 
     | 
     | 
    		di = NUMKEYS(mp);  | 
    
    
    2531  | 
     | 
     | 
    	} else { | 
    
    
    2532  | 
     | 
     | 
    		/* There is at least one neighbor to the left.  | 
    
    
    2533  | 
     | 
     | 
    		 */  | 
    
    
    2534  | 
     | 
     | 
    		DPRINTF("reading left neighbor"); | 
    
    
    2535  | 
     | 
     | 
    		node = NODEPTR(parent, mp->parent_index - 1);  | 
    
    
    2536  | 
     | 
     | 
    		if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL)  | 
    
    
    2537  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2538  | 
     | 
     | 
    		neighbor->parent_index = mp->parent_index - 1;  | 
    
    
    2539  | 
     | 
     | 
    		si = NUMKEYS(neighbor) - 1;  | 
    
    
    2540  | 
     | 
     | 
    		di = 0;  | 
    
    
    2541  | 
     | 
     | 
    	}  | 
    
    
    2542  | 
     | 
     | 
    	neighbor->parent = parent;  | 
    
    
    2543  | 
     | 
     | 
     | 
    
    
    2544  | 
     | 
     | 
    	DPRINTF("found neighbor page %u (%lu keys, %.1f%% full)", | 
    
    
    2545  | 
     | 
     | 
    	    neighbor->pgno, NUMKEYS(neighbor), (float)PAGEFILL(bt, neighbor) / 10);  | 
    
    
    2546  | 
     | 
     | 
     | 
    
    
    2547  | 
     | 
     | 
    	/* If the neighbor page is above threshold and has at least two  | 
    
    
    2548  | 
     | 
     | 
    	 * keys, move one key from it.  | 
    
    
    2549  | 
     | 
     | 
    	 *  | 
    
    
    2550  | 
     | 
     | 
    	 * Otherwise we should try to merge them, but that might not be  | 
    
    
    2551  | 
     | 
     | 
    	 * possible, even if both are below threshold, as prefix expansion  | 
    
    
    2552  | 
     | 
     | 
    	 * might make keys larger. FIXME: detect this  | 
    
    
    2553  | 
     | 
     | 
    	 */  | 
    
    
    2554  | 
     | 
     | 
    	if (PAGEFILL(bt, neighbor) >= FILL_THRESHOLD && NUMKEYS(neighbor) >= 2)  | 
    
    
    2555  | 
     | 
     | 
    		return btree_move_node(bt, neighbor, si, mp, di);  | 
    
    
    2556  | 
     | 
     | 
    	else { /* FIXME: if (has_enough_room()) */ | 
    
    
    2557  | 
     | 
     | 
    		if (mp->parent_index == 0)  | 
    
    
    2558  | 
     | 
     | 
    			return btree_merge(bt, neighbor, mp);  | 
    
    
    2559  | 
     | 
     | 
    		else  | 
    
    
    2560  | 
     | 
     | 
    			return btree_merge(bt, mp, neighbor);  | 
    
    
    2561  | 
     | 
     | 
    	}  | 
    
    
    2562  | 
     | 
     | 
    }  | 
    
    
    2563  | 
     | 
     | 
     | 
    
    
    2564  | 
     | 
     | 
    int  | 
    
    
    2565  | 
     | 
     | 
    btree_txn_del(struct btree *bt, struct btree_txn *txn,  | 
    
    
    2566  | 
     | 
     | 
        struct btval *key, struct btval *data)  | 
    
    
    2567  | 
     | 
     | 
    { | 
    
    
    2568  | 
     | 
     | 
    	int		 rc, exact, close_txn = 0;  | 
    
    
    2569  | 
     | 
     | 
    	unsigned int	 ki;  | 
    
    
    2570  | 
     | 
     | 
    	struct node	*leaf;  | 
    
    
    2571  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    2572  | 
     | 
     | 
     | 
    
    
    2573  | 
     | 
     | 
    	DPRINTF("========> delete key %.*s", (int)key->size, (char *)key->data); | 
    
    
    2574  | 
     | 
     | 
     | 
    
    
    2575  | 
     | 
     | 
    	assert(key != NULL);  | 
    
    
    2576  | 
     | 
     | 
     | 
    
    
    2577  | 
     | 
     | 
    	if (bt != NULL && txn != NULL && bt != txn->bt) { | 
    
    
    2578  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    2579  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    2580  | 
     | 
     | 
    	}  | 
    
    
    2581  | 
     | 
     | 
     | 
    
    
    2582  | 
     | 
     | 
    	if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) { | 
    
    
    2583  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    2584  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    2585  | 
     | 
     | 
    	}  | 
    
    
    2586  | 
     | 
     | 
     | 
    
    
    2587  | 
     | 
     | 
    	if (bt == NULL) { | 
    
    
    2588  | 
     | 
     | 
    		if (txn == NULL) { | 
    
    
    2589  | 
     | 
     | 
    			errno = EINVAL;  | 
    
    
    2590  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2591  | 
     | 
     | 
    		}  | 
    
    
    2592  | 
     | 
     | 
    		bt = txn->bt;  | 
    
    
    2593  | 
     | 
     | 
    	}  | 
    
    
    2594  | 
     | 
     | 
     | 
    
    
    2595  | 
     | 
     | 
    	if (key->size == 0 || key->size > MAXKEYSIZE) { | 
    
    
    2596  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    2597  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    2598  | 
     | 
     | 
    	}  | 
    
    
    2599  | 
     | 
     | 
     | 
    
    
    2600  | 
     | 
     | 
    	if (txn == NULL) { | 
    
    
    2601  | 
     | 
     | 
    		close_txn = 1;  | 
    
    
    2602  | 
     | 
     | 
    		if ((txn = btree_txn_begin(bt, 0)) == NULL)  | 
    
    
    2603  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2604  | 
     | 
     | 
    	}  | 
    
    
    2605  | 
     | 
     | 
     | 
    
    
    2606  | 
     | 
     | 
    	if ((rc = btree_search_page(bt, txn, key, NULL, 1, &mp)) != BT_SUCCESS)  | 
    
    
    2607  | 
     | 
     | 
    		goto done;  | 
    
    
    2608  | 
     | 
     | 
     | 
    
    
    2609  | 
     | 
     | 
    	leaf = btree_search_node(bt, mp, key, &exact, &ki);  | 
    
    
    2610  | 
     | 
     | 
    	if (leaf == NULL || !exact) { | 
    
    
    2611  | 
     | 
     | 
    		errno = ENOENT;  | 
    
    
    2612  | 
     | 
     | 
    		rc = BT_FAIL;  | 
    
    
    2613  | 
     | 
     | 
    		goto done;  | 
    
    
    2614  | 
     | 
     | 
    	}  | 
    
    
    2615  | 
     | 
     | 
     | 
    
    
    2616  | 
     | 
     | 
    	if (data && (rc = btree_read_data(bt, NULL, leaf, data)) != BT_SUCCESS)  | 
    
    
    2617  | 
     | 
     | 
    		goto done;  | 
    
    
    2618  | 
     | 
     | 
     | 
    
    
    2619  | 
     | 
     | 
    	btree_del_node(bt, mp, ki);  | 
    
    
    2620  | 
     | 
     | 
    	bt->meta.entries--;  | 
    
    
    2621  | 
     | 
     | 
    	rc = btree_rebalance(bt, mp);  | 
    
    
    2622  | 
     | 
     | 
    	if (rc != BT_SUCCESS)  | 
    
    
    2623  | 
     | 
     | 
    		txn->flags |= BT_TXN_ERROR;  | 
    
    
    2624  | 
     | 
     | 
     | 
    
    
    2625  | 
     | 
     | 
    done:  | 
    
    
    2626  | 
     | 
     | 
    	if (close_txn) { | 
    
    
    2627  | 
     | 
     | 
    		if (rc == BT_SUCCESS)  | 
    
    
    2628  | 
     | 
     | 
    			rc = btree_txn_commit(txn);  | 
    
    
    2629  | 
     | 
     | 
    		else  | 
    
    
    2630  | 
     | 
     | 
    			btree_txn_abort(txn);  | 
    
    
    2631  | 
     | 
     | 
    	}  | 
    
    
    2632  | 
     | 
     | 
    	mpage_prune(bt);  | 
    
    
    2633  | 
     | 
     | 
    	return rc;  | 
    
    
    2634  | 
     | 
     | 
    }  | 
    
    
    2635  | 
     | 
     | 
     | 
    
    
    2636  | 
     | 
     | 
    /* Reduce the length of the prefix separator <sep> to the minimum length that  | 
    
    
    2637  | 
     | 
     | 
     * still makes it uniquely distinguishable from <min>.  | 
    
    
    2638  | 
     | 
     | 
     *  | 
    
    
    2639  | 
     | 
     | 
     * <min> is guaranteed to be sorted less than <sep>  | 
    
    
    2640  | 
     | 
     | 
     *  | 
    
    
    2641  | 
     | 
     | 
     * On return, <sep> is modified to the minimum length.  | 
    
    
    2642  | 
     | 
     | 
     */  | 
    
    
    2643  | 
     | 
     | 
    static void  | 
    
    
    2644  | 
     | 
     | 
    bt_reduce_separator(struct btree *bt, struct node *min, struct btval *sep)  | 
    
    
    2645  | 
     | 
     | 
    { | 
    
    
    2646  | 
     | 
     | 
    	size_t		 n = 0;  | 
    
    
    2647  | 
     | 
     | 
    	char		*p1;  | 
    
    
    2648  | 
     | 
     | 
    	char		*p2;  | 
    
    
    2649  | 
     | 
     | 
     | 
    
    
    2650  | 
     | 
     | 
    	if (F_ISSET(bt->flags, BT_REVERSEKEY)) { | 
    
    
    2651  | 
     | 
     | 
     | 
    
    
    2652  | 
     | 
     | 
    		assert(sep->size > 0);  | 
    
    
    2653  | 
     | 
     | 
     | 
    
    
    2654  | 
     | 
     | 
    		p1 = (char *)NODEKEY(min) + min->ksize - 1;  | 
    
    
    2655  | 
     | 
     | 
    		p2 = (char *)sep->data + sep->size - 1;  | 
    
    
    2656  | 
     | 
     | 
     | 
    
    
    2657  | 
     | 
     | 
    		while (p1 >= (char *)NODEKEY(min) && *p1 == *p2) { | 
    
    
    2658  | 
     | 
     | 
    			assert(p2 > (char *)sep->data);  | 
    
    
    2659  | 
     | 
     | 
    			p1--;  | 
    
    
    2660  | 
     | 
     | 
    			p2--;  | 
    
    
    2661  | 
     | 
     | 
    			n++;  | 
    
    
    2662  | 
     | 
     | 
    		}  | 
    
    
    2663  | 
     | 
     | 
     | 
    
    
    2664  | 
     | 
     | 
    		sep->data = p2;  | 
    
    
    2665  | 
     | 
     | 
    		sep->size = n + 1;  | 
    
    
    2666  | 
     | 
     | 
    	} else { | 
    
    
    2667  | 
     | 
     | 
     | 
    
    
    2668  | 
     | 
     | 
    		assert(min->ksize > 0);  | 
    
    
    2669  | 
     | 
     | 
    		assert(sep->size > 0);  | 
    
    
    2670  | 
     | 
     | 
     | 
    
    
    2671  | 
     | 
     | 
    		p1 = (char *)NODEKEY(min);  | 
    
    
    2672  | 
     | 
     | 
    		p2 = (char *)sep->data;  | 
    
    
    2673  | 
     | 
     | 
     | 
    
    
    2674  | 
     | 
     | 
    		while (*p1 == *p2) { | 
    
    
    2675  | 
     | 
     | 
    			p1++;  | 
    
    
    2676  | 
     | 
     | 
    			p2++;  | 
    
    
    2677  | 
     | 
     | 
    			n++;  | 
    
    
    2678  | 
     | 
     | 
    			if (n == min->ksize || n == sep->size)  | 
    
    
    2679  | 
     | 
     | 
    				break;  | 
    
    
    2680  | 
     | 
     | 
    		}  | 
    
    
    2681  | 
     | 
     | 
     | 
    
    
    2682  | 
     | 
     | 
    		sep->size = n + 1;  | 
    
    
    2683  | 
     | 
     | 
    	}  | 
    
    
    2684  | 
     | 
     | 
     | 
    
    
    2685  | 
     | 
     | 
    	DPRINTF("reduced separator to [%.*s] > [%.*s]", | 
    
    
    2686  | 
     | 
     | 
    	    (int)sep->size, (char *)sep->data,  | 
    
    
    2687  | 
     | 
     | 
    	    (int)min->ksize, (char *)NODEKEY(min));  | 
    
    
    2688  | 
     | 
     | 
    }  | 
    
    
    2689  | 
     | 
     | 
     | 
    
    
    2690  | 
     | 
     | 
    /* Split page <*mpp>, and insert <key,(data|newpgno)> in either left or  | 
    
    
    2691  | 
     | 
     | 
     * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and  | 
    
    
    2692  | 
     | 
     | 
     * *newindxp with the actual values after split, ie if *mpp and *newindxp  | 
    
    
    2693  | 
     | 
     | 
     * refer to a node in the new right sibling page.  | 
    
    
    2694  | 
     | 
     | 
     */  | 
    
    
    2695  | 
     | 
     | 
    static int  | 
    
    
    2696  | 
     | 
     | 
    btree_split(struct btree *bt, struct mpage **mpp, unsigned int *newindxp,  | 
    
    
    2697  | 
     | 
     | 
        struct btval *newkey, struct btval *newdata, pgno_t newpgno)  | 
    
    
    2698  | 
     | 
     | 
    { | 
    
    
    2699  | 
     | 
     | 
    	uint8_t		 flags;  | 
    
    
    2700  | 
     | 
     | 
    	int		 rc = BT_SUCCESS, ins_new = 0;  | 
    
    
    2701  | 
     | 
     | 
    	indx_t		 newindx;  | 
    
    
    2702  | 
     | 
     | 
    	pgno_t		 pgno = 0;  | 
    
    
    2703  | 
     | 
     | 
    	size_t		 orig_pfx_len, left_pfx_diff, right_pfx_diff, pfx_diff;  | 
    
    
    2704  | 
     | 
     | 
    	unsigned int	 i, j, split_indx;  | 
    
    
    2705  | 
     | 
     | 
    	struct node	*node;  | 
    
    
    2706  | 
     | 
     | 
    	struct mpage	*pright, *p, *mp;  | 
    
    
    2707  | 
     | 
     | 
    	struct btval	 sepkey, rkey, rdata;  | 
    
    
    2708  | 
     | 
     | 
    	struct btkey	 tmpkey;  | 
    
    
    2709  | 
     | 
     | 
    	struct page	*copy;  | 
    
    
    2710  | 
     | 
     | 
     | 
    
    
    2711  | 
     | 
     | 
    	assert(bt != NULL);  | 
    
    
    2712  | 
     | 
     | 
    	assert(bt->txn != NULL);  | 
    
    
    2713  | 
     | 
     | 
     | 
    
    
    2714  | 
     | 
     | 
    	mp = *mpp;  | 
    
    
    2715  | 
     | 
     | 
    	newindx = *newindxp;  | 
    
    
    2716  | 
     | 
     | 
     | 
    
    
    2717  | 
     | 
     | 
    	DPRINTF("-----> splitting %s page %u and adding [%.*s] at index %d", | 
    
    
    2718  | 
     | 
     | 
    	    IS_LEAF(mp) ? "leaf" : "branch", mp->pgno,  | 
    
    
    2719  | 
     | 
     | 
    	    (int)newkey->size, (char *)newkey->data, *newindxp);  | 
    
    
    2720  | 
     | 
     | 
    	DPRINTF("page %u has prefix [%.*s]", mp->pgno, | 
    
    
    2721  | 
     | 
     | 
    	    (int)mp->prefix.len, (char *)mp->prefix.str);  | 
    
    
    2722  | 
     | 
     | 
    	orig_pfx_len = mp->prefix.len;  | 
    
    
    2723  | 
     | 
     | 
     | 
    
    
    2724  | 
     | 
     | 
    	if (mp->parent == NULL) { | 
    
    
    2725  | 
     | 
     | 
    		if ((mp->parent = btree_new_page(bt, P_BRANCH)) == NULL)  | 
    
    
    2726  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2727  | 
     | 
     | 
    		mp->parent_index = 0;  | 
    
    
    2728  | 
     | 
     | 
    		bt->txn->root = mp->parent->pgno;  | 
    
    
    2729  | 
     | 
     | 
    		DPRINTF("root split! new root = %u", mp->parent->pgno); | 
    
    
    2730  | 
     | 
     | 
    		bt->meta.depth++;  | 
    
    
    2731  | 
     | 
     | 
     | 
    
    
    2732  | 
     | 
     | 
    		/* Add left (implicit) pointer. */  | 
    
    
    2733  | 
     | 
     | 
    		if (btree_add_node(bt, mp->parent, 0, NULL, NULL,  | 
    
    
    2734  | 
     | 
     | 
    		    mp->pgno, 0) != BT_SUCCESS)  | 
    
    
    2735  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2736  | 
     | 
     | 
    	} else { | 
    
    
    2737  | 
     | 
     | 
    		DPRINTF("parent branch page is %u", mp->parent->pgno); | 
    
    
    2738  | 
     | 
     | 
    	}  | 
    
    
    2739  | 
     | 
     | 
     | 
    
    
    2740  | 
     | 
     | 
    	/* Create a right sibling. */  | 
    
    
    2741  | 
     | 
     | 
    	if ((pright = btree_new_page(bt, mp->page->flags)) == NULL)  | 
    
    
    2742  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    2743  | 
     | 
     | 
    	pright->parent = mp->parent;  | 
    
    
    2744  | 
     | 
     | 
    	pright->parent_index = mp->parent_index + 1;  | 
    
    
    2745  | 
     | 
     | 
    	DPRINTF("new right sibling: page %u", pright->pgno); | 
    
    
    2746  | 
     | 
     | 
     | 
    
    
    2747  | 
     | 
     | 
    	/* Move half of the keys to the right sibling. */  | 
    
    
    2748  | 
     | 
     | 
    	if ((copy = malloc(bt->head.psize)) == NULL)  | 
    
    
    2749  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    2750  | 
     | 
     | 
    	bcopy(mp->page, copy, bt->head.psize);  | 
    
    
    2751  | 
     | 
     | 
    	assert(mp->ref == 0);				/* XXX */  | 
    
    
    2752  | 
     | 
     | 
    	memset(&mp->page->ptrs, 0, bt->head.psize - PAGEHDRSZ);  | 
    
    
    2753  | 
     | 
     | 
    	mp->page->lower = PAGEHDRSZ;  | 
    
    
    2754  | 
     | 
     | 
    	mp->page->upper = bt->head.psize;  | 
    
    
    2755  | 
     | 
     | 
     | 
    
    
    2756  | 
     | 
     | 
    	split_indx = NUMKEYSP(copy) / 2 + 1;  | 
    
    
    2757  | 
     | 
     | 
     | 
    
    
    2758  | 
     | 
     | 
    	/* First find the separating key between the split pages.  | 
    
    
    2759  | 
     | 
     | 
    	 */  | 
    
    
    2760  | 
     | 
     | 
    	memset(&sepkey, 0, sizeof(sepkey));  | 
    
    
    2761  | 
     | 
     | 
    	if (newindx == split_indx) { | 
    
    
    2762  | 
     | 
     | 
    		sepkey.size = newkey->size;  | 
    
    
    2763  | 
     | 
     | 
    		sepkey.data = newkey->data;  | 
    
    
    2764  | 
     | 
     | 
    		remove_prefix(bt, &sepkey, mp->prefix.len);  | 
    
    
    2765  | 
     | 
     | 
    	} else { | 
    
    
    2766  | 
     | 
     | 
    		node = NODEPTRP(copy, split_indx);  | 
    
    
    2767  | 
     | 
     | 
    		sepkey.size = node->ksize;  | 
    
    
    2768  | 
     | 
     | 
    		sepkey.data = NODEKEY(node);  | 
    
    
    2769  | 
     | 
     | 
    	}  | 
    
    
    2770  | 
     | 
     | 
     | 
    
    
    2771  | 
     | 
     | 
    	if (IS_LEAF(mp) && bt->cmp == NULL) { | 
    
    
    2772  | 
     | 
     | 
    		/* Find the smallest separator. */  | 
    
    
    2773  | 
     | 
     | 
    		/* Ref: Prefix B-trees, R. Bayer, K. Unterauer, 1977 */  | 
    
    
    2774  | 
     | 
     | 
    		node = NODEPTRP(copy, split_indx - 1);  | 
    
    
    2775  | 
     | 
     | 
    		bt_reduce_separator(bt, node, &sepkey);  | 
    
    
    2776  | 
     | 
     | 
    	}  | 
    
    
    2777  | 
     | 
     | 
     | 
    
    
    2778  | 
     | 
     | 
    	/* Fix separator wrt parent prefix. */  | 
    
    
    2779  | 
     | 
     | 
    	if (bt->cmp == NULL) { | 
    
    
    2780  | 
     | 
     | 
    		tmpkey.len = sizeof(tmpkey.str);  | 
    
    
    2781  | 
     | 
     | 
    		concat_prefix(bt, mp->prefix.str, mp->prefix.len,  | 
    
    
    2782  | 
     | 
     | 
    		    sepkey.data, sepkey.size, tmpkey.str, &tmpkey.len);  | 
    
    
    2783  | 
     | 
     | 
    		sepkey.data = tmpkey.str;  | 
    
    
    2784  | 
     | 
     | 
    		sepkey.size = tmpkey.len;  | 
    
    
    2785  | 
     | 
     | 
    	}  | 
    
    
    2786  | 
     | 
     | 
     | 
    
    
    2787  | 
     | 
     | 
    	DPRINTF("separator is [%.*s]", (int)sepkey.size, (char *)sepkey.data); | 
    
    
    2788  | 
     | 
     | 
     | 
    
    
    2789  | 
     | 
     | 
    	/* Copy separator key to the parent.  | 
    
    
    2790  | 
     | 
     | 
    	 */  | 
    
    
    2791  | 
     | 
     | 
    	if (SIZELEFT(pright->parent) < bt_branch_size(bt, &sepkey)) { | 
    
    
    2792  | 
     | 
     | 
    		rc = btree_split(bt, &pright->parent, &pright->parent_index,  | 
    
    
    2793  | 
     | 
     | 
    		    &sepkey, NULL, pright->pgno);  | 
    
    
    2794  | 
     | 
     | 
     | 
    
    
    2795  | 
     | 
     | 
    		/* Right page might now have changed parent.  | 
    
    
    2796  | 
     | 
     | 
    		 * Check if left page also changed parent.  | 
    
    
    2797  | 
     | 
     | 
    		 */  | 
    
    
    2798  | 
     | 
     | 
    		if (pright->parent != mp->parent &&  | 
    
    
    2799  | 
     | 
     | 
    		    mp->parent_index >= NUMKEYS(mp->parent)) { | 
    
    
    2800  | 
     | 
     | 
    			mp->parent = pright->parent;  | 
    
    
    2801  | 
     | 
     | 
    			mp->parent_index = pright->parent_index - 1;  | 
    
    
    2802  | 
     | 
     | 
    		}  | 
    
    
    2803  | 
     | 
     | 
    	} else { | 
    
    
    2804  | 
     | 
     | 
    		remove_prefix(bt, &sepkey, pright->parent->prefix.len);  | 
    
    
    2805  | 
     | 
     | 
    		rc = btree_add_node(bt, pright->parent, pright->parent_index,  | 
    
    
    2806  | 
     | 
     | 
    		    &sepkey, NULL, pright->pgno, 0);  | 
    
    
    2807  | 
     | 
     | 
    	}  | 
    
    
    2808  | 
     | 
     | 
    	if (rc != BT_SUCCESS) { | 
    
    
    2809  | 
     | 
     | 
    		free(copy);  | 
    
    
    2810  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    2811  | 
     | 
     | 
    	}  | 
    
    
    2812  | 
     | 
     | 
     | 
    
    
    2813  | 
     | 
     | 
    	/* Update prefix for right and left page, if the parent was split.  | 
    
    
    2814  | 
     | 
     | 
    	 */  | 
    
    
    2815  | 
     | 
     | 
    	find_common_prefix(bt, pright);  | 
    
    
    2816  | 
     | 
     | 
    	assert(orig_pfx_len <= pright->prefix.len);  | 
    
    
    2817  | 
     | 
     | 
    	right_pfx_diff = pright->prefix.len - orig_pfx_len;  | 
    
    
    2818  | 
     | 
     | 
     | 
    
    
    2819  | 
     | 
     | 
    	find_common_prefix(bt, mp);  | 
    
    
    2820  | 
     | 
     | 
    	assert(orig_pfx_len <= mp->prefix.len);  | 
    
    
    2821  | 
     | 
     | 
    	left_pfx_diff = mp->prefix.len - orig_pfx_len;  | 
    
    
    2822  | 
     | 
     | 
     | 
    
    
    2823  | 
     | 
     | 
    	for (i = j = 0; i <= NUMKEYSP(copy); j++) { | 
    
    
    2824  | 
     | 
     | 
    		if (i < split_indx) { | 
    
    
    2825  | 
     | 
     | 
    			/* Re-insert in left sibling. */  | 
    
    
    2826  | 
     | 
     | 
    			p = mp;  | 
    
    
    2827  | 
     | 
     | 
    			pfx_diff = left_pfx_diff;  | 
    
    
    2828  | 
     | 
     | 
    		} else { | 
    
    
    2829  | 
     | 
     | 
    			/* Insert in right sibling. */  | 
    
    
    2830  | 
     | 
     | 
    			if (i == split_indx)  | 
    
    
    2831  | 
     | 
     | 
    				/* Reset insert index for right sibling. */  | 
    
    
    2832  | 
     | 
     | 
    				j = (i == newindx && ins_new);  | 
    
    
    2833  | 
     | 
     | 
    			p = pright;  | 
    
    
    2834  | 
     | 
     | 
    			pfx_diff = right_pfx_diff;  | 
    
    
    2835  | 
     | 
     | 
    		}  | 
    
    
    2836  | 
     | 
     | 
     | 
    
    
    2837  | 
     | 
     | 
    		if (i == newindx && !ins_new) { | 
    
    
    2838  | 
     | 
     | 
    			/* Insert the original entry that caused the split. */  | 
    
    
    2839  | 
     | 
     | 
    			rkey.data = newkey->data;  | 
    
    
    2840  | 
     | 
     | 
    			rkey.size = newkey->size;  | 
    
    
    2841  | 
     | 
     | 
    			if (IS_LEAF(mp)) { | 
    
    
    2842  | 
     | 
     | 
    				rdata.data = newdata->data;  | 
    
    
    2843  | 
     | 
     | 
    				rdata.size = newdata->size;  | 
    
    
    2844  | 
     | 
     | 
    			} else  | 
    
    
    2845  | 
     | 
     | 
    				pgno = newpgno;  | 
    
    
    2846  | 
     | 
     | 
    			flags = 0;  | 
    
    
    2847  | 
     | 
     | 
    			pfx_diff = p->prefix.len;  | 
    
    
    2848  | 
     | 
     | 
     | 
    
    
    2849  | 
     | 
     | 
    			ins_new = 1;  | 
    
    
    2850  | 
     | 
     | 
     | 
    
    
    2851  | 
     | 
     | 
    			/* Update page and index for the new key. */  | 
    
    
    2852  | 
     | 
     | 
    			*newindxp = j;  | 
    
    
    2853  | 
     | 
     | 
    			*mpp = p;  | 
    
    
    2854  | 
     | 
     | 
    		} else if (i == NUMKEYSP(copy)) { | 
    
    
    2855  | 
     | 
     | 
    			break;  | 
    
    
    2856  | 
     | 
     | 
    		} else { | 
    
    
    2857  | 
     | 
     | 
    			node = NODEPTRP(copy, i);  | 
    
    
    2858  | 
     | 
     | 
    			rkey.data = NODEKEY(node);  | 
    
    
    2859  | 
     | 
     | 
    			rkey.size = node->ksize;  | 
    
    
    2860  | 
     | 
     | 
    			if (IS_LEAF(mp)) { | 
    
    
    2861  | 
     | 
     | 
    				rdata.data = NODEDATA(node);  | 
    
    
    2862  | 
     | 
     | 
    				rdata.size = node->n_dsize;  | 
    
    
    2863  | 
     | 
     | 
    			} else  | 
    
    
    2864  | 
     | 
     | 
    				pgno = node->n_pgno;  | 
    
    
    2865  | 
     | 
     | 
    			flags = node->flags;  | 
    
    
    2866  | 
     | 
     | 
     | 
    
    
    2867  | 
     | 
     | 
    			i++;  | 
    
    
    2868  | 
     | 
     | 
    		}  | 
    
    
    2869  | 
     | 
     | 
     | 
    
    
    2870  | 
     | 
     | 
    		if (!IS_LEAF(mp) && j == 0) { | 
    
    
    2871  | 
     | 
     | 
    			/* First branch index doesn't need key data. */  | 
    
    
    2872  | 
     | 
     | 
    			rkey.size = 0;  | 
    
    
    2873  | 
     | 
     | 
    		} else  | 
    
    
    2874  | 
     | 
     | 
    			remove_prefix(bt, &rkey, pfx_diff);  | 
    
    
    2875  | 
     | 
     | 
     | 
    
    
    2876  | 
     | 
     | 
    		rc = btree_add_node(bt, p, j, &rkey, &rdata, pgno,flags);  | 
    
    
    2877  | 
     | 
     | 
    	}  | 
    
    
    2878  | 
     | 
     | 
     | 
    
    
    2879  | 
     | 
     | 
    	free(copy);  | 
    
    
    2880  | 
     | 
     | 
    	return rc;  | 
    
    
    2881  | 
     | 
     | 
    }  | 
    
    
    2882  | 
     | 
     | 
     | 
    
    
    2883  | 
     | 
     | 
    int  | 
    
    
    2884  | 
     | 
     | 
    btree_txn_put(struct btree *bt, struct btree_txn *txn,  | 
    
    
    2885  | 
     | 
     | 
        struct btval *key, struct btval *data, unsigned int flags)  | 
    
    
    2886  | 
     | 
     | 
    { | 
    
    
    2887  | 
     | 
     | 
    	int		 rc = BT_SUCCESS, exact, close_txn = 0;  | 
    
    
    2888  | 
     | 
     | 
    	unsigned int	 ki;  | 
    
    
    2889  | 
     | 
     | 
    	struct node	*leaf;  | 
    
    
    2890  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    2891  | 
     | 
     | 
    	struct btval	 xkey;  | 
    
    
    2892  | 
     | 
     | 
     | 
    
    
    2893  | 
     | 
     | 
    	assert(key != NULL);  | 
    
    
    2894  | 
     | 
     | 
    	assert(data != NULL);  | 
    
    
    2895  | 
     | 
     | 
     | 
    
    
    2896  | 
     | 
     | 
    	if (bt != NULL && txn != NULL && bt != txn->bt) { | 
    
    
    2897  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    2898  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    2899  | 
     | 
     | 
    	}  | 
    
    
    2900  | 
     | 
     | 
     | 
    
    
    2901  | 
     | 
     | 
    	if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) { | 
    
    
    2902  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    2903  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    2904  | 
     | 
     | 
    	}  | 
    
    
    2905  | 
     | 
     | 
     | 
    
    
    2906  | 
     | 
     | 
    	if (bt == NULL) { | 
    
    
    2907  | 
     | 
     | 
    		if (txn == NULL) { | 
    
    
    2908  | 
     | 
     | 
    			errno = EINVAL;  | 
    
    
    2909  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2910  | 
     | 
     | 
    		}  | 
    
    
    2911  | 
     | 
     | 
    		bt = txn->bt;  | 
    
    
    2912  | 
     | 
     | 
    	}  | 
    
    
    2913  | 
     | 
     | 
     | 
    
    
    2914  | 
     | 
     | 
    	if (key->size == 0 || key->size > MAXKEYSIZE) { | 
    
    
    2915  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    2916  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    2917  | 
     | 
     | 
    	}  | 
    
    
    2918  | 
     | 
     | 
     | 
    
    
    2919  | 
     | 
     | 
    	DPRINTF("==> put key %.*s, size %zu, data size %zu", | 
    
    
    2920  | 
     | 
     | 
    		(int)key->size, (char *)key->data, key->size, data->size);  | 
    
    
    2921  | 
     | 
     | 
     | 
    
    
    2922  | 
     | 
     | 
    	if (txn == NULL) { | 
    
    
    2923  | 
     | 
     | 
    		close_txn = 1;  | 
    
    
    2924  | 
     | 
     | 
    		if ((txn = btree_txn_begin(bt, 0)) == NULL)  | 
    
    
    2925  | 
     | 
     | 
    			return BT_FAIL;  | 
    
    
    2926  | 
     | 
     | 
    	}  | 
    
    
    2927  | 
     | 
     | 
     | 
    
    
    2928  | 
     | 
     | 
    	rc = btree_search_page(bt, txn, key, NULL, 1, &mp);  | 
    
    
    2929  | 
     | 
     | 
    	if (rc == BT_SUCCESS) { | 
    
    
    2930  | 
     | 
     | 
    		leaf = btree_search_node(bt, mp, key, &exact, &ki);  | 
    
    
    2931  | 
     | 
     | 
    		if (leaf && exact) { | 
    
    
    2932  | 
     | 
     | 
    			if (F_ISSET(flags, BT_NOOVERWRITE)) { | 
    
    
    2933  | 
     | 
     | 
    				DPRINTF("duplicate key %.*s", | 
    
    
    2934  | 
     | 
     | 
    				    (int)key->size, (char *)key->data);  | 
    
    
    2935  | 
     | 
     | 
    				errno = EEXIST;  | 
    
    
    2936  | 
     | 
     | 
    				rc = BT_FAIL;  | 
    
    
    2937  | 
     | 
     | 
    				goto done;  | 
    
    
    2938  | 
     | 
     | 
    			}  | 
    
    
    2939  | 
     | 
     | 
    			btree_del_node(bt, mp, ki);  | 
    
    
    2940  | 
     | 
     | 
    		}  | 
    
    
    2941  | 
     | 
     | 
    		if (leaf == NULL) {		/* append if not found */ | 
    
    
    2942  | 
     | 
     | 
    			ki = NUMKEYS(mp);  | 
    
    
    2943  | 
     | 
     | 
    			DPRINTF("appending key at index %d", ki); | 
    
    
    2944  | 
     | 
     | 
    		}  | 
    
    
    2945  | 
     | 
     | 
    	} else if (errno == ENOENT) { | 
    
    
    2946  | 
     | 
     | 
    		/* new file, just write a root leaf page */  | 
    
    
    2947  | 
     | 
     | 
    		DPRINTF("allocating new root leaf page"); | 
    
    
    2948  | 
     | 
     | 
    		if ((mp = btree_new_page(bt, P_LEAF)) == NULL) { | 
    
    
    2949  | 
     | 
     | 
    			rc = BT_FAIL;  | 
    
    
    2950  | 
     | 
     | 
    			goto done;  | 
    
    
    2951  | 
     | 
     | 
    		}  | 
    
    
    2952  | 
     | 
     | 
    		txn->root = mp->pgno;  | 
    
    
    2953  | 
     | 
     | 
    		bt->meta.depth++;  | 
    
    
    2954  | 
     | 
     | 
    		ki = 0;  | 
    
    
    2955  | 
     | 
     | 
    	}  | 
    
    
    2956  | 
     | 
     | 
    	else  | 
    
    
    2957  | 
     | 
     | 
    		goto done;  | 
    
    
    2958  | 
     | 
     | 
     | 
    
    
    2959  | 
     | 
     | 
    	assert(IS_LEAF(mp));  | 
    
    
    2960  | 
     | 
     | 
    	DPRINTF("there are %lu keys, should insert new key at index %u", | 
    
    
    2961  | 
     | 
     | 
    		NUMKEYS(mp), ki);  | 
    
    
    2962  | 
     | 
     | 
     | 
    
    
    2963  | 
     | 
     | 
    	/* Copy the key pointer as it is modified by the prefix code. The  | 
    
    
    2964  | 
     | 
     | 
    	 * caller might have malloc'ed the data.  | 
    
    
    2965  | 
     | 
     | 
    	 */  | 
    
    
    2966  | 
     | 
     | 
    	xkey.data = key->data;  | 
    
    
    2967  | 
     | 
     | 
    	xkey.size = key->size;  | 
    
    
    2968  | 
     | 
     | 
     | 
    
    
    2969  | 
     | 
     | 
    	if (SIZELEFT(mp) < bt_leaf_size(bt, key, data)) { | 
    
    
    2970  | 
     | 
     | 
    		rc = btree_split(bt, &mp, &ki, &xkey, data, P_INVALID);  | 
    
    
    2971  | 
     | 
     | 
    	} else { | 
    
    
    2972  | 
     | 
     | 
    		/* There is room already in this leaf page. */  | 
    
    
    2973  | 
     | 
     | 
    		remove_prefix(bt, &xkey, mp->prefix.len);  | 
    
    
    2974  | 
     | 
     | 
    		rc = btree_add_node(bt, mp, ki, &xkey, data, 0, 0);  | 
    
    
    2975  | 
     | 
     | 
    	}  | 
    
    
    2976  | 
     | 
     | 
     | 
    
    
    2977  | 
     | 
     | 
    	if (rc != BT_SUCCESS)  | 
    
    
    2978  | 
     | 
     | 
    		txn->flags |= BT_TXN_ERROR;  | 
    
    
    2979  | 
     | 
     | 
    	else  | 
    
    
    2980  | 
     | 
     | 
    		bt->meta.entries++;  | 
    
    
    2981  | 
     | 
     | 
     | 
    
    
    2982  | 
     | 
     | 
    done:  | 
    
    
    2983  | 
     | 
     | 
    	if (close_txn) { | 
    
    
    2984  | 
     | 
     | 
    		if (rc == BT_SUCCESS)  | 
    
    
    2985  | 
     | 
     | 
    			rc = btree_txn_commit(txn);  | 
    
    
    2986  | 
     | 
     | 
    		else  | 
    
    
    2987  | 
     | 
     | 
    			btree_txn_abort(txn);  | 
    
    
    2988  | 
     | 
     | 
    	}  | 
    
    
    2989  | 
     | 
     | 
    	mpage_prune(bt);  | 
    
    
    2990  | 
     | 
     | 
    	return rc;  | 
    
    
    2991  | 
     | 
     | 
    }  | 
    
    
    2992  | 
     | 
     | 
     | 
    
    
    2993  | 
     | 
     | 
    static pgno_t  | 
    
    
    2994  | 
     | 
     | 
    btree_compact_tree(struct btree *bt, pgno_t pgno, struct btree *btc)  | 
    
    
    2995  | 
     | 
     | 
    { | 
    
    
    2996  | 
     | 
     | 
    	ssize_t		 rc;  | 
    
    
    2997  | 
     | 
     | 
    	indx_t		 i;  | 
    
    
    2998  | 
     | 
     | 
    	pgno_t		*pnext, next;  | 
    
    
    2999  | 
     | 
     | 
    	struct node	*node;  | 
    
    
    3000  | 
     | 
     | 
    	struct page	*p;  | 
    
    
    3001  | 
     | 
     | 
    	struct mpage	*mp;  | 
    
    
    3002  | 
     | 
     | 
     | 
    
    
    3003  | 
     | 
     | 
    	/* Get the page and make a copy of it.  | 
    
    
    3004  | 
     | 
     | 
    	 */  | 
    
    
    3005  | 
     | 
     | 
    	if ((mp = btree_get_mpage(bt, pgno)) == NULL)  | 
    
    
    3006  | 
     | 
     | 
    		return P_INVALID;  | 
    
    
    3007  | 
     | 
     | 
    	if ((p = malloc(bt->head.psize)) == NULL)  | 
    
    
    3008  | 
     | 
     | 
    		return P_INVALID;  | 
    
    
    3009  | 
     | 
     | 
    	bcopy(mp->page, p, bt->head.psize);  | 
    
    
    3010  | 
     | 
     | 
     | 
    
    
    3011  | 
     | 
     | 
            /* Go through all nodes in the (copied) page and update the  | 
    
    
    3012  | 
     | 
     | 
             * page pointers.  | 
    
    
    3013  | 
     | 
     | 
    	 */  | 
    
    
    3014  | 
     | 
     | 
    	if (F_ISSET(p->flags, P_BRANCH)) { | 
    
    
    3015  | 
     | 
     | 
    		for (i = 0; i < NUMKEYSP(p); i++) { | 
    
    
    3016  | 
     | 
     | 
    			node = NODEPTRP(p, i);  | 
    
    
    3017  | 
     | 
     | 
    			node->n_pgno = btree_compact_tree(bt, node->n_pgno, btc);  | 
    
    
    3018  | 
     | 
     | 
    			if (node->n_pgno == P_INVALID) { | 
    
    
    3019  | 
     | 
     | 
    				free(p);  | 
    
    
    3020  | 
     | 
     | 
    				return P_INVALID;  | 
    
    
    3021  | 
     | 
     | 
    			}  | 
    
    
    3022  | 
     | 
     | 
    		}  | 
    
    
    3023  | 
     | 
     | 
    	} else if (F_ISSET(p->flags, P_LEAF)) { | 
    
    
    3024  | 
     | 
     | 
    		for (i = 0; i < NUMKEYSP(p); i++) { | 
    
    
    3025  | 
     | 
     | 
    			node = NODEPTRP(p, i);  | 
    
    
    3026  | 
     | 
     | 
    			if (F_ISSET(node->flags, F_BIGDATA)) { | 
    
    
    3027  | 
     | 
     | 
    				bcopy(NODEDATA(node), &next, sizeof(next));  | 
    
    
    3028  | 
     | 
     | 
    				next = btree_compact_tree(bt, next, btc);  | 
    
    
    3029  | 
     | 
     | 
    				if (next == P_INVALID) { | 
    
    
    3030  | 
     | 
     | 
    					free(p);  | 
    
    
    3031  | 
     | 
     | 
    					return P_INVALID;  | 
    
    
    3032  | 
     | 
     | 
    				}  | 
    
    
    3033  | 
     | 
     | 
    				bcopy(&next, NODEDATA(node), sizeof(next));  | 
    
    
    3034  | 
     | 
     | 
    			}  | 
    
    
    3035  | 
     | 
     | 
    		}  | 
    
    
    3036  | 
     | 
     | 
    	} else if (F_ISSET(p->flags, P_OVERFLOW)) { | 
    
    
    3037  | 
     | 
     | 
    		pnext = &p->p_next_pgno;  | 
    
    
    3038  | 
     | 
     | 
    		if (*pnext > 0) { | 
    
    
    3039  | 
     | 
     | 
    			*pnext = btree_compact_tree(bt, *pnext, btc);  | 
    
    
    3040  | 
     | 
     | 
    			if (*pnext == P_INVALID) { | 
    
    
    3041  | 
     | 
     | 
    				free(p);  | 
    
    
    3042  | 
     | 
     | 
    				return P_INVALID;  | 
    
    
    3043  | 
     | 
     | 
    			}  | 
    
    
    3044  | 
     | 
     | 
    		}  | 
    
    
    3045  | 
     | 
     | 
    	} else  | 
    
    
    3046  | 
     | 
     | 
    		assert(0);  | 
    
    
    3047  | 
     | 
     | 
     | 
    
    
    3048  | 
     | 
     | 
    	pgno = p->pgno = btc->txn->next_pgno++;  | 
    
    
    3049  | 
     | 
     | 
    	rc = write(btc->fd, p, bt->head.psize);  | 
    
    
    3050  | 
     | 
     | 
    	free(p);  | 
    
    
    3051  | 
     | 
     | 
    	if (rc != (ssize_t)bt->head.psize)  | 
    
    
    3052  | 
     | 
     | 
    		return P_INVALID;  | 
    
    
    3053  | 
     | 
     | 
    	mpage_prune(bt);  | 
    
    
    3054  | 
     | 
     | 
    	return pgno;  | 
    
    
    3055  | 
     | 
     | 
    }  | 
    
    
    3056  | 
     | 
     | 
     | 
    
    
    3057  | 
     | 
     | 
    int  | 
    
    
    3058  | 
     | 
     | 
    btree_compact(struct btree *bt)  | 
    
    
    3059  | 
     | 
     | 
    { | 
    
    
    3060  | 
     | 
     | 
    	char			*compact_path = NULL;  | 
    
    
    3061  | 
     | 
     | 
    	struct btree		*btc;  | 
    
    
    3062  | 
     | 
     | 
    	struct btree_txn	*txn, *txnc = NULL;  | 
    
    
    3063  | 
     | 
     | 
    	int			 fd;  | 
    
    
    3064  | 
     | 
     | 
    	pgno_t			 root;  | 
    
    
    3065  | 
     | 
     | 
     | 
    
    
    3066  | 
     | 
     | 
    	assert(bt != NULL);  | 
    
    
    3067  | 
     | 
     | 
     | 
    
    
    3068  | 
     | 
     | 
    	DPRINTF("compacting btree %p with path %s", bt, bt->path); | 
    
    
    3069  | 
     | 
     | 
     | 
    
    
    3070  | 
     | 
     | 
    	if (bt->path == NULL) { | 
    
    
    3071  | 
     | 
     | 
    		errno = EINVAL;  | 
    
    
    3072  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    3073  | 
     | 
     | 
    	}  | 
    
    
    3074  | 
     | 
     | 
     | 
    
    
    3075  | 
     | 
     | 
    	if ((txn = btree_txn_begin(bt, 0)) == NULL)  | 
    
    
    3076  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    3077  | 
     | 
     | 
     | 
    
    
    3078  | 
     | 
     | 
    	if (asprintf(&compact_path, "%s.compact.XXXXXX", bt->path) == -1) { | 
    
    
    3079  | 
     | 
     | 
    		btree_txn_abort(txn);  | 
    
    
    3080  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    3081  | 
     | 
     | 
    	}  | 
    
    
    3082  | 
     | 
     | 
    	fd = mkstemp(compact_path);  | 
    
    
    3083  | 
     | 
     | 
    	if (fd == -1) { | 
    
    
    3084  | 
     | 
     | 
    		free(compact_path);  | 
    
    
    3085  | 
     | 
     | 
    		btree_txn_abort(txn);  | 
    
    
    3086  | 
     | 
     | 
    		return BT_FAIL;  | 
    
    
    3087  | 
     | 
     | 
    	}  | 
    
    
    3088  | 
     | 
     | 
     | 
    
    
    3089  | 
     | 
     | 
    	if ((btc = btree_open_fd(fd, 0)) == NULL)  | 
    
    
    3090  | 
     | 
     | 
    		goto failed;  | 
    
    
    3091  | 
     | 
     | 
    	bcopy(&bt->meta, &btc->meta, sizeof(bt->meta));  | 
    
    
    3092  | 
     | 
     | 
    	btc->meta.revisions = 0;  | 
    
    
    3093  | 
     | 
     | 
     | 
    
    
    3094  | 
     | 
     | 
    	if ((txnc = btree_txn_begin(btc, 0)) == NULL)  | 
    
    
    3095  | 
     | 
     | 
    		goto failed;  | 
    
    
    3096  | 
     | 
     | 
     | 
    
    
    3097  | 
     | 
     | 
    	if (bt->meta.root != P_INVALID) { | 
    
    
    3098  | 
     | 
     | 
    		root = btree_compact_tree(bt, bt->meta.root, btc);  | 
    
    
    3099  | 
     | 
     | 
    		if (root == P_INVALID)  | 
    
    
    3100  | 
     | 
     | 
    			goto failed;  | 
    
    
    3101  | 
     | 
     | 
    		if (btree_write_meta(btc, root, 0) != BT_SUCCESS)  | 
    
    
    3102  | 
     | 
     | 
    			goto failed;  | 
    
    
    3103  | 
     | 
     | 
    	}  | 
    
    
    3104  | 
     | 
     | 
     | 
    
    
    3105  | 
     | 
     | 
    	fsync(fd);  | 
    
    
    3106  | 
     | 
     | 
     | 
    
    
    3107  | 
     | 
     | 
    	DPRINTF("renaming %s to %s", compact_path, bt->path); | 
    
    
    3108  | 
     | 
     | 
    	if (rename(compact_path, bt->path) != 0)  | 
    
    
    3109  | 
     | 
     | 
    		goto failed;  | 
    
    
    3110  | 
     | 
     | 
     | 
    
    
    3111  | 
     | 
     | 
    	/* Write a "tombstone" meta page so other processes can pick up  | 
    
    
    3112  | 
     | 
     | 
    	 * the change and re-open the file.  | 
    
    
    3113  | 
     | 
     | 
    	 */  | 
    
    
    3114  | 
     | 
     | 
    	if (btree_write_meta(bt, P_INVALID, BT_TOMBSTONE) != BT_SUCCESS)  | 
    
    
    3115  | 
     | 
     | 
    		goto failed;  | 
    
    
    3116  | 
     | 
     | 
     | 
    
    
    3117  | 
     | 
     | 
    	btree_txn_abort(txn);  | 
    
    
    3118  | 
     | 
     | 
    	btree_txn_abort(txnc);  | 
    
    
    3119  | 
     | 
     | 
    	free(compact_path);  | 
    
    
    3120  | 
     | 
     | 
    	btree_close(btc);  | 
    
    
    3121  | 
     | 
     | 
    	mpage_prune(bt);  | 
    
    
    3122  | 
     | 
     | 
    	return 0;  | 
    
    
    3123  | 
     | 
     | 
     | 
    
    
    3124  | 
     | 
     | 
    failed:  | 
    
    
    3125  | 
     | 
     | 
    	btree_txn_abort(txn);  | 
    
    
    3126  | 
     | 
     | 
    	btree_txn_abort(txnc);  | 
    
    
    3127  | 
     | 
     | 
    	unlink(compact_path);  | 
    
    
    3128  | 
     | 
     | 
    	free(compact_path);  | 
    
    
    3129  | 
     | 
     | 
    	btree_close(btc);  | 
    
    
    3130  | 
     | 
     | 
    	mpage_prune(bt);  | 
    
    
    3131  | 
     | 
     | 
    	return BT_FAIL;  | 
    
    
    3132  | 
     | 
     | 
    }  | 
    
    
    3133  | 
     | 
     | 
     | 
    
    
    3134  | 
     | 
     | 
    /* Reverts the last change. Truncates the file at the last root page.  | 
    
    
    3135  | 
     | 
     | 
     */  | 
    
    
    3136  | 
     | 
     | 
    int  | 
    
    
    3137  | 
     | 
     | 
    btree_revert(struct btree *bt)  | 
    
    
    3138  | 
     | 
     | 
    { | 
    
    
    3139  | 
     | 
     | 
    	if (btree_read_meta(bt, NULL) != 0)  | 
    
    
    3140  | 
     | 
     | 
    		return -1;  | 
    
    
    3141  | 
     | 
     | 
     | 
    
    
    3142  | 
     | 
     | 
    	DPRINTF("truncating file at page %u", bt->meta.root); | 
    
    
    3143  | 
     | 
     | 
    	return ftruncate(bt->fd, bt->head.psize * bt->meta.root);  | 
    
    
    3144  | 
     | 
     | 
    }  | 
    
    
    3145  | 
     | 
     | 
     | 
    
    
    3146  | 
     | 
     | 
    void  | 
    
    
    3147  | 
     | 
     | 
    btree_set_cache_size(struct btree *bt, unsigned int cache_size)  | 
    
    
    3148  | 
     | 
     | 
    { | 
    
    
    3149  | 
     | 
     | 
    	bt->stat.max_cache = cache_size;  | 
    
    
    3150  | 
     | 
     | 
    }  | 
    
    
    3151  | 
     | 
     | 
     | 
    
    
    3152  | 
     | 
     | 
    unsigned int  | 
    
    
    3153  | 
     | 
     | 
    btree_get_flags(struct btree *bt)  | 
    
    
    3154  | 
     | 
     | 
    { | 
    
    
    3155  | 
     | 
     | 
    	return (bt->flags & ~BT_FIXPADDING);  | 
    
    
    3156  | 
     | 
     | 
    }  | 
    
    
    3157  | 
     | 
     | 
     | 
    
    
    3158  | 
     | 
     | 
    const char *  | 
    
    
    3159  | 
     | 
     | 
    btree_get_path(struct btree *bt)  | 
    
    
    3160  | 
     | 
     | 
    { | 
    
    
    3161  | 
     | 
     | 
    	return bt->path;  | 
    
    
    3162  | 
     | 
     | 
    }  | 
    
    
    3163  | 
     | 
     | 
     | 
    
    
    3164  | 
     | 
     | 
    const struct btree_stat *  | 
    
    
    3165  | 
     | 
     | 
    btree_stat(struct btree *bt)  | 
    
    
    3166  | 
     | 
     | 
    { | 
    
    
    3167  | 
     | 
     | 
    	if (bt == NULL)  | 
    
    
    3168  | 
     | 
     | 
    		return NULL;  | 
    
    
    3169  | 
     | 
     | 
     | 
    
    
    3170  | 
     | 
     | 
    	bt->stat.branch_pages = bt->meta.branch_pages;  | 
    
    
    3171  | 
     | 
     | 
    	bt->stat.leaf_pages = bt->meta.leaf_pages;  | 
    
    
    3172  | 
     | 
     | 
    	bt->stat.overflow_pages = bt->meta.overflow_pages;  | 
    
    
    3173  | 
     | 
     | 
    	bt->stat.revisions = bt->meta.revisions;  | 
    
    
    3174  | 
     | 
     | 
    	bt->stat.depth = bt->meta.depth;  | 
    
    
    3175  | 
     | 
     | 
    	bt->stat.entries = bt->meta.entries;  | 
    
    
    3176  | 
     | 
     | 
    	bt->stat.psize = bt->head.psize;  | 
    
    
    3177  | 
     | 
     | 
    	bt->stat.created_at = bt->meta.created_at;  | 
    
    
    3178  | 
     | 
     | 
     | 
    
    
    3179  | 
     | 
     | 
    	return &bt->stat;  | 
    
    
    3180  | 
     | 
     | 
    }  | 
    
    
    3181  | 
     | 
     | 
     |