Read-only mirror of official repo on openldap.org. Issues and pull requests here are ignored. Use OpenLDAP ITS for issues.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
lmdb/libraries/libmdb/mdb.c

2462 lines
59 KiB

#include <sys/types.h>
#include <sys/stat.h>
#include <sys/queue.h>
#include <sys/param.h>
#include <sys/uio.h>
#include <sys/mman.h>
#ifdef HAVE_SYS_FILE_H
#include <sys/file.h>
#endif
#include <assert.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <openssl/sha.h>
#include "mdb.h"
#define DEBUG
#ifdef DEBUG
# define DPRINTF(...) do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); } while(0)
#else
# define DPRINTF(...)
#endif
#define PAGESIZE 4096
#define MDB_MINKEYS 4
#define MDB_MAGIC 0xBEEFC0DE
#define MDB_VERSION 1
#define MAXKEYSIZE 255
#define P_INVALID 0xFFFFFFFF
#define F_ISSET(w, f) (((w) & (f)) == (f))
typedef ulong pgno_t;
typedef uint16_t indx_t;
/* Common header for all page types. Overflow pages
* occupy a number of contiguous pages with no
* headers on any page after the first.
*/
typedef struct MDB_page { /* represents a page of storage */
pgno_t mp_pgno; /* page number */
#define P_BRANCH 0x01 /* branch page */
#define P_LEAF 0x02 /* leaf page */
#define P_OVERFLOW 0x04 /* overflow page */
#define P_META 0x08 /* meta page */
#define P_HEAD 0x10 /* header page */
#define P_DIRTY 0x20 /* dirty page */
uint32_t mp_flags;
#define mp_lower mp_pb.pb.pb_lower
#define mp_upper mp_pb.pb.pb_upper
#define mp_pages mp_pb.pb_pages
union page_bounds {
struct {
indx_t pb_lower; /* lower bound of free space */
indx_t pb_upper; /* upper bound of free space */
} pb;
uint32_t pb_pages; /* number of overflow pages */
} mp_pb;
indx_t mp_ptrs[1]; /* dynamic size */
} MDB_page;
#define PAGEHDRSZ offsetof(MDB_page, mp_ptrs)
#define NUMKEYS(p) (((p)->mp_lower - PAGEHDRSZ) >> 1)
#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
#define PAGEFILL(env, p) (1000 * ((env)->me_head.mh_psize - PAGEHDRSZ - SIZELEFT(p)) / \
((env)->me_head.mh_psize - PAGEHDRSZ))
#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF)
#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH)
#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW)
typedef struct MDB_head { /* header page content */
uint32_t mh_magic;
uint32_t mh_version;
uint32_t mh_flags;
uint32_t mh_psize; /* page size */
void *mh_address; /* address for fixed mapping */
size_t mh_mapsize; /* size of mmap region */
} MDB_head;
typedef struct MDB_rootstat {
pgno_t mr_root; /* page number of root page */
MDB_stat mr_stat;
} MDB_rootstat;
typedef struct MDB_meta { /* meta (footer) page content */
pgno_t mm_root; /* page number of root page */
MDB_stat mm_stat;
pgno_t mm_prev_meta; /* previous meta page number */
#define MDB_TOMBSTONE 0x01 /* file is replaced */
uint32_t mm_flags;
#define mm_revisions mm_stat.ms_revisions
#define mm_created_at mm_stat.ms_created_at
uint8_t mm_hash[SHA_DIGEST_LENGTH];
} MDB_meta;
typedef struct MDB_dhead { /* a dirty page */
SIMPLEQ_ENTRY(MDB_dpage) md_next; /* queue of dirty pages */
MDB_page *md_parent;
int md_pi; /* parent index */
int md_num;
} MDB_dhead;
typedef struct MDB_dpage {
MDB_dhead h;
MDB_page p;
} MDB_dpage;
SIMPLEQ_HEAD(dirty_queue, MDB_dpage);
typedef struct MDB_pageparent {
MDB_page *mp_page;
MDB_page *mp_parent;
int mp_pi;
} MDB_pageparent;
static MDB_dpage *mdb_newpage(MDB_txn *txn, MDB_page *parent, int parent_idx, int num);
static int mdb_touch(MDB_txn *txn, MDB_pageparent *mp);
typedef struct MDB_ppage { /* ordered list of pages */
SLIST_ENTRY(MDB_ppage) mp_entry;
MDB_page *mp_page;
unsigned int mp_ki; /* cursor index on page */
} MDB_ppage;
SLIST_HEAD(page_stack, MDB_ppage);
#define CURSOR_EMPTY(c) SLIST_EMPTY(&(c)->mc_stack)
#define CURSOR_TOP(c) SLIST_FIRST(&(c)->mc_stack)
#define CURSOR_POP(c) SLIST_REMOVE_HEAD(&(c)->mc_stack, mp_entry)
#define CURSOR_PUSH(c,p) SLIST_INSERT_HEAD(&(c)->mc_stack, p, mp_entry)
struct MDB_cursor {
MDB_db *mc_db;
MDB_txn *mc_txn;
struct page_stack mc_stack; /* stack of parent pages */
short mc_initialized; /* 1 if initialized */
short mc_eof; /* 1 if end is reached */
};
#define METAHASHLEN offsetof(MDB_meta, mm_hash)
#define METADATA(p) ((void *)((char *)p + PAGEHDRSZ))
typedef struct MDB_node {
#define mn_pgno mn_p.np_pgno
#define mn_dsize mn_p.np_dsize
union {
pgno_t np_pgno; /* child page number */
uint32_t np_dsize; /* leaf data size */
} mn_p;
uint16_t mn_ksize; /* key size */
#define F_BIGDATA 0x01 /* data put on overflow page */
uint8_t mn_flags;
char mn_data[1];
} MDB_node;
struct MDB_txn {
pgno_t mt_root; /* current / new root page */
pgno_t mt_next_pgno; /* next unallocated page */
pgno_t mt_first_pgno;
MDB_env *mt_env;
struct dirty_queue *mt_dirty_queue; /* modified pages */
#define MDB_TXN_RDONLY 0x01 /* read-only transaction */
#define MDB_TXN_ERROR 0x02 /* an error has occurred */
unsigned int mt_flags;
};
/* Must be same as MDB_db, minus md_root/md_stat */
typedef struct MDB_db0 {
unsigned int md_flags;
MDB_cmp_func *md_cmp; /* user compare function */
MDB_rel_func *md_rel; /* user relocate function */
MDB_db *md_parent; /* parent tree */
MDB_env *md_env;
} MDB_db0;
struct MDB_db {
unsigned int md_flags;
MDB_cmp_func *md_cmp; /* user compare function */
MDB_rel_func *md_rel; /* user relocate function */
MDB_db *md_parent; /* parent tree */
MDB_env *md_env;
pgno_t md_root; /* page number of root page */
MDB_stat md_stat;
};
struct MDB_env {
int me_fd;
#define MDB_FIXPADDING 0x01 /* internal */
uint32_t me_flags;
char *me_path;
char *me_map;
MDB_head me_head;
MDB_db0 me_db; /* first DB, overlaps with meta */
MDB_meta me_meta;
MDB_txn *me_txn; /* current write transaction */
size_t me_mapsize;
off_t me_size; /* current file size */
};
#define NODESIZE offsetof(MDB_node, mn_data)
#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size)
#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
#define NODEKEY(node) (void *)((node)->mn_data)
#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize)
#define NODEPGNO(node) ((node)->mn_pgno)
#define NODEDSZ(node) ((node)->mn_dsize)
#define MDB_COMMIT_PAGES 64 /* max number of pages to write in one commit */
#define MDB_MAXCACHE_DEF 1024 /* max number of pages to keep in cache */
static int mdb_search_page_root(MDB_db *db,
MDB_val *key,
MDB_cursor *cursor, int modify,
MDB_pageparent *mpp);
static int mdb_search_page(MDB_db *db,
MDB_txn *txn, MDB_val *key,
MDB_cursor *cursor, int modify,
MDB_pageparent *mpp);
static int mdbenv_write_header(MDB_env *env);
static int mdbenv_read_header(MDB_env *env);
static int mdb_check_meta_page(MDB_page *p);
static int mdbenv_read_meta(MDB_env *env, pgno_t *p_next);
static int mdbenv_write_meta(MDB_env *env, pgno_t root,
unsigned int flags);
static MDB_page *mdbenv_get_page(MDB_env *env, pgno_t pgno);
static MDB_node *mdb_search_node(MDB_db *db, MDB_page *mp,
MDB_val *key, int *exactp, unsigned int *kip);
static int mdb_add_node(MDB_db *bt, MDB_page *mp,
indx_t indx, MDB_val *key, MDB_val *data,
pgno_t pgno, uint8_t flags);
static void mdb_del_node(MDB_db *bt, MDB_page *mp,
indx_t indx);
static int mdb_read_data(MDB_db *bt, MDB_page *mp,
MDB_node *leaf, MDB_val *data);
static int mdb_rebalance(MDB_db *bt, MDB_pageparent *mp);
static int mdb_update_key(MDB_db *bt, MDB_page *mp,
indx_t indx, MDB_val *key);
static int mdb_move_node(MDB_db *bt,
MDB_pageparent *src, indx_t srcindx,
MDB_pageparent *dst, indx_t dstindx);
static int mdb_merge(MDB_db *bt, MDB_pageparent *src,
MDB_pageparent *dst);
static int mdb_split(MDB_db *bt, MDB_page **mpp,
unsigned int *newindxp, MDB_val *newkey,
MDB_val *newdata, pgno_t newpgno);
static MDB_dpage *mdbenv_new_page(MDB_env *env, uint32_t flags, int num);
static void cursor_pop_page(MDB_cursor *cursor);
static MDB_ppage *cursor_push_page(MDB_cursor *cursor,
MDB_page *mp);
static int mdb_set_key(MDB_db *bt, MDB_page *mp,
MDB_node *node, MDB_val *key);
static int mdb_sibling(MDB_cursor *cursor, int move_right);
static int mdb_cursor_next(MDB_cursor *cursor,
MDB_val *key, MDB_val *data);
static int mdb_cursor_set(MDB_cursor *cursor,
MDB_val *key, MDB_val *data, int *exactp);
static int mdb_cursor_first(MDB_cursor *cursor,
MDB_val *key, MDB_val *data);
static size_t mdb_leaf_size(MDB_db *bt, MDB_val *key,
MDB_val *data);
static size_t mdb_branch_size(MDB_db *bt, MDB_val *key);
static pgno_t mdbenv_compact_tree(MDB_env *env, pgno_t pgno,
MDB_env *envc);
static int memncmp(const void *s1, size_t n1,
const void *s2, size_t n2);
static int memnrcmp(const void *s1, size_t n1,
const void *s2, size_t n2);
static int
memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
if (n1 < n2) {
if (memcmp(s1, s2, n1) == 0)
return -1;
}
else if (n1 > n2) {
if (memcmp(s1, s2, n2) == 0)
return 1;
}
return memcmp(s1, s2, n1);
}
static int
memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
const unsigned char *p1;
const unsigned char *p2;
if (n1 == 0)
return n2 == 0 ? 0 : -1;
if (n2 == 0)
return n1 == 0 ? 0 : 1;
p1 = (const unsigned char *)s1 + n1 - 1;
p2 = (const unsigned char *)s2 + n2 - 1;
while (*p1 == *p2) {
if (p1 == s1)
return (p2 == s2) ? 0 : -1;
if (p2 == s2)
return (p1 == p2) ? 0 : 1;
p1--;
p2--;
}
return *p1 - *p2;
}
int
mdb_cmp(MDB_db *db, const MDB_val *a, const MDB_val *b)
{
return db->md_cmp(a, b);
}
static int
_mdb_cmp(MDB_db *db, const MDB_val *key1, const MDB_val *key2)
{
if (F_ISSET(db->md_flags, MDB_REVERSEKEY))
return memnrcmp(key1->mv_data, key1->mv_size, key2->mv_data, key2->mv_size);
else
return memncmp((char *)key1->mv_data, key1->mv_size, key2->mv_data, key2->mv_size);
}
/* Allocate new page(s) for writing */
static MDB_dpage *
mdb_newpage(MDB_txn *txn, MDB_page *parent, int parent_idx, int num)
{
MDB_dpage *dp;
if ((dp = malloc(txn->mt_env->me_head.mh_psize * num + sizeof(MDB_dhead))) == NULL)
return NULL;
dp->h.md_num = num;
dp->h.md_parent = parent;
dp->h.md_pi = parent_idx;
SIMPLEQ_INSERT_TAIL(txn->mt_dirty_queue, dp, h.md_next);
dp->p.mp_pgno = txn->mt_next_pgno;
txn->mt_next_pgno += num;
return dp;
}
/* Touch a page: make it dirty and re-insert into tree with updated pgno.
*/
static int
mdb_touch(MDB_txn *txn, MDB_pageparent *pp)
{
int rc;
MDB_page *mp = pp->mp_page;
pgno_t pgno;
assert(txn != NULL);
assert(pp != NULL);
if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
MDB_dpage *dp;
DPRINTF("touching page %lu -> %lu", mp->mp_pgno, txn->mt_next_pgno);
if ((dp = mdb_newpage(txn, pp->mp_parent, pp->mp_pi, 1)) == NULL)
return ENOMEM;
pgno = dp->p.mp_pgno;
bcopy(mp, &dp->p, txn->mt_env->me_head.mh_psize);
mp = &dp->p;
mp->mp_pgno = pgno;
mp->mp_flags |= P_DIRTY;
/* Update the page number to new touched page. */
if (pp->mp_parent != NULL)
NODEPGNO(NODEPTR(pp->mp_parent, pp->mp_pi)) = mp->mp_pgno;
pp->mp_page = mp;
}
return 0;
}
int
mdbenv_sync(MDB_env *env)
{
int rc = 0;
if (!F_ISSET(env->me_flags, MDB_NOSYNC)) {
if (fsync(env->me_fd))
rc = errno;
}
return rc;
}
int
mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret)
{
MDB_txn *txn;
int rc;
if (!rdonly && env->me_txn != NULL) {
DPRINTF("write transaction already begun");
return EBUSY;
}
if ((txn = calloc(1, sizeof(*txn))) == NULL) {
DPRINTF("calloc: %s", strerror(errno));
return ENOMEM;
}
if (rdonly) {
txn->mt_flags |= MDB_TXN_RDONLY;
} else {
txn->mt_dirty_queue = calloc(1, sizeof(*txn->mt_dirty_queue));
if (txn->mt_dirty_queue == NULL) {
free(txn);
return ENOMEM;
}
SIMPLEQ_INIT(txn->mt_dirty_queue);
#if 0
DPRINTF("taking write lock on txn %p", txn);
if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) {
DPRINTF("flock: %s", strerror(errno));
errno = EBUSY;
free(txn->dirty_queue);
free(txn);
return NULL;
}
#endif
env->me_txn = txn;
}
txn->mt_env = env;
if ((rc = mdbenv_read_meta(env, &txn->mt_next_pgno)) != MDB_SUCCESS) {
mdb_txn_abort(txn);
return rc;
}
txn->mt_first_pgno = txn->mt_next_pgno;
txn->mt_root = env->me_meta.mm_root;
DPRINTF("begin transaction on mdbenv %p, root page %lu", env, txn->mt_root);
*ret = txn;
return MDB_SUCCESS;
}
void
mdb_txn_abort(MDB_txn *txn)
{
MDB_dpage *dp;
MDB_env *env;
if (txn == NULL)
return;
env = txn->mt_env;
DPRINTF("abort transaction on mdbenv %p, root page %lu", env, txn->mt_root);
if (!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
/* Discard all dirty pages.
*/
while (!SIMPLEQ_EMPTY(txn->mt_dirty_queue)) {
dp = SIMPLEQ_FIRST(txn->mt_dirty_queue);
SIMPLEQ_REMOVE_HEAD(txn->mt_dirty_queue, h.md_next);
free(dp);
}
#if 0
DPRINTF("releasing write lock on txn %p", txn);
txn->bt->txn = NULL;
if (flock(txn->bt->fd, LOCK_UN) != 0) {
DPRINTF("failed to unlock fd %d: %s",
txn->bt->fd, strerror(errno));
}
#endif
free(txn->mt_dirty_queue);
}
if (txn == env->me_txn)
env->me_txn = NULL;
free(txn);
}
int
mdb_txn_commit(MDB_txn *txn)
{
int n, done;
ssize_t rc;
off_t size;
MDB_dpage *dp;
MDB_env *env;
struct iovec iov[MDB_COMMIT_PAGES];
assert(txn != NULL);
assert(txn->mt_env != NULL);
env = txn->mt_env;
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
DPRINTF("attempt to commit read-only transaction");
mdb_txn_abort(txn);
return EPERM;
}
if (txn != env->me_txn) {
DPRINTF("attempt to commit unknown transaction");
mdb_txn_abort(txn);
return EINVAL;
}
if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
DPRINTF("error flag is set, can't commit");
mdb_txn_abort(txn);
return EINVAL;
}
if (SIMPLEQ_EMPTY(txn->mt_dirty_queue))
goto done;
if (F_ISSET(env->me_flags, MDB_FIXPADDING)) {
size = lseek(env->me_fd, 0, SEEK_END);
size += env->me_head.mh_psize - (size % env->me_head.mh_psize);
DPRINTF("extending to multiple of page size: %lu", size);
if (ftruncate(env->me_fd, size) != 0) {
n = errno;
DPRINTF("ftruncate: %s", strerror(errno));
mdb_txn_abort(txn);
return n;
}
env->me_flags ^= MDB_FIXPADDING;
}
DPRINTF("committing transaction on mdbenv %p, root page %lu",
env, txn->mt_root);
/* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done.
*/
do {
n = 0;
done = 1;
SIMPLEQ_FOREACH(dp, txn->mt_dirty_queue, h.md_next) {
DPRINTF("committing page %lu", dp->p.mp_pgno);
iov[n].iov_len = env->me_head.mh_psize * dp->h.md_num;
iov[n].iov_base = &dp->p;
/* clear dirty flag */
dp->p.mp_flags &= ~P_DIRTY;
if (++n >= MDB_COMMIT_PAGES) {
done = 0;
break;
}
}
if (n == 0)
break;
DPRINTF("committing %u dirty pages", n);
rc = writev(env->me_fd, iov, n);
if (rc != (ssize_t)env->me_head.mh_psize*n) {
n = errno;
if (rc > 0)
DPRINTF("short write, filesystem full?");
else
DPRINTF("writev: %s", strerror(errno));
mdb_txn_abort(txn);
return n;
}
/* Drop the dirty pages.
*/
while (!SIMPLEQ_EMPTY(txn->mt_dirty_queue)) {
dp = SIMPLEQ_FIRST(txn->mt_dirty_queue);
SIMPLEQ_REMOVE_HEAD(txn->mt_dirty_queue, h.md_next);
free(dp);
if (--n == 0)
break;
}
} while (!done);
if ((n = mdbenv_sync(env)) != 0 ||
(n = mdbenv_write_meta(env, txn->mt_root, 0)) != MDB_SUCCESS ||
(n = mdbenv_sync(env)) != 0) {
mdb_txn_abort(txn);
return n;
}
txn = NULL;
env->me_txn = NULL;
done:
mdb_txn_abort(txn);
return MDB_SUCCESS;
}
static int
mdbenv_write_header(MDB_env *env)
{
struct stat sb;
MDB_head *h;
MDB_page *p;
ssize_t rc;
unsigned int psize;
DPRINTF("writing header page");
assert(env != NULL);
psize = sysconf(_SC_PAGE_SIZE);
if ((p = calloc(1, psize)) == NULL)
return ENOMEM;
p->mp_flags = P_HEAD;
env->me_head.mh_psize = psize;
env->me_head.mh_flags = env->me_flags & 0xffff;
h = METADATA(p);
bcopy(&env->me_head, h, sizeof(*h));
rc = write(env->me_fd, p, env->me_head.mh_psize);
free(p);
if (rc != (ssize_t)env->me_head.mh_psize) {
int err = errno;
if (rc > 0)
DPRINTF("short write, filesystem full?");
return err;
}
return MDB_SUCCESS;
}
static int
mdbenv_read_header(MDB_env *env)
{
char page[PAGESIZE];
MDB_page *p;
MDB_head *h;
int rc;
assert(env != NULL);
/* We don't know the page size yet, so use a minimum value.
*/
if ((rc = pread(env->me_fd, page, PAGESIZE, 0)) == 0) {
return ENOENT;
} else if (rc != PAGESIZE) {
if (rc > 0)
errno = EINVAL;
DPRINTF("read: %s", strerror(errno));
return errno;
}
p = (MDB_page *)page;
if (!F_ISSET(p->mp_flags, P_HEAD)) {
DPRINTF("page %lu not a header page", p->mp_pgno);
return EINVAL;
}
h = METADATA(p);
if (h->mh_magic != MDB_MAGIC) {
DPRINTF("header has invalid magic");
return EINVAL;
}
if (h->mh_version != MDB_VERSION) {
DPRINTF("database is version %u, expected version %u",
h->mh_version, MDB_VERSION);
return EINVAL;
}
bcopy(h, &env->me_head, sizeof(*h));
return 0;
}
static int
mdbenv_write_meta(MDB_env *env, pgno_t root, unsigned int flags)
{
MDB_dpage *dp;
MDB_meta *meta;
ssize_t rc;
DPRINTF("writing meta page for root page %lu", root);
assert(env != NULL);
assert(env->me_txn != NULL);
if ((dp = mdbenv_new_page(env, P_META, 1)) == NULL)
return ENOMEM;
env->me_meta.mm_prev_meta = env->me_meta.mm_root;
env->me_meta.mm_root = root;
env->me_meta.mm_flags = flags;
env->me_meta.mm_created_at = time(0);
env->me_meta.mm_revisions++;
SHA1((unsigned char *)&env->me_meta, METAHASHLEN, env->me_meta.mm_hash);
/* Copy the meta data changes to the new meta page. */
meta = METADATA(&dp->p);
bcopy(&env->me_meta, meta, sizeof(*meta));
rc = write(env->me_fd, &dp->p, env->me_head.mh_psize);
SIMPLEQ_REMOVE_HEAD(env->me_txn->mt_dirty_queue, h.md_next);
free(dp);
if (rc != (ssize_t)env->me_head.mh_psize) {
int err = errno;
if (rc > 0)
DPRINTF("short write, filesystem full?");
return err;
}
if ((env->me_size = lseek(env->me_fd, 0, SEEK_END)) == -1) {
DPRINTF("failed to update file size: %s", strerror(errno));
env->me_size = 0;
}
return MDB_SUCCESS;
}
/* Returns true if page p is a valid meta page, false otherwise.
*/
static int
mdb_check_meta_page(MDB_page *p)
{
MDB_meta *m;
unsigned char hash[SHA_DIGEST_LENGTH];
m = METADATA(p);
if (!F_ISSET(p->mp_flags, P_META)) {
DPRINTF("page %lu not a meta page", p->mp_pgno);
return EINVAL;
}
if (m->mm_root >= p->mp_pgno && m->mm_root != P_INVALID) {
DPRINTF("page %lu points to an invalid root page", p->mp_pgno);
return EINVAL;
}
SHA1((unsigned char *)m, METAHASHLEN, hash);
if (bcmp(hash, m->mm_hash, SHA_DIGEST_LENGTH) != 0) {
DPRINTF("page %lu has an invalid digest", p->mp_pgno);
return EINVAL;
}
return 0;
}
static int
mdbenv_read_meta(MDB_env *env, pgno_t *p_next)
{
MDB_page *mp;
MDB_meta *meta;
pgno_t meta_pgno, next_pgno;
off_t size;
assert(env != NULL);
if ((size = lseek(env->me_fd, 0, SEEK_END)) == -1)
goto fail;
DPRINTF("mdbenv_read_meta: size = %lu", size);
if (size < env->me_size) {
DPRINTF("file has shrunk!");
errno = EIO;
goto fail;
}
if (size == env->me_head.mh_psize) { /* there is only the header */
if (p_next != NULL)
*p_next = 1;
env->me_meta.mm_stat.ms_psize = env->me_head.mh_psize;
return MDB_SUCCESS; /* new file */
}
next_pgno = size / env->me_head.mh_psize;
if (next_pgno == 0) {
DPRINTF("corrupt file");
errno = EIO;
goto fail;
}
meta_pgno = next_pgno - 1;
if (size % env->me_head.mh_psize != 0) {
DPRINTF("filesize not a multiple of the page size!");
env->me_flags |= MDB_FIXPADDING;
next_pgno++;
}
if (p_next != NULL)
*p_next = next_pgno;
if (size == env->me_size) {
DPRINTF("size unchanged, keeping current meta page");
if (F_ISSET(env->me_meta.mm_flags, MDB_TOMBSTONE)) {
DPRINTF("file is dead");
errno = ESTALE;
return MDB_FAIL;
} else
return MDB_SUCCESS;
} else {
env->me_size = size;
}
while (meta_pgno > 0) {
if ((mp = mdbenv_get_page(env, meta_pgno)) == NULL)
break;
if (!mdb_check_meta_page(mp)) {
meta = METADATA(mp);
DPRINTF("flags = 0x%x", meta->mm_flags);
if (F_ISSET(meta->mm_flags, MDB_TOMBSTONE)) {
DPRINTF("file is dead");
errno = ESTALE;
return MDB_FAIL;
} else {
/* Make copy of last meta page. */
bcopy(meta, &env->me_meta, sizeof(env->me_meta));
return MDB_SUCCESS;
}
}
--meta_pgno; /* scan backwards to first valid meta page */
}
errno = EIO;
fail:
if (p_next != NULL)
*p_next = P_INVALID;
return MDB_FAIL;
}
int
mdbenv_create(MDB_env **env, size_t size)
{
MDB_env *e;
e = calloc(1, sizeof(*e));
if (!e) return ENOMEM;
e->me_head.mh_magic = MDB_MAGIC;
e->me_head.mh_version = MDB_VERSION;
e->me_mapsize = e->me_head.mh_mapsize = size;
e->me_db.md_env = e;
*env = e;
return MDB_SUCCESS;
}
int
mdbenv_open2(MDB_env *env, unsigned int flags)
{
int i, newenv = 0;
i = fcntl(env->me_fd, F_GETFL, 0);
if (fcntl(env->me_fd, F_SETFL, i | O_APPEND) == -1)
return errno;
env->me_flags = flags;
env->me_flags &= ~MDB_FIXPADDING;
env->me_meta.mm_root = P_INVALID;
if ((i = mdbenv_read_header(env)) != 0) {
if (i != ENOENT)
return i;
DPRINTF("new mdbenv");
newenv = 1;
}
if (!env->me_mapsize)
env->me_mapsize = env->me_head.mh_mapsize;
i = MAP_SHARED;
if (env->me_head.mh_address && (flags & MDB_FIXEDMAP))
i |= MAP_FIXED;
env->me_map = mmap(env->me_head.mh_address, env->me_mapsize, PROT_READ, i,
env->me_fd, 0);
if (env->me_map == MAP_FAILED)
return errno;
if (newenv) {
env->me_head.mh_mapsize = env->me_mapsize;
if (flags & MDB_FIXEDMAP)
env->me_head.mh_address = env->me_map;
i = mdbenv_write_header(env);
if (i != MDB_SUCCESS) {
munmap(env->me_map, env->me_mapsize);
return i;
}
}
if ((i = mdbenv_read_meta(env, NULL)) != 0)
return i;
DPRINTF("opened database version %u, pagesize %u",
env->me_head.mh_version, env->me_head.mh_psize);
DPRINTF("timestamp: %s", ctime(&env->me_meta.mm_created_at));
DPRINTF("depth: %u", env->me_meta.mm_stat.ms_depth);
DPRINTF("entries: %lu", env->me_meta.mm_stat.ms_entries);
DPRINTF("revisions: %lu", env->me_meta.mm_stat.ms_revisions);
DPRINTF("branch pages: %lu", env->me_meta.mm_stat.ms_branch_pages);
DPRINTF("leaf pages: %lu", env->me_meta.mm_stat.ms_leaf_pages);
DPRINTF("overflow pages: %lu", env->me_meta.mm_stat.ms_overflow_pages);
DPRINTF("root: %lu", env->me_meta.mm_root);
DPRINTF("previous me_meta.me_page: %lu", env->me_meta.mm_prev_meta);
return MDB_SUCCESS;
}
int
mdbenv_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode)
{
int oflags, rc;
if (F_ISSET(flags, MDB_RDONLY))
oflags = O_RDONLY;
else
oflags = O_RDWR | O_CREAT | O_APPEND;
if ((env->me_fd = open(path, oflags, mode)) == -1)
return errno;
if ((rc = mdbenv_open2(env, flags)) != MDB_SUCCESS) {
close(env->me_fd);
env->me_fd = -1;
} else {
env->me_path = strdup(path);
DPRINTF("opened dbenv %p", env);
}
return rc;
}
void
mdbenv_close(MDB_env *env)
{
if (env == NULL)
return;
free(env->me_path);
if (env->me_map) {
munmap(env->me_map, env->me_mapsize);
}
close(env->me_fd);
}
/* Search for key within a leaf page, using binary search.
* Returns the smallest entry larger or equal to the key.
* If exactp is non-null, stores whether the found entry was an exact match
* in *exactp (1 or 0).
* If kip is non-null, stores the index of the found entry in *kip.
* If no entry larger or equal to the key is found, returns NULL.
*/
static MDB_node *
mdb_search_node(MDB_db *bt, MDB_page *mp, MDB_val *key,
int *exactp, unsigned int *kip)
{
unsigned int i = 0;
int low, high;
int rc = 0;
MDB_node *node;
MDB_val nodekey;
DPRINTF("searching %lu keys in %s page %lu",
NUMKEYS(mp),
IS_LEAF(mp) ? "leaf" : "branch",
mp->mp_pgno);
assert(NUMKEYS(mp) > 0);
bzero(&nodekey, sizeof(nodekey));
low = IS_LEAF(mp) ? 0 : 1;
high = NUMKEYS(mp) - 1;
while (low <= high) {
i = (low + high) >> 1;
node = NODEPTR(mp, i);
nodekey.mv_size = node->mn_ksize;
nodekey.mv_data = NODEKEY(node);
if (bt->md_cmp)
rc = bt->md_cmp(key, &nodekey);
else
rc = _mdb_cmp(bt, key, &nodekey);
if (IS_LEAF(mp))
DPRINTF("found leaf index %u [%.*s], rc = %i",
i, (int)nodekey.mv_size, (char *)nodekey.mv_data, rc);
else
DPRINTF("found branch index %u [%.*s -> %lu], rc = %i",
i, (int)node->mn_ksize, (char *)NODEKEY(node),
node->mn_pgno, rc);
if (rc == 0)
break;
if (rc > 0)
low = i + 1;
else
high = i - 1;
}
if (rc > 0) { /* Found entry is less than the key. */
i++; /* Skip to get the smallest entry larger than key. */
if (i >= NUMKEYS(mp))
/* There is no entry larger or equal to the key. */
return NULL;
}
if (exactp)
*exactp = (rc == 0);
if (kip) /* Store the key index if requested. */
*kip = i;
return NODEPTR(mp, i);
}
static void
cursor_pop_page(MDB_cursor *cursor)
{
MDB_ppage *top;
top = CURSOR_TOP(cursor);
CURSOR_POP(cursor);
DPRINTF("popped page %lu off cursor %p", top->mp_page->mp_pgno, cursor);
free(top);
}
static MDB_ppage *
cursor_push_page(MDB_cursor *cursor, MDB_page *mp)
{
MDB_ppage *ppage;
DPRINTF("pushing page %lu on cursor %p", mp->mp_pgno, cursor);
if ((ppage = calloc(1, sizeof(*ppage))) == NULL)
return NULL;
ppage->mp_page = mp;
CURSOR_PUSH(cursor, ppage);
return ppage;
}
static MDB_page *
mdbenv_get_page(MDB_env *env, pgno_t pgno)
{
MDB_page *p = NULL;
MDB_txn *txn = env->me_txn;
if (txn && pgno >= txn->mt_first_pgno &&
!SIMPLEQ_EMPTY(txn->mt_dirty_queue)) {
MDB_dpage *dp;
SIMPLEQ_FOREACH(dp, txn->mt_dirty_queue, h.md_next) {
if (dp->p.mp_pgno == pgno) {
p = &dp->p;
break;
}
}
} else {
p = (MDB_page *)(env->me_map + env->me_head.mh_psize * pgno);
}
return p;
}
static int
mdb_search_page_root(MDB_db *bt, MDB_val *key,
MDB_cursor *cursor, int modify, MDB_pageparent *mpp)
{
MDB_page *mp = mpp->mp_page;
int rc;
if (cursor && cursor_push_page(cursor, mp) == NULL)
return MDB_FAIL;
while (IS_BRANCH(mp)) {
unsigned int i = 0;
MDB_node *node;
DPRINTF("branch page %lu has %lu keys", mp->mp_pgno, NUMKEYS(mp));
assert(NUMKEYS(mp) > 1);
DPRINTF("found index 0 to page %lu", NODEPGNO(NODEPTR(mp, 0)));
if (key == NULL) /* Initialize cursor to first page. */
i = 0;
else {
int exact;
node = mdb_search_node(bt, mp, key, &exact, &i);
if (node == NULL)
i = NUMKEYS(mp) - 1;
else if (!exact) {
assert(i > 0);
i--;
}
}
if (key)
DPRINTF("following index %u for key %.*s",
i, (int)key->mv_size, (char *)key->mv_data);
assert(i >= 0 && i < NUMKEYS(mp));
node = NODEPTR(mp, i);
if (cursor)
CURSOR_TOP(cursor)->mp_ki = i;
mpp->mp_parent = mp;
if ((mp = mdbenv_get_page(bt->md_env, NODEPGNO(node))) == NULL)
return MDB_FAIL;
mpp->mp_pi = i;
mpp->mp_page = mp;
if (cursor && cursor_push_page(cursor, mp) == NULL)
return MDB_FAIL;
if (modify && (rc = mdb_touch(bt->md_env->me_txn, mpp)))
return rc;
mp = mpp->mp_page;
}
if (!IS_LEAF(mp)) {
DPRINTF("internal error, index points to a %02X page!?",
mp->mp_flags);
return MDB_FAIL;
}
DPRINTF("found leaf page %lu for key %.*s", mp->mp_pgno,
key ? (int)key->mv_size : 0, key ? (char *)key->mv_data : NULL);
return MDB_SUCCESS;
}
/* Search for the page a given key should be in.
* Stores a pointer to the found page in *mpp.
* If key is NULL, search for the lowest page (used by mdb_cursor_first).
* If cursor is non-null, pushes parent pages on the cursor stack.
* If modify is true, visited pages are updated with new page numbers.
*/
static int
mdb_search_page(MDB_db *db, MDB_txn *txn, MDB_val *key,
MDB_cursor *cursor, int modify, MDB_pageparent *mpp)
{
int rc;
pgno_t root;
/* Can't modify pages outside a transaction. */
if (txn == NULL && modify) {
return EINVAL;
}
/* Choose which root page to start with. If a transaction is given
* use the root page from the transaction, otherwise read the last
* committed root page.
*/
if (txn == NULL) {
if ((rc = mdbenv_read_meta(db->md_env, NULL)) != MDB_SUCCESS)
return rc;
root = db->md_env->me_meta.mm_root;
} else if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
DPRINTF("transaction has failed, must abort");
return EINVAL;
} else
root = txn->mt_root;
if (root == P_INVALID) { /* Tree is empty. */
DPRINTF("tree is empty");
return ENOENT;
}
if ((mpp->mp_page = mdbenv_get_page(db->md_env, root)) == NULL)
return MDB_FAIL;
DPRINTF("root page has flags 0x%X", mpp->mp_page->mp_flags);
if (modify && !F_ISSET(mpp->mp_page->mp_flags, P_DIRTY)) {
mpp->mp_parent = NULL;
mpp->mp_pi = 0;
if ((rc = mdb_touch(txn, mpp)))
return rc;
txn->mt_root = mpp->mp_page->mp_pgno;
}
return mdb_search_page_root(db, key, cursor, modify, mpp);
}
static int
mdb_read_data(MDB_db *db, MDB_page *mp, MDB_node *leaf,
MDB_val *data)
{
MDB_page *omp; /* overflow mpage */
size_t psz;
size_t max;
size_t sz = 0;
pgno_t pgno;
bzero(data, sizeof(*data));
max = db->md_env->me_head.mh_psize - PAGEHDRSZ;
if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
data->mv_size = leaf->mn_dsize;
data->mv_data = NODEDATA(leaf);
return MDB_SUCCESS;
}
/* Read overflow data.
*/
data->mv_size = leaf->mn_dsize;
bcopy(NODEDATA(leaf), &pgno, sizeof(pgno));
if ((omp = mdbenv_get_page(db->md_env, pgno)) == NULL) {
DPRINTF("read overflow page %lu failed", pgno);
return MDB_FAIL;
}
data->mv_data = omp;
return MDB_SUCCESS;
}
int
mdb_get(MDB_db *db, MDB_txn *txn,
MDB_val *key, MDB_val *data)
{
int rc, exact;
MDB_node *leaf;
MDB_pageparent mpp;
assert(key);
assert(data);
DPRINTF("===> get key [%.*s]", (int)key->mv_size, (char *)key->mv_data);
if (db == NULL)
return EINVAL;
if (txn != NULL && db->md_env != txn->mt_env) {
return EINVAL;
}
if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
return EINVAL;
}
if ((rc = mdb_search_page(db, txn, key, NULL, 0, &mpp)) != MDB_SUCCESS)
return rc;
leaf = mdb_search_node(db, mpp.mp_page, key, &exact, NULL);
if (leaf && exact)
rc = mdb_read_data(db, mpp.mp_page, leaf, data);
else {
rc = ENOENT;
}
return rc;
}
static int
mdb_sibling(MDB_cursor *cursor, int move_right)
{
int rc;
MDB_node *indx;
MDB_ppage *parent, *top;
MDB_page *mp;
top = CURSOR_TOP(cursor);
if ((parent = SLIST_NEXT(top, mp_entry)) == NULL) {
return ENOENT; /* root has no siblings */
}
DPRINTF("parent page is page %lu, index %u",
parent->mp_page->mp_pgno, parent->mp_ki);
cursor_pop_page(cursor);
if (move_right ? (parent->mp_ki + 1 >= NUMKEYS(parent->mp_page))
: (parent->mp_ki == 0)) {
DPRINTF("no more keys left, moving to %s sibling",
move_right ? "right" : "left");
if ((rc = mdb_sibling(cursor, move_right)) != MDB_SUCCESS)
return rc;
parent = CURSOR_TOP(cursor);
} else {
if (move_right)
parent->mp_ki++;
else
parent->mp_ki--;
DPRINTF("just moving to %s index key %u",
move_right ? "right" : "left", parent->mp_ki);
}
assert(IS_BRANCH(parent->mp_page));
#if 0
indx = NODEPTR(parent->mp_page, parent->mp_ki);
if ((mp = mdbenv_get_page(cursor->bt, indx->n_pgno)) == NULL)
return MDB_FAIL;
mp->parent = parent->mp_page;
mp->parent_index = parent->mp_ki;
#endif
cursor_push_page(cursor, mp);
return MDB_SUCCESS;
}
static int
mdb_set_key(MDB_db *bt, MDB_page *mp, MDB_node *node,
MDB_val *key)
{
if (key == NULL)
return 0;
key->mv_size = node->mn_ksize;
key->mv_data = NODEKEY(node);
return 0;
}
static int
mdb_cursor_next(MDB_cursor *cursor, MDB_val *key, MDB_val *data)
{
MDB_ppage *top;
MDB_page *mp;
MDB_node *leaf;
if (cursor->mc_eof) {
return ENOENT;
}
assert(cursor->mc_initialized);
top = CURSOR_TOP(cursor);
mp = top->mp_page;
DPRINTF("cursor_next: top page is %lu in cursor %p", mp->mp_pgno, cursor);
if (top->mp_ki + 1 >= NUMKEYS(mp)) {
DPRINTF("=====> move to next sibling page");
if (mdb_sibling(cursor, 1) != MDB_SUCCESS) {
cursor->mc_eof = 1;
return ENOENT;
}
top = CURSOR_TOP(cursor);
mp = top->mp_page;
DPRINTF("next page is %lu, key index %u", mp->mp_pgno, top->mp_ki);
} else
top->mp_ki++;
DPRINTF("==> cursor points to page %lu with %lu keys, key index %u",
mp->mp_pgno, NUMKEYS(mp), top->mp_ki);
assert(IS_LEAF(mp));
leaf = NODEPTR(mp, top->mp_ki);
if (data && mdb_read_data(cursor->mc_db, mp, leaf, data) != MDB_SUCCESS)
return MDB_FAIL;
return mdb_set_key(cursor->mc_db, mp, leaf, key);
}
static int
mdb_cursor_set(MDB_cursor *cursor, MDB_val *key, MDB_val *data,
int *exactp)
{
int rc;
MDB_node *leaf;
MDB_ppage *top;
MDB_pageparent mpp;
assert(cursor);
assert(key);
assert(key->mv_size > 0);
rc = mdb_search_page(cursor->mc_db, cursor->mc_txn, key, cursor, 0, &mpp);
if (rc != MDB_SUCCESS)
return rc;
assert(IS_LEAF(mpp.mp_page));
top = CURSOR_TOP(cursor);
leaf = mdb_search_node(cursor->mc_db, mpp.mp_page, key, exactp, &top->mp_ki);
if (exactp != NULL && !*exactp) {
/* MDB_CURSOR_EXACT specified and not an exact match. */
return ENOENT;
}
if (leaf == NULL) {
DPRINTF("===> inexact leaf not found, goto sibling");
if ((rc = mdb_sibling(cursor, 1)) != MDB_SUCCESS)
return rc; /* no entries matched */
top = CURSOR_TOP(cursor);
top->mp_ki = 0;
mpp.mp_page = top->mp_page;
assert(IS_LEAF(mpp.mp_page));
leaf = NODEPTR(mpp.mp_page, 0);
}
cursor->mc_initialized = 1;
cursor->mc_eof = 0;
if (data && (rc = mdb_read_data(cursor->mc_db, mpp.mp_page, leaf, data)) != MDB_SUCCESS)
return rc;
rc = mdb_set_key(cursor->mc_db, mpp.mp_page, leaf, key);
if (rc == MDB_SUCCESS) {
DPRINTF("==> cursor placed on key %.*s",
(int)key->mv_size, (char *)key->mv_data);
;
}
return rc;
}
static int
mdb_cursor_first(MDB_cursor *cursor, MDB_val *key, MDB_val *data)
{
int rc;
MDB_pageparent mpp;
MDB_node *leaf;
rc = mdb_search_page(cursor->mc_db, cursor->mc_txn, NULL, cursor, 0, &mpp);
if (rc != MDB_SUCCESS)
return rc;
assert(IS_LEAF(mpp.mp_page));
leaf = NODEPTR(mpp.mp_page, 0);
cursor->mc_initialized = 1;
cursor->mc_eof = 0;
if (data && (rc = mdb_read_data(cursor->mc_db, mpp.mp_page, leaf, data)) != MDB_SUCCESS)
return rc;
return mdb_set_key(cursor->mc_db, mpp.mp_page, leaf, key);
}
int
mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data,
MDB_cursor_op op)
{
int rc;
int exact = 0;
assert(cursor);
switch (op) {
case MDB_CURSOR:
case MDB_CURSOR_EXACT:
while (CURSOR_TOP(cursor) != NULL)
cursor_pop_page(cursor);
if (key == NULL || key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
rc = EINVAL;
} else if (op == MDB_CURSOR_EXACT)
rc = mdb_cursor_set(cursor, key, data, &exact);
else
rc = mdb_cursor_set(cursor, key, data, NULL);
break;
case MDB_NEXT:
if (!cursor->mc_initialized)
rc = mdb_cursor_first(cursor, key, data);
else
rc = mdb_cursor_next(cursor, key, data);
break;
case MDB_FIRST:
while (CURSOR_TOP(cursor) != NULL)
cursor_pop_page(cursor);
rc = mdb_cursor_first(cursor, key, data);
break;
default:
DPRINTF("unhandled/unimplemented cursor operation %u", op);
rc = EINVAL;
break;
}
return rc;
}
/* Allocate a page and initialize it
*/
static MDB_dpage *
mdbenv_new_page(MDB_env *env, uint32_t flags, int num)
{
MDB_dpage *dp;
assert(env != NULL);
assert(env->me_txn != NULL);
DPRINTF("allocating new mpage %lu, page size %u",
env->me_txn->mt_next_pgno, env->me_head.mh_psize);
if ((dp = mdb_newpage(env->me_txn, NULL, 0, num)) == NULL)
return NULL;
dp->p.mp_flags = flags;
dp->p.mp_lower = PAGEHDRSZ;
dp->p.mp_upper = env->me_head.mh_psize;
if (IS_BRANCH(&dp->p))
env->me_meta.mm_stat.ms_branch_pages++;
else if (IS_LEAF(&dp->p))
env->me_meta.mm_stat.ms_leaf_pages++;
else if (IS_OVERFLOW(&dp->p)) {
env->me_meta.mm_stat.ms_overflow_pages += num;
dp->p.mp_pages = num;
}
return dp;
}
static size_t
mdb_leaf_size(MDB_db *db, MDB_val *key, MDB_val *data)
{
size_t sz;
sz = LEAFSIZE(key, data);
if (data->mv_size >= db->md_env->me_head.mh_psize / MDB_MINKEYS) {
/* put on overflow page */
sz -= data->mv_size - sizeof(pgno_t);
}
return sz + sizeof(indx_t);
}
static size_t
mdb_branch_size(MDB_db *db, MDB_val *key)
{
size_t sz;
sz = INDXSIZE(key);
if (sz >= db->md_env->me_head.mh_psize / MDB_MINKEYS) {
/* put on overflow page */
/* not implemented */
/* sz -= key->size - sizeof(pgno_t); */
}
return sz + sizeof(indx_t);
}
static int
mdb_add_node(MDB_db *db, MDB_page *mp, indx_t indx,
MDB_val *key, MDB_val *data, pgno_t pgno, uint8_t flags)
{
unsigned int i;
size_t node_size = NODESIZE;
indx_t ofs;
MDB_node *node;
MDB_dpage *ofp = NULL; /* overflow page */
assert(mp->mp_upper >= mp->mp_lower);
DPRINTF("add node [%.*s] to %s page %lu at index %i, key size %zu",
key ? (int)key->mv_size : 0, key ? (char *)key->mv_data : NULL,
IS_LEAF(mp) ? "leaf" : "branch",
mp->mp_pgno, indx, key ? key->mv_size : 0);
if (key != NULL)
node_size += key->mv_size;
if (IS_LEAF(mp)) {
assert(data);
node_size += data->mv_size;
if (F_ISSET(flags, F_BIGDATA)) {
/* Data already on overflow page. */
node_size -= data->mv_size - sizeof(pgno_t);
} else if (data->mv_size >= db->md_env->me_head.mh_psize / MDB_MINKEYS) {
int ovpages = PAGEHDRSZ + data->mv_size + db->md_env->me_head.mh_psize - 1;
ovpages /= db->md_env->me_head.mh_psize;
/* Put data on overflow page. */
DPRINTF("data size is %zu, put on overflow page",
data->mv_size);
node_size -= data->mv_size - sizeof(pgno_t);
if ((ofp = mdbenv_new_page(db->md_env, P_OVERFLOW, ovpages)) == NULL)
return MDB_FAIL;
DPRINTF("allocated overflow page %lu", ofp->p.mp_pgno);
flags |= F_BIGDATA;
}
}
if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
DPRINTF("not enough room in page %lu, got %lu ptrs",
mp->mp_pgno, NUMKEYS(mp));
DPRINTF("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower,
mp->mp_upper - mp->mp_lower);
DPRINTF("node size = %zu", node_size);
return ENOSPC;
}
/* Move higher pointers up one slot. */
for (i = NUMKEYS(mp); i > indx; i--)
mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
/* Adjust free space offsets. */
ofs = mp->mp_upper - node_size;
assert(ofs >= mp->mp_lower + sizeof(indx_t));
mp->mp_ptrs[indx] = ofs;
mp->mp_upper = ofs;
mp->mp_lower += sizeof(indx_t);
/* Write the node data. */
node = NODEPTR(mp, indx);
node->mn_ksize = (key == NULL) ? 0 : key->mv_size;
node->mn_flags = flags;
if (IS_LEAF(mp))
node->mn_dsize = data->mv_size;
else
node->mn_pgno = pgno;
if (key)
bcopy(key->mv_data, NODEKEY(node), key->mv_size);
if (IS_LEAF(mp)) {
assert(key);
if (ofp == NULL) {
if (F_ISSET(flags, F_BIGDATA))
bcopy(data->mv_data, node->mn_data + key->mv_size,
sizeof(pgno_t));
else
bcopy(data->mv_data, node->mn_data + key->mv_size,
data->mv_size);
} else {
bcopy(&ofp->p.mp_pgno, node->mn_data + key->mv_size,
sizeof(pgno_t));
bcopy(data->mv_data, METADATA(&ofp->p), data->mv_size);
}
}
return MDB_SUCCESS;
}
static void
mdb_del_node(MDB_db *db, MDB_page *mp, indx_t indx)
{
unsigned int sz;
indx_t i, j, numkeys, ptr;
MDB_node *node;
char *base;
DPRINTF("delete node %u on %s page %lu", indx,
IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno);
assert(indx < NUMKEYS(mp));
node = NODEPTR(mp, indx);
sz = NODESIZE + node->mn_ksize;
if (IS_LEAF(mp)) {
if (F_ISSET(node->mn_flags, F_BIGDATA))
sz += sizeof(pgno_t);
else
sz += NODEDSZ(node);
}
ptr = mp->mp_ptrs[indx];
numkeys = NUMKEYS(mp);
for (i = j = 0; i < numkeys; i++) {
if (i != indx) {
mp->mp_ptrs[j] = mp->mp_ptrs[i];
if (mp->mp_ptrs[i] < ptr)
mp->mp_ptrs[j] += sz;
j++;
}
}
base = (char *)mp + mp->mp_upper;
bcopy(base, base + sz, ptr - mp->mp_upper);
mp->mp_lower -= sizeof(indx_t);
mp->mp_upper += sz;
}
int
mdb_cursor_open(MDB_db *db, MDB_txn *txn, MDB_cursor **ret)
{
MDB_cursor *cursor;
if (db == NULL || ret == NULL)
return EINVAL;
if (txn != NULL && db->md_env != txn->mt_env) {
return EINVAL;
}
if ((cursor = calloc(1, sizeof(*cursor))) != NULL) {
SLIST_INIT(&cursor->mc_stack);
cursor->mc_db = db;
cursor->mc_txn = txn;
}
*ret = cursor;
return MDB_SUCCESS;
}
void
mdb_cursor_close(MDB_cursor *cursor)
{
if (cursor != NULL) {
while (!CURSOR_EMPTY(cursor))
cursor_pop_page(cursor);
/* btree_close(cursor->bt); */
free(cursor);
}
}
static int
mdb_update_key(MDB_db *db, MDB_page *mp, indx_t indx,
MDB_val *key)
{
indx_t ptr, i, numkeys;
int delta;
size_t len;
MDB_node *node;
char *base;
node = NODEPTR(mp, indx);
ptr = mp->mp_ptrs[indx];
DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %lu",
indx, ptr,
(int)node->mn_ksize, (char *)NODEKEY(node),
(int)key->mv_size, (char *)key->mv_data,
mp->mp_pgno);
delta = key->mv_size - node->mn_ksize;
if (delta) {
if (delta > 0 && SIZELEFT(mp) < delta) {
DPRINTF("OUCH! Not enough room, delta = %d", delta);
return ENOSPC;
}
numkeys = NUMKEYS(mp);
for (i = 0; i < numkeys; i++) {
if (mp->mp_ptrs[i] <= ptr)
mp->mp_ptrs[i] -= delta;
}
base = (char *)mp + mp->mp_upper;
len = ptr - mp->mp_upper + NODESIZE;
bcopy(base, base - delta, len);
mp->mp_upper -= delta;
node = NODEPTR(mp, indx);
node->mn_ksize = key->mv_size;
}
bcopy(key->mv_data, NODEKEY(node), key->mv_size);
return MDB_SUCCESS;
}
/* Move a node from src to dst.
*/
static int
mdb_move_node(MDB_db *bt, MDB_pageparent *src, indx_t srcindx,
MDB_pageparent *dst, indx_t dstindx)
{
int rc;
MDB_node *srcnode;
MDB_val key, data;
srcnode = NODEPTR(src->mp_page, srcindx);
DPRINTF("moving %s node %u [%.*s] on page %lu to node %u on page %lu",
IS_LEAF(src->mp_page) ? "leaf" : "branch",
srcindx,
(int)srcnode->mn_ksize, (char *)NODEKEY(srcnode),
src->mp_page->mp_pgno,
dstindx, dst->mp_page->mp_pgno);
/* Mark src and dst as dirty. */
if ((rc = mdb_touch(bt->md_env->me_txn, src)) ||
(rc = mdb_touch(bt->md_env->me_txn, dst)))
return rc;;
/* Add the node to the destination page.
*/
key.mv_size = srcnode->mn_ksize;
key.mv_data = NODEKEY(srcnode);
data.mv_size = NODEDSZ(srcnode);
data.mv_data = NODEDATA(srcnode);
rc = mdb_add_node(bt, dst->mp_page, dstindx, &key, &data, NODEPGNO(srcnode),
srcnode->mn_flags);
if (rc != MDB_SUCCESS)
return rc;
/* Delete the node from the source page.
*/
mdb_del_node(bt, src->mp_page, srcindx);
/* Update the parent separators.
*/
if (srcindx == 0 && src->mp_pi != 0) {
DPRINTF("update separator for source page %lu to [%.*s]",
src->mp_page->mp_pgno, (int)key.mv_size, (char *)key.mv_data);
if ((rc = mdb_update_key(bt, src->mp_parent, src->mp_pi,
&key)) != MDB_SUCCESS)
return rc;
}
if (srcindx == 0 && IS_BRANCH(src->mp_page)) {
MDB_val nullkey;
nullkey.mv_size = 0;
assert(mdb_update_key(bt, src->mp_page, 0, &nullkey) == MDB_SUCCESS);
}
if (dstindx == 0 && dst->mp_pi != 0) {
DPRINTF("update separator for destination page %lu to [%.*s]",
dst->mp_page->mp_pgno, (int)key.mv_size, (char *)key.mv_data);
if ((rc = mdb_update_key(bt, dst->mp_parent, dst->mp_pi,
&key)) != MDB_SUCCESS)
return rc;
}
if (dstindx == 0 && IS_BRANCH(dst->mp_page)) {
MDB_val nullkey;
nullkey.mv_size = 0;
assert(mdb_update_key(bt, dst->mp_page, 0, &nullkey) == MDB_SUCCESS);
}
return MDB_SUCCESS;
}
static int
mdb_merge(MDB_db *bt, MDB_pageparent *src, MDB_pageparent *dst)
{
int rc;
indx_t i;
MDB_node *srcnode;
MDB_val key, data;
MDB_pageparent mpp;
MDB_dhead *dh;
DPRINTF("merging page %lu and %lu", src->mp_page->mp_pgno, dst->mp_page->mp_pgno);
assert(src->mp_parent); /* can't merge root page */
assert(dst->mp_parent);
assert(bt->md_env->me_txn != NULL);
/* Mark src and dst as dirty. */
if ((rc = mdb_touch(bt->md_env->me_txn, src)) ||
(rc = mdb_touch(bt->md_env->me_txn, dst)))
return rc;
/* Move all nodes from src to dst.
*/
for (i = 0; i < NUMKEYS(src->mp_page); i++) {
srcnode = NODEPTR(src->mp_page, i);
key.mv_size = srcnode->mn_ksize;
key.mv_data = NODEKEY(srcnode);
data.mv_size = NODEDSZ(srcnode);
data.mv_data = NODEDATA(srcnode);
rc = mdb_add_node(bt, dst->mp_page, NUMKEYS(dst->mp_page), &key,
&data, NODEPGNO(srcnode), srcnode->mn_flags);
if (rc != MDB_SUCCESS)
return rc;
}
DPRINTF("dst page %lu now has %lu keys (%.1f%% filled)",
dst->mp_page->mp_pgno, NUMKEYS(dst->mp_page), (float)PAGEFILL(bt->md_env, dst->mp_page) / 10);
/* Unlink the src page from parent.
*/
mdb_del_node(bt, src->mp_parent, src->mp_pi);
if (src->mp_pi == 0) {
key.mv_size = 0;
if ((rc = mdb_update_key(bt, src->mp_parent, 0, &key)) != MDB_SUCCESS)
return rc;
}
if (IS_LEAF(src->mp_page))
bt->md_env->me_meta.mm_stat.ms_leaf_pages--;
else
bt->md_env->me_meta.mm_stat.ms_branch_pages--;
mpp.mp_page = src->mp_parent;
dh = (MDB_dhead *)src->mp_parent;
dh--;
mpp.mp_parent = dh->md_parent;
mpp.mp_pi = dh->md_pi;
return mdb_rebalance(bt, &mpp);
}
#define FILL_THRESHOLD 250
static int
mdb_rebalance(MDB_db *db, MDB_pageparent *mpp)
{
MDB_node *node;
MDB_page *root;
MDB_pageparent npp;
indx_t si = 0, di = 0;
assert(db != NULL);
assert(db->md_env->me_txn != NULL);
assert(mpp != NULL);
DPRINTF("rebalancing %s page %lu (has %lu keys, %.1f%% full)",
IS_LEAF(mpp->mp_page) ? "leaf" : "branch",
mpp->mp_page->mp_pgno, NUMKEYS(mpp->mp_page), (float)PAGEFILL(db->md_env, mpp->mp_page) / 10);
if (PAGEFILL(db->md_env, mpp->mp_page) >= FILL_THRESHOLD) {
DPRINTF("no need to rebalance page %lu, above fill threshold",
mpp->mp_page->mp_pgno);
return MDB_SUCCESS;
}
if (mpp->mp_parent == NULL) {
if (NUMKEYS(mpp->mp_page) == 0) {
DPRINTF("tree is completely empty");
db->md_env->me_txn->mt_root = P_INVALID;
db->md_env->me_meta.mm_stat.ms_depth--;
db->md_env->me_meta.mm_stat.ms_leaf_pages--;
} else if (IS_BRANCH(mpp->mp_page) && NUMKEYS(mpp->mp_page) == 1) {
DPRINTF("collapsing root page!");
db->md_env->me_txn->mt_root = NODEPGNO(NODEPTR(mpp->mp_page, 0));
if ((root = mdbenv_get_page(db->md_env, db->md_env->me_txn->mt_root)) == NULL)
return MDB_FAIL;
db->md_env->me_meta.mm_stat.ms_depth--;
db->md_env->me_meta.mm_stat.ms_branch_pages--;
} else
DPRINTF("root page doesn't need rebalancing");
return MDB_SUCCESS;
}
/* The parent (branch page) must have at least 2 pointers,
* otherwise the tree is invalid.
*/
assert(NUMKEYS(mpp->mp_parent) > 1);
/* Leaf page fill factor is below the threshold.
* Try to move keys from left or right neighbor, or
* merge with a neighbor page.
*/
/* Find neighbors.
*/
if (mpp->mp_pi == 0) {
/* We're the leftmost leaf in our parent.
*/
DPRINTF("reading right neighbor");
node = NODEPTR(mpp->mp_parent, mpp->mp_pi + 1);
if ((npp.mp_page = mdbenv_get_page(db->md_env, NODEPGNO(node))) == NULL)
return MDB_FAIL;
npp.mp_pi = mpp->mp_pi + 1;
si = 0;
di = NUMKEYS(mpp->mp_page);
} else {
/* There is at least one neighbor to the left.
*/
DPRINTF("reading left neighbor");
node = NODEPTR(mpp->mp_parent, mpp->mp_pi - 1);
if ((npp.mp_page = mdbenv_get_page(db->md_env, NODEPGNO(node))) == NULL)
return MDB_FAIL;
npp.mp_pi = mpp->mp_pi - 1;
si = NUMKEYS(npp.mp_page) - 1;
di = 0;
}
npp.mp_parent = mpp->mp_parent;
DPRINTF("found neighbor page %lu (%lu keys, %.1f%% full)",
npp.mp_page->mp_pgno, NUMKEYS(npp.mp_page), (float)PAGEFILL(db->md_env, npp.mp_page) / 10);
/* If the neighbor page is above threshold and has at least two
* keys, move one key from it.
*
* Otherwise we should try to merge them.
*/
if (PAGEFILL(db->md_env, npp.mp_page) >= FILL_THRESHOLD && NUMKEYS(npp.mp_page) >= 2)
return mdb_move_node(db, &npp, si, mpp, di);
else { /* FIXME: if (has_enough_room()) */
if (mpp->mp_pi == 0)
return mdb_merge(db, &npp, mpp);
else
return mdb_merge(db, mpp, &npp);
}
}
int
mdb_del(MDB_db *bt, MDB_txn *txn,
MDB_val *key, MDB_val *data)
{
int rc, exact;
unsigned int ki;
MDB_node *leaf;
MDB_pageparent mpp;
DPRINTF("========> delete key %.*s", (int)key->mv_size, (char *)key->mv_data);
assert(key != NULL);
if (bt == NULL || txn == NULL)
return EINVAL;
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
return EINVAL;
}
if (bt->md_env->me_txn != txn) {
return EINVAL;
}
if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
return EINVAL;
}
if ((rc = mdb_search_page(bt, txn, key, NULL, 1, &mpp)) != MDB_SUCCESS)
return rc;
leaf = mdb_search_node(bt, mpp.mp_page, key, &exact, &ki);
if (leaf == NULL || !exact) {
return ENOENT;
}
if (data && (rc = mdb_read_data(bt, NULL, leaf, data)) != MDB_SUCCESS)
return rc;
mdb_del_node(bt, mpp.mp_page, ki);
bt->md_env->me_meta.mm_stat.ms_entries--;
rc = mdb_rebalance(bt, &mpp);
if (rc != MDB_SUCCESS)
txn->mt_flags |= MDB_TXN_ERROR;
return rc;
}
/* Split page <*mpp>, and insert <key,(data|newpgno)> in either left or
* right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and
* *newindxp with the actual values after split, ie if *mpp and *newindxp
* refer to a node in the new right sibling page.
*/
static int
mdb_split(MDB_db *bt, MDB_page **mpp, unsigned int *newindxp,
MDB_val *newkey, MDB_val *newdata, pgno_t newpgno)
{
uint8_t flags;
int rc = MDB_SUCCESS, ins_new = 0;
indx_t newindx;
pgno_t pgno = 0;
unsigned int i, j, split_indx;
MDB_node *node;
MDB_page *pright, *p;
MDB_val sepkey, rkey, rdata;
MDB_page *copy;
MDB_dpage *mdp, *rdp, *pdp;
MDB_dhead *dh;
assert(bt != NULL);
assert(bt->md_env != NULL);
dh = ((MDB_dhead *)*mpp) - 1;
mdp = (MDB_dpage *)dh;
newindx = *newindxp;
DPRINTF("-----> splitting %s page %lu and adding [%.*s] at index %i",
IS_LEAF(&mdp->p) ? "leaf" : "branch", mdp->p.mp_pgno,
(int)newkey->mv_size, (char *)newkey->mv_data, *newindxp);
if (mdp->h.md_parent == NULL) {
if ((pdp = mdbenv_new_page(bt->md_env, P_BRANCH, 1)) == NULL)
return MDB_FAIL;
mdp->h.md_pi = 0;
mdp->h.md_parent = &pdp->p;
bt->md_env->me_txn->mt_root = pdp->p.mp_pgno;
DPRINTF("root split! new root = %lu", pdp->p.mp_pgno);
bt->md_env->me_meta.mm_stat.ms_depth++;
/* Add left (implicit) pointer. */
if (mdb_add_node(bt, &pdp->p, 0, NULL, NULL,
mdp->p.mp_pgno, 0) != MDB_SUCCESS)
return MDB_FAIL;
} else {
DPRINTF("parent branch page is %lu", mdp->h.md_parent->mp_pgno);
}
/* Create a right sibling. */
if ((rdp = mdbenv_new_page(bt->md_env, mdp->p.mp_flags, 1)) == NULL)
return MDB_FAIL;
rdp->h.md_parent = mdp->h.md_parent;
rdp->h.md_pi = mdp->h.md_pi + 1;
DPRINTF("new right sibling: page %lu", rdp->p.mp_pgno);
/* Move half of the keys to the right sibling. */
if ((copy = malloc(bt->md_env->me_head.mh_psize)) == NULL)
return MDB_FAIL;
bcopy(&mdp->p, copy, bt->md_env->me_head.mh_psize);
bzero(&mdp->p.mp_ptrs, bt->md_env->me_head.mh_psize - PAGEHDRSZ);
mdp->p.mp_lower = PAGEHDRSZ;
mdp->p.mp_upper = bt->md_env->me_head.mh_psize;
split_indx = NUMKEYS(copy) / 2 + 1;
/* First find the separating key between the split pages.
*/
bzero(&sepkey, sizeof(sepkey));
if (newindx == split_indx) {
sepkey.mv_size = newkey->mv_size;
sepkey.mv_data = newkey->mv_data;
} else {
node = NODEPTR(copy, split_indx);
sepkey.mv_size = node->mn_ksize;
sepkey.mv_data = NODEKEY(node);
}
DPRINTF("separator is [%.*s]", (int)sepkey.mv_size, (char *)sepkey.mv_data);
/* Copy separator key to the parent.
*/
if (SIZELEFT(rdp->h.md_parent) < mdb_branch_size(bt, &sepkey)) {
rc = mdb_split(bt, &rdp->h.md_parent, &rdp->h.md_pi,
&sepkey, NULL, rdp->p.mp_pgno);
/* Right page might now have changed parent.
* Check if left page also changed parent.
*/
if (rdp->h.md_parent != mdp->h.md_parent &&
mdp->h.md_pi >= NUMKEYS(mdp->h.md_parent)) {
mdp->h.md_parent = rdp->h.md_parent;
mdp->h.md_pi = rdp->h.md_pi - 1;
}
} else {
rc = mdb_add_node(bt, rdp->h.md_parent, rdp->h.md_pi,
&sepkey, NULL, rdp->p.mp_pgno, 0);
}
if (rc != MDB_SUCCESS) {
free(copy);
return MDB_FAIL;
}
for (i = j = 0; i <= NUMKEYS(copy); j++) {
if (i < split_indx) {
/* Re-insert in left sibling. */
pdp = mdp;
} else {
/* Insert in right sibling. */
if (i == split_indx)
/* Reset insert index for right sibling. */
j = (i == newindx && ins_new);
pdp = rdp;
}
if (i == newindx && !ins_new) {
/* Insert the original entry that caused the split. */
rkey.mv_data = newkey->mv_data;
rkey.mv_size = newkey->mv_size;
if (IS_LEAF(&mdp->p)) {
rdata.mv_data = newdata->mv_data;
rdata.mv_size = newdata->mv_size;
} else
pgno = newpgno;
flags = 0;
ins_new = 1;
/* Update page and index for the new key. */
*newindxp = j;
*mpp = &pdp->p;
} else if (i == NUMKEYS(copy)) {
break;
} else {
node = NODEPTR(copy, i);
rkey.mv_data = NODEKEY(node);
rkey.mv_size = node->mn_ksize;
if (IS_LEAF(&mdp->p)) {
rdata.mv_data = NODEDATA(node);
rdata.mv_size = node->mn_dsize;
} else
pgno = node->mn_pgno;
flags = node->mn_flags;
i++;
}
if (!IS_LEAF(&mdp->p) && j == 0) {
/* First branch index doesn't need key data. */
rkey.mv_size = 0;
}
rc = mdb_add_node(bt, &pdp->p, j, &rkey, &rdata, pgno,flags);
}
free(copy);
return rc;
}
int
mdb_put(MDB_db *bt, MDB_txn *txn,
MDB_val *key, MDB_val *data, unsigned int flags)
{
int rc = MDB_SUCCESS, exact;
unsigned int ki;
MDB_node *leaf;
MDB_pageparent mpp;
assert(key != NULL);
assert(data != NULL);
if (bt == NULL || txn == NULL)
return EINVAL;
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
return EINVAL;
}
if (bt->md_env->me_txn != txn) {
return EINVAL;
}
if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
return EINVAL;
}
DPRINTF("==> put key %.*s, size %zu, data size %zu",
(int)key->mv_size, (char *)key->mv_data, key->mv_size, data->mv_size);
rc = mdb_search_page(bt, txn, key, NULL, 1, &mpp);
if (rc == MDB_SUCCESS) {
leaf = mdb_search_node(bt, mpp.mp_page, key, &exact, &ki);
if (leaf && exact) {
if (F_ISSET(flags, MDB_NOOVERWRITE)) {
DPRINTF("duplicate key %.*s",
(int)key->mv_size, (char *)key->mv_data);
return EEXIST;
}
mdb_del_node(bt, mpp.mp_page, ki);
}
if (leaf == NULL) { /* append if not found */
ki = NUMKEYS(mpp.mp_page);
DPRINTF("appending key at index %i", ki);
}
} else if (rc == ENOENT) {
MDB_dpage *dp;
/* new file, just write a root leaf page */
DPRINTF("allocating new root leaf page");
if ((dp = mdbenv_new_page(bt->md_env, P_LEAF, 1)) == NULL) {
return ENOMEM;
}
mpp.mp_page = &dp->p;
txn->mt_root = mpp.mp_page->mp_pgno;
bt->md_env->me_meta.mm_stat.ms_depth++;
ki = 0;
}
else
goto done;
assert(IS_LEAF(mpp.mp_page));
DPRINTF("there are %lu keys, should insert new key at index %i",
NUMKEYS(mpp.mp_page), ki);
if (SIZELEFT(mpp.mp_page) < mdb_leaf_size(bt, key, data)) {
rc = mdb_split(bt, &mpp.mp_page, &ki, key, data, P_INVALID);
} else {
/* There is room already in this leaf page. */
rc = mdb_add_node(bt, mpp.mp_page, ki, key, data, 0, 0);
}
if (rc != MDB_SUCCESS)
txn->mt_flags |= MDB_TXN_ERROR;
else
bt->md_env->me_meta.mm_stat.ms_entries++;
done:
return rc;
}
static pgno_t
mdbenv_compact_tree(MDB_env *env, pgno_t pgno, MDB_env *envc)
{
ssize_t rc;
indx_t i;
pgno_t *pnext, next;
MDB_node *node;
MDB_page *p;
MDB_page *mp;
/* Get the page and make a copy of it.
*/
if ((mp = mdbenv_get_page(env, pgno)) == NULL)
return P_INVALID;
if ((p = malloc(env->me_head.mh_psize)) == NULL)
return P_INVALID;
bcopy(mp, p, env->me_head.mh_psize);
/* Go through all nodes in the (copied) page and update the
* page pointers.
*/
if (F_ISSET(p->mp_flags, P_BRANCH)) {
for (i = 0; i < NUMKEYS(p); i++) {
node = NODEPTR(p, i);
node->mn_pgno = mdbenv_compact_tree(env, node->mn_pgno, envc);
if (node->mn_pgno == P_INVALID) {
free(p);
return P_INVALID;
}
}
} else if (F_ISSET(p->mp_flags, P_LEAF)) {
for (i = 0; i < NUMKEYS(p); i++) {
node = NODEPTR(p, i);
if (F_ISSET(node->mn_flags, F_BIGDATA)) {
bcopy(NODEDATA(node), &next, sizeof(next));
next = mdbenv_compact_tree(env, next, envc);
if (next == P_INVALID) {
free(p);
return P_INVALID;
}
bcopy(&next, NODEDATA(node), sizeof(next));
}
}
} else if (F_ISSET(p->mp_flags, P_OVERFLOW)) {
; /* handled below */
} else
assert(0);
pgno = p->mp_pgno = envc->me_txn->mt_next_pgno++;
rc = write(envc->me_fd, p, env->me_head.mh_psize);
free(p);
if (rc != (ssize_t)env->me_head.mh_psize)
return P_INVALID;
if (F_ISSET(mp->mp_flags, P_OVERFLOW)) {
if (mp->mp_pages > 1) {
size_t len = (mp->mp_pages-1) * env->me_head.mh_psize;
envc->me_txn->mt_next_pgno += mp->mp_pages-1;
rc = write(envc->me_fd, mp+1, len);
if (rc != len)
return P_INVALID;
}
}
return pgno;
}
int
mdbenv_compact(MDB_env *env)
{
char *compact_path = NULL;
MDB_env *envc;
MDB_txn *txn, *txnc = NULL;
int fd, rc;
pgno_t root;
assert(env != NULL);
if (env->me_path == NULL) {
return EINVAL;
}
DPRINTF("compacting mdbenv %p with path %s", env, env->me_path);
rc = mdb_txn_begin(env, 0, &txn);
if (rc) return rc;
asprintf(&compact_path, "%s.compact.XXXXXX", env->me_path);
fd = mkstemp(compact_path);
if (fd == -1) {
rc = errno;
free(compact_path);
mdb_txn_abort(txn);
return rc;
}
rc = mdbenv_create(&envc, env->me_mapsize);
if (rc) goto failed;
envc->me_fd = fd;
rc = mdbenv_open2(envc, env->me_flags);
if (rc) goto failed;
bcopy(&env->me_meta, &envc->me_meta, sizeof(env->me_meta));
envc->me_meta.mm_stat.ms_revisions = 0;
rc = mdb_txn_begin(envc, 0, &txnc);
if (rc) goto failed;
if (env->me_meta.mm_root != P_INVALID) {
root = mdbenv_compact_tree(env, env->me_meta.mm_root, envc);
if (root == P_INVALID)
goto failed;
if ((rc = mdbenv_write_meta(envc, root, 0)) != MDB_SUCCESS)
goto failed;
}
fsync(fd);
DPRINTF("renaming %s to %s", compact_path, env->me_path);
if (rename(compact_path, env->me_path) != 0) {
rc = errno;
goto failed;
}
/* Write a "tombstone" meta page so other processes can pick up
* the change and re-open the file.
*/
if (mdbenv_write_meta(env, P_INVALID, MDB_TOMBSTONE) != MDB_SUCCESS)
goto failed;
mdb_txn_abort(txn);
mdb_txn_abort(txnc);
free(compact_path);
mdbenv_close(envc);
return 0;
failed:
mdb_txn_abort(txn);
mdb_txn_abort(txnc);
unlink(compact_path);
free(compact_path);
mdbenv_close(envc);
return rc;
}
int
mdbenv_get_flags(MDB_env *env, unsigned int *arg)
{
if (!env || !arg)
return EINVAL;
*arg = (env->me_flags & ~MDB_FIXPADDING);
return MDB_SUCCESS;
}
int
mdbenv_get_path(MDB_env *env, const char **arg)
{
if (!env || !arg)
return EINVAL;
*arg = env->me_path;
return MDB_SUCCESS;
}
int
mdbenv_stat(MDB_env *env, MDB_stat **arg)
{
if (env == NULL || arg == NULL)
return EINVAL;
*arg = &env->me_meta.mm_stat;
return MDB_SUCCESS;
}
int mdb_open(MDB_env *env, MDB_txn *txn, const char *name, unsigned int flags, MDB_db **db)
{
if (!name) {
*db = (MDB_db *)&env->me_db;
return MDB_SUCCESS;
}
return EINVAL;
}
int mdb_stat(MDB_db *db, MDB_stat **arg)
{
if (db == NULL || arg == NULL)
return EINVAL;
*arg = &db->md_stat;
return MDB_SUCCESS;
}
void mdb_close(MDB_db *db)
{
}