Error cleanup

Replace MDB_FAIL with more specific codes
Return MDB_PANIC after failure to write meta page
vmware
Howard Chu 13 years ago
parent 5ef6c43acf
commit 5e3a3db1f2
  1. 94
      libraries/libmdb/mdb.c
  2. 11
      libraries/libmdb/mdb.h

@ -110,6 +110,7 @@ typedef struct MDB_txbody {
pthread_mutex_t mtb_mutex; pthread_mutex_t mtb_mutex;
ULONG mtb_txnid; ULONG mtb_txnid;
uint32_t mtb_numreaders; uint32_t mtb_numreaders;
uint32_t mtb_me_toggle;
} MDB_txbody; } MDB_txbody;
typedef struct MDB_txninfo { typedef struct MDB_txninfo {
@ -120,6 +121,7 @@ typedef struct MDB_txninfo {
#define mti_mutex mt1.mtb.mtb_mutex #define mti_mutex mt1.mtb.mtb_mutex
#define mti_txnid mt1.mtb.mtb_txnid #define mti_txnid mt1.mtb.mtb_txnid
#define mti_numreaders mt1.mtb.mtb_numreaders #define mti_numreaders mt1.mtb.mtb_numreaders
#define mti_me_toggle mt1.mtb.mtb_me_toggle
char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
} mt1; } mt1;
union { union {
@ -311,9 +313,9 @@ struct MDB_env {
int me_fd; int me_fd;
int me_lfd; int me_lfd;
int me_mfd; /* just for writing the meta pages */ int me_mfd; /* just for writing the meta pages */
uint16_t me_flags; #define MDB_FATAL_ERROR 0x80000000U
uint16_t me_db_toggle; uint32_t me_flags;
unsigned int me_psize; uint32_t me_extrapad; /* unused for now */
unsigned int me_maxreaders; unsigned int me_maxreaders;
unsigned int me_numdbs; unsigned int me_numdbs;
unsigned int me_maxdbs; unsigned int me_maxdbs;
@ -325,6 +327,9 @@ struct MDB_env {
MDB_txn *me_txn; /* current write transaction */ MDB_txn *me_txn; /* current write transaction */
size_t me_mapsize; size_t me_mapsize;
off_t me_size; /* current file size */ off_t me_size; /* current file size */
pgno_t me_maxpg; /* me_mapsize / me_psize */
unsigned int me_psize;
unsigned int me_db_toggle;
MDB_dbx *me_dbxs; /* array */ MDB_dbx *me_dbxs; /* array */
MDB_db *me_dbs[2]; MDB_db *me_dbs[2];
MDB_oldpages *me_pghead; MDB_oldpages *me_pghead;
@ -568,6 +573,11 @@ mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num)
} }
} }
if (pgno == P_INVALID) {
/* DB size is maxed out */
if (txn->mt_next_pgno + num >= txn->mt_env->me_maxpg)
return NULL;
}
if ((dp = malloc(txn->mt_env->me_psize * num + sizeof(MDB_dhead))) == NULL) if ((dp = malloc(txn->mt_env->me_psize * num + sizeof(MDB_dhead))) == NULL)
return NULL; return NULL;
dp->h.md_num = num; dp->h.md_num = num;
@ -633,6 +643,10 @@ mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret)
MDB_txn *txn; MDB_txn *txn;
int rc, toggle; int rc, toggle;
if (env->me_flags & MDB_FATAL_ERROR) {
DPRINTF("mdb_txn_begin: environment had fatal error, must shutdown!");
return MDB_PANIC;
}
if ((txn = calloc(1, sizeof(MDB_txn))) == NULL) { if ((txn = calloc(1, sizeof(MDB_txn))) == NULL) {
DPRINTF("calloc: %s", strerror(errno)); DPRINTF("calloc: %s", strerror(errno));
return ENOMEM; return ENOMEM;
@ -679,6 +693,7 @@ mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret)
txn->mt_env = env; txn->mt_env = env;
toggle = env->me_txns->mti_me_toggle;
if ((rc = mdb_env_read_meta(env, &toggle)) != MDB_SUCCESS) { if ((rc = mdb_env_read_meta(env, &toggle)) != MDB_SUCCESS) {
mdb_txn_abort(txn); mdb_txn_abort(txn);
return rc; return rc;
@ -1036,19 +1051,23 @@ static int
mdb_env_write_meta(MDB_txn *txn) mdb_env_write_meta(MDB_txn *txn)
{ {
MDB_env *env; MDB_env *env;
MDB_meta meta; MDB_meta meta, metab;
off_t off; off_t off;
int rc, len; int rc, len, toggle;
char *ptr; char *ptr;
assert(txn != NULL); assert(txn != NULL);
assert(txn->mt_env != NULL); assert(txn->mt_env != NULL);
toggle = !F_ISSET(txn->mt_flags, MDB_TXN_METOGGLE);
DPRINTF("writing meta page %d for root page %lu", DPRINTF("writing meta page %d for root page %lu",
!F_ISSET(txn->mt_flags, MDB_TXN_METOGGLE), txn->mt_dbs[MAIN_DBI].md_root); toggle, txn->mt_dbs[MAIN_DBI].md_root);
env = txn->mt_env; env = txn->mt_env;
metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;
ptr = (char *)&meta; ptr = (char *)&meta;
off = offsetof(MDB_meta, mm_dbs[0].md_depth); off = offsetof(MDB_meta, mm_dbs[0].md_depth);
len = sizeof(MDB_meta) - off; len = sizeof(MDB_meta) - off;
@ -1059,15 +1078,25 @@ mdb_env_write_meta(MDB_txn *txn)
meta.mm_last_pg = txn->mt_next_pgno - 1; meta.mm_last_pg = txn->mt_next_pgno - 1;
meta.mm_txnid = txn->mt_txnid; meta.mm_txnid = txn->mt_txnid;
if (!F_ISSET(txn->mt_flags, MDB_TXN_METOGGLE)) if (toggle)
off += env->me_psize; off += env->me_psize;
off += PAGEHDRSZ; off += PAGEHDRSZ;
rc = pwrite(env->me_fd, ptr, len, off); /* Write to the SYNC fd */
rc = pwrite(env->me_mfd, ptr, len, off);
if (rc != len) { if (rc != len) {
DPRINTF("write failed, disk error?"); DPRINTF("write failed, disk error?");
/* On a failure, the pagecache still contains the new data.
* Write some old data back, to prevent it from being used.
* Use the non-SYNC fd; we know it will fail anyway.
*/
meta.mm_last_pg = metab.mm_last_pg;
meta.mm_txnid = metab.mm_txnid;
rc = pwrite(env->me_fd, ptr, len, off);
env->me_flags |= MDB_FATAL_ERROR;
return errno; return errno;
} }
txn->mt_env->me_txns->mti_me_toggle = toggle;
return MDB_SUCCESS; return MDB_SUCCESS;
} }
@ -1079,13 +1108,13 @@ mdb_env_read_meta(MDB_env *env, int *which)
assert(env != NULL); assert(env != NULL);
if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid) if (which)
toggle = *which;
else if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid)
toggle = 1; toggle = 1;
if (env->me_meta != env->me_metas[toggle]) if (env->me_meta != env->me_metas[toggle])
env->me_meta = env->me_metas[toggle]; env->me_meta = env->me_metas[toggle];
if (which)
*which = toggle;
DPRINTF("Using meta page %d", toggle); DPRINTF("Using meta page %d", toggle);
@ -1183,7 +1212,9 @@ mdb_env_open2(MDB_env *env, unsigned int flags)
} }
env->me_psize = meta.mm_psize; env->me_psize = meta.mm_psize;
p = (MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)env->me_map; env->me_maxpg = env->me_mapsize / env->me_psize;
p = (MDB_page *)env->me_map;
env->me_metas[0] = METADATA(p); env->me_metas[0] = METADATA(p);
env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + meta.mm_psize); env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + meta.mm_psize);
@ -1219,6 +1250,8 @@ mdb_env_share_locks(MDB_env *env)
struct flock lock_info; struct flock lock_info;
env->me_txns->mti_txnid = env->me_meta->mm_txnid; env->me_txns->mti_txnid = env->me_meta->mm_txnid;
if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid)
env->me_txns->mti_me_toggle = 1;
memset((void *)&lock_info, 0, sizeof(lock_info)); memset((void *)&lock_info, 0, sizeof(lock_info));
lock_info.l_type = F_RDLCK; lock_info.l_type = F_RDLCK;
@ -1292,6 +1325,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_magic = MDB_MAGIC;
env->me_txns->mti_txnid = 0; env->me_txns->mti_txnid = 0;
env->me_txns->mti_numreaders = 0; env->me_txns->mti_numreaders = 0;
env->me_txns->mti_me_toggle = 0;
} else { } else {
if (env->me_txns->mti_magic != MDB_MAGIC) { if (env->me_txns->mti_magic != MDB_MAGIC) {
@ -1546,7 +1580,7 @@ mdb_search_page_root(MDB_txn *txn, MDB_dbi dbi, MDB_val *key,
int rc; int rc;
if (cursor && cursor_push_page(cursor, mp) == NULL) if (cursor && cursor_push_page(cursor, mp) == NULL)
return MDB_FAIL; return ENOMEM;
while (IS_BRANCH(mp)) { while (IS_BRANCH(mp)) {
unsigned int i = 0; unsigned int i = 0;
@ -1583,12 +1617,12 @@ mdb_search_page_root(MDB_txn *txn, MDB_dbi dbi, MDB_val *key,
mpp->mp_parent = mp; mpp->mp_parent = mp;
if ((mp = mdb_get_page(txn, NODEPGNO(node))) == NULL) if ((mp = mdb_get_page(txn, NODEPGNO(node))) == NULL)
return MDB_FAIL; return MDB_PAGE_NOTFOUND;
mpp->mp_pi = i; mpp->mp_pi = i;
mpp->mp_page = mp; mpp->mp_page = mp;
if (cursor && cursor_push_page(cursor, mp) == NULL) if (cursor && cursor_push_page(cursor, mp) == NULL)
return MDB_FAIL; return ENOMEM;
if (modify) { if (modify) {
MDB_dhead *dh = ((MDB_dhead *)mp)-1; MDB_dhead *dh = ((MDB_dhead *)mp)-1;
@ -1605,7 +1639,7 @@ mdb_search_page_root(MDB_txn *txn, MDB_dbi dbi, MDB_val *key,
if (!IS_LEAF(mp)) { if (!IS_LEAF(mp)) {
DPRINTF("internal error, index points to a %02X page!?", DPRINTF("internal error, index points to a %02X page!?",
mp->mp_flags); mp->mp_flags);
return MDB_FAIL; return MDB_CORRUPTED;
} }
DPRINTF("found leaf page %lu for key %.*s", mp->mp_pgno, DPRINTF("found leaf page %lu for key %.*s", mp->mp_pgno,
@ -1643,7 +1677,7 @@ mdb_search_page(MDB_txn *txn, MDB_dbi dbi, MDB_val *key,
} }
if ((mpp->mp_page = mdb_get_page(txn, root)) == NULL) if ((mpp->mp_page = mdb_get_page(txn, root)) == NULL)
return MDB_FAIL; return MDB_PAGE_NOTFOUND;
DPRINTF("root page has flags 0x%X", mpp->mp_page->mp_flags); DPRINTF("root page has flags 0x%X", mpp->mp_page->mp_flags);
@ -1687,7 +1721,7 @@ mdb_read_data(MDB_txn *txn, MDB_node *leaf, MDB_val *data)
memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
if ((omp = mdb_get_page(txn, pgno)) == NULL) { if ((omp = mdb_get_page(txn, pgno)) == NULL) {
DPRINTF("read overflow page %lu failed", pgno); DPRINTF("read overflow page %lu failed", pgno);
return MDB_FAIL; return MDB_PAGE_NOTFOUND;
} }
data->mv_data = omp; data->mv_data = omp;
@ -1773,7 +1807,7 @@ mdb_sibling(MDB_cursor *cursor, int move_right)
indx = NODEPTR(parent->mp_page, parent->mp_ki); indx = NODEPTR(parent->mp_page, parent->mp_ki);
if ((mp = mdb_get_page(cursor->mc_txn, indx->mn_pgno)) == NULL) if ((mp = mdb_get_page(cursor->mc_txn, indx->mn_pgno)) == NULL)
return MDB_FAIL; return MDB_PAGE_NOTFOUND;
#if 0 #if 0
mp->parent = parent->mp_page; mp->parent = parent->mp_page;
mp->parent_index = parent->mp_ki; mp->parent_index = parent->mp_ki;
@ -2211,7 +2245,7 @@ mdb_add_node(MDB_txn *txn, MDB_dbi dbi, MDB_page *mp, indx_t indx,
data->mv_size); data->mv_size);
node_size += sizeof(pgno_t); node_size += sizeof(pgno_t);
if ((ofp = mdb_new_page(txn, dbi, P_OVERFLOW, ovpages)) == NULL) if ((ofp = mdb_new_page(txn, dbi, P_OVERFLOW, ovpages)) == NULL)
return MDB_FAIL; return ENOMEM;
DPRINTF("allocated overflow page %lu", ofp->p.mp_pgno); DPRINTF("allocated overflow page %lu", ofp->p.mp_pgno);
flags |= F_BIGDATA; flags |= F_BIGDATA;
} else { } else {
@ -2646,7 +2680,7 @@ mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mpp)
DPRINTF("collapsing root page!"); DPRINTF("collapsing root page!");
txn->mt_dbs[dbi].md_root = NODEPGNO(NODEPTR(mpp->mp_page, 0)); txn->mt_dbs[dbi].md_root = NODEPGNO(NODEPTR(mpp->mp_page, 0));
if ((root = mdb_get_page(txn, txn->mt_dbs[dbi].md_root)) == NULL) if ((root = mdb_get_page(txn, txn->mt_dbs[dbi].md_root)) == NULL)
return MDB_FAIL; return MDB_PAGE_NOTFOUND;
txn->mt_dbs[dbi].md_depth--; txn->mt_dbs[dbi].md_depth--;
txn->mt_dbs[dbi].md_branch_pages--; txn->mt_dbs[dbi].md_branch_pages--;
} else } else
@ -2672,7 +2706,7 @@ mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mpp)
DPRINTF("reading right neighbor"); DPRINTF("reading right neighbor");
node = NODEPTR(mpp->mp_parent, mpp->mp_pi + 1); node = NODEPTR(mpp->mp_parent, mpp->mp_pi + 1);
if ((npp.mp_page = mdb_get_page(txn, NODEPGNO(node))) == NULL) if ((npp.mp_page = mdb_get_page(txn, NODEPGNO(node))) == NULL)
return MDB_FAIL; return MDB_PAGE_NOTFOUND;
npp.mp_pi = mpp->mp_pi + 1; npp.mp_pi = mpp->mp_pi + 1;
si = 0; si = 0;
di = NUMKEYS(mpp->mp_page); di = NUMKEYS(mpp->mp_page);
@ -2682,7 +2716,7 @@ mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mpp)
DPRINTF("reading left neighbor"); DPRINTF("reading left neighbor");
node = NODEPTR(mpp->mp_parent, mpp->mp_pi - 1); node = NODEPTR(mpp->mp_parent, mpp->mp_pi - 1);
if ((npp.mp_page = mdb_get_page(txn, NODEPGNO(node))) == NULL) if ((npp.mp_page = mdb_get_page(txn, NODEPGNO(node))) == NULL)
return MDB_FAIL; return MDB_PAGE_NOTFOUND;
npp.mp_pi = mpp->mp_pi - 1; npp.mp_pi = mpp->mp_pi - 1;
si = NUMKEYS(npp.mp_page) - 1; si = NUMKEYS(npp.mp_page) - 1;
di = 0; di = 0;
@ -2861,7 +2895,7 @@ mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp, unsigned int *newindxp,
if (mdp->h.md_parent == NULL) { if (mdp->h.md_parent == NULL) {
if ((pdp = mdb_new_page(txn, dbi, P_BRANCH, 1)) == NULL) if ((pdp = mdb_new_page(txn, dbi, P_BRANCH, 1)) == NULL)
return MDB_FAIL; return ENOMEM;
mdp->h.md_pi = 0; mdp->h.md_pi = 0;
mdp->h.md_parent = &pdp->p; mdp->h.md_parent = &pdp->p;
txn->mt_dbs[dbi].md_root = pdp->p.mp_pgno; txn->mt_dbs[dbi].md_root = pdp->p.mp_pgno;
@ -2869,23 +2903,23 @@ mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp, unsigned int *newindxp,
txn->mt_dbs[dbi].md_depth++; txn->mt_dbs[dbi].md_depth++;
/* Add left (implicit) pointer. */ /* Add left (implicit) pointer. */
if (mdb_add_node(txn, dbi, &pdp->p, 0, NULL, NULL, if ((rc = mdb_add_node(txn, dbi, &pdp->p, 0, NULL, NULL,
mdp->p.mp_pgno, 0) != MDB_SUCCESS) mdp->p.mp_pgno, 0)) != MDB_SUCCESS)
return MDB_FAIL; return rc;
} else { } else {
DPRINTF("parent branch page is %lu", mdp->h.md_parent->mp_pgno); DPRINTF("parent branch page is %lu", mdp->h.md_parent->mp_pgno);
} }
/* Create a right sibling. */ /* Create a right sibling. */
if ((rdp = mdb_new_page(txn, dbi, mdp->p.mp_flags, 1)) == NULL) if ((rdp = mdb_new_page(txn, dbi, mdp->p.mp_flags, 1)) == NULL)
return MDB_FAIL; return ENOMEM;
rdp->h.md_parent = mdp->h.md_parent; rdp->h.md_parent = mdp->h.md_parent;
rdp->h.md_pi = mdp->h.md_pi + 1; rdp->h.md_pi = mdp->h.md_pi + 1;
DPRINTF("new right sibling: page %lu", rdp->p.mp_pgno); DPRINTF("new right sibling: page %lu", rdp->p.mp_pgno);
/* Move half of the keys to the right sibling. */ /* Move half of the keys to the right sibling. */
if ((copy = malloc(txn->mt_env->me_psize)) == NULL) if ((copy = malloc(txn->mt_env->me_psize)) == NULL)
return MDB_FAIL; return ENOMEM;
memcpy(copy, &mdp->p, txn->mt_env->me_psize); memcpy(copy, &mdp->p, txn->mt_env->me_psize);
memset(&mdp->p.mp_ptrs, 0, txn->mt_env->me_psize - PAGEHDRSZ); memset(&mdp->p.mp_ptrs, 0, txn->mt_env->me_psize - PAGEHDRSZ);
mdp->p.mp_lower = PAGEHDRSZ; mdp->p.mp_lower = PAGEHDRSZ;
@ -2927,7 +2961,7 @@ mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp, unsigned int *newindxp,
} }
if (rc != MDB_SUCCESS) { if (rc != MDB_SUCCESS) {
free(copy); free(copy);
return MDB_FAIL; return rc;
} }
for (i = j = 0; i <= NUMKEYS(copy); j++) { for (i = j = 0; i <= NUMKEYS(copy); j++) {

@ -82,11 +82,14 @@ typedef enum MDB_cursor_op { /* cursor operations */
} MDB_cursor_op; } MDB_cursor_op;
/* return codes */ /* return codes */
/* BerkeleyDB uses -30800 to -30999, we'll go under them */
#define MDB_SUCCESS 0 #define MDB_SUCCESS 0
#define MDB_FAIL -1 #define MDB_KEYEXIST -30799 /* key/data pair already exists */
#define MDB_KEYEXIST -2 #define MDB_NOTFOUND -30798 /* key/data pair not found (EOF) */
#define MDB_NOTFOUND -3 #define MDB_PAGE_NOTFOUND -30797 /* Requested page not found */
#define MDB_VERSION_MISMATCH -4 #define MDB_CORRUPTED -30796 /* Located page was wrong type */
#define MDB_PANIC -30795 /* Update of meta page failed, probably I/O error */
#define MDB_VERSION_MISMATCH -30794 /* Environment version mismatch */
/* DB flags */ /* DB flags */
#define MDB_REVERSEKEY 0x02 /* use reverse string keys */ #define MDB_REVERSEKEY 0x02 /* use reverse string keys */

Loading…
Cancel
Save