From 5e3a3db1f2dac9f47123a4032bd5435c70a45c1a Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Mon, 15 Aug 2011 18:37:30 -0700 Subject: [PATCH] Error cleanup Replace MDB_FAIL with more specific codes Return MDB_PANIC after failure to write meta page --- libraries/libmdb/mdb.c | 94 ++++++++++++++++++++++++++++-------------- libraries/libmdb/mdb.h | 11 +++-- 2 files changed, 71 insertions(+), 34 deletions(-) diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index 6faa1f5..48e1ebb 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -110,6 +110,7 @@ typedef struct MDB_txbody { pthread_mutex_t mtb_mutex; ULONG mtb_txnid; uint32_t mtb_numreaders; + uint32_t mtb_me_toggle; } MDB_txbody; typedef struct MDB_txninfo { @@ -120,6 +121,7 @@ typedef struct MDB_txninfo { #define mti_mutex mt1.mtb.mtb_mutex #define mti_txnid mt1.mtb.mtb_txnid #define mti_numreaders mt1.mtb.mtb_numreaders +#define mti_me_toggle mt1.mtb.mtb_me_toggle char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; } mt1; union { @@ -311,9 +313,9 @@ struct MDB_env { int me_fd; int me_lfd; int me_mfd; /* just for writing the meta pages */ - uint16_t me_flags; - uint16_t me_db_toggle; - unsigned int me_psize; +#define MDB_FATAL_ERROR 0x80000000U + uint32_t me_flags; + uint32_t me_extrapad; /* unused for now */ unsigned int me_maxreaders; unsigned int me_numdbs; unsigned int me_maxdbs; @@ -325,6 +327,9 @@ struct MDB_env { MDB_txn *me_txn; /* current write transaction */ size_t me_mapsize; off_t me_size; /* current file size */ + pgno_t me_maxpg; /* me_mapsize / me_psize */ + unsigned int me_psize; + unsigned int me_db_toggle; MDB_dbx *me_dbxs; /* array */ MDB_db *me_dbs[2]; MDB_oldpages *me_pghead; @@ -568,6 +573,11 @@ mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num) } } + if (pgno == P_INVALID) { + /* DB size is maxed out */ + if (txn->mt_next_pgno + num >= txn->mt_env->me_maxpg) + return NULL; + } if ((dp = malloc(txn->mt_env->me_psize * num + sizeof(MDB_dhead))) == NULL) return NULL; dp->h.md_num = num; @@ -633,6 +643,10 @@ mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret) MDB_txn *txn; int rc, toggle; + if (env->me_flags & MDB_FATAL_ERROR) { + DPRINTF("mdb_txn_begin: environment had fatal error, must shutdown!"); + return MDB_PANIC; + } if ((txn = calloc(1, sizeof(MDB_txn))) == NULL) { DPRINTF("calloc: %s", strerror(errno)); return ENOMEM; @@ -679,6 +693,7 @@ mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret) txn->mt_env = env; + toggle = env->me_txns->mti_me_toggle; if ((rc = mdb_env_read_meta(env, &toggle)) != MDB_SUCCESS) { mdb_txn_abort(txn); return rc; @@ -1036,19 +1051,23 @@ static int mdb_env_write_meta(MDB_txn *txn) { MDB_env *env; - MDB_meta meta; + MDB_meta meta, metab; off_t off; - int rc, len; + int rc, len, toggle; char *ptr; assert(txn != NULL); assert(txn->mt_env != NULL); + toggle = !F_ISSET(txn->mt_flags, MDB_TXN_METOGGLE); DPRINTF("writing meta page %d for root page %lu", - !F_ISSET(txn->mt_flags, MDB_TXN_METOGGLE), txn->mt_dbs[MAIN_DBI].md_root); + toggle, txn->mt_dbs[MAIN_DBI].md_root); env = txn->mt_env; + metab.mm_txnid = env->me_metas[toggle]->mm_txnid; + metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; + ptr = (char *)&meta; off = offsetof(MDB_meta, mm_dbs[0].md_depth); len = sizeof(MDB_meta) - off; @@ -1059,15 +1078,25 @@ mdb_env_write_meta(MDB_txn *txn) meta.mm_last_pg = txn->mt_next_pgno - 1; meta.mm_txnid = txn->mt_txnid; - if (!F_ISSET(txn->mt_flags, MDB_TXN_METOGGLE)) + if (toggle) off += env->me_psize; off += PAGEHDRSZ; - rc = pwrite(env->me_fd, ptr, len, off); + /* Write to the SYNC fd */ + rc = pwrite(env->me_mfd, ptr, len, off); if (rc != len) { DPRINTF("write failed, disk error?"); + /* On a failure, the pagecache still contains the new data. + * Write some old data back, to prevent it from being used. + * Use the non-SYNC fd; we know it will fail anyway. + */ + meta.mm_last_pg = metab.mm_last_pg; + meta.mm_txnid = metab.mm_txnid; + rc = pwrite(env->me_fd, ptr, len, off); + env->me_flags |= MDB_FATAL_ERROR; return errno; } + txn->mt_env->me_txns->mti_me_toggle = toggle; return MDB_SUCCESS; } @@ -1079,13 +1108,13 @@ mdb_env_read_meta(MDB_env *env, int *which) assert(env != NULL); - if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid) + if (which) + toggle = *which; + else if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid) toggle = 1; if (env->me_meta != env->me_metas[toggle]) env->me_meta = env->me_metas[toggle]; - if (which) - *which = toggle; DPRINTF("Using meta page %d", toggle); @@ -1183,7 +1212,9 @@ mdb_env_open2(MDB_env *env, unsigned int flags) } env->me_psize = meta.mm_psize; - p = (MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)(MDB_page *)env->me_map; + env->me_maxpg = env->me_mapsize / env->me_psize; + + p = (MDB_page *)env->me_map; env->me_metas[0] = METADATA(p); env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + meta.mm_psize); @@ -1219,6 +1250,8 @@ mdb_env_share_locks(MDB_env *env) struct flock lock_info; env->me_txns->mti_txnid = env->me_meta->mm_txnid; + if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid) + env->me_txns->mti_me_toggle = 1; memset((void *)&lock_info, 0, sizeof(lock_info)); lock_info.l_type = F_RDLCK; @@ -1292,6 +1325,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_txnid = 0; env->me_txns->mti_numreaders = 0; + env->me_txns->mti_me_toggle = 0; } else { if (env->me_txns->mti_magic != MDB_MAGIC) { @@ -1546,7 +1580,7 @@ mdb_search_page_root(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, int rc; if (cursor && cursor_push_page(cursor, mp) == NULL) - return MDB_FAIL; + return ENOMEM; while (IS_BRANCH(mp)) { unsigned int i = 0; @@ -1583,12 +1617,12 @@ mdb_search_page_root(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, mpp->mp_parent = mp; if ((mp = mdb_get_page(txn, NODEPGNO(node))) == NULL) - return MDB_FAIL; + return MDB_PAGE_NOTFOUND; mpp->mp_pi = i; mpp->mp_page = mp; if (cursor && cursor_push_page(cursor, mp) == NULL) - return MDB_FAIL; + return ENOMEM; if (modify) { MDB_dhead *dh = ((MDB_dhead *)mp)-1; @@ -1605,7 +1639,7 @@ mdb_search_page_root(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, if (!IS_LEAF(mp)) { DPRINTF("internal error, index points to a %02X page!?", mp->mp_flags); - return MDB_FAIL; + return MDB_CORRUPTED; } DPRINTF("found leaf page %lu for key %.*s", mp->mp_pgno, @@ -1643,7 +1677,7 @@ mdb_search_page(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, } if ((mpp->mp_page = mdb_get_page(txn, root)) == NULL) - return MDB_FAIL; + return MDB_PAGE_NOTFOUND; DPRINTF("root page has flags 0x%X", mpp->mp_page->mp_flags); @@ -1687,7 +1721,7 @@ mdb_read_data(MDB_txn *txn, MDB_node *leaf, MDB_val *data) memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); if ((omp = mdb_get_page(txn, pgno)) == NULL) { DPRINTF("read overflow page %lu failed", pgno); - return MDB_FAIL; + return MDB_PAGE_NOTFOUND; } data->mv_data = omp; @@ -1773,7 +1807,7 @@ mdb_sibling(MDB_cursor *cursor, int move_right) indx = NODEPTR(parent->mp_page, parent->mp_ki); if ((mp = mdb_get_page(cursor->mc_txn, indx->mn_pgno)) == NULL) - return MDB_FAIL; + return MDB_PAGE_NOTFOUND; #if 0 mp->parent = parent->mp_page; mp->parent_index = parent->mp_ki; @@ -2211,7 +2245,7 @@ mdb_add_node(MDB_txn *txn, MDB_dbi dbi, MDB_page *mp, indx_t indx, data->mv_size); node_size += sizeof(pgno_t); if ((ofp = mdb_new_page(txn, dbi, P_OVERFLOW, ovpages)) == NULL) - return MDB_FAIL; + return ENOMEM; DPRINTF("allocated overflow page %lu", ofp->p.mp_pgno); flags |= F_BIGDATA; } else { @@ -2646,7 +2680,7 @@ mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mpp) DPRINTF("collapsing root page!"); txn->mt_dbs[dbi].md_root = NODEPGNO(NODEPTR(mpp->mp_page, 0)); if ((root = mdb_get_page(txn, txn->mt_dbs[dbi].md_root)) == NULL) - return MDB_FAIL; + return MDB_PAGE_NOTFOUND; txn->mt_dbs[dbi].md_depth--; txn->mt_dbs[dbi].md_branch_pages--; } else @@ -2672,7 +2706,7 @@ mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mpp) DPRINTF("reading right neighbor"); node = NODEPTR(mpp->mp_parent, mpp->mp_pi + 1); if ((npp.mp_page = mdb_get_page(txn, NODEPGNO(node))) == NULL) - return MDB_FAIL; + return MDB_PAGE_NOTFOUND; npp.mp_pi = mpp->mp_pi + 1; si = 0; di = NUMKEYS(mpp->mp_page); @@ -2682,7 +2716,7 @@ mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mpp) DPRINTF("reading left neighbor"); node = NODEPTR(mpp->mp_parent, mpp->mp_pi - 1); if ((npp.mp_page = mdb_get_page(txn, NODEPGNO(node))) == NULL) - return MDB_FAIL; + return MDB_PAGE_NOTFOUND; npp.mp_pi = mpp->mp_pi - 1; si = NUMKEYS(npp.mp_page) - 1; di = 0; @@ -2861,7 +2895,7 @@ mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp, unsigned int *newindxp, if (mdp->h.md_parent == NULL) { if ((pdp = mdb_new_page(txn, dbi, P_BRANCH, 1)) == NULL) - return MDB_FAIL; + return ENOMEM; mdp->h.md_pi = 0; mdp->h.md_parent = &pdp->p; txn->mt_dbs[dbi].md_root = pdp->p.mp_pgno; @@ -2869,23 +2903,23 @@ mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp, unsigned int *newindxp, txn->mt_dbs[dbi].md_depth++; /* Add left (implicit) pointer. */ - if (mdb_add_node(txn, dbi, &pdp->p, 0, NULL, NULL, - mdp->p.mp_pgno, 0) != MDB_SUCCESS) - return MDB_FAIL; + if ((rc = mdb_add_node(txn, dbi, &pdp->p, 0, NULL, NULL, + mdp->p.mp_pgno, 0)) != MDB_SUCCESS) + return rc; } else { DPRINTF("parent branch page is %lu", mdp->h.md_parent->mp_pgno); } /* Create a right sibling. */ if ((rdp = mdb_new_page(txn, dbi, mdp->p.mp_flags, 1)) == NULL) - return MDB_FAIL; + return ENOMEM; rdp->h.md_parent = mdp->h.md_parent; rdp->h.md_pi = mdp->h.md_pi + 1; DPRINTF("new right sibling: page %lu", rdp->p.mp_pgno); /* Move half of the keys to the right sibling. */ if ((copy = malloc(txn->mt_env->me_psize)) == NULL) - return MDB_FAIL; + return ENOMEM; memcpy(copy, &mdp->p, txn->mt_env->me_psize); memset(&mdp->p.mp_ptrs, 0, txn->mt_env->me_psize - PAGEHDRSZ); mdp->p.mp_lower = PAGEHDRSZ; @@ -2927,7 +2961,7 @@ mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp, unsigned int *newindxp, } if (rc != MDB_SUCCESS) { free(copy); - return MDB_FAIL; + return rc; } for (i = j = 0; i <= NUMKEYS(copy); j++) { diff --git a/libraries/libmdb/mdb.h b/libraries/libmdb/mdb.h index dfd35a0..39d9b6f 100644 --- a/libraries/libmdb/mdb.h +++ b/libraries/libmdb/mdb.h @@ -82,11 +82,14 @@ typedef enum MDB_cursor_op { /* cursor operations */ } MDB_cursor_op; /* return codes */ +/* BerkeleyDB uses -30800 to -30999, we'll go under them */ #define MDB_SUCCESS 0 -#define MDB_FAIL -1 -#define MDB_KEYEXIST -2 -#define MDB_NOTFOUND -3 -#define MDB_VERSION_MISMATCH -4 +#define MDB_KEYEXIST -30799 /* key/data pair already exists */ +#define MDB_NOTFOUND -30798 /* key/data pair not found (EOF) */ +#define MDB_PAGE_NOTFOUND -30797 /* Requested page not found */ +#define MDB_CORRUPTED -30796 /* Located page was wrong type */ +#define MDB_PANIC -30795 /* Update of meta page failed, probably I/O error */ +#define MDB_VERSION_MISMATCH -30794 /* Environment version mismatch */ /* DB flags */ #define MDB_REVERSEKEY 0x02 /* use reverse string keys */