Drop P_DIRTY, and MDB_WRITEMAP dirty/spill lists

mt_workid = mt_txnid when WRITEMAP, so dirty pages == "spilled"
pages and mdb_page_flush() does nothing.
mdb.master3
Hallvard Furuseth 7 years ago committed by Howard Chu
parent f1db84d332
commit adfa8f758a
  1. 109
      libraries/liblmdb/mdb.c

@ -975,7 +975,10 @@ typedef struct MDB_page_header {
* Thus an #MDB_txn can write to pages with mp_txnid >= txn.mt_workid.
* A page with smaller mp_txnid is dirty in an ancestor txn or clean.
*
* txn.mt_workid > txn.mt_txnid, to tell apart spilled and dirty pages.
* Non-#MDB_WRITEMAP sets txn.mt_workid > txn.mt_txnid, to tell apart
* spilled and dirty pages. WRITEMAP sets mt_workid = mt_txnid, since
* it does not copy/spill pages. Thus (page.mp_txnid == txn.mt_txnid)
* says "spilled page" without WRITEMAP, "dirty page" with WRITEMAP.
*
* Finally, ((dirty page).mp_txnid & #MDB_PGTXNID_FLAGMASK) can be used
* for flags with non-WRITEMAP; it keeps low bits in workid = 0.
@ -992,14 +995,13 @@ typedef struct MDB_page_header {
#define P_LEAF 0x02 /**< leaf page */
#define P_OVERFLOW 0x04 /**< overflow page */
#define P_META 0x08 /**< meta page */
#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */
#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
#define P_DIRTY_OVF 0x2000 /**< page has dirty overflow nodes */
#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */
#define P_KEEP 0x8000 /**< leave this page alone during spill */
/** Persistent flags for page administration rather than page contents */
#define P_ADM_FLAGS (P_DIRTY)
#define P_ADM_FLAGS 0 /* later... */
/** @} */
uint16_t mh_flags; /**< @ref mdb_page */
#define mp_lower mp_pb.pb.pb_lower
@ -1058,6 +1060,8 @@ typedef struct MDB_page {
/** Test if a page is a sub page */
#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP)
/** Test if (this non-sub page is dirty && env is non-#MDB_WRITEMAP) */
#define IS_DIRTY_NW(txn, p) ((p)->mp_txnid > (txn)->mt_txnid)
/** Test if this non-sub page belongs to the current snapshot */
#define IS_MUTABLE(txn, p) ((p)->mp_txnid >= (txn)->mt_txnid)
/** Test if this non-sub page is writable in this txn (not an ancestor) */
@ -1331,7 +1335,7 @@ struct MDB_txn {
/** Written to mp_txnid of dirty pages, to be fixed by #mdb_page_flush().
*
* Value >= 1 + (parent ? parent.last_workid : txnid).
* Value >= WRITEMAP ? txnid : 1 + (parent ? parent.last_workid : txnid).
* See #MDB_page.%mp_txnid.
*
* An MDB_txn can write to a page when page.mp_txnid >= txn.mt_workid.
@ -1358,10 +1362,11 @@ struct MDB_txn {
/** The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set.
* Unused with #MDB_WRITEMAP, which does not use a dirty list.
*/
MDB_IDL mt_spill_pgs;
union {
/** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */
/** For write txns: Modified pages, sorted. Unused when MDB_WRITEMAP. */
MDB_ID2L dirty_list;
/** For read txns: This thread/txn's reader table slot, or NULL. */
MDB_reader *reader;
@ -1434,6 +1439,7 @@ struct MDB_txn {
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirty_list into mt_parent after freeing hidden mt_parent pages.
* When #MDB_WRITEMAP, it is nonzero but otherwise irrelevant.
*/
unsigned int mt_dirty_room;
};
@ -1586,7 +1592,9 @@ struct MDB_env {
MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */
/** IDL of pages that became unused in a write txn */
MDB_IDL me_free_pgs;
/** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */
/** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE.
* Unused except for a dummy element when #MDB_WRITEMAP.
*/
MDB_ID2L me_dirty_list;
/** Max number of freelist items that can fit in a single overflow page */
int me_maxfree_1pg;
@ -1919,7 +1927,7 @@ void
mdb_page_list(MDB_page *mp)
{
pgno_t pgno = mdb_dbg_pgno(mp);
const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : "";
const char *type;
MDB_node *node;
unsigned int i, nkeys, nsize, total = 0;
MDB_val key;
@ -1932,8 +1940,7 @@ mdb_page_list(MDB_page *mp)
case P_LEAF|P_LEAF2: type = "LEAF2 page"; break;
case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break;
case P_OVERFLOW:
fprintf(stderr, "Overflow page %"Yu" pages %u%s\n",
pgno, mp->mp_pages, state);
fprintf(stderr, "Overflow page %"Yu" pages %u\n", pgno, mp->mp_pages);
return;
case P_META:
fprintf(stderr, "Meta-page %"Yu" txnid %"Yu"\n",
@ -1945,7 +1952,7 @@ mdb_page_list(MDB_page *mp)
}
nkeys = NUMKEYS(mp);
fprintf(stderr, "%s %"Yu" numkeys %d%s\n", type, pgno, nkeys, state);
fprintf(stderr, "%s %"Yu" numkeys %d\n", type, pgno, nkeys);
for (i=0; i<nkeys; i++) {
if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */
@ -2188,7 +2195,7 @@ mdb_page_unref(MDB_txn *txn, MDB_page *mp)
pgno_t pgno;
MDB_ID3L tl = txn->mt_rpages;
unsigned x, rem;
if (mp->mp_flags & (P_SUBP|P_DIRTY))
if (IS_SUBP(mp) || IS_DIRTY_NW(txn, mp))
return;
rem = mp->mp_pgno & (MDB_RPAGE_CHUNK-1);
pgno = mp->mp_pgno ^ rem;
@ -2264,14 +2271,14 @@ mdb_page_loose(MDB_cursor *mc, MDB_page *mp)
/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
* @param[in] mc A cursor handle for the current operation.
* @param[in] pflags Flags of the pages to update:
* P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
* 0 to set P_KEEP, P_KEEP to clear it.
* @param[in] all No shortcuts. Needed except after a full #mdb_page_flush().
* @return 0 on success, non-zero on failure.
*/
static int
mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
{
enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP };
enum { Mask = P_SUBP|P_LOOSE|P_KEEP };
MDB_txn *txn = mc->mc_txn;
MDB_cursor *m3, *m0 = mc;
MDB_xcursor *mx;
@ -2288,6 +2295,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
for (j=0; j<m3->mc_snum; j++) {
mp = m3->mc_pg[j];
if ((mp->mp_flags & Mask) == pflags)
if (IS_DIRTY_NW(txn, mp))
mp->mp_flags ^= P_KEEP;
}
mx = m3->mc_xcursor;
@ -2343,7 +2351,7 @@ static int mdb_page_flush(MDB_txn *txn, int keep);
*
* Otherwise, if not using nested txns, it is expected that apps will
* not run into #MDB_TXN_FULL any more. The pages are flushed to disk
* the same way as for a txn commit, e.g. their P_DIRTY flag is cleared.
* the same way as for a txn commit.
* If the txn never references them again, they can be left alone.
* If the txn only reads them, they can be used without any fuss.
* If the txn writes them again, they can be dirtied immediately without
@ -2372,7 +2380,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
unsigned int i, j, need;
int rc;
if (m0->mc_flags & C_SUB)
if (m0->mc_flags & (C_SUB|C_WRITEMAP))
return MDB_SUCCESS;
/* Estimate how much space this op will take */
@ -2406,7 +2414,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
}
/* Preserve pages which may soon be dirtied again */
if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS)
if ((rc = mdb_pages_xkeep(m0, 0, 1)) != MDB_SUCCESS)
goto done;
/* Less aggressive spill - we originally spilled the entire dirty list,
@ -2454,7 +2462,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
goto done;
/* Reset any dirty pages we kept that page_flush didn't see */
rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
rc = mdb_pages_xkeep(m0, P_KEEP, i);
done:
txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS;
@ -2480,23 +2488,20 @@ mdb_find_oldest(MDB_txn *txn)
return oldest;
}
/** Mark a page as dirty and add it to the txn's dirty list */
/** Add a page to the txn's dirty list, if there is one */
static void
mdb_page_dirty(MDB_txn *txn, MDB_page *mp)
{
MDB_ID2 mid;
int rc, (*insert)(MDB_ID2L, MDB_ID2 *);
mp->mp_flags |= P_DIRTY;
int rc;
if (txn->mt_flags & MDB_TXN_WRITEMAP) {
insert = mdb_mid2l_append;
} else {
insert = mdb_mid2l_insert;
txn->mt_flags |= MDB_TXN_DIRTY;
return;
}
mid.mid = mp->mp_pgno;
mid.mptr = mp;
rc = insert(txn->mt_u.dirty_list, &mid);
rc = mdb_mid2l_insert(txn->mt_u.dirty_list, &mid);
mdb_tassert(txn, rc == 0);
txn->mt_dirty_room--;
}
@ -2806,9 +2811,7 @@ mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret)
num = mp->mp_pages;
else
num = 1;
if (env->me_flags & MDB_WRITEMAP) {
np = mp;
} else {
{
np = mdb_page_malloc(txn, num, 1);
if (!np)
return ENOMEM;
@ -2873,7 +2876,7 @@ mdb_page_touch(MDB_cursor *mc)
} else {
mc->mc_db->md_root = pgno;
}
} else if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
} else if (!IS_DIRTY_NW(txn, mp)) {
rc = mdb_page_unspill(txn, mp, &np);
if (rc)
goto fail;
@ -2896,7 +2899,6 @@ mdb_page_touch(MDB_cursor *mc)
mid.mptr = np;
rc = mdb_mid2l_insert(dl, &mid);
mdb_cassert(mc, rc == 0);
np->mp_flags |= P_DIRTY;
}
np_flags = np->mp_flags; /* P_ADM_FLAGS */
@ -3201,9 +3203,14 @@ mdb_txn_renew0(MDB_txn *txn)
txn->mt_child = NULL;
txn->mt_loose_pgs = NULL;
txn->mt_loose_count = 0;
txn->mt_workid = (txn->mt_txnid | MDB_PGTXNID_FLAGMASK) + 1;
if (env->me_flags & MDB_WRITEMAP) {
txn->mt_workid = txn->mt_txnid;
txn->mt_dirty_room = 1;
} else {
txn->mt_workid = (txn->mt_txnid | MDB_PGTXNID_FLAGMASK) + 1;
txn->mt_dirty_room = MDB_IDL_UM_MAX;
}
txn->mt_last_workid = txn->mt_workid;
txn->mt_dirty_room = MDB_IDL_UM_MAX;
txn->mt_u.dirty_list = env->me_dirty_list;
txn->mt_u.dirty_list[0].mid = 0;
txn->mt_free_pgs = env->me_free_pgs;
@ -3845,18 +3852,6 @@ mdb_page_flush(MDB_txn *txn, int keep)
j = i = keep;
if (env->me_flags & MDB_WRITEMAP) {
/* Mark the pages as clean */
while (++i <= pagecount) {
dp = dl[i].mptr;
/* Don't flush this page yet */
if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
dp->mp_flags &= ~P_KEEP;
dl[++j] = dl[i];
continue;
}
dp->mp_txnid = txn->mt_txnid;
dp->mp_flags &= ~P_DIRTY;
}
goto done;
}
@ -3873,7 +3868,6 @@ mdb_page_flush(MDB_txn *txn, int keep)
pgno = dl[i].mid;
/* Mark the page as clean */
dp->mp_txnid = txn->mt_txnid;
dp->mp_flags &= ~P_DIRTY;
pos = pgno * psize;
size = psize;
#if MDB_RPAGE_CACHE
@ -4023,11 +4017,11 @@ retry_seek:
}
mdb_dpage_free(env, dp);
}
done:
i--;
txn->mt_dirty_room += i - j;
dl[0].mid = j;
done:
return MDB_SUCCESS;
}
@ -5765,8 +5759,10 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
/* silently ignore WRITEMAP when we're only getting read access */
flags &= ~MDB_WRITEMAP;
} else {
/* WRITEMAP has a dummy element to match dirty_room = 1 */
size_t dl_size = (flags & MDB_WRITEMAP) ? 2 : MDB_IDL_UM_SIZE;
if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) &&
(env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2)))))
(env->me_dirty_list = calloc(dl_size, sizeof(MDB_ID2)))))
rc = ENOMEM;
}
@ -6956,18 +6952,19 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp)
rc = mdb_midl_need(&env->me_pghead, ovpages);
if (rc)
return rc;
if (!(mp->mp_flags & P_DIRTY)) {
MDB_IDL sl = txn->mt_spill_pgs;
if (sl)
x = mdb_midl_search(sl, pn);
if (! (sl && x <= sl[0] && sl[x] == pn))
if (!IS_DIRTY_NW(txn, mp)) { /* spilled or WRITEMAP */
MDB_IDL sl = txn->mt_spill_pgs;
if (sl) {
x = mdb_midl_search(sl, pn);
if (! (x <= sl[0] && sl[x] == pn))
return MDB_PROBLEM;
/* This page is no longer spilled */
if (x == sl[0])
sl[0]--;
else
sl[x] |= 1;
goto release;
}
goto release;
}
/* Remove from dirty list */
dl = txn->mt_u.dirty_list;
@ -8037,7 +8034,7 @@ more:
dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size);
/* Make sub-page header for the dup items, with dummy body */
fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
fp->mp_flags = P_LEAF|P_SUBP;
fp->mp_lower = (PAGEHDRSZ-PAGEBASE);
xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
@ -8153,7 +8150,7 @@ current:
* is smaller than the overflow threshold.
*/
if (!IS_WRITABLE(mc->mc_txn, omp)) {
if (!(omp->mp_flags & P_DIRTY)) {
if (!IS_DIRTY_NW(mc->mc_txn, omp)) {
rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
if (rc)
return rc;

Loading…
Cancel
Save