From 1b8bfc575694fe4ee4e40eefd444873da5655086 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 23 Jan 2013 15:47:35 +0100 Subject: [PATCH] Freelist cleanup/streamlining Drop unneeded definitions, redundant code. --- libraries/liblmdb/mdb.c | 202 +++++++++++++++++----------------------- 1 file changed, 83 insertions(+), 119 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 68f6083..ef41458 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -911,18 +911,6 @@ typedef struct MDB_xcursor { unsigned char mx_dbflag; } MDB_xcursor; - /** A set of pages freed by an earlier transaction. */ -typedef struct MDB_oldpages { - /** Usually we only read one record from the FREEDB at a time, but - * in case we read more, this will chain them together. - */ - struct MDB_oldpages *mo_next; - /** The ID of the transaction in which these pages were freed. */ - txnid_t mo_txnid; - /** An #MDB_IDL of the pages */ - pgno_t mo_pages[1]; /* dynamic */ -} MDB_oldpages; - /** The database environment. */ struct MDB_env { HANDLE me_fd; /**< The main data file */ @@ -949,12 +937,10 @@ struct MDB_env { size_t me_mapsize; /**< size of the data memory map */ off_t me_size; /**< current file size */ pgno_t me_maxpg; /**< me_mapsize / me_psize */ - txnid_t me_pgfirst; /**< ID of first old page record we used */ txnid_t me_pglast; /**< ID of last old page record we used */ MDB_dbx *me_dbxs; /**< array of static DB info */ uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ - MDB_oldpages *me_pghead; /**< list of old page records */ - MDB_oldpages *me_pgfree; /**< list of page records to free */ + pgno_t *me_pghead; /**< old pages reclaimed from freelist */ pthread_key_t me_txkey; /**< thread-key for readers */ MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ /** IDL of pages that became unused in a write txn */ @@ -1287,7 +1273,6 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) * after txn 3 commits, and so will be safe to re-use in txn 4. */ if (txn->mt_txnid > 3) { - if (!txn->mt_env->me_pghead && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) { /* See if there's anything in the free DB */ @@ -1298,7 +1283,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) txnid_t *kptr; mdb_cursor_init(&m2, txn, FREE_DBI, NULL); - if (!txn->mt_env->me_pgfirst) { + if (!txn->mt_env->me_pglast) { mdb_page_search(&m2, NULL, 0); leaf = NODEPTR(m2.mc_pg[m2.mc_top], 0); kptr = (txnid_t *)NODEKEY(leaf); @@ -1335,10 +1320,9 @@ again: if (oldest > last) { /* It's usable, grab it. */ - MDB_oldpages *mop; - pgno_t *idl; + pgno_t *idl, *mop; - if (!txn->mt_env->me_pgfirst) { + if (!txn->mt_env->me_pglast) { mdb_node_read(txn, leaf, &data); } idl = (MDB_ID *) data.mv_data; @@ -1347,26 +1331,20 @@ again: */ if (!idl[0]) { txn->mt_env->me_pglast = last; - if (!txn->mt_env->me_pgfirst) - txn->mt_env->me_pgfirst = last; goto again; } - mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t)); + mop = malloc(MDB_IDL_SIZEOF(idl)); if (!mop) return ENOMEM; - mop->mo_next = txn->mt_env->me_pghead; - mop->mo_txnid = last; txn->mt_env->me_pglast = last; - if (!txn->mt_env->me_pgfirst) - txn->mt_env->me_pgfirst = last; txn->mt_env->me_pghead = mop; - memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl)); + memcpy(mop, idl, MDB_IDL_SIZEOF(idl)); #if MDB_DEBUG > 1 { unsigned int i; DPRINTF("IDL read txn %zu root %zu num %zu", - mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]); + last, txn->mt_dbs[FREE_DBI].md_root, idl[0]); for (i=0; imt_env->me_pghead) { - MDB_oldpages *mop = txn->mt_env->me_pghead; + pgno_t *mop = txn->mt_env->me_pghead; if (num > 1) { MDB_cursor m2; int retry = 500, readit = 0, n2 = num-1; unsigned int i, j, k; /* If current list is too short, must fetch more and coalesce */ - if (mop->mo_pages[0] < (unsigned)num) + if (mop[0] < (unsigned)num) readit = 1; mdb_cursor_init(&m2, txn, FREE_DBI, NULL); @@ -1398,11 +1376,10 @@ none: } if (readit) { MDB_val key, data; - MDB_oldpages *mop2; - pgno_t *idl; + pgno_t *idl, *mop2; int exact; - last = mop->mo_txnid + 1; + last = txn->mt_env->me_pglast + 1; /* We haven't hit the readers list yet? */ if (!oldest) { @@ -1432,39 +1409,37 @@ none: if (rc) return rc; idl = (MDB_ID *) data.mv_data; - mop2 = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - 2*sizeof(pgno_t) + MDB_IDL_SIZEOF(mop->mo_pages)); + mop2 = malloc(MDB_IDL_SIZEOF(idl) + MDB_IDL_SIZEOF(mop)); if (!mop2) return ENOMEM; /* merge in sorted order */ - i = idl[0]; j = mop->mo_pages[0]; mop2->mo_pages[0] = k = i+j; - mop->mo_pages[0] = P_INVALID; + i = idl[0]; j = mop[0]; mop2[0] = k = i+j; + mop[0] = P_INVALID; while (i>0 || j>0) { - if (i && idl[i] < mop->mo_pages[j]) - mop2->mo_pages[k--] = idl[i--]; + if (i && idl[i] < mop[j]) + mop2[k--] = idl[i--]; else - mop2->mo_pages[k--] = mop->mo_pages[j--]; + mop2[k--] = mop[j--]; } txn->mt_env->me_pglast = last; - mop2->mo_txnid = last; - mop2->mo_next = mop->mo_next; txn->mt_env->me_pghead = mop2; free(mop); mop = mop2; /* Keep trying to read until we have enough */ - if (mop->mo_pages[0] < (unsigned)num) { + if (mop[0] < (unsigned)num) { continue; } } /* current list has enough pages, but are they contiguous? */ - for (i=mop->mo_pages[0]; i>=(unsigned)num; i--) { - if (mop->mo_pages[i-n2] == mop->mo_pages[i] + n2) { - pgno = mop->mo_pages[i]; + for (i=mop[0]; i>=(unsigned)num; i--) { + if (mop[i-n2] == mop[i] + n2) { + pgno = mop[i]; i -= n2; /* move any stragglers down */ - for (j=i+num; j<=mop->mo_pages[0]; j++) - mop->mo_pages[i++] = mop->mo_pages[j]; - mop->mo_pages[0] -= num; + for (j=i+num; j<=mop[0]; j++) + mop[i++] = mop[j]; + mop[0] -= num; break; } } @@ -1478,17 +1453,12 @@ none: } while (1); } else { /* peel pages off tail, so we only have to truncate the list */ - pgno = MDB_IDL_LAST(mop->mo_pages); - mop->mo_pages[0]--; + pgno = MDB_IDL_LAST(mop); + mop[0]--; } - if (MDB_IDL_IS_ZERO(mop->mo_pages)) { - txn->mt_env->me_pghead = mop->mo_next; - if (mc->mc_dbi == FREE_DBI) { - mop->mo_next = txn->mt_env->me_pgfree; - txn->mt_env->me_pgfree = mop; - } else { - free(mop); - } + if (MDB_IDL_IS_ZERO(mop)) { + txn->mt_env->me_pghead = NULL; + free(mop); } } } @@ -1961,7 +1931,7 @@ mdb_txn_reset0(MDB_txn *txn) if (!(env->me_flags & MDB_ROFS)) txn->mt_u.reader->mr_txnid = (txnid_t)-1; } else { - MDB_oldpages *mop; + pgno_t *mop; MDB_page *dp; unsigned int i; @@ -2001,11 +1971,10 @@ mdb_txn_reset0(MDB_txn *txn) env->me_free_pgs = txn->mt_free_pgs; } - while ((mop = txn->mt_env->me_pghead)) { - txn->mt_env->me_pghead = mop->mo_next; + if ((mop = txn->mt_env->me_pghead) != NULL) { + txn->mt_env->me_pghead = NULL; free(mop); } - txn->mt_env->me_pgfirst = 0; txn->mt_env->me_pglast = 0; env->me_txn = NULL; @@ -2054,6 +2023,7 @@ mdb_txn_commit(MDB_txn *txn) MDB_page *dp; MDB_env *env; pgno_t next, freecnt; + txnid_t oldpg_txnid, id; MDB_cursor mc; assert(txn != NULL); @@ -2165,10 +2135,21 @@ mdb_txn_commit(MDB_txn *txn) } } + /* Save the freelist as of this transaction to the freeDB. This + * can change the freelist, so keep trying until it stabilizes. + * + * env->me_pglast and the length of txn->mt_free_pgs cannot decrease. + * Page numbers cannot disappear from txn->mt_free_pgs. New pages + * can only appear in env->me_pghead when env->me_pglast increases. + * Until then, the me_pghead pointer won't move but can become NULL. + */ + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + oldpg_txnid = id = 0; + freecnt = 0; /* should only be one record now */ - if (env->me_pghead || env->me_pgfirst) { + if (env->me_pghead || env->me_pglast) { /* make sure first page of freeDB is touched and on freelist */ rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY); if (rc && rc != MDB_NOTFOUND) { @@ -2179,28 +2160,27 @@ fail: } /* Delete IDLs we used from the free list */ - if (env->me_pgfirst) { - txnid_t cur; + if (env->me_pglast) { MDB_val key; - int exact = 0; - - key.mv_size = sizeof(cur); - for (cur = env->me_pgfirst; cur <= env->me_pglast; cur++) { - key.mv_data = &cur; - mdb_cursor_set(&mc, &key, NULL, MDB_SET, &exact); + do { +free_pgfirst: + rc = mdb_cursor_first(&mc, &key, NULL); + if (rc) + goto fail; + oldpg_txnid = *(txnid_t *)key.mv_data; +again: + assert(oldpg_txnid <= env->me_pglast); + id = 0; rc = mdb_cursor_del(&mc, 0); if (rc) goto fail; - } - env->me_pgfirst = 0; - env->me_pglast = 0; + } while (oldpg_txnid < env->me_pglast); } - /* save to free list */ + /* Save IDL of pages freed by this txn, to freeDB */ free2: - freecnt = txn->mt_free_pgs[0]; - if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { + if (freecnt != txn->mt_free_pgs[0]) { MDB_val key, data; /* make sure last page of freeDB is touched and on freelist */ @@ -2225,61 +2205,50 @@ free2: /* write to last page of freeDB */ key.mv_size = sizeof(pgno_t); key.mv_data = &txn->mt_txnid; - data.mv_data = txn->mt_free_pgs; /* The free list can still grow during this call, - * despite the pre-emptive touches above. So check - * and make sure the entire thing got written. + * despite the pre-emptive touches above. So retry + * until the reserved space remains big enough. */ do { + assert(freecnt < txn->mt_free_pgs[0]); freecnt = txn->mt_free_pgs[0]; data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs); - mdb_midl_sort(txn->mt_free_pgs); - rc = mdb_cursor_put(&mc, &key, &data, 0); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); if (rc) goto fail; } while (freecnt != txn->mt_free_pgs[0]); + mdb_midl_sort(txn->mt_free_pgs); + memcpy(data.mv_data, txn->mt_free_pgs, data.mv_size); + if (oldpg_txnid < env->me_pglast || (!env->me_pghead && id)) + goto free_pgfirst; /* used up freeDB[oldpg_txnid] */ } - /* should only be one record now */ -again: + + /* Put back page numbers we took from freeDB but did not use */ if (env->me_pghead) { MDB_val key, data; - MDB_oldpages *mop; - pgno_t orig; - txnid_t id; + pgno_t orig, *mop; mop = env->me_pghead; - id = mop->mo_txnid; + id = env->me_pglast; key.mv_size = sizeof(id); key.mv_data = &id; - data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages); - data.mv_data = mop->mo_pages; - orig = mop->mo_pages[0]; /* These steps may grow the freelist again * due to freed overflow pages... */ - rc = mdb_cursor_put(&mc, &key, &data, 0); - if (rc) - goto fail; - if (mop == env->me_pghead && env->me_pghead->mo_txnid == id) { - /* could have been used again here */ - if (mop->mo_pages[0] != orig) { - data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages); - data.mv_data = mop->mo_pages; - id = mop->mo_txnid; - rc = mdb_cursor_put(&mc, &key, &data, 0); - if (rc) - goto fail; - } - } else { - /* was completely used up */ - rc = mdb_cursor_del(&mc, 0); + i = 2; + do { + orig = mop[0]; + data.mv_size = MDB_IDL_SIZEOF(mop); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); if (rc) goto fail; - if (env->me_pghead) - goto again; - } - env->me_pgfirst = 0; - env->me_pglast = 0; + assert(!env->me_pghead || env->me_pglast); + /* mop could have been used again here */ + if (id != env->me_pglast || env->me_pghead == NULL) + goto again; /* was completely used up */ + assert(mop == env->me_pghead && mop[0] <= orig); + } while (mop[0] != orig && --i); + memcpy(data.mv_data, mop, data.mv_size); } /* Check for growth of freelist again */ @@ -2291,12 +2260,6 @@ again: env->me_pghead = NULL; } - while (env->me_pgfree) { - MDB_oldpages *mop = env->me_pgfree; - env->me_pgfree = mop->mo_next; - free(mop); - } - if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { if (mdb_midl_shrink(&txn->mt_free_pgs)) env->me_free_pgs = txn->mt_free_pgs; @@ -2431,6 +2394,7 @@ sync: } done: + env->me_pglast = 0; env->me_txn = NULL; if (txn->mt_numdbs > env->me_numdbs) { /* update the DB flags */