Rearrange MDB dirty page code.

Split out mdb_dpage_free(), mdb_page_flush() and clean up.
vmware
Hallvard Furuseth 12 years ago
parent c6f9323b3d
commit 9e6ef6c88f
  1. 265
      libraries/liblmdb/mdb.c

@ -1289,16 +1289,10 @@ mdb_page_free(MDB_env *env, MDB_page *mp)
env->me_dpages = mp; env->me_dpages = mp;
} }
/* Return all dirty pages to dpage list */ /* Free a dirty page */
static void static void
mdb_dlist_free(MDB_txn *txn) mdb_dpage_free(MDB_env *env, MDB_page *dp)
{ {
MDB_env *env = txn->mt_env;
MDB_ID2L dl = txn->mt_u.dirty_list;
unsigned i, n = dl[0].mid;
for (i = 1; i <= n; i++) {
MDB_page *dp = dl[i].mptr;
if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
mdb_page_free(env, dp); mdb_page_free(env, dp);
} else { } else {
@ -1307,6 +1301,18 @@ mdb_dlist_free(MDB_txn *txn)
free(dp); free(dp);
} }
} }
/* Return all dirty pages to dpage list */
static void
mdb_dlist_free(MDB_txn *txn)
{
MDB_env *env = txn->mt_env;
MDB_ID2L dl = txn->mt_u.dirty_list;
unsigned i, n = dl[0].mid;
for (i = 1; i <= n; i++) {
mdb_dpage_free(env, dl[i].mptr);
}
dl[0].mid = 0; dl[0].mid = 0;
} }
@ -2191,16 +2197,111 @@ mdb_freelist_save(MDB_txn *txn)
return rc; return rc;
} }
/** Flush dirty pages to the map, after clearing their dirty flag.
*/
static int
mdb_page_flush(MDB_txn *txn)
{
MDB_env *env = txn->mt_env;
MDB_ID2L dl = txn->mt_u.dirty_list;
unsigned psize = env->me_psize;
int i, pagecount = dl[0].mid, rc;
size_t size, pos = 0;
pgno_t pgno;
MDB_page *dp;
#ifdef _WIN32
OVERLAPPED ov;
memset(&ov, 0, sizeof(ov));
#else
struct iovec iov[MDB_COMMIT_PAGES];
ssize_t wpos, wsize, wres;
size_t next_pos = 1; /* impossible pos, so pos != next_pos */
int n = 0;
#endif
if (env->me_flags & MDB_WRITEMAP) {
/* Clear dirty flags */
for (i = pagecount; i; i--) {
dp = dl[i].mptr;
dp->mp_flags &= ~P_DIRTY;
}
dl[0].mid = 0;
return MDB_SUCCESS;
}
/* Write the pages */
for (i = 1;; i++) {
if (i <= pagecount) {
dp = dl[i].mptr;
pgno = dl[i].mid;
/* clear dirty flag */
dp->mp_flags &= ~P_DIRTY;
pos = pgno * psize;
size = psize;
if (IS_OVERFLOW(dp)) size *= dp->mp_pages;
}
#ifdef _WIN32
else break;
/* Windows actually supports scatter/gather I/O, but only on
* unbuffered file handles. Since we're relying on the OS page
* cache for all our data, that's self-defeating. So we just
* write pages one at a time. We use the ov structure to set
* the write offset, to at least save the overhead of a Seek
* system call.
*/
DPRINTF("committing page %zu", pgno);
ov.Offset = pos & 0xffffffff;
ov.OffsetHigh = pos >> 16;
ov.OffsetHigh >>= 16;
if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) {
rc = ErrCode();
DPRINTF("WriteFile: %d", rc);
return rc;
}
#else
/* Write up to MDB_COMMIT_PAGES dirty pages at a time. */
if (pos != next_pos || n == MDB_COMMIT_PAGES) {
if (n) {
/* Write previous page(s) */
lseek(env->me_fd, wpos, SEEK_SET);
wres = writev(env->me_fd, iov, n);
if (wres != wsize) {
rc = ErrCode();
if (wres < 0) {
DPRINTF("writev: %s", strerror(rc));
} else {
DPUTS("short write, filesystem full?");
}
return rc;
}
n = 0;
}
if (i > pagecount)
break;
wpos = pos;
wsize = 0;
}
DPRINTF("committing page %zu", pgno);
next_pos = pos + size;
iov[n].iov_len = size;
iov[n].iov_base = (char *)dp;
wsize += size;
n++;
#endif /* _WIN32 */
}
mdb_dlist_free(txn);
return MDB_SUCCESS;
}
int int
mdb_txn_commit(MDB_txn *txn) mdb_txn_commit(MDB_txn *txn)
{ {
int n, done; int rc;
unsigned int i; unsigned int i;
ssize_t rc;
off_t size;
MDB_page *dp;
MDB_env *env; MDB_env *env;
pgno_t next;
assert(txn != NULL); assert(txn != NULL);
assert(txn->mt_env != NULL); assert(txn->mt_env != NULL);
@ -2208,10 +2309,8 @@ mdb_txn_commit(MDB_txn *txn)
if (txn->mt_child) { if (txn->mt_child) {
rc = mdb_txn_commit(txn->mt_child); rc = mdb_txn_commit(txn->mt_child);
txn->mt_child = NULL; txn->mt_child = NULL;
if (rc) { if (rc)
mdb_txn_abort(txn); goto fail;
return rc;
}
} }
env = txn->mt_env; env = txn->mt_env;
@ -2227,8 +2326,8 @@ mdb_txn_commit(MDB_txn *txn)
DPUTS("error flag is set, can't commit"); DPUTS("error flag is set, can't commit");
if (txn->mt_parent) if (txn->mt_parent)
txn->mt_parent->mt_flags |= MDB_TXN_ERROR; txn->mt_parent->mt_flags |= MDB_TXN_ERROR;
mdb_txn_abort(txn); rc = EINVAL;
return EINVAL; goto fail;
} }
if (txn->mt_parent) { if (txn->mt_parent) {
@ -2237,10 +2336,9 @@ mdb_txn_commit(MDB_txn *txn)
MDB_ID2L dst, src; MDB_ID2L dst, src;
/* Append our free list to parent's */ /* Append our free list to parent's */
if (mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs)) { rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
mdb_txn_abort(txn); if (rc)
return ENOMEM; goto fail;
}
mdb_midl_free(txn->mt_free_pgs); mdb_midl_free(txn->mt_free_pgs);
parent->mt_next_pgno = txn->mt_next_pgno; parent->mt_next_pgno = txn->mt_next_pgno;
@ -2302,8 +2400,8 @@ mdb_txn_commit(MDB_txn *txn)
if (txn != env->me_txn) { if (txn != env->me_txn) {
DPUTS("attempt to commit unknown transaction"); DPUTS("attempt to commit unknown transaction");
mdb_txn_abort(txn); rc = EINVAL;
return EINVAL; goto fail;
} }
mdb_cursors_close(txn, 0); mdb_cursors_close(txn, 0);
@ -2338,125 +2436,17 @@ mdb_txn_commit(MDB_txn *txn)
mdb_midl_free(env->me_pghead); mdb_midl_free(env->me_pghead);
env->me_pghead = NULL; env->me_pghead = NULL;
if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
if (mdb_midl_shrink(&txn->mt_free_pgs)) if (mdb_midl_shrink(&txn->mt_free_pgs))
env->me_free_pgs = txn->mt_free_pgs; env->me_free_pgs = txn->mt_free_pgs;
}
#if MDB_DEBUG > 2 #if MDB_DEBUG > 2
mdb_audit(txn); mdb_audit(txn);
#endif #endif
if (env->me_flags & MDB_WRITEMAP) { if ((rc = mdb_page_flush(txn)) ||
for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) { (rc = mdb_env_sync(env, 0)) ||
dp = txn->mt_u.dirty_list[i].mptr; (rc = mdb_env_write_meta(txn)))
/* clear dirty flag */ goto fail;
dp->mp_flags &= ~P_DIRTY;
}
txn->mt_u.dirty_list[0].mid = 0;
goto sync;
}
/* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done.
*/
next = 0;
i = 1;
do {
#ifdef _WIN32
/* Windows actually supports scatter/gather I/O, but only on
* unbuffered file handles. Since we're relying on the OS page
* cache for all our data, that's self-defeating. So we just
* write pages one at a time. We use the ov structure to set
* the write offset, to at least save the overhead of a Seek
* system call.
*/
OVERLAPPED ov;
memset(&ov, 0, sizeof(ov));
for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
size_t wsize;
dp = txn->mt_u.dirty_list[i].mptr;
DPRINTF("committing page %zu", dp->mp_pgno);
size = dp->mp_pgno * env->me_psize;
ov.Offset = size & 0xffffffff;
ov.OffsetHigh = size >> 16;
ov.OffsetHigh >>= 16;
/* clear dirty flag */
dp->mp_flags &= ~P_DIRTY;
wsize = env->me_psize;
if (IS_OVERFLOW(dp)) wsize *= dp->mp_pages;
rc = WriteFile(env->me_fd, dp, wsize, NULL, &ov);
if (!rc) {
n = ErrCode();
DPRINTF("WriteFile: %d", n);
mdb_txn_abort(txn);
return n;
}
}
done = 1;
#else
struct iovec iov[MDB_COMMIT_PAGES];
n = 0;
done = 1;
size = 0;
for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
dp = txn->mt_u.dirty_list[i].mptr;
if (dp->mp_pgno != next) {
if (n) {
rc = writev(env->me_fd, iov, n);
if (rc != size) {
n = ErrCode();
if (rc > 0)
DPUTS("short write, filesystem full?");
else
DPRINTF("writev: %s", strerror(n));
mdb_txn_abort(txn);
return n;
}
n = 0;
size = 0;
}
lseek(env->me_fd, dp->mp_pgno * env->me_psize, SEEK_SET);
next = dp->mp_pgno;
}
DPRINTF("committing page %zu", dp->mp_pgno);
iov[n].iov_len = env->me_psize;
if (IS_OVERFLOW(dp)) iov[n].iov_len *= dp->mp_pages;
iov[n].iov_base = (char *)dp;
size += iov[n].iov_len;
next = dp->mp_pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1);
/* clear dirty flag */
dp->mp_flags &= ~P_DIRTY;
if (++n >= MDB_COMMIT_PAGES) {
done = 0;
i++;
break;
}
}
if (n == 0)
break;
rc = writev(env->me_fd, iov, n);
if (rc != size) {
n = ErrCode();
if (rc > 0)
DPUTS("short write, filesystem full?");
else
DPRINTF("writev: %s", strerror(n));
mdb_txn_abort(txn);
return n;
}
#endif
} while (!done);
mdb_dlist_free(txn);
sync:
if ((n = mdb_env_sync(env, 0)) != 0 ||
(n = mdb_env_write_meta(txn)) != MDB_SUCCESS) {
mdb_txn_abort(txn);
return n;
}
done: done:
env->me_pglast = 0; env->me_pglast = 0;
@ -3499,7 +3489,6 @@ mdb_env_close0(MDB_env *env, int excl)
free(env->me_dbxs); free(env->me_dbxs);
free(env->me_path); free(env->me_path);
free(env->me_dirty_list); free(env->me_dirty_list);
if (env->me_free_pgs)
mdb_midl_free(env->me_free_pgs); mdb_midl_free(env->me_free_pgs);
if (env->me_flags & MDB_ENV_TXKEY) { if (env->me_flags & MDB_ENV_TXKEY) {

Loading…
Cancel
Save