Fixes for overflow pages

vl32b
Howard Chu 9 years ago
parent 2e0733899f
commit 69ca43c761
  1. 237
      libraries/liblmdb/mdb.c
  2. 4
      libraries/liblmdb/midl.h

@ -1159,6 +1159,9 @@ struct MDB_txn {
/** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */
MDB_txn *mt_child; MDB_txn *mt_child;
pgno_t mt_next_pgno; /**< next unallocated page */ pgno_t mt_next_pgno; /**< next unallocated page */
#ifdef MDB_VL32
pgno_t mt_last_pgno; /**< last written page */
#endif
/** The ID of this transaction. IDs are integers incrementing from 1. /** The ID of this transaction. IDs are integers incrementing from 1.
* Only committed write transactions increment the ID. If a transaction * Only committed write transactions increment the ID. If a transaction
* aborts, the ID may be re-used by the next writer. * aborts, the ID may be re-used by the next writer.
@ -1206,8 +1209,14 @@ struct MDB_txn {
/** Array of flags for each DB */ /** Array of flags for each DB */
unsigned char *mt_dbflags; unsigned char *mt_dbflags;
#ifdef MDB_VL32 #ifdef MDB_VL32
/** List of read-only pages */ /** List of read-only pages (actually chunks) */
MDB_ID3L mt_rpages; MDB_ID3L mt_rpages;
/** We map chunks of 16 pages. Even though Windows uses 4KB pages, all
* mappings must begin on 64KB boundaries. So we round off all pgnos to
* a chunk boundary. We do the same on Linux for symmetry, and also to
* reduce the frequency of mmap/munmap calls.
*/
#define MDB_RPAGE_CHUNK 16
#endif #endif
/** Number of DB records in use, or 0 when the txn is finished. /** Number of DB records in use, or 0 when the txn is finished.
* This number only ever increments until the txn finishes; we * This number only ever increments until the txn finishes; we
@ -1917,7 +1926,8 @@ mdb_page_unref(MDB_txn *txn, MDB_page *mp)
if (mp->mp_flags & (P_SUBP|P_DIRTY)) if (mp->mp_flags & (P_SUBP|P_DIRTY))
return; return;
x = mdb_mid3l_search(txn->mt_rpages, mp->mp_pgno); x = mdb_mid3l_search(txn->mt_rpages, mp->mp_pgno);
txn->mt_rpages[x].mref--; if (txn->mt_rpages[x].mref)
txn->mt_rpages[x].mref--;
} }
#define MDB_PAGE_UNREF(txn, mp) mdb_page_unref(txn, mp) #define MDB_PAGE_UNREF(txn, mp) mdb_page_unref(txn, mp)
@ -2899,6 +2909,9 @@ mdb_txn_renew0(MDB_txn *txn)
/* Moved to here to avoid a data race in read TXNs */ /* Moved to here to avoid a data race in read TXNs */
txn->mt_next_pgno = meta->mm_last_pg+1; txn->mt_next_pgno = meta->mm_last_pg+1;
#ifdef MDB_VL32
txn->mt_last_pgno = txn->mt_next_pgno - 1;
#endif
txn->mt_flags = flags; txn->mt_flags = flags;
@ -3173,16 +3186,22 @@ mdb_txn_end(MDB_txn *txn, unsigned mode)
} }
#ifdef MDB_VL32 #ifdef MDB_VL32
if (!txn->mt_parent) { if (!txn->mt_parent) {
unsigned i, x, n = txn->mt_rpages[0].mid; MDB_ID3L el = env->me_rpages, rl = txn->mt_rpages;
unsigned i, x, n = rl[0].mid;
LOCK_MUTEX0(env->me_rpmutex); LOCK_MUTEX0(env->me_rpmutex);
for (i = 1; i <= n; i++) { for (i = 1; i <= n; i++) {
x = mdb_mid3l_search(env->me_rpages, txn->mt_rpages[i].mid); if (rl[i].mid & (MDB_RPAGE_CHUNK-1)) {
env->me_rpages[x].mref--; /* tmp overflow pages that we didn't share in env */
munmap(rl[i].mptr, rl[i].mcnt * env->me_psize);
} else {
x = mdb_mid3l_search(el, rl[i].mid);
el[x].mref--;
}
} }
UNLOCK_MUTEX(env->me_rpmutex); UNLOCK_MUTEX(env->me_rpmutex);
txn->mt_rpages[0].mid = 0; rl[0].mid = 0;
if (mode & MDB_END_FREE) if (mode & MDB_END_FREE)
free(txn->mt_rpages); free(rl);
} }
#endif #endif
if (mode & MDB_END_FREE) { if (mode & MDB_END_FREE) {
@ -3535,6 +3554,10 @@ retry_seek:
n++; n++;
#endif /* _WIN32 */ #endif /* _WIN32 */
} }
#ifdef MDB_VL32
if (pgno > txn->mt_last_pgno)
txn->mt_last_pgno = pgno;
#endif
/* MIPS has cache coherency issues, this is a no-op everywhere else /* MIPS has cache coherency issues, this is a no-op everywhere else
* Note: for any size >= on-chip cache size, entire on-chip cache is * Note: for any size >= on-chip cache size, entire on-chip cache is
@ -5545,6 +5568,13 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
* at the end of the transaction. This guarantees that no stale references * at the end of the transaction. This guarantees that no stale references
* linger in the per-env list. * linger in the per-env list.
* *
* Usually we map chunks of 16 pages at a time, but if an overflow page begins
* at the tail of the chunk we extend the chunk to include the entire overflow
* page. Unfortunately, pages can be turned into overflow pages after their
* chunk was already mapped. In that case we must remap the chunk if the
* overflow page is referenced. If the chunk's refcnt is 0 we can just remap
* it, otherwise we temporarily map a new chunk just for the overflow page.
*
* @param[in] txn the transaction for this access. * @param[in] txn the transaction for this access.
* @param[in] pgno the page number for the page to retrieve. * @param[in] pgno the page number for the page to retrieve.
* @param[out] ret address of a pointer where the page's address will be stored. * @param[out] ret address of a pointer where the page's address will be stored.
@ -5557,37 +5587,81 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret)
MDB_page *p; MDB_page *p;
MDB_ID3L rl = txn->mt_rpages; MDB_ID3L rl = txn->mt_rpages;
MDB_ID3L el = env->me_rpages; MDB_ID3L el = env->me_rpages;
unsigned x; MDB_ID3 id3;
unsigned x, rem;
pgno_t pgno; pgno_t pgno;
int rc;
#ifdef _WIN32 #ifdef _WIN32
/* Even though Windows uses 4KB pages, all mappings must begin LARGE_INTEGER off;
* on 64KB boundaries. So we round off all pgnos to the SIZE_T len;
* appropriate boundary and then offset the pointer just #define SET_OFF(off,val) off.QuadPart = val
* before returning. #define MAP(rc,env,addr,len,off) \
* addr = NULL; \
* FIXME: we need to do special handling for overflow pages. rc = NtMapViewOfSection(env->me_fmh, GetCurrentProcess(), &addr, 0, \
* Most likely by keeping a separate list for them. len, &off, &len, ViewUnmap, MEM_RESERVE, PAGE_READONLY)
*/
pgno = pg0 / 16;
#else #else
pgno = pg0; off_t off;
size_t len;
#define SET_OFF(off,val) off = val
#define MAP(rc,env,addr,len,off) \
addr = mmap(NULL, len, PROT_READ, MAP_SHARED, env->me_fd, off); \
rc = (addr == MAP_FAILED) ? errno : 0
#endif #endif
/* remember the offset of the actual page number, so we can
* return the correct pointer at the end.
*/
rem = pg0 & (MDB_RPAGE_CHUNK-1);
pgno = pg0 ^ rem;
id3.mid = 0;
x = mdb_mid3l_search(rl, pgno); x = mdb_mid3l_search(rl, pgno);
if (x <= rl[0].mid && rl[x].mid == pgno) { if (x <= rl[0].mid && rl[x].mid == pgno) {
p = rl[x].mptr; if (x != rl[0].mid && rl[x+1].mid == pg0)
x++;
/* check for overflow size */
p = (MDB_page *)((char *)rl[x].mptr + rem * env->me_psize);
if (IS_OVERFLOW(p) && p->mp_pages + rem > rl[x].mcnt) {
id3.mcnt = p->mp_pages + rem;
len = id3.mcnt * env->me_psize;
SET_OFF(off, pgno * env->me_psize);
MAP(rc, env, id3.mptr, len, off);
if (rc)
return rc;
if (!rl[x].mref) {
munmap(rl[x].mptr, rl[x].mcnt);
rl[x].mptr = id3.mptr;
rl[x].mcnt = id3.mcnt;
} else {
/* hope there's room to insert this locally.
* setting mid here tells later code to just insert
* this id3 instead of searching for a match.
*/
id3.mid = pg0;
goto notlocal;
}
}
id3.mptr = rl[x].mptr;
id3.mcnt = rl[x].mcnt;
rl[x].mref++; rl[x].mref++;
goto ok; goto ok;
} }
notlocal:
if (rl[0].mid >= MDB_IDL_UM_MAX) { if (rl[0].mid >= MDB_IDL_UM_MAX) {
unsigned i, y = 0; unsigned i, y = 0;
/* purge unref'd pages from our list and unref in env */ /* purge unref'd pages from our list and unref in env */
LOCK_MUTEX0(env->me_rpmutex); LOCK_MUTEX0(env->me_rpmutex);
for (i=1; i<rl[0].mid; i++) { for (i=1; i<rl[0].mid; i++) {
if (!rl[i].mref) { if (!rl[i].mref) {
if (!y) y = i;
/* tmp overflow pages don't go to env */
if (rl[i].mid & (MDB_RPAGE_CHUNK-1)) {
munmap(rl[i].mptr, rl[i].mcnt * env->me_psize);
continue;
}
x = mdb_mid3l_search(el, rl[i].mid); x = mdb_mid3l_search(el, rl[i].mid);
el[x].mref--; el[x].mref--;
if (!y) y = i;
} }
} }
UNLOCK_MUTEX(env->me_rpmutex); UNLOCK_MUTEX(env->me_rpmutex);
@ -5599,14 +5673,38 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret)
rl[0].mid = y-1; rl[0].mid = y-1;
} }
if (rl[0].mid < MDB_IDL_UM_SIZE) { if (rl[0].mid < MDB_IDL_UM_SIZE) {
MDB_ID3 id3; id3.mref = 1;
size_t len = env->me_psize; if (id3.mid)
int np = 1; goto found;
len = env->me_psize * MDB_RPAGE_CHUNK;
id3.mid = pgno;
id3.mcnt = MDB_RPAGE_CHUNK;
/* search for page in env */ /* search for page in env */
LOCK_MUTEX0(env->me_rpmutex); LOCK_MUTEX0(env->me_rpmutex);
x = mdb_mid3l_search(el, pgno); x = mdb_mid3l_search(el, pgno);
if (x <= el[0].mid && el[x].mid == pgno) { if (x <= el[0].mid && el[x].mid == pgno) {
p = el[x].mptr; id3.mptr = el[x].mptr;
id3.mcnt = el[x].mcnt;
/* check for overflow size */
p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize);
if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) {
id3.mcnt = p->mp_pages + rem;
len = id3.mcnt * env->me_psize;
SET_OFF(off, pgno * env->me_psize);
MAP(rc, env, id3.mptr, len, off);
if (rc)
goto fail;
if (!el[x].mref) {
munmap(el[x].mptr, el[x].mcnt);
el[x].mptr = id3.mptr;
el[x].mcnt = id3.mcnt;
} else {
id3.mid = pg0;
UNLOCK_MUTEX(env->me_rpmutex);
goto found;
}
}
el[x].mref++; el[x].mref++;
UNLOCK_MUTEX(env->me_rpmutex); UNLOCK_MUTEX(env->me_rpmutex);
goto found; goto found;
@ -5617,11 +5715,7 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret)
for (i=1; i<el[0].mid; i++) { for (i=1; i<el[0].mid; i++) {
if (!el[i].mref) { if (!el[i].mref) {
if (!y) y = i; if (!y) y = i;
#ifdef _WIN32
UnmapViewOfFile(el[i].mptr);
#else
munmap(el[i].mptr, env->me_psize * el[i].mcnt); munmap(el[i].mptr, env->me_psize * el[i].mcnt);
#endif
} }
} }
if (!y) { if (!y) {
@ -5633,74 +5727,55 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret)
el[y++] = el[i]; el[y++] = el[i];
el[0].mid = y-1; el[0].mid = y-1;
} }
#ifdef _WIN32 SET_OFF(off, pgno * env->me_psize);
LARGE_INTEGER off; MAP(rc, env, id3.mptr, len, off);
SIZE_T mlen;
int rc;
off.QuadPart = pgno * env->me_psize * 16;
p = NULL;
np = 16;
len *= 16;
mlen = len;
rc = NtMapViewOfSection(env->me_fmh, GetCurrentProcess(), (void **)&p, 0,
mlen, &off, &mlen, ViewUnmap, MEM_RESERVE, PAGE_READONLY);
if (rc) { if (rc) {
fail: fail:
UNLOCK_MUTEX(env->me_rpmutex); UNLOCK_MUTEX(env->me_rpmutex);
return rc; return rc;
} }
#if 0 /* If this page is far enough from the end of the env, scan for
if (IS_OVERFLOW(p)) { * any overflow pages that would spill onto another block.
np = p->mp_pages; */
UnmapViewOfFile(p); if (pgno + MDB_RPAGE_CHUNK <= txn->mt_last_pgno) {
len *= np; int i;
p = MapViewOfFile(env->me_fmh, FILE_MAP_READ, hi, lo, len); char *cp = (char *)id3.mptr + rem * env->me_psize;
if (p == NULL) for (i=rem; i<MDB_RPAGE_CHUNK;) {
goto fail; p = (MDB_page *)cp;
} if (IS_OVERFLOW(p)) {
#endif int nop = p->mp_pages;
#else if (nop + i > MDB_RPAGE_CHUNK) {
off_t off = pgno * env->me_psize; munmap(id3.mptr, len);
p = mmap(NULL, len, PROT_READ, MAP_SHARED, env->me_fd, off); id3.mcnt = nop + i;
if (p == MAP_FAILED) { len = id3.mcnt * env->me_psize;
fail: MAP(rc, env, id3.mptr, len, off);
UNLOCK_MUTEX(env->me_rpmutex); if (rc)
return errno; goto fail;
} break;
if (IS_OVERFLOW(p)) { }
np = p->mp_pages; i += nop;
munmap(p, len); cp += nop * env->me_psize;
len *= np; } else {
p = mmap(NULL, len, PROT_READ, MAP_SHARED, env->me_fd, off); i++;
if (p == MAP_FAILED) cp += env->me_psize;
goto fail; }
}
} }
#endif
id3.mid = pgno;
id3.mptr = p;
id3.mcnt = np;
id3.mref = 1;
mdb_mid3l_insert(el, &id3); mdb_mid3l_insert(el, &id3);
UNLOCK_MUTEX(env->me_rpmutex); UNLOCK_MUTEX(env->me_rpmutex);
found: found:
id3.mid = pgno;
id3.mptr = p;
id3.mcnt = np;
id3.mref = 1;
mdb_mid3l_insert(rl, &id3); mdb_mid3l_insert(rl, &id3);
} else { } else {
return MDB_TXN_FULL; return MDB_TXN_FULL;
} }
ok: ok:
#ifdef _WIN32 p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize);
{ #if 1 /* temporary */
char *v = (char *)p; if (IS_OVERFLOW(p)) {
v += (pg0 & 0x0f) * env->me_psize; mdb_tassert(txn, p->mp_pages + rem <= id3.mcnt);
*ret = (MDB_page *)v;
} }
#else
*ret = p;
#endif #endif
*ret = p;
return MDB_SUCCESS; return MDB_SUCCESS;
} }
#endif #endif
@ -5715,7 +5790,9 @@ ok:
static int static int
mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl)
{ {
#ifndef MDB_VL32
MDB_env *env = txn->mt_env; MDB_env *env = txn->mt_env;
#endif
MDB_page *p = NULL; MDB_page *p = NULL;
int level; int level;

@ -190,8 +190,8 @@ int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id );
typedef struct MDB_ID3 { typedef struct MDB_ID3 {
MDB_ID mid; /**< The ID */ MDB_ID mid; /**< The ID */
void *mptr; /**< The pointer */ void *mptr; /**< The pointer */
int mcnt; /**< Number of pages */ unsigned int mcnt; /**< Number of pages */
int mref; /**< Refcounter */ unsigned int mref; /**< Refcounter */
} MDB_ID3; } MDB_ID3;
typedef MDB_ID3 *MDB_ID3L; typedef MDB_ID3 *MDB_ID3L;

Loading…
Cancel
Save