Add MDB_ROBUST: Robust mutexes, when supported.

The flag is silently ignored when unsupported - i.e. on BSD systems.

Also generalize error code MDB_PANIC.
robust
Hallvard Furuseth 10 years ago
parent 644802d090
commit cf8c8fc024
  1. 15
      libraries/liblmdb/lmdb.h
  2. 125
      libraries/liblmdb/mdb.c

@ -49,7 +49,9 @@
* stale locks can block further operation. * stale locks can block further operation.
* *
* Fix: Check for stale readers periodically, using the * Fix: Check for stale readers periodically, using the
* #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. Or just * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool.
* Catch stale
* locks with option MDB_ROBUST if supported (non-BSD). Or just
* make all programs using the database close it; the lockfile * make all programs using the database close it; the lockfile
* is always reset on first open of the environment. * is always reset on first open of the environment.
* *
@ -105,6 +107,7 @@
* The transaction becomes "long-lived" as above until a check * The transaction becomes "long-lived" as above until a check
* for stale readers is performed or the lockfile is reset, * for stale readers is performed or the lockfile is reset,
* since the process may not remove it from the lockfile. * since the process may not remove it from the lockfile.
* Except write-transactions on Unix with MDB_ROBUST or on Windows.
* *
* - If you do that anyway, do a periodic check for stale readers. Or * - If you do that anyway, do a periodic check for stale readers. Or
* close the environment once in a while, so the lockfile can get reset. * close the environment once in a while, so the lockfile can get reset.
@ -287,6 +290,8 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
#define MDB_NORDAHEAD 0x800000 #define MDB_NORDAHEAD 0x800000
/** don't initialize malloc'd memory before writing to datafile */ /** don't initialize malloc'd memory before writing to datafile */
#define MDB_NOMEMINIT 0x1000000 #define MDB_NOMEMINIT 0x1000000
/** catch stale locks if supported (not on BSD, needs robust mutexes) */
#define MDB_ROBUST 0x2000000
/** @} */ /** @} */
/** @defgroup mdb_dbi_open Database Flags /** @defgroup mdb_dbi_open Database Flags
@ -391,7 +396,7 @@ typedef enum MDB_cursor_op {
#define MDB_PAGE_NOTFOUND (-30797) #define MDB_PAGE_NOTFOUND (-30797)
/** Located page was wrong type */ /** Located page was wrong type */
#define MDB_CORRUPTED (-30796) #define MDB_CORRUPTED (-30796)
/** Update of meta page failed, probably I/O error */ /** Update of meta page failed or environment had fatal error */
#define MDB_PANIC (-30795) #define MDB_PANIC (-30795)
/** Environment version mismatch */ /** Environment version mismatch */
#define MDB_VERSION_MISMATCH (-30794) #define MDB_VERSION_MISMATCH (-30794)
@ -511,6 +516,12 @@ int mdb_env_create(MDB_env **env);
* Open the environment in read-only mode. No write operations will be * Open the environment in read-only mode. No write operations will be
* allowed. LMDB will still modify the lock file - except on read-only * allowed. LMDB will still modify the lock file - except on read-only
* filesystems, where LMDB does not use locks. * filesystems, where LMDB does not use locks.
* <li>#MDB_ROBUST
* Initialize the lockfile to catch stale locks if robust mutexes
* are supported, so aborted processes will not block others.
* Ignored when another process has the environment open, and
* by liblmdb built with MDB_USE_POSIX_SEM (such as BSD systems).
* Enabled by default on Windows. Some locking slowdown on Unix.
* <li>#MDB_WRITEMAP * <li>#MDB_WRITEMAP
* Use a writeable memory map unless MDB_RDONLY is set. This is faster * Use a writeable memory map unless MDB_RDONLY is set. This is faster
* and uses fewer mallocs, but loses protection from application bugs * and uses fewer mallocs, but loses protection from application bugs

@ -175,6 +175,10 @@
/** Features under development */ /** Features under development */
#ifndef MDB_DEVEL #ifndef MDB_DEVEL
#define MDB_DEVEL 0 #define MDB_DEVEL 0
#endif
#if MDB_DEVEL && (defined(_WIN32) || (defined(EOWNERDEAD) && !defined(MDB_USE_POSIX_SEM)))
#define MDB_ROBUST_SUPPORTED 1
#endif #endif
/** Wrapper around __func__, which is a C99 feature */ /** Wrapper around __func__, which is a C99 feature */
@ -202,6 +206,7 @@ typedef HANDLE mdb_mutex_t;
#define pthread_key_delete(x) TlsFree(x) #define pthread_key_delete(x) TlsFree(x)
#define pthread_getspecific(x) TlsGetValue(x) #define pthread_getspecific(x) TlsGetValue(x)
#define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode())
#define pthread_mutex_consistent(mutex) 0
#define pthread_mutex_unlock(x) ReleaseMutex(*x) #define pthread_mutex_unlock(x) ReleaseMutex(*x)
#define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE)
#define pthread_cond_signal(x) SetEvent(*x) #define pthread_cond_signal(x) SetEvent(*x)
@ -256,6 +261,7 @@ typedef pthread_mutex_t *mdb_mutex_t;
*/ */
#define MDB_MUTEX(env, rw) (&(env)->me_txns->mti_##rw##mutex) #define MDB_MUTEX(env, rw) (&(env)->me_txns->mti_##rw##mutex)
/** Lock the reader or writer mutex. /** Lock the reader or writer mutex.
* Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX().
*/ */
#define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex) #define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex)
/** Unlock the reader or writer mutex. /** Unlock the reader or writer mutex.
@ -294,7 +300,18 @@ typedef pthread_mutex_t *mdb_mutex_t;
/** @} */ /** @} */
#ifdef MDB_ROBUST_SUPPORTED
/** Lock mutex, handle any error, set rc = result.
* Return 0 on success, nonzero (not rc) on error.
*/
#define LOCK_MUTEX(rc, env, mutex) \
(((rc) = LOCK_MUTEX0(mutex)) && \
((rc) = mdb_mutex_failed(env, mutex, rc)))
static int mdb_mutex_failed(MDB_env *env, mdb_mutex_t mutex, int rc);
#else
#define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex)) #define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex))
#define mdb_mutex_failed(env, mutex, rc) (rc)
#endif
#ifndef _WIN32 #ifndef _WIN32
/** A flag for opening a file and requesting synchronous data writes. /** A flag for opening a file and requesting synchronous data writes.
@ -1211,6 +1228,7 @@ static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node);
static int mdb_drop0(MDB_cursor *mc, int subs); static int mdb_drop0(MDB_cursor *mc, int subs);
static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi);
static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead);
/** @cond */ /** @cond */
static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long;
@ -1238,7 +1256,7 @@ static char *const mdb_errstr[] = {
"MDB_NOTFOUND: No matching key/data pair found", "MDB_NOTFOUND: No matching key/data pair found",
"MDB_PAGE_NOTFOUND: Requested page not found", "MDB_PAGE_NOTFOUND: Requested page not found",
"MDB_CORRUPTED: Located page was wrong type", "MDB_CORRUPTED: Located page was wrong type",
"MDB_PANIC: Update of meta page failed", "MDB_PANIC: Update of meta page failed or environment had fatal error",
"MDB_VERSION_MISMATCH: Database environment version mismatch", "MDB_VERSION_MISMATCH: Database environment version mismatch",
"MDB_INVALID: File is not an LMDB file", "MDB_INVALID: File is not an LMDB file",
"MDB_MAP_FULL: Environment mapsize limit reached", "MDB_MAP_FULL: Environment mapsize limit reached",
@ -4304,6 +4322,10 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
if ((rc = pthread_mutexattr_init(&mattr)) if ((rc = pthread_mutexattr_init(&mattr))
|| (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED)) || (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED))
#ifdef MDB_ROBUST_SUPPORTED
|| ((env->me_flags & MDB_ROBUST) &&
(rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST)))
#endif
|| (rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr)) || (rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr))
|| (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr))) || (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr)))
goto fail; goto fail;
@ -4362,8 +4384,8 @@ fail:
* environment and re-opening it with the new flags. * environment and re-opening it with the new flags.
*/ */
#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \ #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_ROBUST| \
MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS)
# error "Persistent DB flags & env flags overlap, but both go in mm_flags" # error "Persistent DB flags & env flags overlap, but both go in mm_flags"
@ -9273,17 +9295,22 @@ mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
int ESECT int ESECT
mdb_reader_check(MDB_env *env, int *dead) mdb_reader_check(MDB_env *env, int *dead)
{ {
if (!env)
return EINVAL;
if (dead)
*dead = 0;
return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS;
}
/** As #mdb_reader_check(). rlocked = <caller locked the reader mutex>. */
static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead)
{
mdb_mutex_t rmutex = rlocked ? NULL : MDB_MUTEX(env, r);
unsigned int i, j, rdrs; unsigned int i, j, rdrs;
MDB_reader *mr; MDB_reader *mr;
MDB_PID_T *pids, pid; MDB_PID_T *pids, pid;
int rc = MDB_SUCCESS, count = 0; int rc = MDB_SUCCESS, count = 0;
if (!env)
return EINVAL;
if (dead)
*dead = 0;
if (!env->me_txns)
return MDB_SUCCESS;
rdrs = env->me_txns->mti_numreaders; rdrs = env->me_txns->mti_numreaders;
pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
if (!pids) if (!pids)
@ -9291,24 +9318,32 @@ mdb_reader_check(MDB_env *env, int *dead)
pids[0] = 0; pids[0] = 0;
mr = env->me_txns->mti_readers; mr = env->me_txns->mti_readers;
for (i=0; i<rdrs; i++) { for (i=0; i<rdrs; i++) {
if (mr[i].mr_pid && mr[i].mr_pid != env->me_pid) { pid = mr[i].mr_pid;
pid = mr[i].mr_pid; if (pid && pid != env->me_pid) {
if (mdb_pid_insert(pids, pid) == 0) { if (mdb_pid_insert(pids, pid) == 0) {
if (!mdb_reader_pid(env, Pidcheck, pid)) { if (!mdb_reader_pid(env, Pidcheck, pid)) {
mdb_mutex_t rmutex = MDB_MUTEX(env, r); /* Stale reader found */
if (LOCK_MUTEX(rc, env, rmutex)) j = i;
break; if (rmutex) {
/* Recheck, a new process may have reused pid */ if ((rc = LOCK_MUTEX0(rmutex)) != 0) {
if (!mdb_reader_pid(env, Pidcheck, pid)) { if ((rc = mdb_mutex_failed(env, rmutex, rc)))
for (j=i; j<rdrs; j++) break;
rdrs = 0; /* the above checked all readers */
} else {
/* Recheck, a new process may have reused pid */
if (mdb_reader_pid(env, Pidcheck, pid))
j = rdrs;
}
}
for (; j<rdrs; j++)
if (mr[j].mr_pid == pid) { if (mr[j].mr_pid == pid) {
DPRINTF(("clear stale reader pid %u txn %"Z"d", DPRINTF(("clear stale reader pid %u txn %"Z"d",
(unsigned) pid, mr[j].mr_txnid)); (unsigned) pid, mr[j].mr_txnid));
mr[j].mr_pid = 0; mr[j].mr_pid = 0;
count++; count++;
} }
} if (rmutex)
UNLOCK_MUTEX(rmutex); UNLOCK_MUTEX(rmutex);
} }
} }
} }
@ -9318,4 +9353,56 @@ mdb_reader_check(MDB_env *env, int *dead)
*dead = count; *dead = count;
return rc; return rc;
} }
#ifdef MDB_ROBUST_SUPPORTED
/** Handle #LOCK_MUTEX0() failure.
* With #MDB_ROBUST, try to repair the lock file if the mutex owner died.
* @param[in] env the environment handle
* @param[in] mutex LOCK_MUTEX0() mutex
* @param[in] rc LOCK_MUTEX0() error (nonzero)
* @return 0 on success with the mutex locked, or an error code on failure.
*/
static int mdb_mutex_failed(MDB_env *env, mdb_mutex_t mutex, int rc)
{
int toggle, rlocked, rc2;
#ifndef _WIN32
enum { WAIT_ABANDONED = EOWNERDEAD };
#endif
if (rc == (int) WAIT_ABANDONED) {
/* We own the mutex. Clean up after dead previous owner. */
rc = MDB_SUCCESS;
rlocked = (mutex == MDB_MUTEX(env, r));
if (!rlocked) {
/* Keep mti_txnid updated, otherwise next writer can
* overwrite data which latest meta page refers to.
*/
toggle = mdb_env_pick_meta(env);
env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid;
/* env is hosed if the dead thread was ours */
if (env->me_txn) {
env->me_flags |= MDB_FATAL_ERROR;
env->me_txn = NULL;
rc = MDB_PANIC;
}
}
DPRINTF(("%cmutex owner died, %s", (rlocked ? 'r' : 'w'),
(rc ? "this process' env is hosed" : "recovering")));
rc2 = mdb_reader_check0(env, rlocked, NULL);
if (rc2 == 0)
rc2 = pthread_mutex_consistent(mutex);
if (rc || (rc = rc2)) {
DPRINTF(("LOCK_MUTEX recovery failed, %s", mdb_strerror(rc)));
UNLOCK_MUTEX(mutex);
}
} else {
#ifdef _WIN32
rc = ErrCode();
#endif
DPRINTF(("LOCK_MUTEX failed, %s", mdb_strerror(rc)));
}
return rc;
}
#endif /* MDB_ROBUST_SUPPORTED */
/** @} */ /** @} */

Loading…
Cancel
Save