From 7fdf6720416d5eaadf5fd811c4513c87a2899a0f Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 16 Jan 2013 18:42:57 +0100 Subject: [PATCH] Update MDB comments: Caveats, flags, etc. --- libraries/liblmdb/lmdb.h | 70 ++++++++++++++++++++++++++-------------- libraries/liblmdb/mdb.c | 14 ++++---- 2 files changed, 53 insertions(+), 31 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index bd10bb6..7719e32 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -78,10 +78,11 @@ * database can grow quickly. Write transactions prevent * other write transactions, since writes are serialized. * - * ...when several processes can use a database concurrently: - * * - Avoid suspending a process with active transactions. These - * would then be "long-lived" as above. + * would then be "long-lived" as above. Also read transactions + * suspended when writers commit could sometimes see wrong data. + * + * ...when several processes can use a database concurrently: * * - Avoid aborting a process with an active transaction. * The transaction becomes "long-lived" as above until the lockfile @@ -221,7 +222,7 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel * Values do not overlap Database Flags. * @{ */ - /** mmap at a fixed address */ + /** mmap at a fixed address (experimental) */ #define MDB_FIXEDMAP 0x01 /** no environment directory */ #define MDB_NOSUBDIR 0x4000 @@ -233,7 +234,7 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_NOMETASYNC 0x40000 /** use writable mmap */ #define MDB_WRITEMAP 0x80000 - /** use asynchronous msync */ + /** use asynchronous msync when MDB_WRITEMAP is used */ #define MDB_MAPASYNC 0x100000 /** @} */ @@ -435,24 +436,43 @@ int mdb_env_create(MDB_env **env); * under that directory. With this option, \b path is used as-is for * the database main data file. The database lock file is the \b path * with "-lock" appended. - *
  • #MDB_NOSYNC - * Don't perform a synchronous flush after committing a transaction. This means - * transactions will exhibit the ACI (atomicity, consistency, and isolation) - * properties, but not D (durability); that is database integrity will be - * maintained but it is possible some number of the most recently committed - * transactions may be undone after a system crash. The number of transactions - * at risk is governed by how often the system flushes dirty buffers to disk - * and how often #mdb_env_sync() is called. This flag may be changed - * at any time using #mdb_env_set_flags(). - *
  • #MDB_NOMETASYNC - * Don't perform a synchronous flush of the meta page after committing - * a transaction. This is similar to the #MDB_NOSYNC case, but safer - * because the transaction data is still flushed. The meta page for any - * transaction N will be flushed by the data flush of transaction N+1. - * In case of a system crash, the last committed transaction may be - * lost. This flag may be changed at any time using #mdb_env_set_flags(). *
  • #MDB_RDONLY - * Open the environment in read-only mode. No write operations will be allowed. + * Open the environment in read-only mode. No write operations will be + * allowed. MDB will still modify the lock file - except on read-only + * filesystems, where MDB does not use locks. + *
  • #MDB_WRITEMAP + * Use a writeable memory map unless MDB_RDONLY is set. This is faster + * and uses fewer mallocs, but loses protection from application bugs + * like wild pointer writes and other bad updates into the database. + * Incompatible with nested transactions. + *
  • #MDB_NOMETASYNC + * Flush system buffers to disk only once per transaction, omit the + * metadata flush. Defer that until the system flushes files to disk, + * or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization + * maintains database integrity, but a system crash may undo the last + * committed transaction. I.e. it preserves the ACI (atomicity, + * consistency, isolation) but not D (durability) database property. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_NOSYNC + * Don't flush system buffers to disk when committing a transaction. + * This optimization means a system crash can corrupt the database or + * lose the last transactions if buffers are not yet flushed to disk. + * The risk is governed by how often the system flushes dirty buffers + * to disk and how often #mdb_env_sync() is called. However, if the + * filesystem preserves write order and the #MDB_WRITEMAP flag is not + * used, transactions exhibit ACI (atomicity, consistency, isolation) + * properties and only lose D (durability). I.e. database integrity + * is maintained, but a system crash may undo the final transactions. + * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no + * hint for when to write transactions to disk, unless #mdb_env_sync() + * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_MAPASYNC + * When using #MDB_WRITEMAP, use asynchronous flushes to disk. + * As with #MDB_NOSYNC, a system crash can then corrupt the + * database or lose the last transactions. Calling #mdb_env_sync() + * ensures on-disk database integrity until next commit. + * This flag may be changed at any time using #mdb_env_set_flags(). * * @param[in] mode The UNIX permissions to set on created files. This parameter * is ignored on Windows. @@ -502,7 +522,7 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat); * Data is always written to disk when #mdb_txn_commit() is called, * but the operating system may keep it buffered. MDB always flushes * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC. + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. * @param[in] env An environment handle returned by #mdb_env_create() * @param[in] force If non-zero, force a synchronous flush. Otherwise * if the environment has the #MDB_NOSYNC flag set the flushes @@ -731,7 +751,7 @@ int mdb_txn_renew(MDB_txn *txn); * by the given transaction. Only one thread should call this function; * it is not mutex-protected in a read-only transaction. * To use named databases (with name != NULL), #mdb_env_set_maxdbs() - * must be called before opening the enviorment. + * must be called before opening the environment. * @param[in] txn A transaction handle returned by #mdb_txn_begin() * @param[in] name The name of the database to open. If only a single * database is needed in the environment, this value may be NULL. @@ -796,7 +816,7 @@ int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); * * This call is not mutex protected. Handles should only be closed by * a single thread, and only if no other threads are going to reference - * the database handle any further. + * the database handle or one of its cursors any further. * @param[in] env An environment handle returned by #mdb_env_create() * @param[in] dbi A database handle returned by #mdb_dbi_open() */ diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 1a443ce..e5013d6 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -382,7 +382,7 @@ static txnid_t mdb_debug_start; */ #define P_INVALID (~(pgno_t)0) - /** Test if a flag \b f is set in a flag word \b w. */ + /** Test if the flags \b f are set in a flag word \b w. */ #define F_ISSET(w, f) (((w) & (f)) == (f)) /** Used for offsets within a single page. @@ -404,6 +404,8 @@ typedef uint16_t indx_t; * slot's address is saved in thread-specific data so that subsequent read * transactions started by the same thread need no further locking to proceed. * + * No reader table is used if the database is on a read-only filesystem. + * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which * readers are using data from which old transactions, so that we'll know @@ -810,8 +812,8 @@ struct MDB_txn { */ MDB_IDL mt_free_pgs; union { - MDB_ID2L dirty_list; /**< modified pages */ - MDB_reader *reader; /**< this thread's slot in the reader table */ + MDB_ID2L dirty_list; /**< for write txns: modified pages */ + MDB_reader *reader; /**< this thread's reader table slot or NULL */ } mt_u; /** Array of records for each DB known in the environment. */ MDB_dbx *mt_dbxs; @@ -824,7 +826,7 @@ struct MDB_txn { #define DB_DIRTY 0x01 /**< DB was written in this txn */ #define DB_STALE 0x02 /**< DB record is older than txnID */ /** @} */ - /** Array of cursors for each DB */ + /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; /** Array of flags for each DB */ unsigned char *mt_dbflags; @@ -941,7 +943,7 @@ struct MDB_env { pid_t me_pid; /**< process ID of this env */ char *me_path; /**< path to the DB files */ char *me_map; /**< the memory map of the data file */ - MDB_txninfo *me_txns; /**< the memory map of the lock file */ + MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ MDB_txn *me_txn; /**< current write transaction */ size_t me_mapsize; /**< size of the data memory map */ @@ -950,7 +952,7 @@ struct MDB_env { txnid_t me_pgfirst; /**< ID of first old page record we used */ txnid_t me_pglast; /**< ID of last old page record we used */ MDB_dbx *me_dbxs; /**< array of static DB info */ - uint16_t *me_dbflags; /**< array of DB flags */ + uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ MDB_oldpages *me_pghead; /**< list of old page records */ MDB_oldpages *me_pgfree; /**< list of page records to free */ pthread_key_t me_txkey; /**< thread-key for readers */