ITS#9017 LMDB: improve Windows sync commit perf

mdb.master3
Kris Zyp 5 years ago committed by Howard Chu
parent b485f2869c
commit dfb3bbed65
  1. 233
      libraries/liblmdb/mdb.c

@ -1565,9 +1565,12 @@ struct MDB_env {
HANDLE me_fd; /**< The main data file */ HANDLE me_fd; /**< The main data file */
HANDLE me_lfd; /**< The lock file */ HANDLE me_lfd; /**< The lock file */
HANDLE me_mfd; /**< For writing and syncing the meta pages */ HANDLE me_mfd; /**< For writing and syncing the meta pages */
#if MDB_RPAGE_CACHE && defined(_WIN32) #ifdef _WIN32
#ifdef MDB_RPAGE_CACHE
HANDLE me_fmh; /**< File Mapping handle */ HANDLE me_fmh; /**< File Mapping handle */
#endif #endif
HANDLE me_ovfd; /**< Overlapped/async with write-through file handle */
#endif /* _WIN32 */
/** Failed to update the meta page. Probably an I/O error. */ /** Failed to update the meta page. Probably an I/O error. */
#define MDB_FATAL_ERROR 0x80000000U #define MDB_FATAL_ERROR 0x80000000U
/** Some fields are initialized. */ /** Some fields are initialized. */
@ -1620,6 +1623,8 @@ struct MDB_env {
int me_live_reader; /**< have liveness lock in reader table */ int me_live_reader; /**< have liveness lock in reader table */
#ifdef _WIN32 #ifdef _WIN32
int me_pidquery; /**< Used in OpenProcess */ int me_pidquery; /**< Used in OpenProcess */
OVERLAPPED *ov; /**< Used for for overlapping I/O requests */
int ovs; /**< Count of OVERLAPPEDs */
#endif #endif
#ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ #ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */
# define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ # define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */
@ -2977,7 +2982,11 @@ mdb_env_sync0(MDB_env *env, int force, pgno_t numpgs)
int rc = 0; int rc = 0;
if (env->me_flags & MDB_RDONLY) if (env->me_flags & MDB_RDONLY)
return EACCES; return EACCES;
if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { if (force
#ifndef _WIN32 /* Sync is normally achieved in Windows by doing WRITE_THROUGH writes */
|| !(env->me_flags & MDB_NOSYNC)
#endif
) {
if (env->me_flags & MDB_WRITEMAP) { if (env->me_flags & MDB_WRITEMAP) {
int flags = ((env->me_flags & MDB_MAPASYNC) && !force) int flags = ((env->me_flags & MDB_MAPASYNC) && !force)
? MS_ASYNC : MS_SYNC; ? MS_ASYNC : MS_SYNC;
@ -3907,19 +3916,50 @@ mdb_page_flush(MDB_txn *txn, int keep)
#endif #endif
#ifdef _WIN32 #ifdef _WIN32
OVERLAPPED ov; OVERLAPPED ov;
MDB_page *wdp;
int async_i = 0;
HANDLE fd = (env->me_flags & MDB_NOSYNC) ? env->me_fd : env->me_ovfd;
#else #else
struct iovec iov[MDB_COMMIT_PAGES]; struct iovec iov[MDB_COMMIT_PAGES];
HANDLE fd = env->me_fd;
#endif
ssize_t wsize = 0, wres; ssize_t wsize = 0, wres;
off_t wpos = 0, next_pos = 1; /* impossible pos, so pos != next_pos */ off_t wpos = 0, next_pos = 1; /* impossible pos, so pos != next_pos */
int n = 0; int n = 0;
#endif
j = i = keep; j = i = keep;
if (env->me_flags & MDB_WRITEMAP) { if (env->me_flags & MDB_WRITEMAP
#ifdef _WIN32
/* In windows, we still do writes to the file (with write-through enabled in sync mode),
* as this is faster than FlushViewOfFile/FlushFileBuffers */
&& (env->me_flags & MDB_NOSYNC)
#endif
) {
goto done; goto done;
} }
#ifdef _WIN32
if (pagecount - keep >= env->ovs) {
/* ran out of room in ov array, and re-malloc, copy handles and free previous */
int ovs = (pagecount - keep) * 1.5; /* provide extra padding to reduce number of re-allocations */
int new_size = ovs * sizeof(OVERLAPPED);
ov = malloc(new_size);
if (ov == NULL)
return ENOMEM;
int previous_size = env->ovs * sizeof(OVERLAPPED);
memcpy(ov, env->ov, previous_size); /* Copy previous OVERLAPPED data to retain event handles */
/* And clear rest of memory */
memset(&ov[env->ovs], 0, new_size - previous_size);
if (env->ovs > 0) {
free(env->ov); /* release previous allocation */
}
env->ov = ov;
env->ovs = ovs;
}
#endif
/* Write the pages */ /* Write the pages */
for (;;) { for (;;) {
if (++i <= pagecount) { if (++i <= pagecount) {
@ -3945,60 +3985,58 @@ mdb_page_flush(MDB_txn *txn, int keep)
if (IS_OVERFLOW(dp)) size *= dp->mp_pages; if (IS_OVERFLOW(dp)) size *= dp->mp_pages;
#endif #endif
} }
/* Write up to MDB_COMMIT_PAGES dirty pages at a time. */
if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE
#ifdef _WIN32 #ifdef _WIN32
else break; /* Windows actually supports scatter/gather I/O, but only on
* unbuffered file handles. Since we're relying on the OS page
#if MDB_RPAGE_CACHE * cache for all our data, that's self-defeating. So we just
if (env->me_encfunc) { * write pages one at a time. We use the ov structure to set
MDB_val in, out; * the write offset, to at least save the overhead of a Seek
encp = mdb_page_malloc(txn, nump. 0); * system call.
if (!encp) * If writemap is enabled, consecutive page positions infer
return ENOMEM; * contiguous (mapped) memory.
in.mv_size = size; * Otherwise force write pages one at a time.
in.mv_data = dp; */
out.mv_size = size; || !(env->me_flags & MDB_WRITEMAP)
out.mv_data = encp;
env->me_encfunc(&in, &out, env->me_enckey, 1);
dp = encp;
}
#endif
/* Windows actually supports scatter/gather I/O, but only on
* unbuffered file handles. Since we're relying on the OS page
* cache for all our data, that's self-defeating. So we just
* write pages one at a time. We use the ov structure to set
* the write offset, to at least save the overhead of a Seek
* system call.
*/
DPRINTF(("committing page %"Yu, pgno));
memset(&ov, 0, sizeof(ov));
ov.Offset = pos & 0xffffffff;
ov.OffsetHigh = pos >> 16 >> 16;
rc = 0;
if (!WriteFile(env->me_fd, dp, size, NULL, &ov))
rc = ErrCode();
#if MDB_RPAGE_CACHE
if (env->me_encfunc)
mdb_dpage_free_n(env, dp, nump);
#endif #endif
if (rc) { ) {
DPRINTF(("WriteFile: %d", rc));
return rc;
}
#else
/* Write up to MDB_COMMIT_PAGES dirty pages at a time. */
if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) {
if (n) { if (n) {
retry_write: retry_write:
rc = 0; rc = 0;
/* Write previous page(s) */ /* Write previous page(s) */
#ifdef _WIN32
OVERLAPPED *this_ov = &ov[async_i];
/* Clear status, and keep hEvent, we reuse that */
this_ov->Internal = 0;
this_ov->Offset = wpos & 0xffffffff;
this_ov->OffsetHigh = wpos >> 16 >> 16;
if (!F_ISSET(env->me_flags, MDB_NOSYNC) && !this_ov->hEvent) {
HANDLE event = CreateEvent(NULL, FALSE, FALSE, NULL);
if (!event) {
rc = ErrCode();
DPRINTF(("CreateEvent: %s", strerror(rc)));
return rc;
}
this_ov->hEvent = event;
}
if (!WriteFile(fd, wdp, wsize, NULL, this_ov)) {
rc = ErrCode();
if (rc != ERROR_IO_PENDING) {
DPRINTF(("WriteFile: %d", rc));
return rc;
}
}
async_i++;
#else /* _WIN32 */
#ifdef MDB_USE_PWRITEV #ifdef MDB_USE_PWRITEV
wres = pwritev(env->me_fd, iov, n, wpos); wres = pwritev(fd, iov, n, wpos);
#else #else
if (n == 1) { if (n == 1) {
wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); wres = pwrite(fd, iov[0].iov_base, wsize, wpos);
} else { } else {
retry_seek: retry_seek:
if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { if (lseek(fd, wpos, SEEK_SET) == -1) {
rc = ErrCode(); rc = ErrCode();
if (rc == EINTR) if (rc == EINTR)
goto retry_seek; goto retry_seek;
@ -4006,7 +4044,7 @@ retry_seek:
wres = wsize; wres = wsize;
} else { } else {
rc = 0; rc = 0;
wres = writev(env->me_fd, iov, n); wres = writev(fd, iov, n);
} }
} }
#endif #endif
@ -4021,6 +4059,7 @@ retry_seek:
DPUTS("short write, filesystem full?"); DPUTS("short write, filesystem full?");
} }
} }
#endif /* _WIN32 */
#if MDB_RPAGE_CACHE #if MDB_RPAGE_CACHE
if (env->me_encfunc) { if (env->me_encfunc) {
int j, num1; int j, num1;
@ -4053,13 +4092,16 @@ retry_seek:
dp = encp; dp = encp;
} }
#endif #endif
DPRINTF(("committing page %"Yu, pgno)); #ifdef _WIN32
next_pos = pos + size; wdp = dp;
#else
iov[n].iov_len = size; iov[n].iov_len = size;
iov[n].iov_base = (char *)dp; iov[n].iov_base = (char *)dp;
#endif
DPRINTF(("committing page %"Yu, pgno));
next_pos = pos + size;
wsize += size; wsize += size;
n++; n++;
#endif /* _WIN32 */
} }
#if MDB_RPAGE_CACHE #if MDB_RPAGE_CACHE
if (MDB_REMAPPING(env->me_flags) && pgno > txn->mt_last_pgno) if (MDB_REMAPPING(env->me_flags) && pgno > txn->mt_last_pgno)
@ -4072,21 +4114,49 @@ retry_seek:
*/ */
CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE);
for (i = keep; ++i <= pagecount; ) { #ifdef _WIN32
dp = dl[i].mptr; if (!F_ISSET(env->me_flags, MDB_NOSYNC)) {
/* This is a page we skipped above */ /* Now wait for all the asynchronous/overlapped sync/write-through writes to complete.
if (!dl[i].mid) { * We start with the last one so that all the others should already be complete and
dl[++j] = dl[i]; * we reduce thread suspend/resuming (in practice, typically about 99.5% of writes are
dl[j].mid = dp->mp_pgno; * done after the last write is done) */
continue; rc = 0;
while (--async_i >= 0) {
if (ov[async_i].hEvent) {
if (!GetOverlappedResult(fd, &ov[async_i], &wres, TRUE)) {
rc = ErrCode(); /* Continue on so that all the event signals are reset */
}
}
}
if (rc) { /* any error on GetOverlappedResult, exit now */
return rc;
}
}
#endif /* _WIN32 */
if (!(env->me_flags & MDB_WRITEMAP)) {
/* Don't free pages when using writemap (can only get here in NOSYNC mode in Windows)
* MIPS has cache coherency issues, this is a no-op everywhere else
* Note: for any size >= on-chip cache size, entire on-chip cache is
* flushed.
*/
for (i = keep; ++i <= pagecount; ) {
dp = dl[i].mptr;
/* This is a page we skipped above */
if (!dl[i].mid) {
dl[++j] = dl[i];
dl[j].mid = dp->mp_pgno;
continue;
}
mdb_dpage_free(env, dp);
} }
mdb_dpage_free(env, dp);
} }
done:
i--; i--;
txn->mt_dirty_room += i - j; txn->mt_dirty_room += i - j;
dl[0].mid = j; dl[0].mid = j;
done:
return MDB_SUCCESS; return MDB_SUCCESS;
} }
@ -4443,7 +4513,6 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
if (len == -1 && ErrCode() == EINTR) continue; \ if (len == -1 && ErrCode() == EINTR) continue; \
rc = (len >= 0); break; } while(1) rc = (len >= 0); break; } while(1)
#endif #endif
DPUTS("writing new meta page"); DPUTS("writing new meta page");
psize = env->me_psize; psize = env->me_psize;
@ -4513,6 +4582,7 @@ mdb_env_write_meta(MDB_txn *txn)
if (mapsize < env->me_mapsize) if (mapsize < env->me_mapsize)
mapsize = env->me_mapsize; mapsize = env->me_mapsize;
#ifndef _WIN32 /* We don't want to ever use MSYNC/FlushViewOfFile in Windows */
if (flags & MDB_WRITEMAP) { if (flags & MDB_WRITEMAP) {
mp->mm_mapsize = mapsize; mp->mm_mapsize = mapsize;
mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
@ -4528,11 +4598,10 @@ mdb_env_write_meta(MDB_txn *txn)
unsigned meta_size = env->me_psize; unsigned meta_size = env->me_psize;
rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
ptr = (char *)mp - PAGEHDRSZ; ptr = (char *)mp - PAGEHDRSZ;
#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ /* POSIX msync() requires ptr = start of OS page */
r2 = (ptr - env->me_map) & (env->me_os_psize - 1); r2 = (ptr - env->me_map) & (env->me_os_psize - 1);
ptr -= r2; ptr -= r2;
meta_size += r2; meta_size += r2;
#endif
if (MDB_MSYNC(ptr, meta_size, rc)) { if (MDB_MSYNC(ptr, meta_size, rc)) {
rc = ErrCode(); rc = ErrCode();
goto fail; goto fail;
@ -4540,6 +4609,7 @@ mdb_env_write_meta(MDB_txn *txn)
} }
goto done; goto done;
} }
#endif
metab.mm_txnid = mp->mm_txnid; metab.mm_txnid = mp->mm_txnid;
metab.mm_last_pg = mp->mm_last_pg; metab.mm_last_pg = mp->mm_last_pg;
@ -4921,7 +4991,7 @@ mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname)
/** File type, access mode etc. for #mdb_fopen() */ /** File type, access mode etc. for #mdb_fopen() */
enum mdb_fopen_type { enum mdb_fopen_type {
#ifdef _WIN32 #ifdef _WIN32
MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS MDB_O_RDONLY, MDB_O_RDWR, MDB_O_OVERLAPPED, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS
#else #else
/* A comment in mdb_fopen() explains some O_* flag choices. */ /* A comment in mdb_fopen() explains some O_* flag choices. */
MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */
@ -4982,6 +5052,11 @@ mdb_fopen(const MDB_env *env, MDB_name *fname,
disp = OPEN_ALWAYS; disp = OPEN_ALWAYS;
attrs = FILE_ATTRIBUTE_NORMAL; attrs = FILE_ATTRIBUTE_NORMAL;
switch (which) { switch (which) {
case MDB_O_OVERLAPPED: /* for unbuffered asynchronous writes (write-through mode)*/
acc = GENERIC_WRITE;
disp = OPEN_EXISTING;
attrs = FILE_FLAG_OVERLAPPED|FILE_FLAG_WRITE_THROUGH;
break;
case MDB_O_RDONLY: /* read-only datafile */ case MDB_O_RDONLY: /* read-only datafile */
acc = GENERIC_READ; acc = GENERIC_READ;
disp = OPEN_EXISTING; disp = OPEN_EXISTING;
@ -5071,6 +5146,7 @@ mdb_env_open2(MDB_env *env, int prev)
if (!NtCreateSection) if (!NtCreateSection)
return MDB_PROBLEM; return MDB_PROBLEM;
} }
env->ovs = 0;
#endif /* _WIN32 */ #endif /* _WIN32 */
#ifdef BROKEN_FDATASYNC #ifdef BROKEN_FDATASYNC
@ -5909,6 +5985,11 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
mode, &env->me_fd); mode, &env->me_fd);
if (rc) if (rc)
goto leave; goto leave;
#ifdef _WIN32
rc = mdb_fopen(env, &fname, MDB_O_OVERLAPPED, mode, &env->me_ovfd);
if (rc)
goto leave;
#endif
if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) {
rc = mdb_env_setup_locks(env, &fname, mode, &excl); rc = mdb_env_setup_locks(env, &fname, mode, &excl);
@ -5917,14 +5998,12 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode
} }
if ((rc = mdb_env_open2(env, flags & MDB_PREVSNAPSHOT)) == MDB_SUCCESS) { if ((rc = mdb_env_open2(env, flags & MDB_PREVSNAPSHOT)) == MDB_SUCCESS) {
if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { /* Synchronous fd for meta writes. Needed even with
/* Synchronous fd for meta writes. Needed even with * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset.
* MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. */
*/ rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd);
rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); if (rc)
if (rc) goto leave;
goto leave;
}
DPRINTF(("opened dbenv %p", (void *) env)); DPRINTF(("opened dbenv %p", (void *) env));
if (excl > 0 && !(flags & MDB_PREVSNAPSHOT)) { if (excl > 0 && !(flags & MDB_PREVSNAPSHOT)) {
rc = mdb_env_share_locks(env, &excl); rc = mdb_env_share_locks(env, &excl);
@ -6030,6 +6109,16 @@ mdb_env_close_active(MDB_env *env, int excl)
} }
if (env->me_mfd != INVALID_HANDLE_VALUE) if (env->me_mfd != INVALID_HANDLE_VALUE)
(void) close(env->me_mfd); (void) close(env->me_mfd);
#ifdef _WIN32
if (env->ovs > 0) {
for (i = 0; i < env->ovs; i++) {
CloseHandle(env->ov[i].hEvent);
}
free(env->ov);
}
if (env->me_ovfd != INVALID_HANDLE_VALUE)
(void) close(env->me_ovfd);
#endif
if (env->me_fd != INVALID_HANDLE_VALUE) if (env->me_fd != INVALID_HANDLE_VALUE)
(void) close(env->me_fd); (void) close(env->me_fd);
if (env->me_txns) { if (env->me_txns) {

Loading…
Cancel
Save