|
|
|
@ -1565,9 +1565,12 @@ struct MDB_env { |
|
|
|
|
HANDLE me_fd; /**< The main data file */ |
|
|
|
|
HANDLE me_lfd; /**< The lock file */ |
|
|
|
|
HANDLE me_mfd; /**< For writing and syncing the meta pages */ |
|
|
|
|
#if MDB_RPAGE_CACHE && defined(_WIN32) |
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
#ifdef MDB_RPAGE_CACHE |
|
|
|
|
HANDLE me_fmh; /**< File Mapping handle */ |
|
|
|
|
#endif |
|
|
|
|
HANDLE me_ovfd; /**< Overlapped/async with write-through file handle */ |
|
|
|
|
#endif /* _WIN32 */ |
|
|
|
|
/** Failed to update the meta page. Probably an I/O error. */ |
|
|
|
|
#define MDB_FATAL_ERROR 0x80000000U |
|
|
|
|
/** Some fields are initialized. */ |
|
|
|
@ -1620,6 +1623,8 @@ struct MDB_env { |
|
|
|
|
int me_live_reader; /**< have liveness lock in reader table */ |
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
int me_pidquery; /**< Used in OpenProcess */ |
|
|
|
|
OVERLAPPED *ov; /**< Used for for overlapping I/O requests */ |
|
|
|
|
int ovs; /**< Count of OVERLAPPEDs */ |
|
|
|
|
#endif |
|
|
|
|
#ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ |
|
|
|
|
# define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ |
|
|
|
@ -2977,7 +2982,11 @@ mdb_env_sync0(MDB_env *env, int force, pgno_t numpgs) |
|
|
|
|
int rc = 0; |
|
|
|
|
if (env->me_flags & MDB_RDONLY) |
|
|
|
|
return EACCES; |
|
|
|
|
if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { |
|
|
|
|
if (force |
|
|
|
|
#ifndef _WIN32 /* Sync is normally achieved in Windows by doing WRITE_THROUGH writes */ |
|
|
|
|
|| !(env->me_flags & MDB_NOSYNC) |
|
|
|
|
#endif |
|
|
|
|
) { |
|
|
|
|
if (env->me_flags & MDB_WRITEMAP) { |
|
|
|
|
int flags = ((env->me_flags & MDB_MAPASYNC) && !force) |
|
|
|
|
? MS_ASYNC : MS_SYNC; |
|
|
|
@ -3907,19 +3916,50 @@ mdb_page_flush(MDB_txn *txn, int keep) |
|
|
|
|
#endif |
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
OVERLAPPED ov; |
|
|
|
|
MDB_page *wdp; |
|
|
|
|
int async_i = 0; |
|
|
|
|
HANDLE fd = (env->me_flags & MDB_NOSYNC) ? env->me_fd : env->me_ovfd; |
|
|
|
|
#else |
|
|
|
|
struct iovec iov[MDB_COMMIT_PAGES]; |
|
|
|
|
HANDLE fd = env->me_fd; |
|
|
|
|
#endif |
|
|
|
|
ssize_t wsize = 0, wres; |
|
|
|
|
off_t wpos = 0, next_pos = 1; /* impossible pos, so pos != next_pos */ |
|
|
|
|
int n = 0; |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
j = i = keep; |
|
|
|
|
|
|
|
|
|
if (env->me_flags & MDB_WRITEMAP) { |
|
|
|
|
if (env->me_flags & MDB_WRITEMAP |
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
/* In windows, we still do writes to the file (with write-through enabled in sync mode),
|
|
|
|
|
* as this is faster than FlushViewOfFile/FlushFileBuffers */ |
|
|
|
|
&& (env->me_flags & MDB_NOSYNC) |
|
|
|
|
#endif |
|
|
|
|
) { |
|
|
|
|
goto done; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
if (pagecount - keep >= env->ovs) { |
|
|
|
|
/* ran out of room in ov array, and re-malloc, copy handles and free previous */ |
|
|
|
|
int ovs = (pagecount - keep) * 1.5; /* provide extra padding to reduce number of re-allocations */ |
|
|
|
|
int new_size = ovs * sizeof(OVERLAPPED); |
|
|
|
|
ov = malloc(new_size); |
|
|
|
|
if (ov == NULL) |
|
|
|
|
return ENOMEM; |
|
|
|
|
int previous_size = env->ovs * sizeof(OVERLAPPED); |
|
|
|
|
memcpy(ov, env->ov, previous_size); /* Copy previous OVERLAPPED data to retain event handles */ |
|
|
|
|
/* And clear rest of memory */ |
|
|
|
|
memset(&ov[env->ovs], 0, new_size - previous_size); |
|
|
|
|
if (env->ovs > 0) { |
|
|
|
|
free(env->ov); /* release previous allocation */ |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
env->ov = ov; |
|
|
|
|
env->ovs = ovs; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
/* Write the pages */ |
|
|
|
|
for (;;) { |
|
|
|
|
if (++i <= pagecount) { |
|
|
|
@ -3945,60 +3985,58 @@ mdb_page_flush(MDB_txn *txn, int keep) |
|
|
|
|
if (IS_OVERFLOW(dp)) size *= dp->mp_pages; |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
/* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ |
|
|
|
|
if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE |
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
else break; |
|
|
|
|
|
|
|
|
|
#if MDB_RPAGE_CACHE |
|
|
|
|
if (env->me_encfunc) { |
|
|
|
|
MDB_val in, out; |
|
|
|
|
encp = mdb_page_malloc(txn, nump. 0); |
|
|
|
|
if (!encp) |
|
|
|
|
return ENOMEM; |
|
|
|
|
in.mv_size = size; |
|
|
|
|
in.mv_data = dp; |
|
|
|
|
out.mv_size = size; |
|
|
|
|
out.mv_data = encp; |
|
|
|
|
env->me_encfunc(&in, &out, env->me_enckey, 1); |
|
|
|
|
dp = encp; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
/* Windows actually supports scatter/gather I/O, but only on
|
|
|
|
|
* unbuffered file handles. Since we're relying on the OS page |
|
|
|
|
* cache for all our data, that's self-defeating. So we just |
|
|
|
|
* write pages one at a time. We use the ov structure to set |
|
|
|
|
* the write offset, to at least save the overhead of a Seek |
|
|
|
|
* system call. |
|
|
|
|
* If writemap is enabled, consecutive page positions infer |
|
|
|
|
* contiguous (mapped) memory. |
|
|
|
|
* Otherwise force write pages one at a time. |
|
|
|
|
*/ |
|
|
|
|
DPRINTF(("committing page %"Yu, pgno)); |
|
|
|
|
memset(&ov, 0, sizeof(ov)); |
|
|
|
|
ov.Offset = pos & 0xffffffff; |
|
|
|
|
ov.OffsetHigh = pos >> 16 >> 16; |
|
|
|
|
rc = 0; |
|
|
|
|
if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) |
|
|
|
|
rc = ErrCode(); |
|
|
|
|
#if MDB_RPAGE_CACHE |
|
|
|
|
if (env->me_encfunc) |
|
|
|
|
mdb_dpage_free_n(env, dp, nump); |
|
|
|
|
|| !(env->me_flags & MDB_WRITEMAP) |
|
|
|
|
#endif |
|
|
|
|
if (rc) { |
|
|
|
|
DPRINTF(("WriteFile: %d", rc)); |
|
|
|
|
return rc; |
|
|
|
|
} |
|
|
|
|
#else |
|
|
|
|
/* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ |
|
|
|
|
if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { |
|
|
|
|
) { |
|
|
|
|
if (n) { |
|
|
|
|
retry_write: |
|
|
|
|
rc = 0; |
|
|
|
|
/* Write previous page(s) */ |
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
OVERLAPPED *this_ov = &ov[async_i]; |
|
|
|
|
/* Clear status, and keep hEvent, we reuse that */ |
|
|
|
|
this_ov->Internal = 0; |
|
|
|
|
this_ov->Offset = wpos & 0xffffffff; |
|
|
|
|
this_ov->OffsetHigh = wpos >> 16 >> 16; |
|
|
|
|
if (!F_ISSET(env->me_flags, MDB_NOSYNC) && !this_ov->hEvent) { |
|
|
|
|
HANDLE event = CreateEvent(NULL, FALSE, FALSE, NULL); |
|
|
|
|
if (!event) { |
|
|
|
|
rc = ErrCode(); |
|
|
|
|
DPRINTF(("CreateEvent: %s", strerror(rc))); |
|
|
|
|
return rc; |
|
|
|
|
} |
|
|
|
|
this_ov->hEvent = event; |
|
|
|
|
} |
|
|
|
|
if (!WriteFile(fd, wdp, wsize, NULL, this_ov)) { |
|
|
|
|
rc = ErrCode(); |
|
|
|
|
if (rc != ERROR_IO_PENDING) { |
|
|
|
|
DPRINTF(("WriteFile: %d", rc)); |
|
|
|
|
return rc; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
async_i++; |
|
|
|
|
#else /* _WIN32 */ |
|
|
|
|
#ifdef MDB_USE_PWRITEV |
|
|
|
|
wres = pwritev(env->me_fd, iov, n, wpos); |
|
|
|
|
wres = pwritev(fd, iov, n, wpos); |
|
|
|
|
#else |
|
|
|
|
if (n == 1) { |
|
|
|
|
wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); |
|
|
|
|
wres = pwrite(fd, iov[0].iov_base, wsize, wpos); |
|
|
|
|
} else { |
|
|
|
|
retry_seek: |
|
|
|
|
if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { |
|
|
|
|
if (lseek(fd, wpos, SEEK_SET) == -1) { |
|
|
|
|
rc = ErrCode(); |
|
|
|
|
if (rc == EINTR) |
|
|
|
|
goto retry_seek; |
|
|
|
@ -4006,7 +4044,7 @@ retry_seek: |
|
|
|
|
wres = wsize; |
|
|
|
|
} else { |
|
|
|
|
rc = 0; |
|
|
|
|
wres = writev(env->me_fd, iov, n); |
|
|
|
|
wres = writev(fd, iov, n); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
@ -4021,6 +4059,7 @@ retry_seek: |
|
|
|
|
DPUTS("short write, filesystem full?"); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif /* _WIN32 */ |
|
|
|
|
#if MDB_RPAGE_CACHE |
|
|
|
|
if (env->me_encfunc) { |
|
|
|
|
int j, num1; |
|
|
|
@ -4053,13 +4092,16 @@ retry_seek: |
|
|
|
|
dp = encp; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
DPRINTF(("committing page %"Yu, pgno)); |
|
|
|
|
next_pos = pos + size; |
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
wdp = dp; |
|
|
|
|
#else |
|
|
|
|
iov[n].iov_len = size; |
|
|
|
|
iov[n].iov_base = (char *)dp; |
|
|
|
|
#endif |
|
|
|
|
DPRINTF(("committing page %"Yu, pgno)); |
|
|
|
|
next_pos = pos + size; |
|
|
|
|
wsize += size; |
|
|
|
|
n++; |
|
|
|
|
#endif /* _WIN32 */ |
|
|
|
|
} |
|
|
|
|
#if MDB_RPAGE_CACHE |
|
|
|
|
if (MDB_REMAPPING(env->me_flags) && pgno > txn->mt_last_pgno) |
|
|
|
@ -4072,6 +4114,32 @@ retry_seek: |
|
|
|
|
*/ |
|
|
|
|
CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); |
|
|
|
|
|
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
if (!F_ISSET(env->me_flags, MDB_NOSYNC)) { |
|
|
|
|
/* Now wait for all the asynchronous/overlapped sync/write-through writes to complete.
|
|
|
|
|
* We start with the last one so that all the others should already be complete and |
|
|
|
|
* we reduce thread suspend/resuming (in practice, typically about 99.5% of writes are |
|
|
|
|
* done after the last write is done) */ |
|
|
|
|
rc = 0; |
|
|
|
|
while (--async_i >= 0) { |
|
|
|
|
if (ov[async_i].hEvent) { |
|
|
|
|
if (!GetOverlappedResult(fd, &ov[async_i], &wres, TRUE)) { |
|
|
|
|
rc = ErrCode(); /* Continue on so that all the event signals are reset */ |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (rc) { /* any error on GetOverlappedResult, exit now */ |
|
|
|
|
return rc; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif /* _WIN32 */ |
|
|
|
|
|
|
|
|
|
if (!(env->me_flags & MDB_WRITEMAP)) { |
|
|
|
|
/* Don't free pages when using writemap (can only get here in NOSYNC mode in Windows)
|
|
|
|
|
* MIPS has cache coherency issues, this is a no-op everywhere else |
|
|
|
|
* Note: for any size >= on-chip cache size, entire on-chip cache is |
|
|
|
|
* flushed. |
|
|
|
|
*/ |
|
|
|
|
for (i = keep; ++i <= pagecount; ) { |
|
|
|
|
dp = dl[i].mptr; |
|
|
|
|
/* This is a page we skipped above */ |
|
|
|
@ -4082,11 +4150,13 @@ retry_seek: |
|
|
|
|
} |
|
|
|
|
mdb_dpage_free(env, dp); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
done: |
|
|
|
|
i--; |
|
|
|
|
txn->mt_dirty_room += i - j; |
|
|
|
|
dl[0].mid = j; |
|
|
|
|
|
|
|
|
|
done: |
|
|
|
|
return MDB_SUCCESS; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -4443,7 +4513,6 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) |
|
|
|
|
if (len == -1 && ErrCode() == EINTR) continue; \
|
|
|
|
|
rc = (len >= 0); break; } while(1) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
DPUTS("writing new meta page"); |
|
|
|
|
|
|
|
|
|
psize = env->me_psize; |
|
|
|
@ -4513,6 +4582,7 @@ mdb_env_write_meta(MDB_txn *txn) |
|
|
|
|
if (mapsize < env->me_mapsize) |
|
|
|
|
mapsize = env->me_mapsize; |
|
|
|
|
|
|
|
|
|
#ifndef _WIN32 /* We don't want to ever use MSYNC/FlushViewOfFile in Windows */ |
|
|
|
|
if (flags & MDB_WRITEMAP) { |
|
|
|
|
mp->mm_mapsize = mapsize; |
|
|
|
|
mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; |
|
|
|
@ -4528,11 +4598,10 @@ mdb_env_write_meta(MDB_txn *txn) |
|
|
|
|
unsigned meta_size = env->me_psize; |
|
|
|
|
rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; |
|
|
|
|
ptr = (char *)mp - PAGEHDRSZ; |
|
|
|
|
#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ |
|
|
|
|
/* POSIX msync() requires ptr = start of OS page */ |
|
|
|
|
r2 = (ptr - env->me_map) & (env->me_os_psize - 1); |
|
|
|
|
ptr -= r2; |
|
|
|
|
meta_size += r2; |
|
|
|
|
#endif |
|
|
|
|
if (MDB_MSYNC(ptr, meta_size, rc)) { |
|
|
|
|
rc = ErrCode(); |
|
|
|
|
goto fail; |
|
|
|
@ -4540,6 +4609,7 @@ mdb_env_write_meta(MDB_txn *txn) |
|
|
|
|
} |
|
|
|
|
goto done; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
metab.mm_txnid = mp->mm_txnid; |
|
|
|
|
metab.mm_last_pg = mp->mm_last_pg; |
|
|
|
|
|
|
|
|
@ -4921,7 +4991,7 @@ mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) |
|
|
|
|
/** File type, access mode etc. for #mdb_fopen() */ |
|
|
|
|
enum mdb_fopen_type { |
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS |
|
|
|
|
MDB_O_RDONLY, MDB_O_RDWR, MDB_O_OVERLAPPED, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS |
|
|
|
|
#else |
|
|
|
|
/* A comment in mdb_fopen() explains some O_* flag choices. */ |
|
|
|
|
MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ |
|
|
|
@ -4982,6 +5052,11 @@ mdb_fopen(const MDB_env *env, MDB_name *fname, |
|
|
|
|
disp = OPEN_ALWAYS; |
|
|
|
|
attrs = FILE_ATTRIBUTE_NORMAL; |
|
|
|
|
switch (which) { |
|
|
|
|
case MDB_O_OVERLAPPED: /* for unbuffered asynchronous writes (write-through mode)*/ |
|
|
|
|
acc = GENERIC_WRITE; |
|
|
|
|
disp = OPEN_EXISTING; |
|
|
|
|
attrs = FILE_FLAG_OVERLAPPED|FILE_FLAG_WRITE_THROUGH; |
|
|
|
|
break; |
|
|
|
|
case MDB_O_RDONLY: /* read-only datafile */ |
|
|
|
|
acc = GENERIC_READ; |
|
|
|
|
disp = OPEN_EXISTING; |
|
|
|
@ -5071,6 +5146,7 @@ mdb_env_open2(MDB_env *env, int prev) |
|
|
|
|
if (!NtCreateSection) |
|
|
|
|
return MDB_PROBLEM; |
|
|
|
|
} |
|
|
|
|
env->ovs = 0; |
|
|
|
|
#endif /* _WIN32 */ |
|
|
|
|
|
|
|
|
|
#ifdef BROKEN_FDATASYNC |
|
|
|
@ -5909,6 +5985,11 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode |
|
|
|
|
mode, &env->me_fd); |
|
|
|
|
if (rc) |
|
|
|
|
goto leave; |
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
rc = mdb_fopen(env, &fname, MDB_O_OVERLAPPED, mode, &env->me_ovfd); |
|
|
|
|
if (rc) |
|
|
|
|
goto leave; |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { |
|
|
|
|
rc = mdb_env_setup_locks(env, &fname, mode, &excl); |
|
|
|
@ -5917,14 +5998,12 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if ((rc = mdb_env_open2(env, flags & MDB_PREVSNAPSHOT)) == MDB_SUCCESS) { |
|
|
|
|
if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { |
|
|
|
|
/* Synchronous fd for meta writes. Needed even with
|
|
|
|
|
* MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. |
|
|
|
|
*/ |
|
|
|
|
rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); |
|
|
|
|
if (rc) |
|
|
|
|
goto leave; |
|
|
|
|
} |
|
|
|
|
DPRINTF(("opened dbenv %p", (void *) env)); |
|
|
|
|
if (excl > 0 && !(flags & MDB_PREVSNAPSHOT)) { |
|
|
|
|
rc = mdb_env_share_locks(env, &excl); |
|
|
|
@ -6030,6 +6109,16 @@ mdb_env_close_active(MDB_env *env, int excl) |
|
|
|
|
} |
|
|
|
|
if (env->me_mfd != INVALID_HANDLE_VALUE) |
|
|
|
|
(void) close(env->me_mfd); |
|
|
|
|
#ifdef _WIN32 |
|
|
|
|
if (env->ovs > 0) { |
|
|
|
|
for (i = 0; i < env->ovs; i++) { |
|
|
|
|
CloseHandle(env->ov[i].hEvent); |
|
|
|
|
} |
|
|
|
|
free(env->ov); |
|
|
|
|
} |
|
|
|
|
if (env->me_ovfd != INVALID_HANDLE_VALUE) |
|
|
|
|
(void) close(env->me_ovfd); |
|
|
|
|
#endif |
|
|
|
|
if (env->me_fd != INVALID_HANDLE_VALUE) |
|
|
|
|
(void) close(env->me_fd); |
|
|
|
|
if (env->me_txns) { |
|
|
|
|