From b00761eea6f3c3413dab76ebd99f7a48911e4bb9 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Fri, 11 Oct 2019 18:18:14 -0700 Subject: [PATCH] Fix block cache ID uniqueness for Windows builds (#5844) Summary: Since we do not evict a file's blocks from block cache before that file is deleted, we require a file's cache ID prefix is both unique and non-reusable. However, the Windows functionality we were relying on only guaranteed uniqueness. That meant a newly created file could be assigned the same cache ID prefix as a deleted file. If the newly created file had block offsets matching the deleted file, full cache keys could be exactly the same, resulting in obsolete data blocks returned from cache when trying to read from the new file. We noticed this when running on FAT32 where compaction was writing out of order keys due to reading obsolete blocks from its input files. The functionality is documented as behaving the same on NTFS, although I wasn't able to repro it there. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5844 Test Plan: we had a reliable repro of out-of-order keys on FAT32 that was fixed by this change Differential Revision: D17752442 fbshipit-source-id: 95d983f9196cf415f269e19293b97341edbf7e00 --- db/db_test2.cc | 7 +- env/env_test.cc | 11 +++- port/win/io_win.cc | 66 ++++--------------- table/block_based/block_based_table_reader.cc | 4 +- .../persistent_cache/persistent_cache_test.cc | 9 ++- 5 files changed, 33 insertions(+), 64 deletions(-) diff --git a/db/db_test2.cc b/db/db_test2.cc index c50d55cb9..8d8b2171b 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -1942,7 +1942,10 @@ TEST_F(DBTest2, TestPerfContextIterCpuTime) { } #endif // OS_LINUX -#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented +// GetUniqueIdFromFile is not implemented on these platforms. Persistent cache +// breaks when that function is not implemented and no regular block cache is +// provided. +#if !defined(OS_SOLARIS) && !defined(OS_WIN) TEST_F(DBTest2, PersistentCache) { int num_iter = 80; @@ -2006,7 +2009,7 @@ TEST_F(DBTest2, PersistentCache) { } } } -#endif // !OS_SOLARIS +#endif // !defined(OS_SOLARIS) && !defined(OS_WIN) namespace { void CountSyncPoint() { diff --git a/env/env_test.cc b/env/env_test.cc index fe42b8520..004adf26e 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -883,7 +883,9 @@ TEST_F(EnvPosixTest, PositionedAppend) { } #endif // !ROCKSDB_LITE -// Only works in linux platforms +// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can +// handle a return value of zero but this test case cannot. +#ifndef OS_WIN TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) { // Create file. if (env_ == Env::Default()) { @@ -926,6 +928,7 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) { env_->DeleteFile(fname); } } +#endif // !defined(OS_WIN) // only works in linux platforms #ifdef ROCKSDB_FALLOCATE_PRESENT @@ -1016,7 +1019,9 @@ bool HasPrefix(const std::unordered_set& ss) { return false; } -// Only works in linux and WIN platforms +// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can +// handle a return value of zero but this test case cannot. +#ifndef OS_WIN TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) { if (env_ == Env::Default()) { // Check whether a bunch of concurrently existing files have unique IDs. @@ -1058,7 +1063,6 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) { } } -// Only works in linux and WIN platforms TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) { if (env_ == Env::Default()) { EnvOptions soptions; @@ -1098,6 +1102,7 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) { ASSERT_TRUE(!HasPrefix(ids)); } } +#endif // !defined(OS_WIN) TEST_P(EnvPosixTestWithParam, MultiRead) { EnvOptions soptions; diff --git a/port/win/io_win.cc b/port/win/io_win.cc index 6fbf6fc63..e050d69dd 100644 --- a/port/win/io_win.cc +++ b/port/win/io_win.cc @@ -175,60 +175,18 @@ Status ftruncate(const std::string& filename, HANDLE hFile, return status; } -size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) { - - if (max_size < kMaxVarint64Length * 3) { - return 0; - } -#if (_WIN32_WINNT == _WIN32_WINNT_VISTA) - // MINGGW as defined by CMake file. - // yuslepukhin: I hate the guts of the above macros. - // This impl does not guarantee uniqueness everywhere - // is reasonably good - BY_HANDLE_FILE_INFORMATION FileInfo; - - BOOL result = GetFileInformationByHandle(hFile, &FileInfo); - - TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result); - - if (!result) { - return 0; - } - - char* rid = id; - rid = EncodeVarint64(rid, uint64_t(FileInfo.dwVolumeSerialNumber)); - rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexHigh)); - rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexLow)); - - assert(rid >= id); - return static_cast(rid - id); -#else - FILE_ID_INFO FileInfo; - BOOL result = GetFileInformationByHandleEx(hFile, FileIdInfo, &FileInfo, - sizeof(FileInfo)); - - TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result); - - if (!result) { - return 0; - } - - static_assert(sizeof(uint64_t) == sizeof(FileInfo.VolumeSerialNumber), - "Wrong sizeof expectations"); - // FileId.Identifier is an array of 16 BYTEs, we encode them as two uint64_t - static_assert(sizeof(uint64_t) * 2 == sizeof(FileInfo.FileId.Identifier), - "Wrong sizeof expectations"); - - char* rid = id; - rid = EncodeVarint64(rid, uint64_t(FileInfo.VolumeSerialNumber)); - uint64_t* file_id = reinterpret_cast(&FileInfo.FileId.Identifier[0]); - rid = EncodeVarint64(rid, *file_id); - ++file_id; - rid = EncodeVarint64(rid, *file_id); - - assert(rid >= id); - return static_cast(rid - id); -#endif +size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/, + size_t /*max_size*/) { + // Returning 0 is safe as it causes the table reader to generate a unique ID. + // This is suboptimal for performance as it prevents multiple table readers + // for the same file from sharing cached blocks. For example, if users have + // a low value for `max_open_files`, there can be many table readers opened + // for the same file. + // + // TODO: this is a temporarily solution as it is safe but not optimal for + // performance. For more details see discussion in + // https://github.com/facebook/rocksdb/pull/5844. + return 0; } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index acab50aaf..8baf39987 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -981,7 +981,7 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file, // If the prefix wasn't generated or was too long, // create one from the cache. - if (cc && *size == 0) { + if (cc != nullptr && *size == 0) { char* end = EncodeVarint64(buffer, cc->NewId()); *size = static_cast(end - buffer); } @@ -994,7 +994,7 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file, // If the prefix wasn't generated or was too long, // create one from the cache. - if (*size == 0) { + if (cc != nullptr && *size == 0) { char* end = EncodeVarint64(buffer, cc->NewId()); *size = static_cast(end - buffer); } diff --git a/utilities/persistent_cache/persistent_cache_test.cc b/utilities/persistent_cache/persistent_cache_test.cc index e3b1e39e0..e52cf9bbc 100644 --- a/utilities/persistent_cache/persistent_cache_test.cc +++ b/utilities/persistent_cache/persistent_cache_test.cc @@ -6,7 +6,10 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE + +// GetUniqueIdFromFile is not implemented on Windows. Persistent cache +// breaks when that function is not implemented +#if !defined(ROCKSDB_LITE) && !defined(OS_WIN) #include "utilities/persistent_cache/persistent_cache_test.h" @@ -466,6 +469,6 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } -#else +#else // !defined(ROCKSDB_LITE) && !defined(OS_WIN) int main() { return 0; } -#endif +#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN)