From 4dcc0c89f440d69fcab80e06f9ff62844873b4c2 Mon Sep 17 00:00:00 2001 From: Kosie van der Merwe Date: Thu, 31 Jan 2013 15:20:24 -0800 Subject: [PATCH] Fixed cache key for block cache Summary: Added function to `RandomAccessFile` to generate an unique ID for that file. Currently only `PosixRandomAccessFile` has this behaviour implemented and only on Linux. Changed how key is generated in `Table::BlockReader`. Added tests to check whether the unique ID is stable, unique and not a prefix of another unique ID. Added tests to see that `Table` uses the cache more efficiently. Test Plan: make check Reviewers: chip, vamsi, dhruba Reviewed By: chip CC: leveldb Differential Revision: https://reviews.facebook.net/D8145 --- include/leveldb/env.h | 20 ++++++ include/leveldb/table.h | 6 ++ table/table.cc | 53 +++++++++++++-- table/table_test.cc | 87 ++++++++++++++++++++---- util/coding.h | 4 ++ util/env_posix.cc | 34 ++++++++++ util/env_test.cc | 142 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 329 insertions(+), 17 deletions(-) diff --git a/include/leveldb/env.h b/include/leveldb/env.h index ad770305a..b03228cb0 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -223,6 +223,26 @@ class RandomAccessFile { // Safe for concurrent use by multiple threads. virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const = 0; + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" may more + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to eachother by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return 0; // Default implementation to prevent issues with backwards + // compatibility. + }; }; // A file abstraction for sequential writing. The implementation diff --git a/include/leveldb/table.h b/include/leveldb/table.h index c9aa1e78f..c1526beec 100644 --- a/include/leveldb/table.h +++ b/include/leveldb/table.h @@ -58,6 +58,10 @@ class Table { // be close to the file length. uint64_t ApproximateOffsetOf(const Slice& key) const; + // Returns true if the block for the specified key is in cache. + // REQUIRES: key is in this table. + bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); + private: struct Rep; Rep* rep_; @@ -80,6 +84,8 @@ class Table { void ReadMeta(const Footer& footer); void ReadFilter(const Slice& filter_handle_value); + static void SetupCacheKeyPrefix(Rep* rep); + // No copying allowed Table(const Table&); void operator=(const Table&); diff --git a/table/table.cc b/table/table.cc index b3910cdcb..fb72b4a28 100644 --- a/table/table.cc +++ b/table/table.cc @@ -18,6 +18,11 @@ namespace leveldb { +// The longest the prefix of the cache key used to identify blocks can be. +// We are using the fact that we know for Posix files the unique ID is three +// varints. +const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1; + struct Table::Rep { ~Rep() { delete filter; @@ -28,7 +33,8 @@ struct Table::Rep { Options options; Status status; unique_ptr file; - uint64_t cache_id; + char cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t cache_key_prefix_size; FilterBlockReader* filter; const char* filter_data; @@ -36,6 +42,25 @@ struct Table::Rep { Block* index_block; }; +// Helper function to setup the cache key's prefix for the Table. +void Table::SetupCacheKeyPrefix(Rep* rep) { + assert(kMaxCacheKeyPrefixSize >= 10); + rep->cache_key_prefix_size = 0; + if (rep->options.block_cache) { + rep->cache_key_prefix_size = rep->file->GetUniqueId(rep->cache_key_prefix, + kMaxCacheKeyPrefixSize); + + if (rep->cache_key_prefix_size == 0) { + // If the prefix wasn't generated or was too long, we create one from the + // cache. + char* end = EncodeVarint64(rep->cache_key_prefix, + rep->options.block_cache->NewId()); + rep->cache_key_prefix_size = + static_cast(end - rep->cache_key_prefix); + } + } +} + Status Table::Open(const Options& options, unique_ptr&& file, uint64_t size, @@ -80,7 +105,7 @@ Status Table::Open(const Options& options, rep->file = std::move(file); rep->metaindex_handle = footer.metaindex_handle(); rep->index_block = index_block; - rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); + SetupCacheKeyPrefix(rep); rep->filter_data = NULL; rep->filter = NULL; table->reset(new Table(rep)); @@ -179,10 +204,15 @@ Iterator* Table::BlockReader(void* arg, if (s.ok()) { BlockContents contents; if (block_cache != NULL) { - char cache_key_buffer[16]; - EncodeFixed64(cache_key_buffer, table->rep_->cache_id); - EncodeFixed64(cache_key_buffer+8, handle.offset()); - Slice key(cache_key_buffer, sizeof(cache_key_buffer)); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + const size_t cache_key_prefix_size = table->rep_->cache_key_prefix_size; + assert(cache_key_prefix_size != 0); + assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); + memcpy(cache_key, table->rep_->cache_key_prefix, + cache_key_prefix_size); + char* end = EncodeVarint64(cache_key + cache_key_prefix_size, + handle.offset()); + Slice key(cache_key, static_cast(end-cache_key)); cache_handle = block_cache->Lookup(key); if (cache_handle != NULL) { block = reinterpret_cast(block_cache->Value(cache_handle)); @@ -274,6 +304,17 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k, return s; } +void SaveDidIO(void* arg, const Slice& key, const Slice& value, bool didIO) { + *reinterpret_cast(arg) = didIO; +} +bool Table::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { + // We use InternalGet() as it has logic that checks whether we read the + // block from the disk or not. + bool didIO = false; + Status s = InternalGet(options, key, &didIO, SaveDidIO); + assert(s.ok()); + return !didIO; +} uint64_t Table::ApproximateOffsetOf(const Slice& key) const { Iterator* index_iter = diff --git a/table/table_test.cc b/table/table_test.cc index 3ab612046..449464696 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -109,8 +109,8 @@ class StringSink: public WritableFile { class StringSource: public RandomAccessFile { public: - StringSource(const Slice& contents) - : contents_(contents.data(), contents.size()) { + StringSource(const Slice& contents, uint64_t uniq_id) + : contents_(contents.data(), contents.size()), uniq_id_(uniq_id) { } virtual ~StringSource() { } @@ -130,8 +130,20 @@ class StringSource: public RandomAccessFile { return Status::OK(); } + virtual size_t GetUniqueId(char* id, size_t max_size) const { + if (max_size < 20) { + return 0; + } + + char* rid = id; + rid = EncodeVarint64(rid, uniq_id_); + rid = EncodeVarint64(rid, 0); + return static_cast(rid-id); + } + private: std::string contents_; + uint64_t uniq_id_; }; typedef std::map KVMap; @@ -228,8 +240,8 @@ class TableConstructor: public Constructor { } virtual Status FinishImpl(const Options& options, const KVMap& data) { Reset(); - StringSink sink; - TableBuilder builder(options, &sink); + sink_.reset(new StringSink()); + TableBuilder builder(options, sink_.get()); for (KVMap::const_iterator it = data.begin(); it != data.end(); @@ -240,15 +252,13 @@ class TableConstructor: public Constructor { Status s = builder.Finish(); ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(sink.contents().size(), builder.FileSize()); + ASSERT_EQ(sink_->contents().size(), builder.FileSize()); // Open the table - source_.reset(new StringSource(sink.contents())); - Options table_options; - table_options.comparator = options.comparator; - table_options.compression_opts = options.compression_opts; - return Table::Open(table_options, std::move(source_), - sink.contents().size(), &table_); + uniq_id_ = cur_uniq_id_++; + source_.reset(new StringSource(sink_->contents(), uniq_id_)); + return Table::Open(options, std::move(source_), + sink_->contents().size(), &table_); } virtual Iterator* NewIterator() const { @@ -259,17 +269,34 @@ class TableConstructor: public Constructor { return table_->ApproximateOffsetOf(key); } + virtual Status Reopen(const Options& options) { + source_.reset(new StringSource(sink_->contents(), uniq_id_)); + return Table::Open(options, std::move(source_), + sink_->contents().size(), &table_); + } + + virtual Table* table() { + return table_.get(); + } + private: void Reset() { + uniq_id_ = 0; table_.reset(); + sink_.reset(); source_.reset(); } + uint64_t uniq_id_; + unique_ptr sink_; unique_ptr source_; unique_ptr table_; TableConstructor(); + + static uint64_t cur_uniq_id_; }; +uint64_t TableConstructor::cur_uniq_id_ = 1; // A helper class that converts internal format keys into user keys class KeyConvertingIterator: public Iterator { @@ -892,6 +919,44 @@ TEST(TableTest, ApproximateOffsetOfCompressed) { } +TEST(TableTest, BlockCacheLeak) { + // Check that when we reopen a table we don't lose access to blocks already + // in the cache. This test checks whether the Table actually makes use of the + // unique ID from the file. + + Options opt; + opt.block_size = 1024; + opt.compression = kNoCompression; + opt.block_cache = NewLRUCache(16*1024*1024); // big enough so we don't ever + // lose cached values. + + TableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + c.Finish(opt, &keys, &kvmap); + + unique_ptr iter(c.NewIterator()); + iter->SeekToFirst(); + while (iter->Valid()) { + iter->key(); + iter->value(); + iter->Next(); + } + ASSERT_OK(iter->status()); + + ASSERT_OK(c.Reopen(opt)); + for (const std::string& key: keys) { + ASSERT_TRUE(c.table()->TEST_KeyInCache(ReadOptions(), key)); + } +} + } // namespace leveldb int main(int argc, char** argv) { diff --git a/util/coding.h b/util/coding.h index d70bab7b6..19fa4acb1 100644 --- a/util/coding.h +++ b/util/coding.h @@ -18,6 +18,10 @@ namespace leveldb { +// The maximum length of a varint in bytes for 32 and 64 bits respectively. +const unsigned int kMaxVarint32Length = 5; +const unsigned int kMaxVarint64Length = 10; + // Standard Put... routines append to a string extern void PutFixed32(std::string* dst, uint32_t value); extern void PutFixed64(std::string* dst, uint64_t value); diff --git a/util/env_posix.cc b/util/env_posix.cc index eebf2e66b..a33acfe07 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -11,18 +11,23 @@ #include #include #include +#include #include #include #include #include #include #include +#if defined(OS_LINUX) +#include +#endif #if defined(LEVELDB_PLATFORM_ANDROID) #include #endif #include "leveldb/env.h" #include "leveldb/slice.h" #include "port/port.h" +#include "util/coding.h" #include "util/logging.h" #include "util/posix_logger.h" @@ -108,6 +113,35 @@ class PosixRandomAccessFile: public RandomAccessFile { } return s; } + +#if defined(OS_LINUX) + virtual size_t GetUniqueId(char* id, size_t max_size) const { + // TODO: possibly allow this function to handle tighter bounds. + if (max_size < kMaxVarint64Length*3) { + return 0; + } + + struct stat buf; + int result = fstat(fd_, &buf); + if (result == -1) { + return 0; + } + + long version = 0; + result = ioctl(fd_, FS_IOC_GETVERSION, &version); + if (result == -1) { + return 0; + } + uint64_t uversion = (uint64_t)version; + + char* rid = id; + rid = EncodeVarint64(rid, buf.st_dev); + rid = EncodeVarint64(rid, buf.st_ino); + rid = EncodeVarint64(rid, uversion); + assert(rid >= id); + return static_cast(rid-id); + } +#endif }; // mmap() based random-access diff --git a/util/env_test.cc b/util/env_test.cc index dcc1457e7..fa6483da9 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -4,7 +4,9 @@ #include "leveldb/env.h" +#include #include "port/port.h" +#include "util/coding.h" #include "util/testharness.h" namespace leveldb { @@ -97,6 +99,146 @@ TEST(EnvPosixTest, StartThread) { ASSERT_EQ(state.val, 3); } +bool IsSingleVarint(const std::string& s) { + Slice slice(s); + + uint64_t v; + if (!GetVarint64(&slice, &v)) { + return false; + } + + return slice.size() == 0; +} + +bool IsUniqueIDValid(const std::string& s) { + return !s.empty() && !IsSingleVarint(s); +} + +const size_t MAX_ID_SIZE = 100; +char temp_id[MAX_ID_SIZE]; + +TEST(EnvPosixTest, RandomAccessUniqueID) { + // Create file. + std::string fname = test::TmpDir() + "/" + "testfile"; + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile)); + + unique_ptr file; + + // Get Unique ID + ASSERT_OK(env_->NewRandomAccessFile(fname, &file)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id1(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id1)); + + // Get Unique ID again + ASSERT_OK(env_->NewRandomAccessFile(fname, &file)); + id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id2(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id2)); + + // Get Unique ID again after waiting some time. + env_->SleepForMicroseconds(1000000); + ASSERT_OK(env_->NewRandomAccessFile(fname, &file)); + id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id3(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id3)); + + // Check IDs are the same. + ASSERT_EQ(unique_id1, unique_id2); + ASSERT_EQ(unique_id2, unique_id3); + + // Delete the file + env_->DeleteFile(fname); +} + +// Returns true if any of the strings in ss are the prefix of another string. +bool HasPrefix(const std::unordered_set& ss) { + for (const std::string& s: ss) { + if (s.empty()) { + return true; + } + for (size_t i = 1; i < s.size(); ++i) { + if (ss.count(s.substr(0, i)) != 0) { + return true; + } + } + } + return false; +} + +TEST(EnvPosixTest, RandomAccessUniqueIDConcurrent) { + // Check whether a bunch of concurrently existing files have unique IDs. + + // Create the files + std::vector fnames; + for (int i = 0; i < 1000; ++i) { + fnames.push_back(test::TmpDir() + "/" + "testfile" + std::to_string(i)); + + // Create file. + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fnames[i], &wfile)); + } + + // Collect and check whether the IDs are unique. + std::unordered_set ids; + for (const std::string fname: fnames) { + unique_ptr file; + std::string unique_id; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + unique_id = std::string(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id)); + + ASSERT_TRUE(ids.count(unique_id) == 0); + ids.insert(unique_id); + } + + // Delete the files + for (const std::string fname: fnames) { + ASSERT_OK(env_->DeleteFile(fname)); + } + + ASSERT_TRUE(!HasPrefix(ids)); +} + +TEST(EnvPosixTest, RandomAccessUniqueIDDeletes) { + std::string fname = test::TmpDir() + "/" + "testfile"; + + // Check that after file is deleted we don't get same ID again in a new file. + std::unordered_set ids; + for (int i = 0; i < 1000; ++i) { + // Create file. + { + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile)); + } + + // Get Unique ID + std::string unique_id; + { + unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + unique_id = std::string(temp_id, id_size); + } + + ASSERT_TRUE(IsUniqueIDValid(unique_id)); + ASSERT_TRUE(ids.count(unique_id) == 0); + ids.insert(unique_id); + + // Delete the file + ASSERT_OK(env_->DeleteFile(fname)); + } + + ASSERT_TRUE(!HasPrefix(ids)); +} + } // namespace leveldb int main(int argc, char** argv) {