diff --git a/HISTORY.md b/HISTORY.md index b9d0f7413..efd49f642 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -33,6 +33,7 @@ * Reduce iterator key comparision for upper/lower bound check. * Log Writer will flush after finishing the whole record, rather than a fragment. * Lower MultiGet batching API latency by reading data blocks from disk in parallel +* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases. ### General Improvements * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress. diff --git a/db/db_test2.cc b/db/db_test2.cc index 109a7a377..3664b3a24 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -3771,6 +3771,52 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) { delete db_; db_ = nullptr; } + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, RowCacheSnapshot) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + options.row_cache = NewLRUCache(8192); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar1")); + + const Snapshot* s1 = db_->GetSnapshot(); + + ASSERT_OK(Put("foo", "bar2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("foo2", "bar")); + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_OK(Put("foo3", "bar")); + const Snapshot* s3 = db_->GetSnapshot(); + + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); + ASSERT_EQ(Get("foo"), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo"), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo", s1), "bar1"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s2), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s1), "bar1"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s3), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + + db_->ReleaseSnapshot(s1); + db_->ReleaseSnapshot(s2); + db_->ReleaseSnapshot(s3); +} +#endif // ROCKSDB_LITE } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/table_cache.cc b/db/table_cache.cc index 121d4941f..2290b5939 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -11,6 +11,7 @@ #include "db/dbformat.h" #include "db/range_tombstone_fragmenter.h" +#include "db/snapshot_impl.h" #include "db/version_edit.h" #include "file/filename.h" @@ -24,6 +25,7 @@ #include "table/table_builder.h" #include "table/table_reader.h" #include "test_util/sync_point.h" +#include "util/cast_util.h" #include "util/coding.h" #include "util/file_reader_writer.h" #include "util/stop_watch.h" @@ -277,8 +279,23 @@ Status TableCache::Get(const ReadOptions& options, // sequence key increases. However, to support caching snapshot // reads, we append the sequence number (incremented by 1 to // distinguish from 0) only in this case. - uint64_t seq_no = - options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k); + // If the snapshot is larger than the largest seqno in the file, + // all data should be exposed to the snapshot, so we treat it + // the same as there is no snapshot. The exception is that if + // a seq-checking callback is registered, some internal keys + // may still be filtered out. + uint64_t seq_no = 0; + // Maybe we can include the whole file ifsnapshot == fd.largest_seqno. + if (options.snapshot != nullptr && + (get_context->has_callback() || + static_cast_with_check( + options.snapshot) + ->GetSequenceNumber() <= fd.largest_seqno)) { + // We should consider to use options.snapshot->GetSequenceNumber() + // instead of GetInternalKeySeqno(k), which will make the code + // easier to understand. + seq_no = 1 + GetInternalKeySeqno(k); + } // Compute row cache key. row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), diff --git a/table/get_context.h b/table/get_context.h index 7a37beb2d..7110ceae8 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -136,6 +136,8 @@ class GetContext { void ReportCounters(); + bool has_callback() const { return callback_ != nullptr; } + uint64_t get_tracing_get_id() const { return tracing_get_id_; } private: