From 6e8d6f429dd28324002f143ae593435d4a80e26f Mon Sep 17 00:00:00 2001 From: Aaron Gao Date: Fri, 14 Apr 2017 18:43:32 -0700 Subject: [PATCH] readahead backwards from sst end Summary: prefetch some data from the end of the file for each compaction to reduce IO. Closes https://github.com/facebook/rocksdb/pull/2149 Differential Revision: D4880576 Pulled By: lightmark fbshipit-source-id: aa767cd1afc84c541837fbf1ad6c0d45b34d3932 --- include/rocksdb/env.h | 6 +++ table/block_based_table_reader.cc | 10 +++-- util/file_reader_writer.cc | 73 ++++++++++++++++++------------- util/file_reader_writer.h | 4 ++ 4 files changed, 60 insertions(+), 33 deletions(-) diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index c74f4bb28..6370e33d3 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -472,6 +472,7 @@ class SequentialFile { // A file abstraction for randomly reading the contents of a file. class RandomAccessFile { public: + RandomAccessFile() { } virtual ~RandomAccessFile(); @@ -488,6 +489,11 @@ class RandomAccessFile { virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const = 0; + // Readahead the file starting from offset by n bytes for caching. + virtual Status Prefetch(uint64_t offset, size_t n) { + return Status::OK(); + } + // Used by the file_reader_writer to decide if the ReadAhead wrapper // should simply forward the call and do not enact buffering or locking. virtual bool ShouldForwardRawRequest() const { diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 75d272068..abb476578 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -530,7 +530,12 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, table_reader->reset(); Footer footer; - auto s = ReadFooterFromFile(file.get(), file_size, &footer, + + // Before read footer, readahead backwards to prefetch data + Status s = + file->Prefetch((file_size < 512 * 1024 ? 0 : file_size - 512 * 1024), + 512 * 1024 /* 512 KB prefetching */); + s = ReadFooterFromFile(file.get(), file_size, &footer, kBlockBasedTableMagicNumber); if (!s.ok()) { return s; @@ -541,8 +546,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, "version of RocksDB?"); } - // We've successfully read the footer and the index block: we're - // ready to serve requests. + // We've successfully read the footer. We are ready to serve requests. // Better not mutate rep_ after the creation. eg. internal_prefix_transform // raw pointer will be used to create HashIndexReader, whose reset may // access a dangling pointer. diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index b19149692..1088ae3f4 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -472,7 +472,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { // complitely or partially in the buffer // If it's completely cached, including end of file case when offset + n is // greater than EOF, return - if (TryReadFromCache_(offset, n, &cached_len, scratch) && + if (TryReadFromCache(offset, n, &cached_len, scratch) && (cached_len == n || // End of file buffer_len_ < readahead_size_)) { @@ -484,34 +484,34 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { // chunk_offset equals to advanced_offset size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset); Slice readahead_result; - Status s = file_->Read(chunk_offset, readahead_size_, &readahead_result, - buffer_.BufferStart()); - if (!s.ok()) { - return s; - } - // In the case of cache miss, i.e. when cached_len equals 0, an offset can - // exceed the file end position, so the following check is required - if (advanced_offset < chunk_offset + readahead_result.size()) { - // In the case of cache miss, the first chunk_padding bytes in buffer_ are - // stored for alignment only and must be skipped - size_t chunk_padding = advanced_offset - chunk_offset; - auto remaining_len = - std::min(readahead_result.size() - chunk_padding, n - cached_len); - memcpy(scratch + cached_len, readahead_result.data() + chunk_padding, - remaining_len); - *result = Slice(scratch, cached_len + remaining_len); - } else { - *result = Slice(scratch, cached_len); - } - if (readahead_result.data() == buffer_.BufferStart()) { - buffer_offset_ = chunk_offset; - buffer_len_ = readahead_result.size(); - } else { - buffer_len_ = 0; + Status s = ReadIntoBuffer(chunk_offset, readahead_size_); + if (s.ok()) { + // In the case of cache miss, i.e. when cached_len equals 0, an offset can + // exceed the file end position, so the following check is required + if (advanced_offset < chunk_offset + buffer_len_) { + // In the case of cache miss, the first chunk_padding bytes in buffer_ + // are + // stored for alignment only and must be skipped + size_t chunk_padding = advanced_offset - chunk_offset; + auto remaining_len = + std::min(buffer_len_ - chunk_padding, n - cached_len); + memcpy(scratch + cached_len, buffer_.BufferStart() + chunk_padding, + remaining_len); + *result = Slice(scratch, cached_len + remaining_len); + } else { + *result = Slice(scratch, cached_len); + } } + return s; + } - return Status::OK(); + virtual Status Prefetch(uint64_t offset, size_t n) override { + size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset); + if (prefetch_offset == buffer_offset_) { + return Status::OK(); + } + return ReadIntoBuffer(prefetch_offset, offset - prefetch_offset + n); } virtual size_t GetUniqueId(char* id, size_t max_size) const override { @@ -529,7 +529,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { } private: - bool TryReadFromCache_(uint64_t offset, size_t n, size_t* cached_len, + bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len, char* scratch) const { if (offset < buffer_offset_ || offset >= buffer_offset_ + buffer_len_) { *cached_len = 0; @@ -542,15 +542,28 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { return true; } + Status ReadIntoBuffer(uint64_t offset, size_t n) const { + if (n > buffer_.Capacity()) { + n = buffer_.Capacity(); + } + Slice result; + Status s = file_->Read(offset, n, &result, buffer_.BufferStart()); + if (s.ok()) { + buffer_offset_ = offset; + buffer_len_ = result.size(); + } + return s; + } + std::unique_ptr file_; const size_t alignment_; size_t readahead_size_; const bool forward_calls_; - mutable std::mutex lock_; + mutable std::mutex lock_; mutable AlignedBuffer buffer_; - mutable uint64_t buffer_offset_; - mutable size_t buffer_len_; + mutable uint64_t buffer_offset_; + mutable size_t buffer_len_; }; } // namespace diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 30b11e839..39d475f01 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -92,6 +92,10 @@ class RandomAccessFileReader { Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const; + Status Prefetch(uint64_t offset, size_t n) const { + return file_->Prefetch(offset, n); + } + RandomAccessFile* file() { return file_.get(); } bool use_direct_io() const { return file_->use_direct_io(); }