readahead backwards from sst end

Summary:
prefetch some data from the end of the file for each compaction to reduce IO.
Closes https://github.com/facebook/rocksdb/pull/2149

Differential Revision: D4880576

Pulled By: lightmark

fbshipit-source-id: aa767cd1afc84c541837fbf1ad6c0d45b34d3932
main
Aaron Gao 7 years ago committed by Facebook Github Bot
parent ca96654d85
commit 6e8d6f429d
  1. 6
      include/rocksdb/env.h
  2. 10
      table/block_based_table_reader.cc
  3. 73
      util/file_reader_writer.cc
  4. 4
      util/file_reader_writer.h

@ -472,6 +472,7 @@ class SequentialFile {
// A file abstraction for randomly reading the contents of a file.
class RandomAccessFile {
public:
RandomAccessFile() { }
virtual ~RandomAccessFile();
@ -488,6 +489,11 @@ class RandomAccessFile {
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0;
// Readahead the file starting from offset by n bytes for caching.
virtual Status Prefetch(uint64_t offset, size_t n) {
return Status::OK();
}
// Used by the file_reader_writer to decide if the ReadAhead wrapper
// should simply forward the call and do not enact buffering or locking.
virtual bool ShouldForwardRawRequest() const {

@ -530,7 +530,12 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
table_reader->reset();
Footer footer;
auto s = ReadFooterFromFile(file.get(), file_size, &footer,
// Before read footer, readahead backwards to prefetch data
Status s =
file->Prefetch((file_size < 512 * 1024 ? 0 : file_size - 512 * 1024),
512 * 1024 /* 512 KB prefetching */);
s = ReadFooterFromFile(file.get(), file_size, &footer,
kBlockBasedTableMagicNumber);
if (!s.ok()) {
return s;
@ -541,8 +546,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
"version of RocksDB?");
}
// We've successfully read the footer and the index block: we're
// ready to serve requests.
// We've successfully read the footer. We are ready to serve requests.
// Better not mutate rep_ after the creation. eg. internal_prefix_transform
// raw pointer will be used to create HashIndexReader, whose reset may
// access a dangling pointer.

@ -472,7 +472,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
// complitely or partially in the buffer
// If it's completely cached, including end of file case when offset + n is
// greater than EOF, return
if (TryReadFromCache_(offset, n, &cached_len, scratch) &&
if (TryReadFromCache(offset, n, &cached_len, scratch) &&
(cached_len == n ||
// End of file
buffer_len_ < readahead_size_)) {
@ -484,34 +484,34 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
// chunk_offset equals to advanced_offset
size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
Slice readahead_result;
Status s = file_->Read(chunk_offset, readahead_size_, &readahead_result,
buffer_.BufferStart());
if (!s.ok()) {
return s;
}
// In the case of cache miss, i.e. when cached_len equals 0, an offset can
// exceed the file end position, so the following check is required
if (advanced_offset < chunk_offset + readahead_result.size()) {
// In the case of cache miss, the first chunk_padding bytes in buffer_ are
// stored for alignment only and must be skipped
size_t chunk_padding = advanced_offset - chunk_offset;
auto remaining_len =
std::min(readahead_result.size() - chunk_padding, n - cached_len);
memcpy(scratch + cached_len, readahead_result.data() + chunk_padding,
remaining_len);
*result = Slice(scratch, cached_len + remaining_len);
} else {
*result = Slice(scratch, cached_len);
}
if (readahead_result.data() == buffer_.BufferStart()) {
buffer_offset_ = chunk_offset;
buffer_len_ = readahead_result.size();
} else {
buffer_len_ = 0;
Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
if (s.ok()) {
// In the case of cache miss, i.e. when cached_len equals 0, an offset can
// exceed the file end position, so the following check is required
if (advanced_offset < chunk_offset + buffer_len_) {
// In the case of cache miss, the first chunk_padding bytes in buffer_
// are
// stored for alignment only and must be skipped
size_t chunk_padding = advanced_offset - chunk_offset;
auto remaining_len =
std::min(buffer_len_ - chunk_padding, n - cached_len);
memcpy(scratch + cached_len, buffer_.BufferStart() + chunk_padding,
remaining_len);
*result = Slice(scratch, cached_len + remaining_len);
} else {
*result = Slice(scratch, cached_len);
}
}
return s;
}
return Status::OK();
virtual Status Prefetch(uint64_t offset, size_t n) override {
size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset);
if (prefetch_offset == buffer_offset_) {
return Status::OK();
}
return ReadIntoBuffer(prefetch_offset, offset - prefetch_offset + n);
}
virtual size_t GetUniqueId(char* id, size_t max_size) const override {
@ -529,7 +529,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
}
private:
bool TryReadFromCache_(uint64_t offset, size_t n, size_t* cached_len,
bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
char* scratch) const {
if (offset < buffer_offset_ || offset >= buffer_offset_ + buffer_len_) {
*cached_len = 0;
@ -542,15 +542,28 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
return true;
}
Status ReadIntoBuffer(uint64_t offset, size_t n) const {
if (n > buffer_.Capacity()) {
n = buffer_.Capacity();
}
Slice result;
Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
if (s.ok()) {
buffer_offset_ = offset;
buffer_len_ = result.size();
}
return s;
}
std::unique_ptr<RandomAccessFile> file_;
const size_t alignment_;
size_t readahead_size_;
const bool forward_calls_;
mutable std::mutex lock_;
mutable std::mutex lock_;
mutable AlignedBuffer buffer_;
mutable uint64_t buffer_offset_;
mutable size_t buffer_len_;
mutable uint64_t buffer_offset_;
mutable size_t buffer_len_;
};
} // namespace

@ -92,6 +92,10 @@ class RandomAccessFileReader {
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const;
Status Prefetch(uint64_t offset, size_t n) const {
return file_->Prefetch(offset, n);
}
RandomAccessFile* file() { return file_.get(); }
bool use_direct_io() const { return file_->use_direct_io(); }

Loading…
Cancel
Save