From 30a017fecae60aa7b87c4a1e283b6ac027724a92 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Fri, 5 Jan 2018 16:35:49 -0800 Subject: [PATCH] Blob DB: avoid having a separate read of checksum Summary: Previously on a blob db read, we are making a read of the blob value, and then make another read to get CRC checksum. I'm combining the two read into one. readrandom db_bench with 1G database with base db size of 13M, value size 1k: `./db_bench --db=/home/yiwu/tmp/db_bench --use_blob_db --value_size=1024 --num=1000000 --benchmarks=readrandom --use_existing_db --cache_size=32000000` master: throughput 234MB/s, get micros p50 5.984 p95 9.998 p99 20.817 p100 787 this PR: throughput 261MB/s, get micros p50 5.157 p95 9.928 p99 20.724 p100 190 Closes https://github.com/facebook/rocksdb/pull/3301 Differential Revision: D6615950 Pulled By: yiwu-arbug fbshipit-source-id: 052410c6d8539ec0cc305d53793bbc8f3616baa3 --- utilities/blob_db/blob_db_impl.cc | 94 +++++++++++++++---------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 6277c4314..999967c44 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -1055,58 +1055,58 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry, std::shared_ptr reader = GetOrOpenRandomAccessReader(bfile, env_, env_options_); - std::string* valueptr = value->GetSelf(); - std::string value_c; - if (bdb_options_.compression != kNoCompression) { - valueptr = &value_c; - } + assert(blob_index.offset() > key.size() + sizeof(uint32_t)); + uint64_t record_offset = blob_index.offset() - key.size() - sizeof(uint32_t); + uint64_t record_size = sizeof(uint32_t) + key.size() + blob_index.size(); // Allocate the buffer. This is safe in C++11 - // Note that std::string::reserved() does not work, since previous value - // of the buffer can be larger than blob_index.size(). - valueptr->resize(blob_index.size()); - char* buffer = &(*valueptr)[0]; + std::string buffer_str(record_size, static_cast(0)); + char* buffer = &buffer_str[0]; - Slice blob_value; + // A partial blob record contain checksum, key and value. + Slice blob_record; { StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); - s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value, - buffer); - RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_value.size()); + s = reader->Read(record_offset, record_size, &blob_record, buffer); + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size()); } - if (!s.ok() || blob_value.size() != blob_index.size()) { - if (debug_level_ >= 2) { - ROCKS_LOG_ERROR(db_options_.info_log, - "Failed to read blob from file: %s blob_offset: %" PRIu64 - " blob_size: %" PRIu64 " read: %d key: %s status: '%s'", - bfile->PathName().c_str(), blob_index.offset(), - blob_index.size(), static_cast(blob_value.size()), - key.data(), s.ToString().c_str()); - } - return Status::NotFound("Blob Not Found as couldnt retrieve Blob"); + if (!s.ok()) { + ROCKS_LOG_DEBUG(db_options_.info_log, + "Failed to read blob from blob file %" PRIu64 + ", blob_offset: %" PRIu64 ", blob_size: %" PRIu64 + ", key_size: " PRIu64 ", read " PRIu64 + "bytes, status: '%s'", + bfile->BlobFileNumber(), blob_index.offset(), + blob_index.size(), key.size(), s.ToString().c_str()); + return s; } + if (blob_record.size() != record_size) { + ROCKS_LOG_DEBUG(db_options_.info_log, + "Failed to read blob from blob file %" PRIu64 + ", blob_offset: %" PRIu64 ", blob_size: %" PRIu64 + ", key_size: " PRIu64 ", read " PRIu64 + "bytes, status: '%s'", + bfile->BlobFileNumber(), blob_index.offset(), + blob_index.size(), key.size(), s.ToString().c_str()); - // TODO(yiwu): Add an option to skip crc checking. - Slice crc_slice; - uint32_t crc_exp; - std::string crc_str; - crc_str.resize(sizeof(uint32_t)); - char* crc_buffer = &(crc_str[0]); - s = reader->Read(blob_index.offset() - (key.size() + sizeof(uint32_t)), - sizeof(uint32_t), &crc_slice, crc_buffer); - if (!s.ok() || !GetFixed32(&crc_slice, &crc_exp)) { - if (debug_level_ >= 2) { - ROCKS_LOG_ERROR(db_options_.info_log, - "Failed to fetch blob crc file: %s blob_offset: %" PRIu64 - " blob_size: %" PRIu64 " key: %s status: '%s'", - bfile->PathName().c_str(), blob_index.offset(), - blob_index.size(), key.data(), s.ToString().c_str()); - } - return Status::NotFound("Blob Not Found as couldnt retrieve CRC"); + return Status::Corruption("Failed to retrieve blob from blob index."); } - - uint32_t crc = crc32c::Value(key.data(), key.size()); - crc = crc32c::Extend(crc, blob_value.data(), blob_value.size()); + Slice crc_slice(blob_record.data(), sizeof(uint32_t)); + Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(), + blob_index.size()); + uint32_t crc_exp; + if (!GetFixed32(&crc_slice, &crc_exp)) { + ROCKS_LOG_DEBUG(db_options_.info_log, + "Unable to decode CRC from blob file %" PRIu64 + ", blob_offset: %" PRIu64 ", blob_size: %" PRIu64 + ", key size: %" PRIu64 ", status: '%s'", + bfile->BlobFileNumber(), blob_index.offset(), + blob_index.size(), key.size(), s.ToString().c_str()); + return Status::Corruption("Unable to decode checksum."); + } + + uint32_t crc = crc32c::Value(blob_record.data() + sizeof(uint32_t), + blob_record.size() - sizeof(uint32_t)); crc = crc32c::Mask(crc); // Adjust for storage if (crc != crc_exp) { if (debug_level_ >= 2) { @@ -1119,7 +1119,9 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry, return Status::Corruption("Corruption. Blob CRC mismatch"); } - if (bfile->compression() != kNoCompression) { + if (bfile->compression() == kNoCompression) { + value->PinSelf(blob_value); + } else { BlockContents contents; auto cfh = reinterpret_cast(DefaultColumnFamily()); { @@ -1130,11 +1132,9 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry, kBlockBasedTableVersionFormat, Slice(), bfile->compression(), *(cfh->cfd()->ioptions())); } - *(value->GetSelf()) = contents.data.ToString(); + value->PinSelf(contents.data); } - value->PinSelf(); - return s; }