Merge remote-tracking branch 'upstream/master'

main
wankai 10 years ago
commit 5d25a46936
  1. 6
      db/column_family_test.cc
  2. 134
      db/db_impl.cc
  3. 4
      db/db_impl_readonly.h
  4. 23
      db/db_iter.cc
  5. 2
      db/db_test.cc
  6. 20
      db/dbformat.cc
  7. 13
      db/dbformat.h
  8. 77
      db/forward_iterator.cc
  9. 1
      db/forward_iterator.h
  10. 2
      db/internal_stats.cc
  11. 3
      db/memtable.cc
  12. 2
      db/simple_table_db_test.cc
  13. 44
      db/write_batch.cc
  14. 18
      db/write_batch_internal.h
  15. 2
      include/rocksdb/iostats_context.h
  16. 12
      include/rocksdb/options.h
  17. 2
      include/rocksdb/status.h
  18. 39
      include/rocksdb/table.h
  19. 2
      table/adaptive_table_factory.h
  20. 6
      table/block_based_table_builder.cc
  21. 4
      table/block_based_table_factory.cc
  22. 2
      table/block_based_table_factory.h
  23. 10
      table/block_based_table_reader.cc
  24. 6
      table/block_builder.cc
  25. 3
      table/block_builder.h
  26. 5
      table/block_test.cc
  27. 127
      table/cuckoo_table_builder.cc
  28. 15
      table/cuckoo_table_builder.h
  29. 155
      table/cuckoo_table_builder_test.cc
  30. 35
      table/cuckoo_table_factory.cc
  31. 25
      table/cuckoo_table_factory.h
  32. 55
      table/cuckoo_table_reader.cc
  33. 6
      table/cuckoo_table_reader.h
  34. 70
      table/cuckoo_table_reader_test.cc
  35. 14
      table/filter_block.cc
  36. 14
      table/format.cc
  37. 15
      table/merger.cc
  38. 8
      table/meta_blocks.cc
  39. 2
      table/plain_table_factory.h
  40. 5
      table/table_test.cc
  41. 5
      table/two_level_iterator.cc
  42. 30
      tools/db_stress.cc
  43. 48
      util/env_posix.cc
  44. 2
      util/iostats_context.cc
  45. 14
      util/iostats_context_imp.h
  46. 44
      util/perf_context_imp.h
  47. 30
      utilities/spatialdb/spatial_db.cc

@ -408,9 +408,15 @@ TEST(ColumnFamilyTest, WriteBatchFailure) {
Open(); Open();
CreateColumnFamiliesAndReopen({"one", "two"}); CreateColumnFamiliesAndReopen({"one", "two"});
WriteBatch batch; WriteBatch batch;
batch.Put(handles_[0], Slice("existing"), Slice("column-family"));
batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")); batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
ASSERT_OK(db_->Write(WriteOptions(), &batch)); ASSERT_OK(db_->Write(WriteOptions(), &batch));
DropColumnFamilies({1}); DropColumnFamilies({1});
WriteOptions woptions_ignore_missing_cf;
woptions_ignore_missing_cf.ignore_missing_column_families = true;
batch.Put(handles_[0], Slice("still here"), Slice("column-family"));
ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
ASSERT_EQ("column-family", Get(0, "still here"));
Status s = db_->Write(WriteOptions(), &batch); Status s = db_->Write(WriteOptions(), &batch);
ASSERT_TRUE(s.IsInvalidArgument()); ASSERT_TRUE(s.IsInvalidArgument());
Close(); Close();

@ -290,8 +290,10 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
return result; return result;
} }
namespace {
Status SanitizeDBOptionsByCFOptions( Status SanitizeDBOptionsByCFOptions(
DBOptions* db_opts, const DBOptions* db_opts,
const std::vector<ColumnFamilyDescriptor>& column_families) { const std::vector<ColumnFamilyDescriptor>& column_families) {
Status s; Status s;
for (auto cf : column_families) { for (auto cf : column_families) {
@ -303,7 +305,6 @@ Status SanitizeDBOptionsByCFOptions(
return Status::OK(); return Status::OK();
} }
namespace {
CompressionType GetCompressionFlush(const Options& options) { CompressionType GetCompressionFlush(const Options& options) {
// Compressing memtable flushes might not help unless the sequential load // Compressing memtable flushes might not help unless the sequential load
// optimization is used for leveled compaction. Otherwise the CPU and // optimization is used for leveled compaction. Otherwise the CPU and
@ -631,7 +632,7 @@ bool CompareCandidateFile(const rocksdb::DBImpl::CandidateFileInfo& first,
} else if (first.file_name < second.file_name) { } else if (first.file_name < second.file_name) {
return false; return false;
} else { } else {
return (first.path_id > first.path_id); return (first.path_id > second.path_id);
} }
} }
}; // namespace }; // namespace
@ -1301,14 +1302,20 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
WriteBatch batch; WriteBatch batch;
while (reader.ReadRecord(&record, &scratch)) { while (reader.ReadRecord(&record, &scratch)) {
if (record.size() < 12) { if (record.size() < 12) {
reporter.Corruption( reporter.Corruption(record.size(),
record.size(), Status::Corruption("log record too small")); Status::Corruption("log record too small"));
continue; continue;
} }
WriteBatchInternal::SetContents(&batch, record); WriteBatchInternal::SetContents(&batch, record);
// If column family was not found, it might mean that the WAL write
// batch references to the column family that was dropped after the
// insert. We don't want to fail the whole write batch in that case -- we
// just ignore the update. That's why we set ignore missing column families
// to true
status = WriteBatchInternal::InsertInto( status = WriteBatchInternal::InsertInto(
&batch, column_family_memtables_.get(), true, log_number); &batch, column_family_memtables_.get(),
true /* ignore missing column families */, log_number);
MaybeIgnoreError(&status); MaybeIgnoreError(&status);
if (!status.ok()) { if (!status.ok()) {
@ -1677,6 +1684,13 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
} }
LogFlush(options_.info_log); LogFlush(options_.info_log);
{
MutexLock l(&mutex_);
// an automatic compaction that has been scheduled might have been
// preempted by the manual compactions. Need to schedule it back.
MaybeScheduleFlushOrCompaction();
}
return s; return s;
} }
@ -1864,18 +1878,15 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
bg_cv_.Wait(); bg_cv_.Wait();
} else { } else {
manual_compaction_ = &manual; manual_compaction_ = &manual;
MaybeScheduleFlushOrCompaction(); assert(bg_compaction_scheduled_ == 0);
bg_compaction_scheduled_++;
env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
} }
} }
assert(!manual.in_progress); assert(!manual.in_progress);
assert(bg_manual_only_ > 0); assert(bg_manual_only_ > 0);
--bg_manual_only_; --bg_manual_only_;
if (bg_manual_only_ == 0) {
// an automatic compaction should have been scheduled might have be
// preempted by the manual compactions. Need to schedule it back.
MaybeScheduleFlushOrCompaction();
}
return manual.status; return manual.status;
} }
@ -1963,11 +1974,11 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
// Schedule BGWorkCompaction if there's a compaction pending (or a memtable // Schedule BGWorkCompaction if there's a compaction pending (or a memtable
// flush, but the HIGH pool is not enabled) // flush, but the HIGH pool is not enabled)
// Do it only if max_background_compactions hasn't been reached and, in case // Do it only if max_background_compactions hasn't been reached and
// bg_manual_only_ > 0, if it's a manual compaction. // bg_manual_only_ == 0
if ((manual_compaction_ || is_compaction_needed || if (!bg_manual_only_ &&
(is_flush_pending && options_.max_background_flushes == 0)) && (is_compaction_needed ||
(!bg_manual_only_ || manual_compaction_)) { (is_flush_pending && options_.max_background_flushes == 0))) {
if (bg_compaction_scheduled_ < options_.max_background_compactions) { if (bg_compaction_scheduled_ < options_.max_background_compactions) {
bg_compaction_scheduled_++; bg_compaction_scheduled_++;
env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
@ -1979,7 +1990,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
} }
void DBImpl::RecordFlushIOStats() { void DBImpl::RecordFlushIOStats() {
RecordTick(stats_, FLUSH_WRITE_BYTES, iostats_context.bytes_written); RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
IOSTATS_RESET(bytes_written); IOSTATS_RESET(bytes_written);
} }
@ -2194,6 +2205,10 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
if (is_manual) { if (is_manual) {
// another thread cannot pick up the same work // another thread cannot pick up the same work
manual_compaction_->in_progress = true; manual_compaction_->in_progress = true;
} else if (manual_compaction_ != nullptr) {
// there should be no automatic compactions running when manual compaction
// is running
return Status::OK();
} }
// FLUSH preempts compaction // FLUSH preempts compaction
@ -2313,7 +2328,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
if (status.ok()) { if (status.ok()) {
// Done // Done
} else if (shutting_down_.Acquire_Load()) { } else if (status.IsShutdownInProgress()) {
// Ignore compaction errors found during shutting down // Ignore compaction errors found during shutting down
} else { } else {
Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Compaction error: %s", Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Compaction error: %s",
@ -2573,6 +2588,10 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot(
uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd, uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
DeletionState& deletion_state, DeletionState& deletion_state,
LogBuffer* log_buffer) { LogBuffer* log_buffer) {
if (options_.max_background_flushes > 0) {
// flush thread will take care of this
return 0;
}
if (cfd->imm()->imm_flush_needed.NoBarrier_Load() != nullptr) { if (cfd->imm()->imm_flush_needed.NoBarrier_Load() != nullptr) {
const uint64_t imm_start = env_->NowMicros(); const uint64_t imm_start = env_->NowMicros();
mutex_.Lock(); mutex_.Lock();
@ -2626,9 +2645,29 @@ Status DBImpl::ProcessKeyValueCompaction(
compaction_filter = compaction_filter_from_factory.get(); compaction_filter = compaction_filter_from_factory.get();
} }
int64_t key_drop_user = 0;
int64_t key_drop_newer_entry = 0;
int64_t key_drop_obsolete = 0;
int64_t loop_cnt = 0;
while (input->Valid() && !shutting_down_.Acquire_Load() && while (input->Valid() && !shutting_down_.Acquire_Load() &&
!cfd->IsDropped()) { !cfd->IsDropped()) {
if (++loop_cnt > 1000) {
if (key_drop_user > 0) {
RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
key_drop_user = 0;
}
if (key_drop_newer_entry > 0) {
RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
key_drop_newer_entry);
key_drop_newer_entry = 0;
}
if (key_drop_obsolete > 0) {
RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
key_drop_obsolete = 0;
}
RecordCompactionIOStats(); RecordCompactionIOStats();
loop_cnt = 0;
}
// FLUSH preempts compaction // FLUSH preempts compaction
// TODO(icanadi) this currently only checks if flush is necessary on // TODO(icanadi) this currently only checks if flush is necessary on
// compacting column family. we should also check if flush is necessary on // compacting column family. we should also check if flush is necessary on
@ -2709,7 +2748,7 @@ Status DBImpl::ProcessKeyValueCompaction(
ParseInternalKey(key, &ikey); ParseInternalKey(key, &ikey);
// no value associated with delete // no value associated with delete
value.clear(); value.clear();
RecordTick(stats_, COMPACTION_KEY_DROP_USER); ++key_drop_user;
} else if (value_changed) { } else if (value_changed) {
value = compaction_filter_value; value = compaction_filter_value;
} }
@ -2733,7 +2772,7 @@ Status DBImpl::ProcessKeyValueCompaction(
// TODO: why not > ? // TODO: why not > ?
assert(last_sequence_for_key >= ikey.sequence); assert(last_sequence_for_key >= ikey.sequence);
drop = true; // (A) drop = true; // (A)
RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY); ++key_drop_newer_entry;
} else if (ikey.type == kTypeDeletion && } else if (ikey.type == kTypeDeletion &&
ikey.sequence <= earliest_snapshot && ikey.sequence <= earliest_snapshot &&
compact->compaction->KeyNotExistsBeyondOutputLevel(ikey.user_key)) { compact->compaction->KeyNotExistsBeyondOutputLevel(ikey.user_key)) {
@ -2745,7 +2784,7 @@ Status DBImpl::ProcessKeyValueCompaction(
// few iterations of this loop (by rule (A) above). // few iterations of this loop (by rule (A) above).
// Therefore this deletion marker is obsolete and can be dropped. // Therefore this deletion marker is obsolete and can be dropped.
drop = true; drop = true;
RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE); ++key_drop_obsolete;
} else if (ikey.type == kTypeMerge) { } else if (ikey.type == kTypeMerge) {
if (!merge.HasOperator()) { if (!merge.HasOperator()) {
LogToBuffer(log_buffer, "Options::merge_operator is null."); LogToBuffer(log_buffer, "Options::merge_operator is null.");
@ -2892,7 +2931,15 @@ Status DBImpl::ProcessKeyValueCompaction(
input->Next(); input->Next();
} }
} }
if (key_drop_user > 0) {
RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
}
if (key_drop_newer_entry > 0) {
RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, key_drop_newer_entry);
}
if (key_drop_obsolete > 0) {
RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
}
RecordCompactionIOStats(); RecordCompactionIOStats();
return status; return status;
@ -3367,7 +3414,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value, bool* value_found) { std::string* value, bool* value_found) {
StopWatch sw(env_, stats_, DB_GET); StopWatch sw(env_, stats_, DB_GET);
PERF_TIMER_AUTO(get_snapshot_time); PERF_TIMER_GUARD(get_snapshot_time);
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family); auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd(); auto cfd = cfh->cfd();
@ -3391,6 +3438,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
// merge_operands will contain the sequence of merges in the latter case. // merge_operands will contain the sequence of merges in the latter case.
LookupKey lkey(key, snapshot); LookupKey lkey(key, snapshot);
PERF_TIMER_STOP(get_snapshot_time); PERF_TIMER_STOP(get_snapshot_time);
if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->options())) { if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->options())) {
// Done // Done
RecordTick(stats_, MEMTABLE_HIT); RecordTick(stats_, MEMTABLE_HIT);
@ -3398,20 +3446,19 @@ Status DBImpl::GetImpl(const ReadOptions& options,
// Done // Done
RecordTick(stats_, MEMTABLE_HIT); RecordTick(stats_, MEMTABLE_HIT);
} else { } else {
PERF_TIMER_START(get_from_output_files_time); PERF_TIMER_GUARD(get_from_output_files_time);
sv->current->Get(options, lkey, value, &s, &merge_context, value_found); sv->current->Get(options, lkey, value, &s, &merge_context, value_found);
PERF_TIMER_STOP(get_from_output_files_time);
RecordTick(stats_, MEMTABLE_MISS); RecordTick(stats_, MEMTABLE_MISS);
} }
PERF_TIMER_START(get_post_process_time); {
PERF_TIMER_GUARD(get_post_process_time);
ReturnAndCleanupSuperVersion(cfd, sv); ReturnAndCleanupSuperVersion(cfd, sv);
RecordTick(stats_, NUMBER_KEYS_READ); RecordTick(stats_, NUMBER_KEYS_READ);
RecordTick(stats_, BYTES_READ, value->size()); RecordTick(stats_, BYTES_READ, value->size());
PERF_TIMER_STOP(get_post_process_time); }
return s; return s;
} }
@ -3421,7 +3468,7 @@ std::vector<Status> DBImpl::MultiGet(
const std::vector<Slice>& keys, std::vector<std::string>* values) { const std::vector<Slice>& keys, std::vector<std::string>* values) {
StopWatch sw(env_, stats_, DB_MULTIGET); StopWatch sw(env_, stats_, DB_MULTIGET);
PERF_TIMER_AUTO(get_snapshot_time); PERF_TIMER_GUARD(get_snapshot_time);
SequenceNumber snapshot; SequenceNumber snapshot;
@ -3497,7 +3544,7 @@ std::vector<Status> DBImpl::MultiGet(
} }
// Post processing (decrement reference counts and record statistics) // Post processing (decrement reference counts and record statistics)
PERF_TIMER_START(get_post_process_time); PERF_TIMER_GUARD(get_post_process_time);
autovector<SuperVersion*> superversions_to_delete; autovector<SuperVersion*> superversions_to_delete;
// TODO(icanadi) do we need lock here or just around Cleanup()? // TODO(icanadi) do we need lock here or just around Cleanup()?
@ -3870,7 +3917,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
if (my_batch == nullptr) { if (my_batch == nullptr) {
return Status::Corruption("Batch is nullptr!"); return Status::Corruption("Batch is nullptr!");
} }
PERF_TIMER_AUTO(write_pre_and_post_process_time); PERF_TIMER_GUARD(write_pre_and_post_process_time);
Writer w(&mutex_); Writer w(&mutex_);
w.batch = my_batch; w.batch = my_batch;
w.sync = options.sync; w.sync = options.sync;
@ -4003,7 +4050,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
uint64_t log_size = 0; uint64_t log_size = 0;
if (!options.disableWAL) { if (!options.disableWAL) {
PERF_TIMER_START(write_wal_time); PERF_TIMER_GUARD(write_wal_time);
Slice log_entry = WriteBatchInternal::Contents(updates); Slice log_entry = WriteBatchInternal::Contents(updates);
status = log_->AddRecord(log_entry); status = log_->AddRecord(log_entry);
total_log_size_ += log_entry.size(); total_log_size_ += log_entry.size();
@ -4021,13 +4068,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
status = log_->file()->Sync(); status = log_->file()->Sync();
} }
} }
PERF_TIMER_STOP(write_wal_time);
} }
if (status.ok()) { if (status.ok()) {
PERF_TIMER_START(write_memtable_time); PERF_TIMER_GUARD(write_memtable_time);
status = WriteBatchInternal::InsertInto( status = WriteBatchInternal::InsertInto(
updates, column_family_memtables_.get(), false, 0, this, false); updates, column_family_memtables_.get(),
options.ignore_missing_column_families, 0, this, false);
// A non-OK status here indicates iteration failure (either in-memory // A non-OK status here indicates iteration failure (either in-memory
// writebatch corruption (very bad), or the client specified invalid // writebatch corruption (very bad), or the client specified invalid
// column family). This will later on trigger bg_error_. // column family). This will later on trigger bg_error_.
@ -4036,8 +4083,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
// into the memtable would result in a state that some write ops might // into the memtable would result in a state that some write ops might
// have succeeded in memtable but Status reports error for all writes. // have succeeded in memtable but Status reports error for all writes.
PERF_TIMER_STOP(write_memtable_time);
SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence);
} }
PERF_TIMER_START(write_pre_and_post_process_time); PERF_TIMER_START(write_pre_and_post_process_time);
@ -4071,7 +4116,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
RecordTick(stats_, WRITE_TIMEDOUT); RecordTick(stats_, WRITE_TIMEDOUT);
} }
PERF_TIMER_STOP(write_pre_and_post_process_time);
return status; return status;
} }
@ -4759,11 +4803,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
column_families.push_back( column_families.push_back(
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
std::vector<ColumnFamilyHandle*> handles; std::vector<ColumnFamilyHandle*> handles;
Status s = SanitizeDBOptionsByCFOptions(&db_options, column_families); Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
if (!s.ok()) {
return s;
}
s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
if (s.ok()) { if (s.ok()) {
assert(handles.size() == 1); assert(handles.size() == 1);
// i can delete the handle since DBImpl is always holding a reference to // i can delete the handle since DBImpl is always holding a reference to
@ -4776,6 +4816,10 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
Status DB::Open(const DBOptions& db_options, const std::string& dbname, Status DB::Open(const DBOptions& db_options, const std::string& dbname,
const std::vector<ColumnFamilyDescriptor>& column_families, const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) { std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
Status s = SanitizeDBOptionsByCFOptions(&db_options, column_families);
if (!s.ok()) {
return s;
}
if (db_options.db_paths.size() > 1) { if (db_options.db_paths.size() > 1) {
for (auto& cfd : column_families) { for (auto& cfd : column_families) {
if (cfd.options.compaction_style != kCompactionStyleUniversal) { if (cfd.options.compaction_style != kCompactionStyleUniversal) {
@ -4801,7 +4845,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
} }
DBImpl* impl = new DBImpl(db_options, dbname); DBImpl* impl = new DBImpl(db_options, dbname);
Status s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir); s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir);
if (s.ok()) { if (s.ok()) {
for (auto db_path : impl->options_.db_paths) { for (auto db_path : impl->options_.db_paths) {
s = impl->env_->CreateDirIfMissing(db_path.path); s = impl->env_->CreateDirIfMissing(db_path.path);

@ -74,6 +74,8 @@ class DBImplReadOnly : public DBImpl {
uint32_t target_path_id = 0) override { uint32_t target_path_id = 0) override {
return Status::NotSupported("Not supported operation in read only mode."); return Status::NotSupported("Not supported operation in read only mode.");
} }
#ifndef ROCKSDB_LITE
virtual Status DisableFileDeletions() override { virtual Status DisableFileDeletions() override {
return Status::NotSupported("Not supported operation in read only mode."); return Status::NotSupported("Not supported operation in read only mode.");
} }
@ -85,6 +87,8 @@ class DBImplReadOnly : public DBImpl {
bool flush_memtable = true) override { bool flush_memtable = true) override {
return Status::NotSupported("Not supported operation in read only mode."); return Status::NotSupported("Not supported operation in read only mode.");
} }
#endif // ROCKSDB_LITE
using DBImpl::Flush; using DBImpl::Flush;
virtual Status Flush(const FlushOptions& options, virtual Status Flush(const FlushOptions& options,
ColumnFamilyHandle* column_family) override { ColumnFamilyHandle* column_family) override {

@ -194,9 +194,8 @@ void DBIter::Next() {
// NOTE: In between, saved_key_ can point to a user key that has // NOTE: In between, saved_key_ can point to a user key that has
// a delete marker // a delete marker
inline void DBIter::FindNextUserEntry(bool skipping) { inline void DBIter::FindNextUserEntry(bool skipping) {
PERF_TIMER_AUTO(find_next_user_entry_time); PERF_TIMER_GUARD(find_next_user_entry_time);
FindNextUserEntryInternal(skipping); FindNextUserEntryInternal(skipping);
PERF_TIMER_STOP(find_next_user_entry_time);
} }
// Actual implementation of DBIter::FindNextUserEntry() // Actual implementation of DBIter::FindNextUserEntry()
@ -557,9 +556,12 @@ void DBIter::Seek(const Slice& target) {
saved_key_.Clear(); saved_key_.Clear();
// now savved_key is used to store internal key. // now savved_key is used to store internal key.
saved_key_.SetInternalKey(target, sequence_); saved_key_.SetInternalKey(target, sequence_);
PERF_TIMER_AUTO(seek_internal_seek_time);
{
PERF_TIMER_GUARD(seek_internal_seek_time);
iter_->Seek(saved_key_.GetKey()); iter_->Seek(saved_key_.GetKey());
PERF_TIMER_STOP(seek_internal_seek_time); }
if (iter_->Valid()) { if (iter_->Valid()) {
direction_ = kForward; direction_ = kForward;
ClearSavedValue(); ClearSavedValue();
@ -577,9 +579,12 @@ void DBIter::SeekToFirst() {
} }
direction_ = kForward; direction_ = kForward;
ClearSavedValue(); ClearSavedValue();
PERF_TIMER_AUTO(seek_internal_seek_time);
{
PERF_TIMER_GUARD(seek_internal_seek_time);
iter_->SeekToFirst(); iter_->SeekToFirst();
PERF_TIMER_STOP(seek_internal_seek_time); }
if (iter_->Valid()) { if (iter_->Valid()) {
FindNextUserEntry(false /* not skipping */); FindNextUserEntry(false /* not skipping */);
} else { } else {
@ -595,9 +600,11 @@ void DBIter::SeekToLast() {
} }
direction_ = kReverse; direction_ = kReverse;
ClearSavedValue(); ClearSavedValue();
PERF_TIMER_AUTO(seek_internal_seek_time);
{
PERF_TIMER_GUARD(seek_internal_seek_time);
iter_->SeekToLast(); iter_->SeekToLast();
PERF_TIMER_STOP(seek_internal_seek_time); }
PrevInternal(); PrevInternal();
} }

@ -7554,7 +7554,7 @@ TEST(DBTest, SimpleWriteTimeoutTest) {
ASSERT_OK(Put(Key(2), Key(2) + std::string(100000, 'v'), write_opt)); ASSERT_OK(Put(Key(2), Key(2) + std::string(100000, 'v'), write_opt));
// As the only two write buffers are full in this moment, the third // As the only two write buffers are full in this moment, the third
// Put is expected to be timed-out. // Put is expected to be timed-out.
write_opt.timeout_hint_us = 300; write_opt.timeout_hint_us = 50;
ASSERT_TRUE( ASSERT_TRUE(
Put(Key(3), Key(3) + std::string(100000, 'v'), write_opt).IsTimedOut()); Put(Key(3), Key(3) + std::string(100000, 'v'), write_opt).IsTimedOut());
} }

@ -127,26 +127,6 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
} }
} }
const char* InternalFilterPolicy::Name() const {
return user_policy_->Name();
}
void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
std::string* dst) const {
// We rely on the fact that the code in table.cc does not mind us
// adjusting keys[].
Slice* mkey = const_cast<Slice*>(keys);
for (int i = 0; i < n; i++) {
mkey[i] = ExtractUserKey(keys[i]);
// TODO(sanjay): Suppress dups?
}
user_policy_->CreateFilter(keys, n, dst);
}
bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
}
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
size_t usize = user_key.size(); size_t usize = user_key.size();
size_t needed = usize + 13; // A conservative estimate size_t needed = usize + 13; // A conservative estimate

@ -124,19 +124,6 @@ class InternalKeyComparator : public Comparator {
int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
}; };
// Filter policy wrapper that converts from internal keys to user keys
class InternalFilterPolicy : public FilterPolicy {
private:
std::shared_ptr<const FilterPolicy> shared_ptr_;
const FilterPolicy* const user_policy_;
public:
explicit InternalFilterPolicy(std::shared_ptr<const FilterPolicy> p)
: shared_ptr_(p), user_policy_(p.get()) {}
virtual const char* Name() const;
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
};
// Modules in this directory should keep internal keys wrapped inside // Modules in this directory should keep internal keys wrapped inside
// the following class instead of plain strings so that we do not // the following class instead of plain strings so that we do not
// incorrectly use string comparisons instead of an InternalKeyComparator. // incorrectly use string comparisons instead of an InternalKeyComparator.

@ -6,9 +6,10 @@
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#include "db/forward_iterator.h" #include "db/forward_iterator.h"
#include <limits>
#include <string> #include <string>
#include <utility> #include <utility>
#include <limits>
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/db_iter.h" #include "db/db_iter.h"
#include "db/column_family.h" #include "db/column_family.h"
@ -37,12 +38,16 @@ class LevelIterator : public Iterator {
assert(file_index < files_.size()); assert(file_index < files_.size());
if (file_index != file_index_) { if (file_index != file_index_) {
file_index_ = file_index; file_index_ = file_index;
Reset();
}
valid_ = false;
}
void Reset() {
assert(file_index_ < files_.size());
file_iter_.reset(cfd_->table_cache()->NewIterator( file_iter_.reset(cfd_->table_cache()->NewIterator(
read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
files_[file_index_]->fd, nullptr /* table_reader_ptr */, false)); files_[file_index_]->fd, nullptr /* table_reader_ptr */, false));
} }
valid_ = false;
}
void SeekToLast() override { void SeekToLast() override {
status_ = Status::NotSupported("LevelIterator::SeekToLast()"); status_ = Status::NotSupported("LevelIterator::SeekToLast()");
valid_ = false; valid_ = false;
@ -63,12 +68,15 @@ class LevelIterator : public Iterator {
assert(file_iter_ != nullptr); assert(file_iter_ != nullptr);
file_iter_->Seek(internal_key); file_iter_->Seek(internal_key);
valid_ = file_iter_->Valid(); valid_ = file_iter_->Valid();
assert(valid_);
} }
void Next() override { void Next() override {
assert(valid_); assert(valid_);
file_iter_->Next(); file_iter_->Next();
while (!file_iter_->Valid()) { for (;;) {
if (file_iter_->status().IsIncomplete() || file_iter_->Valid()) {
valid_ = !file_iter_->status().IsIncomplete();
return;
}
if (file_index_ + 1 >= files_.size()) { if (file_index_ + 1 >= files_.size()) {
valid_ = false; valid_ = false;
return; return;
@ -76,7 +84,6 @@ class LevelIterator : public Iterator {
SetFileIndex(file_index_ + 1); SetFileIndex(file_index_ + 1);
file_iter_->SeekToFirst(); file_iter_->SeekToFirst();
} }
valid_ = file_iter_->Valid();
} }
Slice key() const override { Slice key() const override {
assert(valid_); assert(valid_);
@ -160,6 +167,8 @@ void ForwardIterator::SeekToFirst() {
if (sv_ == nullptr || if (sv_ == nullptr ||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) { sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
RebuildIterators(); RebuildIterators();
} else if (status_.IsIncomplete()) {
ResetIncompleteIterators();
} }
SeekInternal(Slice(), true); SeekInternal(Slice(), true);
} }
@ -168,6 +177,8 @@ void ForwardIterator::Seek(const Slice& internal_key) {
if (sv_ == nullptr || if (sv_ == nullptr ||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) { sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
RebuildIterators(); RebuildIterators();
} else if (status_.IsIncomplete()) {
ResetIncompleteIterators();
} }
SeekInternal(internal_key, false); SeekInternal(internal_key, false);
} }
@ -211,7 +222,15 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
} }
l0_iters_[i]->Seek(internal_key); l0_iters_[i]->Seek(internal_key);
} }
if (l0_iters_[i]->Valid()) {
if (l0_iters_[i]->status().IsIncomplete()) {
// if any of the immutable iterators is incomplete (no-io option was
// used), we are unable to reliably find the smallest key
assert(read_options_.read_tier == kBlockCacheTier);
status_ = l0_iters_[i]->status();
valid_ = false;
return;
} else if (l0_iters_[i]->Valid()) {
immutable_min_heap_.push(l0_iters_[i]); immutable_min_heap_.push(l0_iters_[i]);
} }
} }
@ -280,7 +299,14 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
level_iters_[level - 1]->SetFileIndex(f_idx); level_iters_[level - 1]->SetFileIndex(f_idx);
seek_to_first ? level_iters_[level - 1]->SeekToFirst() : seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
level_iters_[level - 1]->Seek(internal_key); level_iters_[level - 1]->Seek(internal_key);
if (level_iters_[level - 1]->Valid()) {
if (level_iters_[level - 1]->status().IsIncomplete()) {
// see above
assert(read_options_.read_tier == kBlockCacheTier);
status_ = level_iters_[level - 1]->status();
valid_ = false;
return;
} else if (level_iters_[level - 1]->Valid()) {
immutable_min_heap_.push(level_iters_[level - 1]); immutable_min_heap_.push(level_iters_[level - 1]);
} }
} }
@ -320,9 +346,17 @@ void ForwardIterator::Next() {
} }
current_->Next(); current_->Next();
if (current_->Valid() && current_ != mutable_iter_) { if (current_ != mutable_iter_) {
if (current_->status().IsIncomplete()) {
assert(read_options_.read_tier == kBlockCacheTier);
status_ = current_->status();
valid_ = false;
return;
} else if (current_->Valid()) {
immutable_min_heap_.push(current_); immutable_min_heap_.push(current_);
} }
}
UpdateCurrent(); UpdateCurrent();
} }
@ -389,6 +423,29 @@ void ForwardIterator::RebuildIterators() {
is_prev_set_ = false; is_prev_set_ = false;
} }
void ForwardIterator::ResetIncompleteIterators() {
const auto& l0_files = sv_->current->files_[0];
for (uint32_t i = 0; i < l0_iters_.size(); ++i) {
assert(i < l0_files.size());
if (!l0_iters_[i]->status().IsIncomplete()) {
continue;
}
delete l0_iters_[i];
l0_iters_[i] = cfd_->table_cache()->NewIterator(
read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
l0_files[i]->fd);
}
for (auto* level_iter : level_iters_) {
if (level_iter && level_iter->status().IsIncomplete()) {
level_iter->Reset();
}
}
current_ = nullptr;
is_prev_set_ = false;
}
void ForwardIterator::UpdateCurrent() { void ForwardIterator::UpdateCurrent() {
if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) { if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
current_ = nullptr; current_ = nullptr;
@ -417,7 +474,7 @@ void ForwardIterator::UpdateCurrent() {
} }
bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
if (!is_prev_set_) { if (!valid_ || !is_prev_set_) {
return true; return true;
} }
Slice prev_key = prev_key_.GetKey(); Slice prev_key = prev_key_.GetKey();

@ -73,6 +73,7 @@ class ForwardIterator : public Iterator {
private: private:
void Cleanup(); void Cleanup();
void RebuildIterators(); void RebuildIterators();
void ResetIncompleteIterators();
void SeekInternal(const Slice& internal_key, bool seek_to_first); void SeekInternal(const Slice& internal_key, bool seek_to_first);
void UpdateCurrent(); void UpdateCurrent();
bool NeedToSeekImmutable(const Slice& internal_key); bool NeedToSeekImmutable(const Slice& internal_key);

@ -257,9 +257,11 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
cfd_->imm()->current()->GetTotalNumEntries() + cfd_->imm()->current()->GetTotalNumEntries() +
current->GetEstimatedActiveKeys(); current->GetEstimatedActiveKeys();
return true; return true;
#ifndef ROCKSDB_LITE
case kIsFileDeletionEnabled: case kIsFileDeletionEnabled:
*value = db->IsFileDeletionsEnabled(); *value = db->IsFileDeletionsEnabled();
return true; return true;
#endif
default: default:
return false; return false;
} }

@ -422,7 +422,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
// Avoiding recording stats for speed. // Avoiding recording stats for speed.
return false; return false;
} }
PERF_TIMER_AUTO(get_from_memtable_time); PERF_TIMER_GUARD(get_from_memtable_time);
Slice user_key = key.user_key(); Slice user_key = key.user_key();
bool found_final_value = false; bool found_final_value = false;
@ -452,7 +452,6 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
if (!found_final_value && merge_in_progress) { if (!found_final_value && merge_in_progress) {
*s = Status::MergeInProgress(""); *s = Status::MergeInProgress("");
} }
PERF_TIMER_STOP(get_from_memtable_time);
PERF_COUNTER_ADD(get_from_memtable_count, 1); PERF_COUNTER_ADD(get_from_memtable_count, 1);
return found_final_value; return found_final_value;
} }

@ -556,7 +556,7 @@ public:
WritableFile* file, WritableFile* file,
CompressionType compression_type) const; CompressionType compression_type) const;
virtual Status SanitizeDBOptions(DBOptions* db_opts) const override { virtual Status SanitizeDBOptions(const DBOptions* db_opts) const override {
return Status::OK(); return Status::OK();
} }

@ -299,17 +299,17 @@ class MemTableInserter : public WriteBatch::Handler {
public: public:
SequenceNumber sequence_; SequenceNumber sequence_;
ColumnFamilyMemTables* cf_mems_; ColumnFamilyMemTables* cf_mems_;
bool recovery_; bool ignore_missing_column_families_;
uint64_t log_number_; uint64_t log_number_;
DBImpl* db_; DBImpl* db_;
const bool dont_filter_deletes_; const bool dont_filter_deletes_;
MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems, MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
bool recovery, uint64_t log_number, DB* db, bool ignore_missing_column_families, uint64_t log_number,
const bool dont_filter_deletes) DB* db, const bool dont_filter_deletes)
: sequence_(sequence), : sequence_(sequence),
cf_mems_(cf_mems), cf_mems_(cf_mems),
recovery_(recovery), ignore_missing_column_families_(ignore_missing_column_families),
log_number_(log_number), log_number_(log_number),
db_(reinterpret_cast<DBImpl*>(db)), db_(reinterpret_cast<DBImpl*>(db)),
dont_filter_deletes_(dont_filter_deletes) { dont_filter_deletes_(dont_filter_deletes) {
@ -321,12 +321,18 @@ class MemTableInserter : public WriteBatch::Handler {
bool SeekToColumnFamily(uint32_t column_family_id, Status* s) { bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
bool found = cf_mems_->Seek(column_family_id); bool found = cf_mems_->Seek(column_family_id);
if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) { if (!found) {
// if in recovery envoronment: if (ignore_missing_column_families_) {
// * If column family was not found, it might mean that the WAL write *s = Status::OK();
// batch references to the column family that was dropped after the } else {
// insert. We don't want to fail the whole write batch in that case -- we *s = Status::InvalidArgument(
// just ignore the update. "Invalid column family specified in write batch");
}
return false;
}
if (log_number_ != 0 && log_number_ < cf_mems_->GetLogNumber()) {
// This is true only in recovery environment (log_number_ is always 0 in
// non-recovery, regular write code-path)
// * If log_number_ < cf_mems_->GetLogNumber(), this means that column // * If log_number_ < cf_mems_->GetLogNumber(), this means that column
// family already contains updates from this log. We can't apply updates // family already contains updates from this log. We can't apply updates
// twice because of update-in-place or merge workloads -- ignore the // twice because of update-in-place or merge workloads -- ignore the
@ -334,18 +340,8 @@ class MemTableInserter : public WriteBatch::Handler {
*s = Status::OK(); *s = Status::OK();
return false; return false;
} }
if (!found) {
assert(!recovery_);
// If the column family was not found in non-recovery enviornment
// (client's write code-path), we have to fail the write and return
// the failure status to the client.
*s = Status::InvalidArgument(
"Invalid column family specified in write batch");
return false;
}
return true; return true;
} }
virtual Status PutCF(uint32_t column_family_id, const Slice& key, virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) { const Slice& value) {
Status seek_status; Status seek_status;
@ -503,10 +499,12 @@ class MemTableInserter : public WriteBatch::Handler {
Status WriteBatchInternal::InsertInto(const WriteBatch* b, Status WriteBatchInternal::InsertInto(const WriteBatch* b,
ColumnFamilyMemTables* memtables, ColumnFamilyMemTables* memtables,
bool recovery, uint64_t log_number, bool ignore_missing_column_families,
DB* db, const bool dont_filter_deletes) { uint64_t log_number, DB* db,
const bool dont_filter_deletes) {
MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables, MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables,
recovery, log_number, db, dont_filter_deletes); ignore_missing_column_families, log_number, db,
dont_filter_deletes);
return b->Iterate(&inserter); return b->Iterate(&inserter);
} }

@ -106,18 +106,18 @@ class WriteBatchInternal {
// Inserts batch entries into memtable // Inserts batch entries into memtable
// If dont_filter_deletes is false AND options.filter_deletes is true, // If dont_filter_deletes is false AND options.filter_deletes is true,
// then --> Drops deletes in batch if db->KeyMayExist returns false // then --> Drops deletes in batch if db->KeyMayExist returns false
// If recovery == true, this means InsertInto is executed on a recovery // If ignore_missing_column_families == true. WriteBatch referencing
// code-path. WriteBatch referencing a dropped column family can be // non-existing column family should be ignored.
// found on a recovery code-path and should be ignored (recovery should not // However, if ignore_missing_column_families == false, any WriteBatch
// fail). Additionally, the memtable will be updated only if // referencing non-existing column family will return a InvalidArgument()
// failure.
//
// If log_number is non-zero, the memtable will be updated only if
// memtables->GetLogNumber() >= log_number // memtables->GetLogNumber() >= log_number
// However, if recovery == false, any WriteBatch referencing
// non-existing column family will return a failure. Also, log_number is
// ignored in that case
static Status InsertInto(const WriteBatch* batch, static Status InsertInto(const WriteBatch* batch,
ColumnFamilyMemTables* memtables, ColumnFamilyMemTables* memtables,
bool recovery = false, uint64_t log_number = 0, bool ignore_missing_column_families = false,
DB* db = nullptr, uint64_t log_number = 0, DB* db = nullptr,
const bool dont_filter_deletes = true); const bool dont_filter_deletes = true);
static void Append(WriteBatch* dst, const WriteBatch* src); static void Append(WriteBatch* dst, const WriteBatch* src);

@ -27,7 +27,9 @@ struct IOStatsContext {
uint64_t bytes_read; uint64_t bytes_read;
}; };
#ifndef IOS_CROSS_COMPILE
extern __thread IOStatsContext iostats_context; extern __thread IOStatsContext iostats_context;
#endif // IOS_CROSS_COMPILE
} // namespace rocksdb } // namespace rocksdb

@ -959,7 +959,17 @@ struct WriteOptions {
// Default: 0 // Default: 0
uint64_t timeout_hint_us; uint64_t timeout_hint_us;
WriteOptions() : sync(false), disableWAL(false), timeout_hint_us(0) {} // If true and if user is trying to write to column families that don't exist
// (they were dropped), ignore the write (don't return an error). If there
// are multiple writes in a WriteBatch, other writes will succeed.
// Default: false
bool ignore_missing_column_families;
WriteOptions()
: sync(false),
disableWAL(false),
timeout_hint_us(0),
ignore_missing_column_families(false) {}
}; };
// Options that control flush operations // Options that control flush operations

@ -96,7 +96,7 @@ class Status {
// Returns true iff the status indicates Incomplete // Returns true iff the status indicates Incomplete
bool IsIncomplete() const { return code() == kIncomplete; } bool IsIncomplete() const { return code() == kIncomplete; }
// Returns true iff the status indicates Incomplete // Returns true iff the status indicates Shutdown In progress
bool IsShutdownInProgress() const { return code() == kShutdownInProgress; } bool IsShutdownInProgress() const { return code() == kShutdownInProgress; }
bool IsTimedOut() const { return code() == kTimedOut; } bool IsTimedOut() const { return code() == kTimedOut; }

@ -227,15 +227,46 @@ extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options =
PlainTableOptions()); PlainTableOptions());
struct CuckooTablePropertyNames { struct CuckooTablePropertyNames {
// The key that is used to fill empty buckets.
static const std::string kEmptyKey; static const std::string kEmptyKey;
// Fixed length of value.
static const std::string kValueLength; static const std::string kValueLength;
static const std::string kNumHashTable; // Number of hash functions used in Cuckoo Hash.
static const std::string kMaxNumBuckets; static const std::string kNumHashFunc;
// It denotes the number of buckets in a Cuckoo Block. Given a key and a
// particular hash function, a Cuckoo Block is a set of consecutive buckets,
// where starting bucket id is given by the hash function on the key. In case
// of a collision during inserting the key, the builder tries to insert the
// key in other locations of the cuckoo block before using the next hash
// function. This reduces cache miss during read operation in case of
// collision.
static const std::string kCuckooBlockSize;
// Size of the hash table. Use this number to compute the modulo of hash
// function. The actual number of buckets will be kMaxHashTableSize +
// kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
// accommodate the Cuckoo Block from end of hash table, due to cache friendly
// implementation.
static const std::string kHashTableSize;
// Denotes if the key sorted in the file is Internal Key (if false)
// or User Key only (if true).
static const std::string kIsLastLevel; static const std::string kIsLastLevel;
}; };
// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
// @hash_table_ratio: Determines the utilization of hash tables. Smaller values
// result in larger hash tables with fewer collisions.
// @max_search_depth: A property used by builder to determine the depth to go to
// to search for a path to displace elements in case of
// collision. See Builder.MakeSpaceForKey method. Higher
// values result in more efficient hash tables with fewer
// lookups but take more time to build.
// @cuckoo_block_size: In case of collision while inserting, the builder
// attempts to insert in the next cuckoo_block_size
// locations before skipping over to the next Cuckoo hash
// function. This makes lookups more cache friendly in case
// of collisions.
extern TableFactory* NewCuckooTableFactory(double hash_table_ratio = 0.9, extern TableFactory* NewCuckooTableFactory(double hash_table_ratio = 0.9,
uint32_t max_search_depth = 100); uint32_t max_search_depth = 100, uint32_t cuckoo_block_size = 5);
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
@ -300,7 +331,7 @@ class TableFactory {
// //
// If the function cannot find a way to sanitize the input DB Options, // If the function cannot find a way to sanitize the input DB Options,
// a non-ok Status will be returned. // a non-ok Status will be returned.
virtual Status SanitizeDBOptions(DBOptions* db_opts) const = 0; virtual Status SanitizeDBOptions(const DBOptions* db_opts) const = 0;
// Return a string that contains printable format of table configurations. // Return a string that contains printable format of table configurations.
// RocksDB prints configurations at DB Open(). // RocksDB prints configurations at DB Open().

@ -43,7 +43,7 @@ class AdaptiveTableFactory : public TableFactory {
override; override;
// Sanitizes the specified DB Options. // Sanitizes the specified DB Options.
Status SanitizeDBOptions(DBOptions* db_opts) const override { Status SanitizeDBOptions(const DBOptions* db_opts) const override {
if (db_opts->allow_mmap_reads == false) { if (db_opts->allow_mmap_reads == false) {
return Status::NotSupported( return Status::NotSupported(
"AdaptiveTable with allow_mmap_reads == false is not supported."); "AdaptiveTable with allow_mmap_reads == false is not supported.");

@ -116,7 +116,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
public: public:
explicit ShortenedIndexBuilder(const Comparator* comparator) explicit ShortenedIndexBuilder(const Comparator* comparator)
: IndexBuilder(comparator), : IndexBuilder(comparator),
index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {} index_block_builder_(1 /* block_restart_interval == 1 */) {}
virtual void AddIndexEntry(std::string* last_key_in_current_block, virtual void AddIndexEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block, const Slice* first_key_in_next_block,
@ -420,7 +420,7 @@ struct BlockBasedTableBuilder::Rep {
table_options(table_opt), table_options(table_opt),
internal_comparator(icomparator), internal_comparator(icomparator),
file(f), file(f),
data_block(table_options.block_restart_interval, &internal_comparator), data_block(table_options.block_restart_interval),
internal_prefix_transform(options.prefix_extractor.get()), internal_prefix_transform(options.prefix_extractor.get()),
index_builder(CreateIndexBuilder( index_builder(CreateIndexBuilder(
table_options.index_type, &internal_comparator, table_options.index_type, &internal_comparator,
@ -492,7 +492,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
} }
if (r->filter_block != nullptr) { if (r->filter_block != nullptr) {
r->filter_block->AddKey(key); r->filter_block->AddKey(ExtractUserKey(key));
} }
r->last_key.assign(key.data(), key.size()); r->last_key.assign(key.data(), key.size());

@ -38,10 +38,6 @@ BlockBasedTableFactory::BlockBasedTableFactory(
table_options_.block_size_deviation > 100) { table_options_.block_size_deviation > 100) {
table_options_.block_size_deviation = 0; table_options_.block_size_deviation = 0;
} }
if (table_options_.filter_policy) {
auto* p = new InternalFilterPolicy(table_options_.filter_policy);
table_options_.filter_policy.reset(p);
}
} }
Status BlockBasedTableFactory::NewTableReader( Status BlockBasedTableFactory::NewTableReader(

@ -45,7 +45,7 @@ class BlockBasedTableFactory : public TableFactory {
WritableFile* file, CompressionType compression_type) const override; WritableFile* file, CompressionType compression_type) const override;
// Sanitizes the specified DB Options. // Sanitizes the specified DB Options.
Status SanitizeDBOptions(DBOptions* db_opts) const override { Status SanitizeDBOptions(const DBOptions* db_opts) const override {
return Status::OK(); return Status::OK();
} }

@ -1067,9 +1067,8 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
s = handle.DecodeFrom(&handle_value); s = handle.DecodeFrom(&handle_value);
assert(s.ok()); assert(s.ok());
auto filter_entry = GetFilter(true /* no io */); auto filter_entry = GetFilter(true /* no io */);
may_match = may_match = filter_entry.value == nullptr ||
filter_entry.value == nullptr || filter_entry.value->PrefixMayMatch(handle.offset(), prefix);
filter_entry.value->PrefixMayMatch(handle.offset(), internal_prefix);
filter_entry.Release(rep_->table_options.block_cache.get()); filter_entry.Release(rep_->table_options.block_cache.get());
} }
@ -1105,9 +1104,8 @@ Status BlockBasedTable::Get(
BlockHandle handle; BlockHandle handle;
bool may_not_exist_in_filter = bool may_not_exist_in_filter =
filter != nullptr && filter != nullptr && handle.DecodeFrom(&handle_value).ok() &&
handle.DecodeFrom(&handle_value).ok() && !filter->KeyMayMatch(handle.offset(), ExtractUserKey(key));
!filter->KeyMayMatch(handle.offset(), key);
if (may_not_exist_in_filter) { if (may_not_exist_in_filter) {
// Not found // Not found

@ -41,10 +41,8 @@
namespace rocksdb { namespace rocksdb {
BlockBuilder::BlockBuilder(int block_restart_interval, BlockBuilder::BlockBuilder(int block_restart_interval)
const Comparator* comparator)
: block_restart_interval_(block_restart_interval), : block_restart_interval_(block_restart_interval),
comparator_(comparator),
restarts_(), restarts_(),
counter_(0), counter_(0),
finished_(false) { finished_(false) {
@ -96,8 +94,6 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) {
Slice last_key_piece(last_key_); Slice last_key_piece(last_key_);
assert(!finished_); assert(!finished_);
assert(counter_ <= block_restart_interval_); assert(counter_ <= block_restart_interval_);
assert(buffer_.empty() // No values yet?
|| comparator_->Compare(key, last_key_piece) > 0);
size_t shared = 0; size_t shared = 0;
if (counter_ < block_restart_interval_) { if (counter_ < block_restart_interval_) {
// See how much sharing to do with previous string // See how much sharing to do with previous string

@ -22,7 +22,7 @@ class BlockBuilder {
BlockBuilder(const BlockBuilder&) = delete; BlockBuilder(const BlockBuilder&) = delete;
void operator=(const BlockBuilder&) = delete; void operator=(const BlockBuilder&) = delete;
BlockBuilder(int block_restart_interval, const Comparator* comparator); explicit BlockBuilder(int block_restart_interval);
// Reset the contents as if the BlockBuilder was just constructed. // Reset the contents as if the BlockBuilder was just constructed.
void Reset(); void Reset();
@ -50,7 +50,6 @@ class BlockBuilder {
private: private:
const int block_restart_interval_; const int block_restart_interval_;
const Comparator* comparator_;
std::string buffer_; // Destination buffer std::string buffer_; // Destination buffer
std::vector<uint32_t> restarts_; // Restart points std::vector<uint32_t> restarts_; // Restart points

@ -76,7 +76,7 @@ TEST(BlockTest, SimpleTest) {
std::vector<std::string> keys; std::vector<std::string> keys;
std::vector<std::string> values; std::vector<std::string> values;
BlockBuilder builder(16, ic.get()); BlockBuilder builder(16);
int num_records = 100000; int num_records = 100000;
GenerateRandomKVs(&keys, &values, 0, num_records); GenerateRandomKVs(&keys, &values, 0, num_records);
@ -132,8 +132,7 @@ BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
const std::vector<std::string> &keys, const std::vector<std::string> &keys,
const std::vector<std::string> &values, const std::vector<std::string> &values,
const int prefix_group_size = 1) { const int prefix_group_size = 1) {
builder->reset( builder->reset(new BlockBuilder(1 /* restart interval */));
new BlockBuilder(1 /* restart interval */, BytewiseComparator()));
// Add only half of the keys // Add only half of the keys
for (size_t i = 0; i < keys.size(); ++i) { for (size_t i = 0; i < keys.size(); ++i) {

@ -16,6 +16,7 @@
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
#include "table/block_builder.h" #include "table/block_builder.h"
#include "table/cuckoo_table_factory.h"
#include "table/format.h" #include "table/format.h"
#include "table/meta_blocks.h" #include "table/meta_blocks.h"
#include "util/autovector.h" #include "util/autovector.h"
@ -24,28 +25,32 @@
namespace rocksdb { namespace rocksdb {
const std::string CuckooTablePropertyNames::kEmptyKey = const std::string CuckooTablePropertyNames::kEmptyKey =
"rocksdb.cuckoo.bucket.empty.key"; "rocksdb.cuckoo.bucket.empty.key";
const std::string CuckooTablePropertyNames::kNumHashTable = const std::string CuckooTablePropertyNames::kNumHashFunc =
"rocksdb.cuckoo.hash.num"; "rocksdb.cuckoo.hash.num";
const std::string CuckooTablePropertyNames::kMaxNumBuckets = const std::string CuckooTablePropertyNames::kHashTableSize =
"rocksdb.cuckoo.bucket.maxnum"; "rocksdb.cuckoo.hash.size";
const std::string CuckooTablePropertyNames::kValueLength = const std::string CuckooTablePropertyNames::kValueLength =
"rocksdb.cuckoo.value.length"; "rocksdb.cuckoo.value.length";
const std::string CuckooTablePropertyNames::kIsLastLevel = const std::string CuckooTablePropertyNames::kIsLastLevel =
"rocksdb.cuckoo.file.islastlevel"; "rocksdb.cuckoo.file.islastlevel";
const std::string CuckooTablePropertyNames::kCuckooBlockSize =
"rocksdb.cuckoo.hash.cuckooblocksize";
// Obtained by running echo rocksdb.table.cuckoo | sha1sum // Obtained by running echo rocksdb.table.cuckoo | sha1sum
extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
CuckooTableBuilder::CuckooTableBuilder( CuckooTableBuilder::CuckooTableBuilder(
WritableFile* file, double hash_table_ratio, WritableFile* file, double max_hash_table_ratio,
uint32_t max_num_hash_table, uint32_t max_search_depth, uint32_t max_num_hash_table, uint32_t max_search_depth,
const Comparator* user_comparator, const Comparator* user_comparator, uint32_t cuckoo_block_size,
uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
: num_hash_table_(2), : num_hash_func_(2),
file_(file), file_(file),
hash_table_ratio_(hash_table_ratio), max_hash_table_ratio_(max_hash_table_ratio),
max_num_hash_table_(max_num_hash_table), max_num_hash_func_(max_num_hash_table),
max_search_depth_(max_search_depth), max_search_depth_(max_search_depth),
cuckoo_block_size_(std::max(1U, cuckoo_block_size)),
hash_table_size_(2),
is_last_level_file_(false), is_last_level_file_(false),
has_seen_first_key_(false), has_seen_first_key_(false),
ucomp_(user_comparator), ucomp_(user_comparator),
@ -86,7 +91,6 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
} else { } else {
kvs_.emplace_back(std::make_pair(key.ToString(), value.ToString())); kvs_.emplace_back(std::make_pair(key.ToString(), value.ToString()));
} }
properties_.num_entries++;
// In order to fill the empty buckets in the hash table, we identify a // In order to fill the empty buckets in the hash table, we identify a
// key which is not used so far (unused_user_key). We determine this by // key which is not used so far (unused_user_key). We determine this by
@ -98,11 +102,14 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
} else if (ikey.user_key.compare(largest_user_key_) > 0) { } else if (ikey.user_key.compare(largest_user_key_) > 0) {
largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
} }
if (hash_table_size_ < kvs_.size() / max_hash_table_ratio_) {
hash_table_size_ *= 2;
}
} }
Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) { Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
uint64_t num_buckets = kvs_.size() / hash_table_ratio_; uint64_t hash_table_size_minus_one = hash_table_size_ - 1;
buckets->resize(num_buckets); buckets->resize(hash_table_size_minus_one + cuckoo_block_size_);
uint64_t make_space_for_key_call_id = 0; uint64_t make_space_for_key_call_id = 0;
for (uint32_t vector_idx = 0; vector_idx < kvs_.size(); vector_idx++) { for (uint32_t vector_idx = 0; vector_idx < kvs_.size(); vector_idx++) {
uint64_t bucket_id; uint64_t bucket_id;
@ -110,8 +117,15 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
autovector<uint64_t> hash_vals; autovector<uint64_t> hash_vals;
Slice user_key = is_last_level_file_ ? kvs_[vector_idx].first : Slice user_key = is_last_level_file_ ? kvs_[vector_idx].first :
ExtractUserKey(kvs_[vector_idx].first); ExtractUserKey(kvs_[vector_idx].first);
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
uint64_t hash_val = get_slice_hash_(user_key, hash_cnt, num_buckets); ++hash_cnt) {
uint64_t hash_val = CuckooHash(user_key, hash_cnt,
hash_table_size_minus_one, get_slice_hash_);
// If there is a collision, check next cuckoo_block_size_ locations for
// empty locations. While checking, if we reach end of the hash table,
// stop searching and proceed for next hash function.
for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
++block_idx, ++hash_val) {
if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) { if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) {
bucket_id = hash_val; bucket_id = hash_val;
bucket_found = true; bucket_found = true;
@ -126,17 +140,20 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
hash_vals.push_back(hash_val); hash_vals.push_back(hash_val);
} }
} }
}
while (!bucket_found && !MakeSpaceForKey(hash_vals, while (!bucket_found && !MakeSpaceForKey(hash_vals,
++make_space_for_key_call_id, buckets, &bucket_id)) { ++make_space_for_key_call_id, buckets, &bucket_id)) {
// Rehash by increashing number of hash tables. // Rehash by increashing number of hash tables.
if (num_hash_table_ >= max_num_hash_table_) { if (num_hash_func_ >= max_num_hash_func_) {
return Status::NotSupported("Too many collissions. Unable to hash."); return Status::NotSupported("Too many collisions. Unable to hash.");
} }
// We don't really need to rehash the entire table because old hashes are // We don't really need to rehash the entire table because old hashes are
// still valid and we only increased the number of hash functions. // still valid and we only increased the number of hash functions.
uint64_t hash_val = get_slice_hash_(user_key, uint64_t hash_val = CuckooHash(user_key, num_hash_func_,
num_hash_table_, num_buckets); hash_table_size_minus_one, get_slice_hash_);
++num_hash_table_; ++num_hash_func_;
for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
++block_idx, ++hash_val) {
if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) { if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) {
bucket_found = true; bucket_found = true;
bucket_id = hash_val; bucket_id = hash_val;
@ -145,6 +162,7 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
hash_vals.push_back(hash_val); hash_vals.push_back(hash_val);
} }
} }
}
(*buckets)[bucket_id].vector_idx = vector_idx; (*buckets)[bucket_id].vector_idx = vector_idx;
} }
return Status::OK(); return Status::OK();
@ -154,13 +172,14 @@ Status CuckooTableBuilder::Finish() {
assert(!closed_); assert(!closed_);
closed_ = true; closed_ = true;
std::vector<CuckooBucket> buckets; std::vector<CuckooBucket> buckets;
Status s = MakeHashTable(&buckets); Status s;
std::string unused_bucket;
if (!kvs_.empty()) {
s = MakeHashTable(&buckets);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
// Determine unused_user_key to fill empty buckets. // Determine unused_user_key to fill empty buckets.
std::string unused_bucket;
if (!kvs_.empty()) {
std::string unused_user_key = smallest_user_key_; std::string unused_user_key = smallest_user_key_;
int curr_pos = unused_user_key.size() - 1; int curr_pos = unused_user_key.size() - 1;
while (curr_pos >= 0) { while (curr_pos >= 0) {
@ -192,6 +211,7 @@ Status CuckooTableBuilder::Finish() {
AppendInternalKey(&unused_bucket, ikey); AppendInternalKey(&unused_bucket, ikey);
} }
} }
properties_.num_entries = kvs_.size();
properties_.fixed_key_len = unused_bucket.size(); properties_.fixed_key_len = unused_bucket.size();
uint32_t value_length = kvs_.empty() ? 0 : kvs_[0].second.size(); uint32_t value_length = kvs_.empty() ? 0 : kvs_[0].second.size();
uint32_t bucket_size = value_length + properties_.fixed_key_len; uint32_t bucket_size = value_length + properties_.fixed_key_len;
@ -226,16 +246,22 @@ Status CuckooTableBuilder::Finish() {
properties_.user_collected_properties[ properties_.user_collected_properties[
CuckooTablePropertyNames::kEmptyKey] = unused_bucket; CuckooTablePropertyNames::kEmptyKey] = unused_bucket;
properties_.user_collected_properties[ properties_.user_collected_properties[
CuckooTablePropertyNames::kNumHashTable].assign( CuckooTablePropertyNames::kNumHashFunc].assign(
reinterpret_cast<char*>(&num_hash_table_), sizeof(num_hash_table_)); reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_));
uint64_t num_buckets = buckets.size();
uint64_t hash_table_size = buckets.size() - cuckoo_block_size_ + 1;
properties_.user_collected_properties[ properties_.user_collected_properties[
CuckooTablePropertyNames::kMaxNumBuckets].assign( CuckooTablePropertyNames::kHashTableSize].assign(
reinterpret_cast<const char*>(&num_buckets), sizeof(num_buckets)); reinterpret_cast<const char*>(&hash_table_size),
sizeof(hash_table_size));
properties_.user_collected_properties[ properties_.user_collected_properties[
CuckooTablePropertyNames::kIsLastLevel].assign( CuckooTablePropertyNames::kIsLastLevel].assign(
reinterpret_cast<const char*>(&is_last_level_file_), reinterpret_cast<const char*>(&is_last_level_file_),
sizeof(is_last_level_file_)); sizeof(is_last_level_file_));
properties_.user_collected_properties[
CuckooTablePropertyNames::kCuckooBlockSize].assign(
reinterpret_cast<const char*>(&cuckoo_block_size_),
sizeof(cuckoo_block_size_));
// Write meta blocks. // Write meta blocks.
MetaIndexBuilder meta_index_builder; MetaIndexBuilder meta_index_builder;
@ -279,7 +305,7 @@ void CuckooTableBuilder::Abandon() {
} }
uint64_t CuckooTableBuilder::NumEntries() const { uint64_t CuckooTableBuilder::NumEntries() const {
return properties_.num_entries; return kvs_.size();
} }
uint64_t CuckooTableBuilder::FileSize() const { uint64_t CuckooTableBuilder::FileSize() const {
@ -288,11 +314,17 @@ uint64_t CuckooTableBuilder::FileSize() const {
} else if (properties_.num_entries == 0) { } else if (properties_.num_entries == 0) {
return 0; return 0;
} }
// This is not the actual size of the file as we need to account for
// hash table ratio. This returns the size of filled buckets in the table // Account for buckets being a power of two.
// scaled up by a factor of 1/hash_table_ratio. // As elements are added, file size remains constant for a while and doubles
return ((kvs_[0].first.size() + kvs_[0].second.size()) * // its size. Since compaction algorithm stops adding elements only after it
properties_.num_entries) / hash_table_ratio_; // exceeds the file limit, we account for the extra element being added here.
uint64_t expected_hash_table_size = hash_table_size_;
if (expected_hash_table_size < (kvs_.size() + 1) / max_hash_table_ratio_) {
expected_hash_table_size *= 2;
}
return (kvs_[0].first.size() + kvs_[0].second.size()) *
expected_hash_table_size;
} }
// This method is invoked when there is no place to insert the target key. // This method is invoked when there is no place to insert the target key.
@ -322,17 +354,19 @@ bool CuckooTableBuilder::MakeSpaceForKey(
std::vector<CuckooNode> tree; std::vector<CuckooNode> tree;
// We want to identify already visited buckets in the current method call so // We want to identify already visited buckets in the current method call so
// that we don't add same buckets again for exploration in the tree. // that we don't add same buckets again for exploration in the tree.
// We do this by maintaining a count of current method call, which acts as a // We do this by maintaining a count of current method call in
// unique id for this invocation of the method. We store this number into // make_space_for_key_call_id, which acts as a unique id for this invocation
// the nodes that we explore in current method call. // of the method. We store this number into the nodes that we explore in
// current method call.
// It is unlikely for the increment operation to overflow because the maximum // It is unlikely for the increment operation to overflow because the maximum
// no. of times this will be called is <= max_num_hash_table_ + kvs_.size(). // no. of times this will be called is <= max_num_hash_func_ + kvs_.size().
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
uint64_t bucket_id = hash_vals[hash_cnt]; uint64_t bucket_id = hash_vals[hash_cnt];
(*buckets)[bucket_id].make_space_for_key_call_id = (*buckets)[bucket_id].make_space_for_key_call_id =
make_space_for_key_call_id; make_space_for_key_call_id;
tree.push_back(CuckooNode(bucket_id, 0, 0)); tree.push_back(CuckooNode(bucket_id, 0, 0));
} }
uint64_t hash_table_size_minus_one = hash_table_size_ - 1;
bool null_found = false; bool null_found = false;
uint32_t curr_pos = 0; uint32_t curr_pos = 0;
while (!null_found && curr_pos < tree.size()) { while (!null_found && curr_pos < tree.size()) {
@ -342,11 +376,15 @@ bool CuckooTableBuilder::MakeSpaceForKey(
break; break;
} }
CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id]; CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id];
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { for (uint32_t hash_cnt = 0;
uint64_t child_bucket_id = get_slice_hash_( hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) {
is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first uint64_t child_bucket_id = CuckooHash(
: ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first)), (is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first :
hash_cnt, buckets->size()); ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first))),
hash_cnt, hash_table_size_minus_one, get_slice_hash_);
// Iterate inside Cuckoo Block.
for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
++block_idx, ++child_bucket_id) {
if ((*buckets)[child_bucket_id].make_space_for_key_call_id == if ((*buckets)[child_bucket_id].make_space_for_key_call_id ==
make_space_for_key_call_id) { make_space_for_key_call_id) {
continue; continue;
@ -360,6 +398,7 @@ bool CuckooTableBuilder::MakeSpaceForKey(
break; break;
} }
} }
}
++curr_pos; ++curr_pos;
} }
@ -367,10 +406,10 @@ bool CuckooTableBuilder::MakeSpaceForKey(
// There is an empty node in tree.back(). Now, traverse the path from this // There is an empty node in tree.back(). Now, traverse the path from this
// empty node to top of the tree and at every node in the path, replace // empty node to top of the tree and at every node in the path, replace
// child with the parent. Stop when first level is reached in the tree // child with the parent. Stop when first level is reached in the tree
// (happens when 0 <= bucket_to_replace_pos < num_hash_table_) and return // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
// this location in first level for target key to be inserted. // this location in first level for target key to be inserted.
uint32_t bucket_to_replace_pos = tree.size()-1; uint32_t bucket_to_replace_pos = tree.size()-1;
while (bucket_to_replace_pos >= num_hash_table_) { while (bucket_to_replace_pos >= num_hash_func_) {
CuckooNode& curr_node = tree[bucket_to_replace_pos]; CuckooNode& curr_node = tree[bucket_to_replace_pos];
(*buckets)[curr_node.bucket_id] = (*buckets)[curr_node.bucket_id] =
(*buckets)[tree[curr_node.parent_pos].bucket_id]; (*buckets)[tree[curr_node.parent_pos].bucket_id];

@ -21,8 +21,9 @@ namespace rocksdb {
class CuckooTableBuilder: public TableBuilder { class CuckooTableBuilder: public TableBuilder {
public: public:
CuckooTableBuilder( CuckooTableBuilder(
WritableFile* file, double hash_table_ratio, uint32_t max_num_hash_table, WritableFile* file, double max_hash_table_ratio,
uint32_t max_search_depth, const Comparator* user_comparator, uint32_t max_num_hash_func, uint32_t max_search_depth,
const Comparator* user_comparator, uint32_t cuckoo_block_size,
uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)); uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t));
// REQUIRES: Either Finish() or Abandon() has been called. // REQUIRES: Either Finish() or Abandon() has been called.
@ -60,7 +61,7 @@ class CuckooTableBuilder: public TableBuilder {
CuckooBucket() CuckooBucket()
: vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {} : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {}
uint32_t vector_idx; uint32_t vector_idx;
// This number will not exceed kvs_.size() + max_num_hash_table_. // This number will not exceed kvs_.size() + max_num_hash_func_.
// We assume number of items is <= 2^32. // We assume number of items is <= 2^32.
uint32_t make_space_for_key_call_id; uint32_t make_space_for_key_call_id;
}; };
@ -73,11 +74,13 @@ class CuckooTableBuilder: public TableBuilder {
uint64_t* bucket_id); uint64_t* bucket_id);
Status MakeHashTable(std::vector<CuckooBucket>* buckets); Status MakeHashTable(std::vector<CuckooBucket>* buckets);
uint32_t num_hash_table_; uint32_t num_hash_func_;
WritableFile* file_; WritableFile* file_;
const double hash_table_ratio_; const double max_hash_table_ratio_;
const uint32_t max_num_hash_table_; const uint32_t max_num_hash_func_;
const uint32_t max_search_depth_; const uint32_t max_search_depth_;
const uint32_t cuckoo_block_size_;
uint64_t hash_table_size_;
bool is_last_level_file_; bool is_last_level_file_;
Status status_; Status status_;
std::vector<std::pair<std::string, std::string>> kvs_; std::vector<std::pair<std::string, std::string>> kvs_;

@ -37,8 +37,9 @@ class CuckooBuilderTest {
void CheckFileContents(const std::vector<std::string>& keys, void CheckFileContents(const std::vector<std::string>& keys,
const std::vector<std::string>& values, const std::vector<std::string>& values,
const std::vector<uint64_t>& expected_locations, const std::vector<uint64_t>& expected_locations,
std::string expected_unused_bucket, uint64_t expected_max_buckets, std::string expected_unused_bucket, uint64_t expected_table_size,
uint32_t expected_num_hash_fun, bool expected_is_last_level) { uint32_t expected_num_hash_func, bool expected_is_last_level,
uint32_t expected_cuckoo_block_size = 1) {
// Read file // Read file
unique_ptr<RandomAccessFile> read_file; unique_ptr<RandomAccessFile> read_file;
ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_)); ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_));
@ -51,7 +52,8 @@ class CuckooBuilderTest {
kCuckooTableMagicNumber, env_, nullptr, &props)); kCuckooTableMagicNumber, env_, nullptr, &props));
ASSERT_EQ(props->num_entries, keys.size()); ASSERT_EQ(props->num_entries, keys.size());
ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size()); ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size());
ASSERT_EQ(props->data_size, keys.size()*expected_unused_bucket.size()); ASSERT_EQ(props->data_size, expected_unused_bucket.size() *
(expected_table_size + expected_cuckoo_block_size - 1));
ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len); ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len);
// Check unused bucket. // Check unused bucket.
@ -65,14 +67,18 @@ class CuckooBuilderTest {
CuckooTablePropertyNames::kValueLength].data()); CuckooTablePropertyNames::kValueLength].data());
ASSERT_EQ(values.empty() ? 0 : values[0].size(), value_len_found); ASSERT_EQ(values.empty() ? 0 : values[0].size(), value_len_found);
ASSERT_EQ(props->raw_value_size, values.size()*value_len_found); ASSERT_EQ(props->raw_value_size, values.size()*value_len_found);
const uint64_t max_buckets = const uint64_t table_size =
*reinterpret_cast<const uint64_t*>(props->user_collected_properties[ *reinterpret_cast<const uint64_t*>(props->user_collected_properties[
CuckooTablePropertyNames::kMaxNumBuckets].data()); CuckooTablePropertyNames::kHashTableSize].data());
ASSERT_EQ(expected_max_buckets, max_buckets); ASSERT_EQ(expected_table_size, table_size);
const uint32_t num_hash_fun_found = const uint32_t num_hash_func_found =
*reinterpret_cast<const uint32_t*>(props->user_collected_properties[ *reinterpret_cast<const uint32_t*>(props->user_collected_properties[
CuckooTablePropertyNames::kNumHashTable].data()); CuckooTablePropertyNames::kNumHashFunc].data());
ASSERT_EQ(expected_num_hash_fun, num_hash_fun_found); ASSERT_EQ(expected_num_hash_func, num_hash_func_found);
const uint32_t cuckoo_block_size =
*reinterpret_cast<const uint32_t*>(props->user_collected_properties[
CuckooTablePropertyNames::kCuckooBlockSize].data());
ASSERT_EQ(expected_cuckoo_block_size, cuckoo_block_size);
const bool is_last_level_found = const bool is_last_level_found =
*reinterpret_cast<const bool*>(props->user_collected_properties[ *reinterpret_cast<const bool*>(props->user_collected_properties[
CuckooTablePropertyNames::kIsLastLevel].data()); CuckooTablePropertyNames::kIsLastLevel].data());
@ -82,7 +88,7 @@ class CuckooBuilderTest {
// Check contents of the bucket. // Check contents of the bucket.
std::vector<bool> keys_found(keys.size(), false); std::vector<bool> keys_found(keys.size(), false);
uint32_t bucket_size = expected_unused_bucket.size(); uint32_t bucket_size = expected_unused_bucket.size();
for (uint32_t i = 0; i < max_buckets; ++i) { for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) {
Slice read_slice; Slice read_slice;
ASSERT_OK(read_file->Read(i*bucket_size, bucket_size, ASSERT_OK(read_file->Read(i*bucket_size, bucket_size,
&read_slice, nullptr)); &read_slice, nullptr));
@ -108,6 +114,14 @@ class CuckooBuilderTest {
return ikey.GetKey().ToString(); return ikey.GetKey().ToString();
} }
uint64_t NextPowOf2(uint64_t num) {
uint64_t n = 2;
while (n <= num) {
n *= 2;
}
return n;
}
Env* env_; Env* env_;
EnvOptions env_options_; EnvOptions env_options_;
std::string fname; std::string fname;
@ -116,10 +130,10 @@ class CuckooBuilderTest {
TEST(CuckooBuilderTest, SuccessWithEmptyFile) { TEST(CuckooBuilderTest, SuccessWithEmptyFile) {
unique_ptr<WritableFile> writable_file; unique_ptr<WritableFile> writable_file;
fname = test::TmpDir() + "/NoCollisionFullKey"; fname = test::TmpDir() + "/EmptyFile";
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
4, 100, BytewiseComparator(), GetSliceHash); 4, 100, BytewiseComparator(), 1, GetSliceHash);
ASSERT_OK(builder.status()); ASSERT_OK(builder.status());
ASSERT_OK(builder.Finish()); ASSERT_OK(builder.Finish());
ASSERT_OK(writable_file->Close()); ASSERT_OK(writable_file->Close());
@ -146,7 +160,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
fname = test::TmpDir() + "/NoCollisionFullKey"; fname = test::TmpDir() + "/NoCollisionFullKey";
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
num_hash_fun, 100, BytewiseComparator(), GetSliceHash); num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
ASSERT_OK(builder.status()); ASSERT_OK(builder.status());
for (uint32_t i = 0; i < user_keys.size(); i++) { for (uint32_t i = 0; i < user_keys.size(); i++) {
builder.Add(Slice(keys[i]), Slice(values[i])); builder.Add(Slice(keys[i]), Slice(values[i]));
@ -156,11 +170,11 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
ASSERT_OK(builder.Finish()); ASSERT_OK(builder.Finish());
ASSERT_OK(writable_file->Close()); ASSERT_OK(writable_file->Close());
uint32_t expected_max_buckets = keys.size() / kHashTableRatio; uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
std::string expected_unused_bucket = GetInternalKey("key00", true); std::string expected_unused_bucket = GetInternalKey("key00", true);
expected_unused_bucket += std::string(values[0].size(), 'a'); expected_unused_bucket += std::string(values[0].size(), 'a');
CheckFileContents(keys, values, expected_locations, CheckFileContents(keys, values, expected_locations,
expected_unused_bucket, expected_max_buckets, 2, false); expected_unused_bucket, expected_table_size, 2, false);
} }
TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
@ -183,7 +197,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
fname = test::TmpDir() + "/WithCollisionFullKey"; fname = test::TmpDir() + "/WithCollisionFullKey";
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
num_hash_fun, 100, BytewiseComparator(), GetSliceHash); num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
ASSERT_OK(builder.status()); ASSERT_OK(builder.status());
for (uint32_t i = 0; i < user_keys.size(); i++) { for (uint32_t i = 0; i < user_keys.size(); i++) {
builder.Add(Slice(keys[i]), Slice(values[i])); builder.Add(Slice(keys[i]), Slice(values[i]));
@ -193,11 +207,49 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
ASSERT_OK(builder.Finish()); ASSERT_OK(builder.Finish());
ASSERT_OK(writable_file->Close()); ASSERT_OK(writable_file->Close());
uint32_t expected_max_buckets = keys.size() / kHashTableRatio; uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
std::string expected_unused_bucket = GetInternalKey("key00", true); std::string expected_unused_bucket = GetInternalKey("key00", true);
expected_unused_bucket += std::string(values[0].size(), 'a'); expected_unused_bucket += std::string(values[0].size(), 'a');
CheckFileContents(keys, values, expected_locations, CheckFileContents(keys, values, expected_locations,
expected_unused_bucket, expected_max_buckets, 4, false); expected_unused_bucket, expected_table_size, 4, false);
}
TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
uint32_t num_hash_fun = 4;
std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
hash_map = {
{user_keys[0], {0, 1, 2, 3}},
{user_keys[1], {0, 1, 2, 3}},
{user_keys[2], {0, 1, 2, 3}},
{user_keys[3], {0, 1, 2, 3}},
};
std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
std::vector<std::string> keys;
for (auto& user_key : user_keys) {
keys.push_back(GetInternalKey(user_key, false));
}
unique_ptr<WritableFile> writable_file;
uint32_t cuckoo_block_size = 2;
fname = test::TmpDir() + "/WithCollisionFullKey2";
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, GetSliceHash);
ASSERT_OK(builder.status());
for (uint32_t i = 0; i < user_keys.size(); i++) {
builder.Add(Slice(keys[i]), Slice(values[i]));
ASSERT_EQ(builder.NumEntries(), i + 1);
ASSERT_OK(builder.status());
}
ASSERT_OK(builder.Finish());
ASSERT_OK(writable_file->Close());
uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
std::string expected_unused_bucket = GetInternalKey("key00", true);
expected_unused_bucket += std::string(values[0].size(), 'a');
CheckFileContents(keys, values, expected_locations,
expected_unused_bucket, expected_table_size, 3, false, cuckoo_block_size);
} }
TEST(CuckooBuilderTest, WithCollisionPathFullKey) { TEST(CuckooBuilderTest, WithCollisionPathFullKey) {
@ -225,7 +277,46 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) {
fname = test::TmpDir() + "/WithCollisionPathFullKey"; fname = test::TmpDir() + "/WithCollisionPathFullKey";
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
num_hash_fun, 100, BytewiseComparator(), GetSliceHash); num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
ASSERT_OK(builder.status());
for (uint32_t i = 0; i < user_keys.size(); i++) {
builder.Add(Slice(keys[i]), Slice(values[i]));
ASSERT_EQ(builder.NumEntries(), i + 1);
ASSERT_OK(builder.status());
}
ASSERT_OK(builder.Finish());
ASSERT_OK(writable_file->Close());
uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
std::string expected_unused_bucket = GetInternalKey("key00", true);
expected_unused_bucket += std::string(values[0].size(), 'a');
CheckFileContents(keys, values, expected_locations,
expected_unused_bucket, expected_table_size, 2, false);
}
TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
uint32_t num_hash_fun = 2;
std::vector<std::string> user_keys = {"key01", "key02", "key03",
"key04", "key05"};
std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
hash_map = {
{user_keys[0], {0, 1}},
{user_keys[1], {1, 2}},
{user_keys[2], {3, 4}},
{user_keys[3], {4, 5}},
{user_keys[4], {0, 3}},
};
std::vector<uint64_t> expected_locations = {2, 1, 3, 4, 0};
std::vector<std::string> keys;
for (auto& user_key : user_keys) {
keys.push_back(GetInternalKey(user_key, false));
}
unique_ptr<WritableFile> writable_file;
fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock";
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
num_hash_fun, 100, BytewiseComparator(), 2, GetSliceHash);
ASSERT_OK(builder.status()); ASSERT_OK(builder.status());
for (uint32_t i = 0; i < user_keys.size(); i++) { for (uint32_t i = 0; i < user_keys.size(); i++) {
builder.Add(Slice(keys[i]), Slice(values[i])); builder.Add(Slice(keys[i]), Slice(values[i]));
@ -235,11 +326,11 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) {
ASSERT_OK(builder.Finish()); ASSERT_OK(builder.Finish());
ASSERT_OK(writable_file->Close()); ASSERT_OK(writable_file->Close());
uint32_t expected_max_buckets = keys.size() / kHashTableRatio; uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
std::string expected_unused_bucket = GetInternalKey("key00", true); std::string expected_unused_bucket = GetInternalKey("key00", true);
expected_unused_bucket += std::string(values[0].size(), 'a'); expected_unused_bucket += std::string(values[0].size(), 'a');
CheckFileContents(keys, values, expected_locations, CheckFileContents(keys, values, expected_locations,
expected_unused_bucket, expected_max_buckets, 2, false); expected_unused_bucket, expected_table_size, 2, false, 2);
} }
TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
@ -258,7 +349,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
fname = test::TmpDir() + "/NoCollisionUserKey"; fname = test::TmpDir() + "/NoCollisionUserKey";
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
num_hash_fun, 100, BytewiseComparator(), GetSliceHash); num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
ASSERT_OK(builder.status()); ASSERT_OK(builder.status());
for (uint32_t i = 0; i < user_keys.size(); i++) { for (uint32_t i = 0; i < user_keys.size(); i++) {
builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@ -268,11 +359,11 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
ASSERT_OK(builder.Finish()); ASSERT_OK(builder.Finish());
ASSERT_OK(writable_file->Close()); ASSERT_OK(writable_file->Close());
uint32_t expected_max_buckets = user_keys.size() / kHashTableRatio; uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
std::string expected_unused_bucket = "key00"; std::string expected_unused_bucket = "key00";
expected_unused_bucket += std::string(values[0].size(), 'a'); expected_unused_bucket += std::string(values[0].size(), 'a');
CheckFileContents(user_keys, values, expected_locations, CheckFileContents(user_keys, values, expected_locations,
expected_unused_bucket, expected_max_buckets, 2, true); expected_unused_bucket, expected_table_size, 2, true);
} }
TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
@ -291,7 +382,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
fname = test::TmpDir() + "/WithCollisionUserKey"; fname = test::TmpDir() + "/WithCollisionUserKey";
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
num_hash_fun, 100, BytewiseComparator(), GetSliceHash); num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
ASSERT_OK(builder.status()); ASSERT_OK(builder.status());
for (uint32_t i = 0; i < user_keys.size(); i++) { for (uint32_t i = 0; i < user_keys.size(); i++) {
builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@ -301,11 +392,11 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
ASSERT_OK(builder.Finish()); ASSERT_OK(builder.Finish());
ASSERT_OK(writable_file->Close()); ASSERT_OK(writable_file->Close());
uint32_t expected_max_buckets = user_keys.size() / kHashTableRatio; uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
std::string expected_unused_bucket = "key00"; std::string expected_unused_bucket = "key00";
expected_unused_bucket += std::string(values[0].size(), 'a'); expected_unused_bucket += std::string(values[0].size(), 'a');
CheckFileContents(user_keys, values, expected_locations, CheckFileContents(user_keys, values, expected_locations,
expected_unused_bucket, expected_max_buckets, 4, true); expected_unused_bucket, expected_table_size, 4, true);
} }
TEST(CuckooBuilderTest, WithCollisionPathUserKey) { TEST(CuckooBuilderTest, WithCollisionPathUserKey) {
@ -326,7 +417,7 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) {
fname = test::TmpDir() + "/WithCollisionPathUserKey"; fname = test::TmpDir() + "/WithCollisionPathUserKey";
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
num_hash_fun, 2, BytewiseComparator(), GetSliceHash); num_hash_fun, 2, BytewiseComparator(), 1, GetSliceHash);
ASSERT_OK(builder.status()); ASSERT_OK(builder.status());
for (uint32_t i = 0; i < user_keys.size(); i++) { for (uint32_t i = 0; i < user_keys.size(); i++) {
builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@ -336,11 +427,11 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) {
ASSERT_OK(builder.Finish()); ASSERT_OK(builder.Finish());
ASSERT_OK(writable_file->Close()); ASSERT_OK(writable_file->Close());
uint32_t expected_max_buckets = user_keys.size() / kHashTableRatio; uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
std::string expected_unused_bucket = "key00"; std::string expected_unused_bucket = "key00";
expected_unused_bucket += std::string(values[0].size(), 'a'); expected_unused_bucket += std::string(values[0].size(), 'a');
CheckFileContents(user_keys, values, expected_locations, CheckFileContents(user_keys, values, expected_locations,
expected_unused_bucket, expected_max_buckets, 2, true); expected_unused_bucket, expected_table_size, 2, true);
} }
TEST(CuckooBuilderTest, FailWhenCollisionPathTooLong) { TEST(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
@ -362,7 +453,7 @@ TEST(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
fname = test::TmpDir() + "/WithCollisionPathUserKey"; fname = test::TmpDir() + "/WithCollisionPathUserKey";
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
num_hash_fun, 2, BytewiseComparator(), GetSliceHash); num_hash_fun, 2, BytewiseComparator(), 1, GetSliceHash);
ASSERT_OK(builder.status()); ASSERT_OK(builder.status());
for (uint32_t i = 0; i < user_keys.size(); i++) { for (uint32_t i = 0; i < user_keys.size(); i++) {
builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value")); builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value"));
@ -382,7 +473,7 @@ TEST(CuckooBuilderTest, FailWhenSameKeyInserted) {
fname = test::TmpDir() + "/FailWhenSameKeyInserted"; fname = test::TmpDir() + "/FailWhenSameKeyInserted";
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
num_hash_fun, 100, BytewiseComparator(), GetSliceHash); num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
ASSERT_OK(builder.status()); ASSERT_OK(builder.status());
builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1")); builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1"));

@ -9,34 +9,14 @@
#include "db/dbformat.h" #include "db/dbformat.h"
#include "table/cuckoo_table_builder.h" #include "table/cuckoo_table_builder.h"
#include "table/cuckoo_table_reader.h" #include "table/cuckoo_table_reader.h"
#include "util/murmurhash.h"
namespace rocksdb { namespace rocksdb {
extern const uint32_t kMaxNumHashTable = 64;
extern uint64_t GetSliceMurmurHash(const Slice& s, uint32_t index,
uint64_t max_num_buckets) {
static constexpr uint32_t seeds[kMaxNumHashTable] = {
816922183, 506425713, 949485004, 22513986, 421427259, 500437285,
888981693, 847587269, 511007211, 722295391, 934013645, 566947683,
193618736, 428277388, 770956674, 819994962, 755946528, 40807421,
263144466, 241420041, 444294464, 731606396, 304158902, 563235655,
968740453, 336996831, 462831574, 407970157, 985877240, 637708754,
736932700, 205026023, 755371467, 729648411, 807744117, 46482135,
847092855, 620960699, 102476362, 314094354, 625838942, 550889395,
639071379, 834567510, 397667304, 151945969, 443634243, 196618243,
421986347, 407218337, 964502417, 327741231, 493359459, 452453139,
692216398, 108161624, 816246924, 234779764, 618949448, 496133787,
156374056, 316589799, 982915425, 553105889 };
return MurmurHash(s.data(), s.size(), seeds[index]) % max_num_buckets;
}
Status CuckooTableFactory::NewTableReader(const Options& options, Status CuckooTableFactory::NewTableReader(const Options& options,
const EnvOptions& soptions, const InternalKeyComparator& icomp, const EnvOptions& soptions, const InternalKeyComparator& icomp,
std::unique_ptr<RandomAccessFile>&& file, uint64_t file_size, std::unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
std::unique_ptr<TableReader>* table) const { std::unique_ptr<TableReader>* table) const {
std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(options, std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(options,
std::move(file), file_size, icomp.user_comparator(), GetSliceMurmurHash)); std::move(file), file_size, icomp.user_comparator(), nullptr));
Status s = new_reader->status(); Status s = new_reader->status();
if (s.ok()) { if (s.ok()) {
*table = std::move(new_reader); *table = std::move(new_reader);
@ -47,9 +27,8 @@ Status CuckooTableFactory::NewTableReader(const Options& options,
TableBuilder* CuckooTableFactory::NewTableBuilder( TableBuilder* CuckooTableFactory::NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_comparator, const Options& options, const InternalKeyComparator& internal_comparator,
WritableFile* file, CompressionType compression_type) const { WritableFile* file, CompressionType compression_type) const {
return new CuckooTableBuilder(file, hash_table_ratio_, kMaxNumHashTable, return new CuckooTableBuilder(file, hash_table_ratio_, 64, max_search_depth_,
max_search_depth_, internal_comparator.user_comparator(), internal_comparator.user_comparator(), cuckoo_block_size_, nullptr);
GetSliceMurmurHash);
} }
std::string CuckooTableFactory::GetPrintableTableOptions() const { std::string CuckooTableFactory::GetPrintableTableOptions() const {
@ -64,12 +43,16 @@ std::string CuckooTableFactory::GetPrintableTableOptions() const {
snprintf(buffer, kBufferSize, " max_search_depth: %u\n", snprintf(buffer, kBufferSize, " max_search_depth: %u\n",
max_search_depth_); max_search_depth_);
ret.append(buffer); ret.append(buffer);
snprintf(buffer, kBufferSize, " cuckoo_block_size: %u\n",
cuckoo_block_size_);
ret.append(buffer);
return ret; return ret;
} }
TableFactory* NewCuckooTableFactory(double hash_table_ratio, TableFactory* NewCuckooTableFactory(double hash_table_ratio,
uint32_t max_search_depth) { uint32_t max_search_depth, uint32_t cuckoo_block_size) {
return new CuckooTableFactory(hash_table_ratio, max_search_depth); return new CuckooTableFactory(
hash_table_ratio, max_search_depth, cuckoo_block_size);
} }
} // namespace rocksdb } // namespace rocksdb

@ -8,11 +8,23 @@
#include <string> #include <string>
#include "rocksdb/table.h" #include "rocksdb/table.h"
#include "util/murmurhash.h"
namespace rocksdb { namespace rocksdb {
extern uint64_t GetSliceMurmurHash(const Slice& s, uint32_t index, const uint32_t kCuckooMurmurSeedMultiplier = 816922183;
uint64_t max_num_buckets); static inline uint64_t CuckooHash(
const Slice& user_key, uint32_t hash_cnt, uint64_t table_size_minus_one,
uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) {
#ifndef NDEBUG
// This part is used only in unit tests.
if (get_slice_hash != nullptr) {
return get_slice_hash(user_key, hash_cnt, table_size_minus_one + 1);
}
#endif
return MurmurHash(user_key.data(), user_key.size(),
kCuckooMurmurSeedMultiplier * hash_cnt) & table_size_minus_one;
}
// Cuckoo Table is designed for applications that require fast point lookups // Cuckoo Table is designed for applications that require fast point lookups
// but not fast range scans. // but not fast range scans.
@ -23,9 +35,11 @@ extern uint64_t GetSliceMurmurHash(const Slice& s, uint32_t index,
// - Does not support Merge operations. // - Does not support Merge operations.
class CuckooTableFactory : public TableFactory { class CuckooTableFactory : public TableFactory {
public: public:
CuckooTableFactory(double hash_table_ratio, uint32_t max_search_depth) CuckooTableFactory(double hash_table_ratio, uint32_t max_search_depth,
uint32_t cuckoo_block_size)
: hash_table_ratio_(hash_table_ratio), : hash_table_ratio_(hash_table_ratio),
max_search_depth_(max_search_depth) {} max_search_depth_(max_search_depth),
cuckoo_block_size_(cuckoo_block_size) {}
~CuckooTableFactory() {} ~CuckooTableFactory() {}
const char* Name() const override { return "CuckooTable"; } const char* Name() const override { return "CuckooTable"; }
@ -41,7 +55,7 @@ class CuckooTableFactory : public TableFactory {
CompressionType compression_type) const override; CompressionType compression_type) const override;
// Sanitizes the specified DB Options. // Sanitizes the specified DB Options.
Status SanitizeDBOptions(DBOptions* db_opts) const override { Status SanitizeDBOptions(const DBOptions* db_opts) const override {
return Status::OK(); return Status::OK();
} }
@ -50,6 +64,7 @@ class CuckooTableFactory : public TableFactory {
private: private:
const double hash_table_ratio_; const double hash_table_ratio_;
const uint32_t max_search_depth_; const uint32_t max_search_depth_;
const uint32_t cuckoo_block_size_;
}; };
} // namespace rocksdb } // namespace rocksdb

@ -17,10 +17,14 @@
#include <vector> #include <vector>
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "table/meta_blocks.h" #include "table/meta_blocks.h"
#include "table/cuckoo_table_factory.h"
#include "util/arena.h" #include "util/arena.h"
#include "util/coding.h" #include "util/coding.h"
namespace rocksdb { namespace rocksdb {
namespace {
static const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
}
extern const uint64_t kCuckooTableMagicNumber; extern const uint64_t kCuckooTableMagicNumber;
@ -44,12 +48,12 @@ CuckooTableReader::CuckooTableReader(
} }
table_props_.reset(props); table_props_.reset(props);
auto& user_props = props->user_collected_properties; auto& user_props = props->user_collected_properties;
auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashTable); auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
if (hash_funs == user_props.end()) { if (hash_funs == user_props.end()) {
status_ = Status::InvalidArgument("Number of hash functions not found"); status_ = Status::InvalidArgument("Number of hash functions not found");
return; return;
} }
num_hash_fun_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data()); num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey); auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
if (unused_key == user_props.end()) { if (unused_key == user_props.end()) {
status_ = Status::InvalidArgument("Empty bucket value not found"); status_ = Status::InvalidArgument("Empty bucket value not found");
@ -67,18 +71,29 @@ CuckooTableReader::CuckooTableReader(
value_length->second.data()); value_length->second.data());
bucket_length_ = key_length_ + value_length_; bucket_length_ = key_length_ + value_length_;
auto num_buckets = user_props.find(CuckooTablePropertyNames::kMaxNumBuckets); auto hash_table_size = user_props.find(
if (num_buckets == user_props.end()) { CuckooTablePropertyNames::kHashTableSize);
status_ = Status::InvalidArgument("Num buckets not found"); if (hash_table_size == user_props.end()) {
status_ = Status::InvalidArgument("Hash table size not found");
return; return;
} }
num_buckets_ = *reinterpret_cast<const uint64_t*>(num_buckets->second.data()); table_size_minus_one_ = *reinterpret_cast<const uint64_t*>(
hash_table_size->second.data()) - 1;
auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
if (is_last_level == user_props.end()) { if (is_last_level == user_props.end()) {
status_ = Status::InvalidArgument("Is last level not found"); status_ = Status::InvalidArgument("Is last level not found");
return; return;
} }
is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data()); is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
auto cuckoo_block_size = user_props.find(
CuckooTablePropertyNames::kCuckooBlockSize);
if (cuckoo_block_size == user_props.end()) {
status_ = Status::InvalidArgument("Cuckoo block size not found");
return;
}
cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
cuckoo_block_size->second.data());
cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
status_ = file_->Read(0, file_size, &file_data_, nullptr); status_ = file_->Read(0, file_size, &file_data_, nullptr);
} }
@ -89,10 +104,12 @@ Status CuckooTableReader::Get(
void (*mark_key_may_exist_handler)(void* handle_context)) { void (*mark_key_may_exist_handler)(void* handle_context)) {
assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
Slice user_key = ExtractUserKey(key); Slice user_key = ExtractUserKey(key);
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_fun_; ++hash_cnt) { for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
uint64_t hash_val = get_slice_hash_(user_key, hash_cnt, num_buckets_); uint64_t offset = bucket_length_ * CuckooHash(
assert(hash_val < num_buckets_); user_key, hash_cnt, table_size_minus_one_, get_slice_hash_);
const char* bucket = &file_data_.data()[hash_val * bucket_length_]; const char* bucket = &file_data_.data()[offset];
for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
++block_idx, bucket += bucket_length_) {
if (ucomp_->Compare(Slice(unused_key_.data(), user_key.size()), if (ucomp_->Compare(Slice(unused_key_.data(), user_key.size()),
Slice(bucket, user_key.size())) == 0) { Slice(bucket, user_key.size())) == 0) {
return Status::OK(); return Status::OK();
@ -102,7 +119,8 @@ Status CuckooTableReader::Get(
if (ucomp_->Compare(user_key, Slice(bucket, user_key.size())) == 0) { if (ucomp_->Compare(user_key, Slice(bucket, user_key.size())) == 0) {
Slice value = Slice(&bucket[key_length_], value_length_); Slice value = Slice(&bucket[key_length_], value_length_);
if (is_last_level_) { if (is_last_level_) {
ParsedInternalKey found_ikey(Slice(bucket, key_length_), 0, kTypeValue); ParsedInternalKey found_ikey(
Slice(bucket, key_length_), 0, kTypeValue);
result_handler(handle_context, found_ikey, value); result_handler(handle_context, found_ikey, value);
} else { } else {
Slice full_key(bucket, key_length_); Slice full_key(bucket, key_length_);
@ -114,15 +132,18 @@ Status CuckooTableReader::Get(
return Status::OK(); return Status::OK();
} }
} }
}
return Status::OK(); return Status::OK();
} }
void CuckooTableReader::Prepare(const Slice& key) { void CuckooTableReader::Prepare(const Slice& key) {
// Prefetch the first Cuckoo Block.
Slice user_key = ExtractUserKey(key); Slice user_key = ExtractUserKey(key);
// Prefetching first location also helps improve Get performance. uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) +
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_fun_; ++hash_cnt) { bucket_length_ * CuckooHash(user_key, 0, table_size_minus_one_, nullptr);
uint64_t hash_val = get_slice_hash_(user_key, hash_cnt, num_buckets_); uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
PREFETCH(&file_data_.data()[hash_val * bucket_length_], 0, 3); for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
} }
} }
@ -186,7 +207,9 @@ CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
void CuckooTableIterator::LoadKeysFromReader() { void CuckooTableIterator::LoadKeysFromReader() {
key_to_bucket_id_.reserve(reader_->GetTableProperties()->num_entries); key_to_bucket_id_.reserve(reader_->GetTableProperties()->num_entries);
for (uint32_t bucket_id = 0; bucket_id < reader_->num_buckets_; bucket_id++) { uint64_t num_buckets = reader_->table_size_minus_one_ +
reader_->cuckoo_block_size_;
for (uint32_t bucket_id = 0; bucket_id < num_buckets; bucket_id++) {
Slice read_key; Slice read_key;
status_ = reader_->file_->Read(bucket_id * reader_->bucket_length_, status_ = reader_->file_->Read(bucket_id * reader_->bucket_length_,
reader_->key_length_, &read_key, nullptr); reader_->key_length_, &read_key, nullptr);

@ -65,12 +65,14 @@ class CuckooTableReader: public TableReader {
bool is_last_level_; bool is_last_level_;
std::shared_ptr<const TableProperties> table_props_; std::shared_ptr<const TableProperties> table_props_;
Status status_; Status status_;
uint32_t num_hash_fun_; uint32_t num_hash_func_;
std::string unused_key_; std::string unused_key_;
uint32_t key_length_; uint32_t key_length_;
uint32_t value_length_; uint32_t value_length_;
uint32_t bucket_length_; uint32_t bucket_length_;
uint64_t num_buckets_; uint32_t cuckoo_block_size_;
uint32_t cuckoo_block_bytes_minus_one_;
uint64_t table_size_minus_one_;
const Comparator* ucomp_; const Comparator* ucomp_;
uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
uint64_t max_num_buckets); uint64_t max_num_buckets);

@ -38,9 +38,6 @@ DEFINE_bool(write, false,
namespace rocksdb { namespace rocksdb {
extern const uint64_t kCuckooTableMagicNumber;
extern const uint64_t kMaxNumHashTable;
namespace { namespace {
const uint32_t kNumHashFunc = 10; const uint32_t kNumHashFunc = 10;
// Methods, variables related to Hash functions. // Methods, variables related to Hash functions.
@ -109,7 +106,7 @@ class CuckooReaderTest {
std::unique_ptr<WritableFile> writable_file; std::unique_ptr<WritableFile> writable_file;
ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
CuckooTableBuilder builder( CuckooTableBuilder builder(
writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, GetSliceHash); writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2, GetSliceHash);
ASSERT_OK(builder.status()); ASSERT_OK(builder.status());
for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) { for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) {
builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); builder.Add(Slice(keys[key_idx]), Slice(values[key_idx]));
@ -397,13 +394,12 @@ void GetKeys(uint64_t num, std::vector<std::string>* keys) {
} }
} }
std::string GetFileName(uint64_t num, double hash_ratio) { std::string GetFileName(uint64_t num) {
if (FLAGS_file_dir.empty()) { if (FLAGS_file_dir.empty()) {
FLAGS_file_dir = test::TmpDir(); FLAGS_file_dir = test::TmpDir();
} }
return FLAGS_file_dir + "/cuckoo_read_benchmark" + return FLAGS_file_dir + "/cuckoo_read_benchmark" +
std::to_string(num/1000000) + "Mratio" + std::to_string(num/1000000) + "Mkeys";
std::to_string(static_cast<int>(100*hash_ratio));
} }
// Create last level file as we are interested in measuring performance of // Create last level file as we are interested in measuring performance of
@ -414,13 +410,13 @@ void WriteFile(const std::vector<std::string>& keys,
options.allow_mmap_reads = true; options.allow_mmap_reads = true;
Env* env = options.env; Env* env = options.env;
EnvOptions env_options = EnvOptions(options); EnvOptions env_options = EnvOptions(options);
std::string fname = GetFileName(num, hash_ratio); std::string fname = GetFileName(num);
std::unique_ptr<WritableFile> writable_file; std::unique_ptr<WritableFile> writable_file;
ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
CuckooTableBuilder builder( CuckooTableBuilder builder(
writable_file.get(), hash_ratio, writable_file.get(), hash_ratio,
kMaxNumHashTable, 1000, test::Uint64Comparator(), GetSliceMurmurHash); 64, 1000, test::Uint64Comparator(), 5, nullptr);
ASSERT_OK(builder.status()); ASSERT_OK(builder.status());
for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
// Value is just a part of key. // Value is just a part of key.
@ -439,27 +435,25 @@ void WriteFile(const std::vector<std::string>& keys,
CuckooTableReader reader( CuckooTableReader reader(
options, std::move(read_file), file_size, options, std::move(read_file), file_size,
test::Uint64Comparator(), GetSliceMurmurHash); test::Uint64Comparator(), nullptr);
ASSERT_OK(reader.status()); ASSERT_OK(reader.status());
ReadOptions r_options; ReadOptions r_options;
for (const auto& key : keys) { for (uint64_t i = 0; i < num; ++i) {
int cnt = 0; int cnt = 0;
ASSERT_OK(reader.Get(r_options, Slice(key), &cnt, CheckValue, nullptr)); ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &cnt, CheckValue, nullptr));
if (cnt != 1) { if (cnt != 1) {
fprintf(stderr, "%" PRIx64 " not found.\n", fprintf(stderr, "%" PRIu64 " not found.\n", i);
*reinterpret_cast<const uint64_t*>(key.data()));
ASSERT_EQ(1, cnt); ASSERT_EQ(1, cnt);
} }
} }
} }
void ReadKeys(const std::vector<std::string>& keys, uint64_t num, void ReadKeys(uint64_t num, uint32_t batch_size) {
double hash_ratio, uint32_t batch_size) {
Options options; Options options;
options.allow_mmap_reads = true; options.allow_mmap_reads = true;
Env* env = options.env; Env* env = options.env;
EnvOptions env_options = EnvOptions(options); EnvOptions env_options = EnvOptions(options);
std::string fname = GetFileName(num, hash_ratio); std::string fname = GetFileName(num);
uint64_t file_size; uint64_t file_size;
env->GetFileSize(fname, &file_size); env->GetFileSize(fname, &file_size);
@ -468,29 +462,33 @@ void ReadKeys(const std::vector<std::string>& keys, uint64_t num,
CuckooTableReader reader( CuckooTableReader reader(
options, std::move(read_file), file_size, test::Uint64Comparator(), options, std::move(read_file), file_size, test::Uint64Comparator(),
GetSliceMurmurHash); nullptr);
ASSERT_OK(reader.status()); ASSERT_OK(reader.status());
const UserCollectedProperties user_props = const UserCollectedProperties user_props =
reader.GetTableProperties()->user_collected_properties; reader.GetTableProperties()->user_collected_properties;
const uint32_t num_hash_fun = *reinterpret_cast<const uint32_t*>( const uint32_t num_hash_fun = *reinterpret_cast<const uint32_t*>(
user_props.at(CuckooTablePropertyNames::kNumHashTable).data()); user_props.at(CuckooTablePropertyNames::kNumHashFunc).data());
fprintf(stderr, "With %" PRIu64 " items and hash table ratio %f, number of" const uint64_t table_size = *reinterpret_cast<const uint64_t*>(
" hash functions used: %u.\n", num, hash_ratio, num_hash_fun); user_props.at(CuckooTablePropertyNames::kHashTableSize).data());
fprintf(stderr, "With %" PRIu64 " items, utilization is %.2f%%, number of"
" hash functions: %u.\n", num, num * 100.0 / (table_size), num_hash_fun);
ReadOptions r_options; ReadOptions r_options;
uint64_t start_time = env->NowMicros(); uint64_t start_time = env->NowMicros();
if (batch_size > 0) { if (batch_size > 0) {
for (uint64_t i = 0; i < num; i += batch_size) { for (uint64_t i = 0; i < num; i += batch_size) {
for (uint64_t j = i; j < i+batch_size && j < num; ++j) { for (uint64_t j = i; j < i+batch_size && j < num; ++j) {
reader.Prepare(Slice(keys[j])); reader.Prepare(Slice(reinterpret_cast<char*>(&j), 16));
} }
for (uint64_t j = i; j < i+batch_size && j < num; ++j) { for (uint64_t j = i; j < i+batch_size && j < num; ++j) {
reader.Get(r_options, Slice(keys[j]), nullptr, DoNothing, nullptr); reader.Get(r_options, Slice(reinterpret_cast<char*>(&j), 16),
nullptr, DoNothing, nullptr);
} }
} }
} else { } else {
for (uint64_t i = 0; i < num; i++) { for (uint64_t i = 0; i < num; i++) {
reader.Get(r_options, Slice(keys[i]), nullptr, DoNothing, nullptr); reader.Get(r_options, Slice(reinterpret_cast<char*>(&i), 16), nullptr,
DoNothing, nullptr);
} }
} }
float time_per_op = (env->NowMicros() - start_time) * 1.0 / num; float time_per_op = (env->NowMicros() - start_time) * 1.0 / num;
@ -501,26 +499,30 @@ void ReadKeys(const std::vector<std::string>& keys, uint64_t num,
} // namespace. } // namespace.
TEST(CuckooReaderTest, TestReadPerformance) { TEST(CuckooReaderTest, TestReadPerformance) {
uint64_t num = 1000*1000*100;
if (!FLAGS_enable_perf) { if (!FLAGS_enable_perf) {
return; return;
} }
double hash_ratio = 0.95;
// These numbers are chosen to have a hash utilizaiton % close to
// 0.9, 0.75, 0.6 and 0.5 respectively.
// They all create 128 M buckets.
std::vector<uint64_t> nums = {120*1000*1000, 100*1000*1000, 80*1000*1000,
70*1000*1000};
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stdout, fprintf(stdout,
"WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n"); "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n");
#endif #endif
std::vector<std::string> keys; std::vector<std::string> keys;
GetKeys(num, &keys); GetKeys(*std::max_element(nums.begin(), nums.end()), &keys);
for (double hash_ratio : std::vector<double>({0.5, 0.6, 0.75, 0.9})) { for (uint64_t num : nums) {
if (FLAGS_write || !Env::Default()->FileExists( if (FLAGS_write || !Env::Default()->FileExists(GetFileName(num))) {
GetFileName(num, hash_ratio))) {
WriteFile(keys, num, hash_ratio); WriteFile(keys, num, hash_ratio);
} }
ReadKeys(keys, num, hash_ratio, 0); ReadKeys(num, 0);
ReadKeys(keys, num, hash_ratio, 10); ReadKeys(num, 10);
ReadKeys(keys, num, hash_ratio, 25); ReadKeys(num, 25);
ReadKeys(keys, num, hash_ratio, 50); ReadKeys(num, 50);
ReadKeys(keys, num, hash_ratio, 100); ReadKeys(num, 100);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
} }

@ -71,20 +71,14 @@ void FilterBlockBuilder::AddKey(const Slice& key) {
} }
// add prefix to filter if needed // add prefix to filter if needed
if (prefix_extractor_ && prefix_extractor_->InDomain(ExtractUserKey(key))) { if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
// If prefix_extractor_, this filter_block layer assumes we only
// operate on internal keys.
Slice user_key = ExtractUserKey(key);
// this assumes prefix(prefix(key)) == prefix(key), as the last // this assumes prefix(prefix(key)) == prefix(key), as the last
// entry in entries_ may be either a key or prefix, and we use // entry in entries_ may be either a key or prefix, and we use
// prefix(last entry) to get the prefix of the last key. // prefix(last entry) to get the prefix of the last key.
if (prev.size() == 0 || if (prev.size() == 0 || !SamePrefix(key, prev)) {
!SamePrefix(user_key, ExtractUserKey(prev))) { Slice prefix = prefix_extractor_->Transform(key);
Slice prefix = prefix_extractor_->Transform(user_key);
InternalKey internal_prefix_tmp(prefix, 0, kTypeValue);
Slice internal_prefix = internal_prefix_tmp.Encode();
start_.push_back(entries_.size()); start_.push_back(entries_.size());
entries_.append(internal_prefix.data(), internal_prefix.size()); entries_.append(prefix.data(), prefix.size());
} }
} }
} }

@ -211,10 +211,13 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer,
const ReadOptions& options, const BlockHandle& handle, const ReadOptions& options, const BlockHandle& handle,
Slice* contents, /* result of reading */ char* buf) { Slice* contents, /* result of reading */ char* buf) {
size_t n = static_cast<size_t>(handle.size()); size_t n = static_cast<size_t>(handle.size());
Status s;
{
PERF_TIMER_GUARD(block_read_time);
s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf);
}
PERF_TIMER_AUTO(block_read_time);
Status s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf);
PERF_TIMER_MEASURE(block_read_time);
PERF_COUNTER_ADD(block_read_count, 1); PERF_COUNTER_ADD(block_read_count, 1);
PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize); PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize);
@ -228,6 +231,7 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer,
// Check the crc of the type and the block contents // Check the crc of the type and the block contents
const char* data = contents->data(); // Pointer to where Read put the data const char* data = contents->data(); // Pointer to where Read put the data
if (options.verify_checksums) { if (options.verify_checksums) {
PERF_TIMER_GUARD(block_checksum_time);
uint32_t value = DecodeFixed32(data + n + 1); uint32_t value = DecodeFixed32(data + n + 1);
uint32_t actual = 0; uint32_t actual = 0;
switch (footer.checksum()) { switch (footer.checksum()) {
@ -247,7 +251,6 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer,
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
PERF_TIMER_STOP(block_checksum_time);
} }
return s; return s;
} }
@ -265,7 +268,7 @@ Status DecompressBlock(BlockContents* result, size_t block_size,
result->cachable = false; result->cachable = false;
result->heap_allocated = false; result->heap_allocated = false;
PERF_TIMER_AUTO(block_decompress_time); PERF_TIMER_GUARD(block_decompress_time);
rocksdb::CompressionType compression_type = rocksdb::CompressionType compression_type =
static_cast<rocksdb::CompressionType>(data[n]); static_cast<rocksdb::CompressionType>(data[n]);
// If the caller has requested that the block not be uncompressed // If the caller has requested that the block not be uncompressed
@ -295,7 +298,6 @@ Status DecompressBlock(BlockContents* result, size_t block_size,
} else { } else {
s = UncompressBlockContents(data, n, result); s = UncompressBlockContents(data, n, result);
} }
PERF_TIMER_STOP(block_decompress_time);
return s; return s;
} }

@ -116,12 +116,12 @@ class MergingIterator : public Iterator {
// Invalidate the heap. // Invalidate the heap.
use_heap_ = false; use_heap_ = false;
IteratorWrapper* first_child = nullptr; IteratorWrapper* first_child = nullptr;
PERF_TIMER_DECLARE();
for (auto& child : children_) { for (auto& child : children_) {
PERF_TIMER_START(seek_child_seek_time); {
PERF_TIMER_GUARD(seek_child_seek_time);
child.Seek(target); child.Seek(target);
PERF_TIMER_STOP(seek_child_seek_time); }
PERF_COUNTER_ADD(seek_child_seek_count, 1); PERF_COUNTER_ADD(seek_child_seek_count, 1);
if (child.Valid()) { if (child.Valid()) {
@ -134,24 +134,21 @@ class MergingIterator : public Iterator {
} else { } else {
// We have more than one children with valid keys. Initialize // We have more than one children with valid keys. Initialize
// the heap and put the first child into the heap. // the heap and put the first child into the heap.
PERF_TIMER_START(seek_min_heap_time); PERF_TIMER_GUARD(seek_min_heap_time);
ClearHeaps(); ClearHeaps();
minHeap_.push(first_child); minHeap_.push(first_child);
PERF_TIMER_STOP(seek_min_heap_time);
} }
} }
if (use_heap_) { if (use_heap_) {
PERF_TIMER_START(seek_min_heap_time); PERF_TIMER_GUARD(seek_min_heap_time);
minHeap_.push(&child); minHeap_.push(&child);
PERF_TIMER_STOP(seek_min_heap_time);
} }
} }
} }
if (use_heap_) { if (use_heap_) {
// If heap is valid, need to put the smallest key to curent_. // If heap is valid, need to put the smallest key to curent_.
PERF_TIMER_START(seek_min_heap_time); PERF_TIMER_GUARD(seek_min_heap_time);
FindSmallest(); FindSmallest();
PERF_TIMER_STOP(seek_min_heap_time);
} else { } else {
// The heap is not valid, then the current_ iterator is the first // The heap is not valid, then the current_ iterator is the first
// one, or null if there is no first child. // one, or null if there is no first child.

@ -16,9 +16,7 @@
namespace rocksdb { namespace rocksdb {
MetaIndexBuilder::MetaIndexBuilder() MetaIndexBuilder::MetaIndexBuilder()
: meta_index_block_( : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}
new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
}
void MetaIndexBuilder::Add(const std::string& key, void MetaIndexBuilder::Add(const std::string& key,
const BlockHandle& handle) { const BlockHandle& handle) {
@ -35,9 +33,7 @@ Slice MetaIndexBuilder::Finish() {
} }
PropertyBlockBuilder::PropertyBlockBuilder() PropertyBlockBuilder::PropertyBlockBuilder()
: properties_block_( : properties_block_(new BlockBuilder(1 /* restart interval */)) {}
new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
}
void PropertyBlockBuilder::Add(const std::string& name, void PropertyBlockBuilder::Add(const std::string& name,
const std::string& val) { const std::string& val) {

@ -169,7 +169,7 @@ class PlainTableFactory : public TableFactory {
static const char kValueTypeSeqId0 = 0xFF; static const char kValueTypeSeqId0 = 0xFF;
// Sanitizes the specified DB Options. // Sanitizes the specified DB Options.
Status SanitizeDBOptions(DBOptions* db_opts) const override { Status SanitizeDBOptions(const DBOptions* db_opts) const override {
if (db_opts->allow_mmap_reads == false) { if (db_opts->allow_mmap_reads == false) {
return Status::NotSupported( return Status::NotSupported(
"PlainTable with allow_mmap_reads == false is not supported."); "PlainTable with allow_mmap_reads == false is not supported.");

@ -244,8 +244,7 @@ class BlockConstructor: public Constructor {
const KVMap& data) { const KVMap& data) {
delete block_; delete block_;
block_ = nullptr; block_ = nullptr;
BlockBuilder builder(table_options.block_restart_interval, BlockBuilder builder(table_options.block_restart_interval);
&internal_comparator);
for (KVMap::const_iterator it = data.begin(); for (KVMap::const_iterator it = data.begin();
it != data.end(); it != data.end();
@ -1054,7 +1053,7 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
ASSERT_EQ("", props.filter_policy_name); // no filter policy is used ASSERT_EQ("", props.filter_policy_name); // no filter policy is used
// Verify data size. // Verify data size.
BlockBuilder block_builder(1, options.comparator); BlockBuilder block_builder(1);
for (const auto& item : kvmap) { for (const auto& item : kvmap) {
block_builder.Add(item.first, item.second); block_builder.Add(item.first, item.second);
} }

@ -172,8 +172,9 @@ void TwoLevelIterator::InitDataBlock() {
SetSecondLevelIterator(nullptr); SetSecondLevelIterator(nullptr);
} else { } else {
Slice handle = first_level_iter_.value(); Slice handle = first_level_iter_.value();
if (second_level_iter_.iter() != nullptr if (second_level_iter_.iter() != nullptr &&
&& handle.compare(data_block_handle_) == 0) { !second_level_iter_.status().IsIncomplete() &&
handle.compare(data_block_handle_) == 0) {
// second_level_iter is already constructed with this iterator, so // second_level_iter is already constructed with this iterator, so
// no need to change anything // no need to change anything
} else { } else {

@ -31,6 +31,7 @@ int main() {
#include <sys/types.h> #include <sys/types.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <exception>
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/version_set.h" #include "db/version_set.h"
@ -759,7 +760,7 @@ class StressTest {
? NewBloomFilterPolicy(FLAGS_bloom_bits) ? NewBloomFilterPolicy(FLAGS_bloom_bits)
: nullptr), : nullptr),
db_(nullptr), db_(nullptr),
new_column_family_name_(0), new_column_family_name_(1),
num_times_reopened_(0) { num_times_reopened_(0) {
if (FLAGS_destroy_db_initially) { if (FLAGS_destroy_db_initially) {
std::vector<std::string> files; std::vector<std::string> files;
@ -1217,12 +1218,20 @@ class StressTest {
Status s __attribute__((unused)); Status s __attribute__((unused));
s = db_->DropColumnFamily(column_families_[cf]); s = db_->DropColumnFamily(column_families_[cf]);
delete column_families_[cf]; delete column_families_[cf];
assert(s.ok()); if (!s.ok()) {
fprintf(stderr, "dropping column family error: %s\n",
s.ToString().c_str());
std::terminate();
}
s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name, s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
&column_families_[cf]); &column_families_[cf]);
column_family_names_[cf] = new_name; column_family_names_[cf] = new_name;
thread->shared->ClearColumnFamily(cf); thread->shared->ClearColumnFamily(cf);
assert(s.ok()); if (!s.ok()) {
fprintf(stderr, "creating column family error: %s\n",
s.ToString().c_str());
std::terminate();
}
thread->shared->UnlockColumnFamily(cf); thread->shared->UnlockColumnFamily(cf);
} }
} }
@ -1297,10 +1306,15 @@ class StressTest {
} }
} }
thread->shared->Put(rand_column_family, rand_key, value_base); thread->shared->Put(rand_column_family, rand_key, value_base);
Status s;
if (FLAGS_use_merge) { if (FLAGS_use_merge) {
db_->Merge(write_opts, column_family, key, v); s = db_->Merge(write_opts, column_family, key, v);
} else { } else {
db_->Put(write_opts, column_family, key, v); s = db_->Put(write_opts, column_family, key, v);
}
if (!s.ok()) {
fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
std::terminate();
} }
thread->stats.AddBytesForWrites(1, sz); thread->stats.AddBytesForWrites(1, sz);
} else { } else {
@ -1311,8 +1325,12 @@ class StressTest {
// OPERATION delete // OPERATION delete
if (!FLAGS_test_batches_snapshots) { if (!FLAGS_test_batches_snapshots) {
thread->shared->Delete(rand_column_family, rand_key); thread->shared->Delete(rand_column_family, rand_key);
db_->Delete(write_opts, column_family, key); Status s = db_->Delete(write_opts, column_family, key);
thread->stats.AddDeletes(1); thread->stats.AddDeletes(1);
if (!s.ok()) {
fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
std::terminate();
}
} else { } else {
MultiDelete(thread, write_opts, column_family, key); MultiDelete(thread, write_opts, column_family, key);
} }

@ -239,11 +239,23 @@ class PosixRandomAccessFile: public RandomAccessFile {
char* scratch) const { char* scratch) const {
Status s; Status s;
ssize_t r = -1; ssize_t r = -1;
do { size_t left = n;
r = pread(fd_, scratch, n, static_cast<off_t>(offset)); char* ptr = scratch;
} while (r < 0 && errno == EINTR); while (left > 0) {
IOSTATS_ADD_IF_POSITIVE(bytes_read, r); r = pread(fd_, ptr, left, static_cast<off_t>(offset));
*result = Slice(scratch, (r < 0) ? 0 : r); if (r <= 0) {
if (errno == EINTR) {
continue;
}
break;
}
ptr += r;
offset += r;
left -= r;
}
IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left);
*result = Slice(scratch, (r < 0) ? 0 : n - left);
if (r < 0) { if (r < 0) {
// An error: return a non-ok status // An error: return a non-ok status
s = IOError(filename_, errno); s = IOError(filename_, errno);
@ -907,9 +919,23 @@ class PosixRandomRWFile : public RandomRWFile {
virtual Status Read(uint64_t offset, size_t n, Slice* result, virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const { char* scratch) const {
Status s; Status s;
ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset)); ssize_t r = -1;
IOSTATS_ADD_IF_POSITIVE(bytes_read, r); size_t left = n;
*result = Slice(scratch, (r < 0) ? 0 : r); char* ptr = scratch;
while (left > 0) {
r = pread(fd_, ptr, left, static_cast<off_t>(offset));
if (r <= 0) {
if (errno == EINTR) {
continue;
}
break;
}
ptr += r;
offset += r;
left -= r;
}
IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left);
*result = Slice(scratch, (r < 0) ? 0 : n - left);
if (r < 0) { if (r < 0) {
s = IOError(filename_, errno); s = IOError(filename_, errno);
} }
@ -1018,15 +1044,12 @@ class PosixFileLock : public FileLock {
std::string filename; std::string filename;
}; };
namespace {
void PthreadCall(const char* label, int result) { void PthreadCall(const char* label, int result) {
if (result != 0) { if (result != 0) {
fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
exit(1); exit(1);
} }
} }
}
class PosixEnv : public Env { class PosixEnv : public Env {
public: public:
@ -1724,12 +1747,11 @@ unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const {
return thread_pools_[pri].GetQueueLen(); return thread_pools_[pri].GetQueueLen();
} }
namespace {
struct StartThreadState { struct StartThreadState {
void (*user_function)(void*); void (*user_function)(void*);
void* arg; void* arg;
}; };
}
static void* StartThreadWrapper(void* arg) { static void* StartThreadWrapper(void* arg) {
StartThreadState* state = reinterpret_cast<StartThreadState*>(arg); StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
state->user_function(state->arg); state->user_function(state->arg);

@ -9,7 +9,9 @@
namespace rocksdb { namespace rocksdb {
#ifndef IOS_CROSS_COMPILE
__thread IOStatsContext iostats_context; __thread IOStatsContext iostats_context;
#endif // IOS_CROSS_COMPILE
void IOStatsContext::Reset() { void IOStatsContext::Reset() {
thread_pool_id = Env::Priority::TOTAL; thread_pool_id = Env::Priority::TOTAL;

@ -6,6 +6,8 @@
#pragma once #pragma once
#include "rocksdb/iostats_context.h" #include "rocksdb/iostats_context.h"
#ifndef IOS_CROSS_COMPILE
// increment a specific counter by the specified value // increment a specific counter by the specified value
#define IOSTATS_ADD(metric, value) \ #define IOSTATS_ADD(metric, value) \
(iostats_context.metric += value) (iostats_context.metric += value)
@ -30,3 +32,15 @@
#define IOSTATS(metric) \ #define IOSTATS(metric) \
(iostats_context.metric) (iostats_context.metric)
#else // IOS_CROSS_COMPILE
#define IOSTATS_ADD(metric, value)
#define IOSTATS_ADD_IF_POSITIVE(metric, value)
#define IOSTATS_RESET(metric)
#define IOSTATS_RESET_ALL()
#define IOSTATS_SET_THREAD_POOL_ID(value)
#define IOSTATS_THREAD_POOL_ID()
#define IOSTATS(metric) 0
#endif // IOS_CROSS_COMPILE

@ -11,11 +11,10 @@ namespace rocksdb {
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
#define PERF_TIMER_DECLARE() #define PERF_TIMER_GUARD(metric)
#define PERF_TIMER_START(metric)
#define PERF_TIMER_AUTO(metric)
#define PERF_TIMER_MEASURE(metric) #define PERF_TIMER_MEASURE(metric)
#define PERF_TIMER_STOP(metric) #define PERF_TIMER_STOP(metric)
#define PERF_TIMER_START(metric)
#define PERF_COUNTER_ADD(metric, value) #define PERF_COUNTER_ADD(metric, value)
#else #else
@ -24,10 +23,15 @@ extern __thread PerfLevel perf_level;
class PerfStepTimer { class PerfStepTimer {
public: public:
PerfStepTimer() PerfStepTimer(uint64_t* metric)
: enabled_(perf_level >= PerfLevel::kEnableTime), : enabled_(perf_level >= PerfLevel::kEnableTime),
env_(enabled_ ? Env::Default() : nullptr), env_(enabled_ ? Env::Default() : nullptr),
start_(0) { start_(0),
metric_(metric) {
}
~PerfStepTimer() {
Stop();
} }
void Start() { void Start() {
@ -36,17 +40,17 @@ class PerfStepTimer {
} }
} }
void Measure(uint64_t* metric) { void Measure() {
if (start_) { if (start_) {
uint64_t now = env_->NowNanos(); uint64_t now = env_->NowNanos();
*metric += now - start_; *metric_ += now - start_;
start_ = now; start_ = now;
} }
} }
void Stop(uint64_t* metric) { void Stop() {
if (start_) { if (start_) {
*metric += env_->NowNanos() - start_; *metric_ += env_->NowNanos() - start_;
start_ = 0; start_ = 0;
} }
} }
@ -55,29 +59,25 @@ class PerfStepTimer {
const bool enabled_; const bool enabled_;
Env* const env_; Env* const env_;
uint64_t start_; uint64_t start_;
uint64_t* metric_;
}; };
// Declare the local timer object to be used later on // Stop the timer and update the metric
#define PERF_TIMER_DECLARE() \ #define PERF_TIMER_STOP(metric) \
PerfStepTimer perf_step_timer; perf_step_timer_ ## metric.Stop();
// Set start time of the timer
#define PERF_TIMER_START(metric) \ #define PERF_TIMER_START(metric) \
perf_step_timer.Start(); perf_step_timer_ ## metric.Start();
// Declare and set start time of the timer // Declare and set start time of the timer
#define PERF_TIMER_AUTO(metric) \ #define PERF_TIMER_GUARD(metric) \
PerfStepTimer perf_step_timer; \ PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric)); \
perf_step_timer.Start(); perf_step_timer_ ## metric.Start();
// Update metric with time elapsed since last START. start time is reset // Update metric with time elapsed since last START. start time is reset
// to current timestamp. // to current timestamp.
#define PERF_TIMER_MEASURE(metric) \ #define PERF_TIMER_MEASURE(metric) \
perf_step_timer.Measure(&(perf_context.metric)); perf_step_timer_ ## metric.Measure();
// Update metric with time elapsed since last START. But start time is not set.
#define PERF_TIMER_STOP(metric) \
perf_step_timer.Stop(&(perf_context.metric));
// Increase metric value // Increase metric value
#define PERF_COUNTER_ADD(metric, value) \ #define PERF_COUNTER_ADD(metric, value) \

@ -621,10 +621,13 @@ class SpatialDBImpl : public SpatialDB {
namespace { namespace {
DBOptions GetDBOptions(const SpatialDBOptions& options) { DBOptions GetDBOptions(const SpatialDBOptions& options) {
DBOptions db_options; DBOptions db_options;
db_options.max_background_compactions = options.num_threads / 2; db_options.max_background_compactions = 3 * options.num_threads / 4;
db_options.max_background_flushes = options.num_threads / 2; db_options.max_background_flushes =
db_options.env->SetBackgroundThreads(db_options.max_background_compactions, Env::LOW); options.num_threads - db_options.max_background_compactions;
db_options.env->SetBackgroundThreads(db_options.max_background_flushes, Env::HIGH); db_options.env->SetBackgroundThreads(db_options.max_background_compactions,
Env::LOW);
db_options.env->SetBackgroundThreads(db_options.max_background_flushes,
Env::HIGH);
if (options.bulk_load) { if (options.bulk_load) {
db_options.disableDataSync = true; db_options.disableDataSync = true;
} }
@ -635,13 +638,15 @@ ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options,
std::shared_ptr<Cache> block_cache) { std::shared_ptr<Cache> block_cache) {
ColumnFamilyOptions column_family_options; ColumnFamilyOptions column_family_options;
column_family_options.write_buffer_size = 128 * 1024 * 1024; // 128MB column_family_options.write_buffer_size = 128 * 1024 * 1024; // 128MB
column_family_options.max_bytes_for_level_base = 1024 * 1024 * 1024; // 1 GB
column_family_options.max_write_buffer_number = 4; column_family_options.max_write_buffer_number = 4;
// only compress levels >= 1 column_family_options.level0_file_num_compaction_trigger = 2;
column_family_options.level0_slowdown_writes_trigger = 16;
column_family_options.level0_slowdown_writes_trigger = 32;
// only compress levels >= 2
column_family_options.compression_per_level.resize( column_family_options.compression_per_level.resize(
column_family_options.num_levels); column_family_options.num_levels);
for (int i = 0; i < column_family_options.num_levels; ++i) { for (int i = 0; i < column_family_options.num_levels; ++i) {
if (i == 0) { if (i < 2) {
column_family_options.compression_per_level[i] = kNoCompression; column_family_options.compression_per_level[i] = kNoCompression;
} else { } else {
column_family_options.compression_per_level[i] = kLZ4Compression; column_family_options.compression_per_level[i] = kLZ4Compression;
@ -651,17 +656,6 @@ ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options,
table_options.block_cache = block_cache; table_options.block_cache = block_cache;
column_family_options.table_factory.reset( column_family_options.table_factory.reset(
NewBlockBasedTableFactory(table_options)); NewBlockBasedTableFactory(table_options));
if (options.bulk_load) {
column_family_options.level0_file_num_compaction_trigger = (1 << 30);
column_family_options.level0_slowdown_writes_trigger = (1 << 30);
column_family_options.level0_stop_writes_trigger = (1 << 30);
column_family_options.disable_auto_compactions = true;
column_family_options.source_compaction_factor = (1 << 30);
column_family_options.num_levels = 2;
column_family_options.target_file_size_base = 256 * 1024 * 1024;
column_family_options.max_mem_compaction_level = 0;
column_family_options.memtable_factory.reset(new VectorRepFactory());
}
return column_family_options; return column_family_options;
} }

Loading…
Cancel
Save