diff --git a/Makefile b/Makefile index 31720522f..a784046e8 100644 --- a/Makefile +++ b/Makefile @@ -60,6 +60,7 @@ TESTS = \ merge_test \ redis_test \ reduce_levels_test \ + simple_table_db_test \ skiplist_test \ stringappend_test \ table_test \ @@ -236,6 +237,9 @@ crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) diff --git a/db/builder.cc b/db/builder.cc index b9206b4e2..1cd2ea8f9 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -15,12 +15,22 @@ #include "db/table_cache.h" #include "db/version_edit.h" #include "rocksdb/db.h" +#include "rocksdb/table.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" +#include "table/block_based_table_builder.h" #include "util/stop_watch.h" namespace rocksdb { +class TableFactory; + +TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, + int level, const bool enable_compression) { + return options.table_factory->GetTableBuilder(options, file, level, + enable_compression); +} + Status BuildTable(const std::string& dbname, Env* env, const Options& options, @@ -52,8 +62,9 @@ Status BuildTable(const std::string& dbname, if (!s.ok()) { return s; } - TableBuilder* builder = new TableBuilder(options, file.get(), 0, - enable_compression); + + TableBuilder* builder = GetTableBuilder(options, file.get(), 0, + enable_compression); // the first key is the smallest key Slice key = iter->key(); diff --git a/db/builder.h b/db/builder.h index a09033e94..17f4aa380 100644 --- a/db/builder.h +++ b/db/builder.h @@ -20,6 +20,11 @@ class EnvOptions; class Iterator; class TableCache; class VersionEdit; +class TableBuilder; +class WritableFile; + +extern TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, + int level, const bool enable_compression); // Build a Table file from the contents of *iter. The generated file // will be named according to meta->number. On success, the rest of diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 3dd92d173..e7b7b4c8b 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -15,12 +15,12 @@ #include #include "rocksdb/cache.h" #include "rocksdb/env.h" +#include "rocksdb/table.h" #include "rocksdb/write_batch.h" #include "db/db_impl.h" #include "db/filename.h" #include "db/log_format.h" #include "db/version_set.h" -#include "table/table.h" #include "util/logging.h" #include "util/testharness.h" #include "util/testutil.h" diff --git a/db/db_impl.cc b/db/db_impl.cc index fbbf650d8..b4961f31f 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -41,10 +41,10 @@ #include "rocksdb/merge_operator.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" -#include "rocksdb/table_builder.h" +#include "rocksdb/table.h" +#include "port/port.h" #include "table/block.h" #include "table/merger.h" -#include "table/table.h" #include "table/two_level_iterator.h" #include "util/auto_roll_logger.h" #include "util/build_version.h" @@ -1774,9 +1774,10 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { compact->outfile->SetPreallocationBlockSize( 1.1 * versions_->MaxFileSizeForLevel(compact->compaction->output_level())); - compact->builder.reset(new TableBuilder(options_, compact->outfile.get(), - compact->compaction->output_level(), - compact->compaction->enable_compression())); + compact->builder.reset( + GetTableBuilder(options_, compact->outfile.get(), + compact->compaction->output_level(), + compact->compaction->enable_compression())); } return s; } @@ -2026,9 +2027,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { compaction_filter_value.clear(); bool to_delete = compaction_filter->Filter(compact->compaction->level(), - ikey.user_key, value, - &compaction_filter_value, - &value_changed); + ikey.user_key, value, + &compaction_filter_value, + &value_changed); if (to_delete) { // make a copy of the original key delete_key.assign(key.data(), key.data() + key.size()); diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index 58da65abf..27d5c31ed 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -29,11 +29,10 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/status.h" -#include "rocksdb/table_builder.h" +#include "rocksdb/table.h" #include "port/port.h" #include "table/block.h" #include "table/merger.h" -#include "table/table.h" #include "table/two_level_iterator.h" #include "util/coding.h" #include "util/logging.h" diff --git a/db/db_test.cc b/db/db_test.cc index d35a9c7da..e53f4882f 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -20,7 +20,7 @@ #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" -#include "table/table.h" +#include "rocksdb/table.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" diff --git a/db/dbformat.h b/db/dbformat.h index a5f1a26a9..64a2c9f05 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -13,7 +13,7 @@ #include "rocksdb/db.h" #include "rocksdb/filter_policy.h" #include "rocksdb/slice.h" -#include "rocksdb/table_builder.h" +#include "rocksdb/table.h" #include "rocksdb/types.h" #include "util/coding.h" #include "util/logging.h" diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc new file mode 100644 index 000000000..bae35ffd2 --- /dev/null +++ b/db/simple_table_db_test.cc @@ -0,0 +1,1118 @@ +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "db/db_statistics.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +using std::unique_ptr; + +namespace rocksdb { + +// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. +// SimpleTable requires the input key size to be fixed 16 bytes, value cannot +// be longer than 150000 bytes and stored data on disk in this format: +// +--------------------------------------------+ <= key1 offset +// | key1 | value_size (4 bytes) | | +// +----------------------------------------+ | +// | value1 | +// | | +// +----------------------------------------+---+ <= key2 offset +// | key2 | value_size (4 bytes) | | +// +----------------------------------------+ | +// | value2 | +// | | +// | ...... | +// +-----------------+--------------------------+ <= index_block_offset +// | key1 | key1 offset (8 bytes) | +// +-----------------+--------------------------+ +// | key2 | key2 offset (8 bytes) | +// +-----------------+--------------------------+ +// | key3 | key3 offset (8 bytes) | +// +-----------------+--------------------------+ +// | ...... | +// +-----------------+------------+-------------+ +// | index_block_offset (8 bytes) | +// +------------------------------+ + + +// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. +class SimpleTable : public Table { + public: + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table" to the newly opened + // table. The client should delete "*table" when no longer needed. + // If there was an error while initializing the table, sets "*table" + // to nullptr and returns a non-ok status. Does not take ownership of + // "*source", but the client must ensure that "source" remains live + // for the duration of the returned table's lifetime. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& options, + const EnvOptions& soptions, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table); + + bool PrefixMayMatch(const Slice& internal_prefix) override; + + Iterator* NewIterator(const ReadOptions&) override; + + Status Get( + const ReadOptions&, const Slice& key, + void* arg, + bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr) override; + + uint64_t ApproximateOffsetOf(const Slice& key) override; + + bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override; + + void SetupForCompaction() override; + + TableStats& GetTableStats() override; + + ~SimpleTable(); + + private: + struct Rep; + Rep* rep_; + + explicit SimpleTable(Rep* rep) { + rep_ = rep; + } + friend class TableCache; + friend class SimpleTableIterator; + + Status GetOffset(const Slice& target, uint64_t* offset); + + // No copying allowed + explicit SimpleTable(const Table&) = delete; + void operator=(const Table&) = delete; +}; + +// Iterator to iterate SimpleTable +class SimpleTableIterator: public Iterator { +public: + explicit SimpleTableIterator(SimpleTable* table); + ~SimpleTableIterator(); + + bool Valid() const; + + void SeekToFirst(); + + void SeekToLast(); + + void Seek(const Slice& target); + + void Next(); + + void Prev(); + + Slice key() const; + + Slice value() const; + + Status status() const; + +private: + SimpleTable* table_; + uint64_t offset_; + uint64_t next_offset_; + Slice key_; + Slice value_; + char tmp_str_[4]; + char* key_str_; + char* value_str_; + int value_str_len_; + Status status_; + // No copying allowed + SimpleTableIterator(const SimpleTableIterator&) = delete; + void operator=(const Iterator&) = delete; +}; + +struct SimpleTable::Rep { + ~Rep() { + } + Rep(const EnvOptions& storage_options, uint64_t index_start_offset, + int num_entries) : + soptions(storage_options), index_start_offset(index_start_offset), + num_entries(num_entries) { + } + + Options options; + const EnvOptions& soptions; + Status status; + unique_ptr file; + uint64_t index_start_offset; + int num_entries; + TableStats table_stats; + + const static int user_key_size = 16; + const static int offset_length = 8; + const static int key_footer_len = 8; + + static int GetInternalKeyLength() { + return user_key_size + key_footer_len; + } +}; + +SimpleTable::~SimpleTable() { + delete rep_; +} + +Status SimpleTable::Open(const Options& options, const EnvOptions& soptions, + unique_ptr && file, uint64_t size, + unique_ptr
* table) { + char footer_space[Rep::offset_length]; + Slice footer_input; + Status s = file->Read(size - Rep::offset_length, Rep::offset_length, + &footer_input, footer_space); + if (s.ok()) { + uint64_t index_start_offset = DecodeFixed64(footer_space); + + int num_entries = (size - Rep::offset_length - index_start_offset) + / (Rep::GetInternalKeyLength() + Rep::offset_length); + SimpleTable::Rep* rep = new SimpleTable::Rep(soptions, index_start_offset, + num_entries); + + + rep->file = std::move(file); + rep->options = options; + table->reset(new SimpleTable(rep)); + } + return s; +} + +void SimpleTable::SetupForCompaction() { +} + +TableStats& SimpleTable::GetTableStats() { + return rep_->table_stats; +} + +bool SimpleTable::PrefixMayMatch(const Slice& internal_prefix) { + return true; +} + +Iterator* SimpleTable::NewIterator(const ReadOptions& options) { + return new SimpleTableIterator(this); +} + +Status SimpleTable::GetOffset(const Slice& target, uint64_t* offset) { + uint32_t left = 0; + uint32_t right = rep_->num_entries - 1; + char key_chars[Rep::GetInternalKeyLength()]; + Slice tmp_slice; + + uint32_t target_offset = 0; + while (left <= right) { + uint32_t mid = (left + right + 1) / 2; + + uint64_t offset_to_read = rep_->index_start_offset + + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid; + Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(), + &tmp_slice, key_chars); + if (!s.ok()) { + return s; + } + + int compare_result = rep_->options.comparator->Compare(tmp_slice, target); + + if (compare_result < 0) { + if (left == right) { + target_offset = right + 1; + break; + } + left = mid; + } else { + if (left == right) { + target_offset = left; + break; + } + right = mid - 1; + } + } + + if (target_offset >= (uint32_t) rep_->num_entries) { + *offset = rep_->index_start_offset; + return Status::OK(); + } + + char value_offset_chars[Rep::offset_length]; + + int64_t offset_for_value_offset = rep_->index_start_offset + + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset + + Rep::GetInternalKeyLength(); + Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length, + &tmp_slice, value_offset_chars); + if (s.ok()) { + *offset = DecodeFixed64(value_offset_chars); + } + return s; +} + +Status SimpleTable::Get(const ReadOptions& options, const Slice& k, void* arg, + bool (*saver)(void*, const Slice&, const Slice&, bool), + void (*mark_key_may_exist)(void*)) { + Status s; + SimpleTableIterator* iter = new SimpleTableIterator(this); + for (iter->Seek(k); iter->Valid(); iter->Next()) { + if (!(*saver)(arg, iter->key(), iter->value(), true)) { + break; + } + } + s = iter->status(); + delete iter; + return s; +} + +bool SimpleTable::TEST_KeyInCache(const ReadOptions& options, + const Slice& key) { + return false; +} + +uint64_t SimpleTable::ApproximateOffsetOf(const Slice& key) { + return 0; +} + +SimpleTableIterator::SimpleTableIterator(SimpleTable* table) : + table_(table) { + key_str_ = new char[table->rep_->GetInternalKeyLength()]; + value_str_len_ = -1; + SeekToFirst(); +} + +SimpleTableIterator::~SimpleTableIterator() { + delete key_str_; + if (value_str_len_ >= 0) { + delete value_str_; + } +} + +bool SimpleTableIterator::Valid() const { + return offset_ < table_->rep_->index_start_offset && offset_ >= 0; +} + +void SimpleTableIterator::SeekToFirst() { + next_offset_ = 0; + Next(); +} + +void SimpleTableIterator::SeekToLast() { + assert(false); +} + +void SimpleTableIterator::Seek(const Slice& target) { + Status s = table_->GetOffset(target, &next_offset_); + if (!s.ok()) { + status_ = s; + } + Next(); +} + +void SimpleTableIterator::Next() { + offset_ = next_offset_; + if (offset_ >= table_->rep_->index_start_offset) { + return; + } + Slice result; + int internal_key_size = table_->rep_->GetInternalKeyLength(); + + Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result, + key_str_); + next_offset_ += internal_key_size; + key_ = result; + + Slice value_size_slice; + s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_); + next_offset_ += 4; + uint32_t value_size = DecodeFixed32(tmp_str_); + + Slice value_slice; + if ((int) value_size > value_str_len_) { + if (value_str_len_ >= 0) { + delete value_str_; + } + value_str_ = new char[value_size]; + value_str_len_ = value_size; + } + char* value_str_ = new char[value_size]; + s = table_->rep_->file->Read(next_offset_, value_size, &value_slice, + value_str_); + next_offset_ += value_size; + value_ = value_slice; +} + +void SimpleTableIterator::Prev() { + assert(false); +} + +Slice SimpleTableIterator::key() const { + Log(table_->rep_->options.info_log, "key!!!!"); + return key_; +} + +Slice SimpleTableIterator::value() const { + return value_; +} + +Status SimpleTableIterator::status() const { + return status_; +} + +class SimpleTableBuilder : public TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + SimpleTableBuilder(const Options& options, WritableFile* file, int level=-1); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~SimpleTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + private: + struct Rep; + Rep* rep_; + + // No copying allowed + SimpleTableBuilder(const SimpleTableBuilder&) = delete; + void operator=(const SimpleTableBuilder&) = delete; +}; + +struct SimpleTableBuilder::Rep { + Options options; + WritableFile* file; + uint64_t offset = 0; + Status status; + + uint64_t num_entries = 0; + + bool closed = false; // Either Finish() or Abandon() has been called. + + const static int user_key_size = 16; + const static int offset_length = 8; + const static int key_footer_len = 8; + + static int GetInternalKeyLength() { + return user_key_size + key_footer_len; + } + + std::string index; + + Rep(const Options& opt, WritableFile* f) + : options(opt), + file(f) { + } + ~Rep() { + } +}; + +SimpleTableBuilder::SimpleTableBuilder(const Options& options, + WritableFile* file, int level) + : TableBuilder(level), rep_(new SimpleTableBuilder::Rep(options, file)) { +} + +SimpleTableBuilder::~SimpleTableBuilder() { + delete(rep_); +} + +void SimpleTableBuilder::Add(const Slice& key, const Slice& value) { + assert((int) key.size() == Rep::GetInternalKeyLength()); + + // Update index + rep_->index.append(key.data(), key.size()); + PutFixed64(&(rep_->index), rep_->offset); + + // Write key-value pair + rep_->file->Append(key); + rep_->offset += Rep::GetInternalKeyLength(); + + std::string size; + int value_size = value.size(); + PutFixed32(&size, value_size); + Slice sizeSlice(size); + rep_->file->Append(sizeSlice); + rep_->file->Append(value); + rep_->offset += value_size + 4; + + rep_->num_entries++; +} + +Status SimpleTableBuilder::status() const { + return Status::OK(); +} + +Status SimpleTableBuilder::Finish() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; + + uint64_t index_offset = rep_->offset; + Slice index_slice(rep_->index); + rep_->file->Append(index_slice); + rep_->offset += index_slice.size(); + + std::string index_offset_str; + PutFixed64(&index_offset_str, index_offset); + Slice foot_slice(index_offset_str); + rep_->file->Append(foot_slice); + rep_->offset += foot_slice.size(); + + return Status::OK(); +} + +void SimpleTableBuilder::Abandon() { + rep_->closed = true; +} + +uint64_t SimpleTableBuilder::NumEntries() const { + return rep_->num_entries; +} + +uint64_t SimpleTableBuilder::FileSize() const { + return rep_->offset; +} + +class SimpleTableFactory : public TableFactory { + public: + ~SimpleTableFactory() {} + SimpleTableFactory() {} + const char* Name() const override { + return "SimpleTable"; + } + Status OpenTable(const Options& options, const EnvOptions& soptions, + unique_ptr && file, uint64_t file_size, + unique_ptr
* table) const; + + TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, + int level, const bool enable_compression) const; +}; + +Status SimpleTableFactory::OpenTable(const Options& options, + const EnvOptions& soptions, + unique_ptr && file, + uint64_t file_size, + unique_ptr
* table) const { + + return SimpleTable::Open(options, soptions, std::move(file), file_size, + table); +} + +TableBuilder* SimpleTableFactory::GetTableBuilder( + const Options& options, WritableFile* file, int level, + const bool enable_compression) const { + return new SimpleTableBuilder(options, file, level); +} + + +namespace anon { +class AtomicCounter { + private: + port::Mutex mu_; + int count_; + public: + AtomicCounter() : count_(0) { } + void Increment() { + MutexLock l(&mu_); + count_++; + } + int Read() { + MutexLock l(&mu_); + return count_; + } + void Reset() { + MutexLock l(&mu_); + count_ = 0; + } +}; + +} + +// Special Env used to delay background operations +class SpecialEnv : public EnvWrapper { + public: + // sstable Sync() calls are blocked while this pointer is non-nullptr. + port::AtomicPointer delay_sstable_sync_; + + // Simulate no-space errors while this pointer is non-nullptr. + port::AtomicPointer no_space_; + + // Simulate non-writable file system while this pointer is non-nullptr + port::AtomicPointer non_writable_; + + // Force sync of manifest files to fail while this pointer is non-nullptr + port::AtomicPointer manifest_sync_error_; + + // Force write to manifest files to fail while this pointer is non-nullptr + port::AtomicPointer manifest_write_error_; + + bool count_random_reads_; + anon::AtomicCounter random_read_counter_; + + anon::AtomicCounter sleep_counter_; + + explicit SpecialEnv(Env* base) : EnvWrapper(base) { + delay_sstable_sync_.Release_Store(nullptr); + no_space_.Release_Store(nullptr); + non_writable_.Release_Store(nullptr); + count_random_reads_ = false; + manifest_sync_error_.Release_Store(nullptr); + manifest_write_error_.Release_Store(nullptr); + } + + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& soptions) { + class SSTableFile : public WritableFile { + private: + SpecialEnv* env_; + unique_ptr base_; + + public: + SSTableFile(SpecialEnv* env, unique_ptr&& base) + : env_(env), + base_(std::move(base)) { + } + Status Append(const Slice& data) { + if (env_->no_space_.Acquire_Load() != nullptr) { + // Drop writes on the floor + return Status::OK(); + } else { + return base_->Append(data); + } + } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { + while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) { + env_->SleepForMicroseconds(100000); + } + return base_->Sync(); + } + }; + class ManifestFile : public WritableFile { + private: + SpecialEnv* env_; + unique_ptr base_; + public: + ManifestFile(SpecialEnv* env, unique_ptr&& b) + : env_(env), base_(std::move(b)) { } + Status Append(const Slice& data) { + if (env_->manifest_write_error_.Acquire_Load() != nullptr) { + return Status::IOError("simulated writer error"); + } else { + return base_->Append(data); + } + } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { + if (env_->manifest_sync_error_.Acquire_Load() != nullptr) { + return Status::IOError("simulated sync error"); + } else { + return base_->Sync(); + } + } + }; + + if (non_writable_.Acquire_Load() != nullptr) { + return Status::IOError("simulated write error"); + } + + Status s = target()->NewWritableFile(f, r, soptions); + if (s.ok()) { + if (strstr(f.c_str(), ".sst") != nullptr) { + r->reset(new SSTableFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "MANIFEST") != nullptr) { + r->reset(new ManifestFile(this, std::move(*r))); + } + } + return s; + } + + Status NewRandomAccessFile(const std::string& f, + unique_ptr* r, + const EnvOptions& soptions) { + class CountingFile : public RandomAccessFile { + private: + unique_ptr target_; + anon::AtomicCounter* counter_; + public: + CountingFile(unique_ptr&& target, + anon::AtomicCounter* counter) + : target_(std::move(target)), counter_(counter) { + } + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + counter_->Increment(); + return target_->Read(offset, n, result, scratch); + } + }; + + Status s = target()->NewRandomAccessFile(f, r, soptions); + if (s.ok() && count_random_reads_) { + r->reset(new CountingFile(std::move(*r), &random_read_counter_)); + } + return s; + } + + virtual void SleepForMicroseconds(int micros) { + sleep_counter_.Increment(); + target()->SleepForMicroseconds(micros); + } +}; + +class SimpleTableDBTest { + protected: + public: + std::string dbname_; + SpecialEnv* env_; + DB* db_; + + Options last_options_; + + SimpleTableDBTest() : env_(new SpecialEnv(Env::Default())) { + dbname_ = test::TmpDir() + "/simple_table_db_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~SimpleTableDBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, Options())); + delete env_; + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + options.table_factory.reset(new SimpleTableFactory()); + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + // Return a string that contains all key,value pairs in order, + // formatted like "(k1->v1)(k2->v2)". + std::string Contents() { + std::vector forward; + std::string result; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string s = IterStatus(iter); + result.push_back('('); + result.append(s); + result.push_back(')'); + forward.push_back(s); + } + + // Check reverse iteration results are the reverse of forward results + unsigned int matched = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_LT(matched, forward.size()); + ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); + matched++; + } + ASSERT_EQ(matched, forward.size()); + + delete iter; + return result; + } + + std::string AllEntriesFor(const Slice& user_key) { + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); + iter->Seek(target.Encode()); + std::string result; + if (!iter->status().ok()) { + result = iter->status().ToString(); + } else { + result = "[ "; + bool first = true; + while (iter->Valid()) { + ParsedInternalKey ikey; + if (!ParseInternalKey(iter->key(), &ikey)) { + result += "CORRUPTED"; + } else { + if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) { + break; + } + if (!first) { + result += ", "; + } + first = false; + switch (ikey.type) { + case kTypeValue: + result += iter->value().ToString(); + break; + case kTypeMerge: + // keep it the same as kTypeValue for testing kMergePut + result += iter->value().ToString(); + break; + case kTypeDeletion: + result += "DEL"; + break; + case kTypeLogData: + assert(false); + break; + } + } + iter->Next(); + } + if (!first) { + result += " "; + } + result += "]"; + } + delete iter; + return result; + } + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + int TotalTableFiles() { + int result = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + result += NumTableFilesAtLevel(level); + } + return result; + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + int CountFiles() { + std::vector files; + env_->GetChildren(dbname_, &files); + + std::vector logfiles; + if (dbname_ != last_options_.wal_dir) { + env_->GetChildren(last_options_.wal_dir, &logfiles); + } + + return static_cast(files.size() + logfiles.size()); + } + + int CountLiveFiles() { + std::vector files; + uint64_t manifest_file_size; + db_->GetLiveFiles(files, &manifest_file_size); + return files.size(); + } + + uint64_t Size(const Slice& start, const Slice& limit) { + Range r(start, limit); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + void Compact(const Slice& start, const Slice& limit) { + db_->CompactRange(&start, &limit); + } + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large) { + for (int i = 0; i < n; i++) { + Put(small, "begin"); + Put(large, "end"); + dbfull()->TEST_FlushMemTable(); + } + } + + // Prevent pushing of new sstables into deeper levels by adding + // tables that cover a specified range to all levels. + void FillLevels(const std::string& smallest, const std::string& largest) { + MakeTables(db_->NumberLevels(), smallest, largest); + } + + void DumpFileCounts(const char* label) { + fprintf(stderr, "---\n%s:\n", label); + fprintf(stderr, "maxoverlap: %lld\n", + static_cast( + dbfull()->TEST_MaxNextLevelOverlappingBytes())); + for (int level = 0; level < db_->NumberLevels(); level++) { + int num = NumTableFilesAtLevel(level); + if (num > 0) { + fprintf(stderr, " level %3d : %d files\n", level, num); + } + } + } + + std::string DumpSSTableList() { + std::string property; + db_->GetProperty("rocksdb.sstables", &property); + return property; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } + + Options OptionsForLogIterTest() { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 1000; + return options; + } + + std::unique_ptr OpenTransactionLogIter( + const SequenceNumber seq) { + unique_ptr iter; + Status status = dbfull()->GetUpdatesSince(seq, &iter); + ASSERT_OK(status); + ASSERT_TRUE(iter->Valid()); + return std::move(iter); + } + + std::string DummyString(size_t len, char c = 'a') { + return std::string(len, c); + } +}; + +TEST(SimpleTableDBTest, Empty) { + ASSERT_TRUE(db_ != nullptr); + ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); +} + +TEST(SimpleTableDBTest, ReadWrite) { + ASSERT_OK(Put("0000000000000foo", "v1")); + ASSERT_EQ("v1", Get("0000000000000foo")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("0000000000000foo", "v3")); + ASSERT_EQ("v3", Get("0000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(SimpleTableDBTest, Flush) { + ASSERT_OK(Put("0000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("0000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(SimpleTableDBTest, Flush2) { + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("0000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(Put("0000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("0000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +TEST(SimpleTableDBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 100<<10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; + num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/table_cache.cc b/db/table_cache.cc index a51ee7fac..2482ee0f9 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -12,7 +12,7 @@ #include "db/filename.h" #include "rocksdb/statistics.h" -#include "table/table.h" +#include "rocksdb/table.h" #include "util/coding.h" #include "util/stop_watch.h" @@ -71,7 +71,9 @@ Status TableCache::FindTable(const EnvOptions& toptions, file->Hint(RandomAccessFile::RANDOM); } StopWatch sw(env_, options_->statistics, TABLE_OPEN_IO_MICROS); - s = Table::Open(*options_, toptions, std::move(file), file_size, &table); + s = options_->table_factory->OpenTable(*options_, toptions, + std::move(file), + file_size, &table); } if (!s.ok()) { @@ -134,7 +136,7 @@ Status TableCache::Get(const ReadOptions& options, if (s.ok()) { Table* t = reinterpret_cast(cache_->Value(handle)); - s = t->InternalGet(options, k, arg, saver, mark_key_may_exist); + s = t->Get(options, k, arg, saver, mark_key_may_exist); cache_->Release(handle); } else if (options.read_tier && s.IsIncomplete()) { // Couldnt find Table in cache but treat as kFound if no_io set diff --git a/db/table_cache.h b/db/table_cache.h index 0928d3b32..135b04435 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -16,7 +16,7 @@ #include "rocksdb/env.h" #include "rocksdb/cache.h" #include "port/port.h" -#include "table/table.h" +#include "rocksdb/table.h" namespace rocksdb { diff --git a/db/table_stats_collector_test.cc b/db/table_stats_collector_test.cc index 586b65d40..16387ed3a 100644 --- a/db/table_stats_collector_test.cc +++ b/db/table_stats_collector_test.cc @@ -10,9 +10,8 @@ #include "db/dbformat.h" #include "db/db_impl.h" #include "db/table_stats_collector.h" -#include "rocksdb/table_builder.h" #include "rocksdb/table_stats.h" -#include "table/table.h" +#include "rocksdb/table.h" #include "util/coding.h" #include "util/testharness.h" #include "util/testutil.h" @@ -89,7 +88,8 @@ void MakeBuilder( std::unique_ptr* builder) { writable->reset(new FakeWritableFile); builder->reset( - new TableBuilder(options, writable->get()) + options.table_factory->GetTableBuilder(options, writable->get(), 0, + true) ); } @@ -98,7 +98,7 @@ void OpenTable( const std::string& contents, std::unique_ptr
* table) { std::unique_ptr file(new FakeRandomeAccessFile(contents)); - auto s = Table::Open( + auto s = options.table_factory->OpenTable( options, EnvOptions(), std::move(file), diff --git a/db/version_set.cc b/db/version_set.cc index 5c93550cf..38d6c79f1 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -19,7 +19,7 @@ #include "db/table_cache.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" -#include "rocksdb/table_builder.h" +#include "rocksdb/table.h" #include "table/merger.h" #include "table/two_level_iterator.h" #include "util/coding.h" @@ -294,11 +294,11 @@ struct Saver { }; } -// Called from TableCache::Get and InternalGet when file/block in which key may -// exist are not there in TableCache/BlockCache respectively. In this case we -// can't guarantee that key does not exist and are not permitted to do IO to be -// certain.Set the status=kFound and value_found=false to let the caller know -// that key may exist but is not there in memory +// Called from TableCache::Get and Table::Get when file/block in which +// key may exist are not there in TableCache/BlockCache respectively. In this +// case we can't guarantee that key does not exist and are not permitted to do +// IO to be certain.Set the status=kFound and value_found=false to let the +// caller know that key may exist but is not there in memory static void MarkKeyMayExist(void* arg) { Saver* s = reinterpret_cast(arg); s->state = kFound; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 79abacb78..c158d3f38 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -17,6 +17,9 @@ #include "rocksdb/statistics.h" #include "rocksdb/table_stats.h" #include "rocksdb/universal_compaction.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" namespace rocksdb { @@ -29,6 +32,7 @@ class MergeOperator; class Snapshot; class CompactionFilter; class CompactionFilterFactory; +class TableFactory; using std::shared_ptr; @@ -579,6 +583,11 @@ struct Options { // MemTableRep. std::shared_ptr memtable_factory; + // This is a factory that provides TableFactory objects. + // Default: a factory that provides a default implementation of + // Table and TableBuilder. + std::shared_ptr table_factory; + // This is a factory that provides compaction filter objects which allow // an application to modify/delete a key-value during background compaction. // Default: a factory that doesn't provide any object diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h new file mode 100644 index 000000000..244c91fb7 --- /dev/null +++ b/include/rocksdb/table.h @@ -0,0 +1,189 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/table_stats.h" + +namespace rocksdb { + +struct Options; +class RandomAccessFile; +struct ReadOptions; +class TableCache; +class WritableFile; + +using std::unique_ptr; + +// TableBuilder provides the interface used to build a Table +// (an immutable and sorted map from keys to values). +// +// Multiple threads can invoke const methods on a TableBuilder without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same TableBuilder must use +// external synchronization. + +class TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + // + // If enable_compression=true, this table will follow the compression + // setting given in parameter options. If enable_compression=false, the + // table will not be compressed. + explicit TableBuilder(int level = -1, const bool enable_compression = true) : + level_(level) { + } + + // REQUIRES: Either Finish() or Abandon() has been called. + virtual ~TableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Add(const Slice& key, const Slice& value) = 0; + + // Return non-ok iff some error has been detected. + virtual Status status() const = 0; + + // Finish building the table. + // REQUIRES: Finish(), Abandon() have not been called + virtual Status Finish() = 0; + + // Indicate that the contents of this builder should be abandoned. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Abandon() = 0; + + // Number of calls to Add() so far. + virtual uint64_t NumEntries() const = 0; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + virtual uint64_t FileSize() const = 0; + +protected: + int level_; +}; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. +class Table { + public: + virtual ~Table() {} + + // Determine whether there is a chance that the current table file + // contains the key a key starting with iternal_prefix. The specific + // table implementation can use bloom filter and/or other heuristic + // to filter out this table as a whole. + virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0; + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + virtual Iterator* NewIterator(const ReadOptions&) = 0; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0; + + // Returns true if the block for the specified key is in cache. + // REQUIRES: key is in this table. + virtual bool TEST_KeyInCache(const ReadOptions& options, + const Slice& key) = 0; + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + virtual void SetupForCompaction() = 0; + + virtual TableStats& GetTableStats() = 0; + + // Get function issued to look for specific key. + // The table will search the first entry in the table whose user key + // matches key, and pass it to the call back function handle_result, + // with the first argument to be parameter arg, and the last bool + // parameter to be whether an I/O is issued. + // mark_key_may_exist call back is called when it is configured to be + // memory only and the key is not found in the block cache, with + // the parameter to be arg. + virtual Status Get( + const ReadOptions&, const Slice& key, + void* arg, + bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr) = 0; +}; + +struct TableStatsNames { + static const std::string kDataSize; + static const std::string kIndexSize; + static const std::string kRawKeySize; + static const std::string kRawValueSize; + static const std::string kNumDataBlocks; + static const std::string kNumEntries; + static const std::string kFilterPolicy; +}; + +// A base class for table factories +class TableFactory { + public: + virtual ~TableFactory() {} + + // The name of the comparator. + // + // The client of this package should switch to a new name whenever + // the table format implementation changes. + // + // Names starting with "rocksdb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Returns a Table object table that can fetch data from file specified + // in parameter file. It's the caller's responsibility to make sure + // file is in the correct format. + // + // OpenTable() is called in two places: + // (1) TableCache::FindTable() calls the function when table cache miss + // and cache the table object returned. + // (1) SstFileReader (for SST Dump) opens the table and dump the table + // contents using the interator of the table. + virtual Status OpenTable(const Options& options, + const EnvOptions& soptions, + unique_ptr&& file, + uint64_t file_size, + unique_ptr
* table) const = 0; + + // Return a table builder to write to a file for this table type. + // + // It is called in several places: + // (1) When flushing memtable to a level-0 output file, it creates a table + // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable()) + // (2) During compaction, it gets the builder for writing compaction output + // files in DBImpl::OpenCompactionOutputFile(). + // (3) When recovering from transaction logs, it creates a table builder to + // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery, + // by calling BuildTable()) + // (4) When running Repairer, it creates a table builder to convert logs to + // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) + virtual TableBuilder* GetTableBuilder( + const Options& options, WritableFile* file, int level, + const bool enable_compression) const = 0; +}; +} // namespace rocksdb diff --git a/table/table.cc b/table/block_based_table.cc similarity index 87% rename from table/table.cc rename to table/block_based_table.cc index 6243ab144..3105a044f 100644 --- a/table/table.cc +++ b/table/block_based_table.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/table.h" +#include "table/block_based_table.h" #include "db/dbformat.h" @@ -17,11 +17,11 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/options.h" #include "rocksdb/statistics.h" +#include "rocksdb/table.h" #include "table/block.h" #include "table/filter_block.h" #include "table/format.h" -#include "table/table.h" #include "table/two_level_iterator.h" #include "util/coding.h" @@ -35,7 +35,7 @@ namespace rocksdb { // varints. const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1; -struct Table::Rep { +struct BlockBasedTable::Rep { ~Rep() { delete filter; delete [] filter_data; @@ -59,8 +59,12 @@ struct Table::Rep { TableStats table_stats; }; +BlockBasedTable::~BlockBasedTable() { + delete rep_; +} + // Helper function to setup the cache key's prefix for the Table. -void Table::SetupCacheKeyPrefix(Rep* rep) { +void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { assert(kMaxCacheKeyPrefixSize >= 10); rep->cache_key_prefix_size = 0; if (rep->options.block_cache) { @@ -105,11 +109,11 @@ Status ReadBlock(RandomAccessFile* file, } // end of anonymous namespace -Status Table::Open(const Options& options, - const EnvOptions& soptions, - unique_ptr&& file, - uint64_t size, - unique_ptr
* table) { +Status BlockBasedTable::Open(const Options& options, + const EnvOptions& soptions, + unique_ptr && file, + uint64_t size, + unique_ptr
* table) { table->reset(); if (size < Footer::kEncodedLength) { return Status::InvalidArgument("file is too short to be an sstable"); @@ -139,7 +143,7 @@ Status Table::Open(const Options& options, if (s.ok()) { // We've successfully read the footer and the index block: we're // ready to serve requests. - Rep* rep = new Table::Rep(soptions); + BlockBasedTable::Rep* rep = new BlockBasedTable::Rep(soptions); rep->options = options; rep->file = std::move(file); rep->metaindex_handle = footer.metaindex_handle(); @@ -147,8 +151,8 @@ Status Table::Open(const Options& options, SetupCacheKeyPrefix(rep); rep->filter_data = nullptr; rep->filter = nullptr; - table->reset(new Table(rep)); - (*table)->ReadMeta(footer); + table->reset(new BlockBasedTable(rep)); + ((BlockBasedTable*) (table->get()))->ReadMeta(footer); } else { if (index_block) delete index_block; } @@ -156,7 +160,7 @@ Status Table::Open(const Options& options, return s; } -void Table::SetupForCompaction() { +void BlockBasedTable::SetupForCompaction() { switch (rep_->options.access_hint_on_compaction_start) { case Options::NONE: break; @@ -175,11 +179,11 @@ void Table::SetupForCompaction() { compaction_optimized_ = true; } -const TableStats& Table::GetTableStats() const { +TableStats& BlockBasedTable::GetTableStats() { return rep_->table_stats; } -void Table::ReadMeta(const Footer& footer) { +void BlockBasedTable::ReadMeta(const Footer& footer) { // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates // it is an empty block. // TODO: we never really verify check sum for meta index block @@ -222,7 +226,7 @@ void Table::ReadMeta(const Footer& footer) { delete meta; } -void Table::ReadFilter(const Slice& filter_handle_value) { +void BlockBasedTable::ReadFilter(const Slice& filter_handle_value) { Slice v = filter_handle_value; BlockHandle filter_handle; if (!filter_handle.DecodeFrom(&v).ok()) { @@ -230,7 +234,7 @@ void Table::ReadFilter(const Slice& filter_handle_value) { } // TODO: We might want to unify with ReadBlock() if we start - // requiring checksum verification in Table::Open. + // requiring checksum verification in BlockBasedTable::Open. ReadOptions opt; BlockContents block; if (!ReadBlockContents(rep_->file.get(), opt, filter_handle, &block, @@ -243,7 +247,7 @@ void Table::ReadFilter(const Slice& filter_handle_value) { rep_->filter = new FilterBlockReader(rep_->options, block.data); } -Status Table::ReadStats(const Slice& handle_value, Rep* rep) { +Status BlockBasedTable::ReadStats(const Slice& handle_value, Rep* rep) { Slice v = handle_value; BlockHandle handle; if (!handle.DecodeFrom(&v).ok()) { @@ -322,10 +326,6 @@ Status Table::ReadStats(const Slice& handle_value, Rep* rep) { return s; } -Table::~Table() { - delete rep_; -} - static void DeleteBlock(void* arg, void* ignored) { delete reinterpret_cast(arg); } @@ -343,13 +343,13 @@ static void ReleaseBlock(void* arg, void* h) { // Convert an index iterator value (i.e., an encoded BlockHandle) // into an iterator over the contents of the corresponding block. -Iterator* Table::BlockReader(void* arg, - const ReadOptions& options, - const Slice& index_value, - bool* didIO, - bool for_compaction) { +Iterator* BlockBasedTable::BlockReader(void* arg, + const ReadOptions& options, + const Slice& index_value, + bool* didIO, + bool for_compaction) { const bool no_io = (options.read_tier == kBlockCacheTier); - Table* table = reinterpret_cast(arg); + BlockBasedTable* table = reinterpret_cast(arg); Cache* block_cache = table->rep_->options.block_cache.get(); std::shared_ptr statistics = table->rep_->options.statistics; Block* block = nullptr; @@ -427,11 +427,11 @@ Iterator* Table::BlockReader(void* arg, return iter; } -Iterator* Table::BlockReader(void* arg, - const ReadOptions& options, - const EnvOptions& soptions, - const Slice& index_value, - bool for_compaction) { +Iterator* BlockBasedTable::BlockReader(void* arg, + const ReadOptions& options, + const EnvOptions& soptions, + const Slice& index_value, + bool for_compaction) { return BlockReader(arg, options, index_value, nullptr, for_compaction); } @@ -448,7 +448,7 @@ Iterator* Table::BlockReader(void* arg, // in memory. When blooms may need to be paged in, we should refactor so that // this is only ever called lazily. In particular, this shouldn't be called // while the DB lock is held like it is now. -bool Table::PrefixMayMatch(const Slice& internal_prefix) const { +bool BlockBasedTable::PrefixMayMatch(const Slice& internal_prefix) { FilterBlockReader* filter = rep_->filter; bool may_match = true; Status s; @@ -497,7 +497,11 @@ bool Table::PrefixMayMatch(const Slice& internal_prefix) const { return may_match; } -Iterator* Table::NewIterator(const ReadOptions& options) const { +Iterator* Table::NewIterator(const ReadOptions& options) { + return nullptr; +} + +Iterator* BlockBasedTable::NewIterator(const ReadOptions& options) { if (options.prefix) { InternalKey internal_prefix(*options.prefix, 0, kTypeValue); if (!PrefixMayMatch(internal_prefix.Encode())) { @@ -509,14 +513,15 @@ Iterator* Table::NewIterator(const ReadOptions& options) const { return NewTwoLevelIterator( rep_->index_block->NewIterator(rep_->options.comparator), - &Table::BlockReader, const_cast(this), options, rep_->soptions); + &BlockBasedTable::BlockReader, const_cast(this), + options, rep_->soptions); } -Status Table::InternalGet(const ReadOptions& options, const Slice& k, - void* arg, - bool (*saver)(void*, const Slice&, const Slice&, - bool), - void (*mark_key_may_exist)(void*)) { +Status BlockBasedTable::Get(const ReadOptions& options, const Slice& k, + void* arg, + bool (*saver)(void*, const Slice&, const Slice&, + bool), + void (*mark_key_may_exist)(void*)) { Status s; Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); bool done = false; @@ -566,16 +571,17 @@ bool SaveDidIO(void* arg, const Slice& key, const Slice& value, bool didIO) { *reinterpret_cast(arg) = didIO; return false; } -bool Table::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { - // We use InternalGet() as it has logic that checks whether we read the +bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, + const Slice& key) { + // We use Get() as it has logic that checks whether we read the // block from the disk or not. bool didIO = false; - Status s = InternalGet(options, key, &didIO, SaveDidIO); + Status s = Get(options, key, &didIO, SaveDidIO); assert(s.ok()); return !didIO; } -uint64_t Table::ApproximateOffsetOf(const Slice& key) const { +uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { Iterator* index_iter = rep_->index_block->NewIterator(rep_->options.comparator); index_iter->Seek(key); @@ -602,8 +608,8 @@ uint64_t Table::ApproximateOffsetOf(const Slice& key) const { return result; } -const std::string Table::kFilterBlockPrefix = "filter."; -const std::string Table::kStatsBlock = "rocksdb.stats"; +const std::string BlockBasedTable::kFilterBlockPrefix = "filter."; +const std::string BlockBasedTable::kStatsBlock = "rocksdb.stats"; const std::string TableStatsNames::kDataSize = "rocksdb.data.size"; const std::string TableStatsNames::kIndexSize = "rocksdb.index.size"; diff --git a/table/table.h b/table/block_based_table.h similarity index 83% rename from table/table.h rename to table/block_based_table.h index 13dc3f32b..e1ff04940 100644 --- a/table/table.h +++ b/table/block_based_table.h @@ -13,6 +13,7 @@ #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/table_stats.h" +#include "rocksdb/table.h" namespace rocksdb { @@ -23,13 +24,14 @@ struct Options; class RandomAccessFile; struct ReadOptions; class TableCache; +class Table; using std::unique_ptr; // A Table is a sorted map from strings to strings. Tables are // immutable and persistent. A Table may be safely accessed from // multiple threads without external synchronization. -class Table { +class BlockBasedTable : public Table { public: static const std::string kFilterBlockPrefix; static const std::string kStatsBlock; @@ -52,14 +54,18 @@ class Table { uint64_t file_size, unique_ptr
* table); - ~Table(); - - bool PrefixMayMatch(const Slice& internal_prefix) const; + bool PrefixMayMatch(const Slice& internal_prefix) override; // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). - Iterator* NewIterator(const ReadOptions&) const; + Iterator* NewIterator(const ReadOptions&) override; + + Status Get( + const ReadOptions&, const Slice& key, + void* arg, + bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr) override; // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were @@ -67,24 +73,25 @@ class Table { // bytes, and so includes effects like compression of the underlying data. // E.g., the approximate offset of the last key in the table will // be close to the file length. - uint64_t ApproximateOffsetOf(const Slice& key) const; + uint64_t ApproximateOffsetOf(const Slice& key) override; // Returns true if the block for the specified key is in cache. // REQUIRES: key is in this table. - bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); + bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override; // Set up the table for Compaction. Might change some parameters with // posix_fadvise - void SetupForCompaction(); + void SetupForCompaction() override; + + TableStats& GetTableStats() override; - const TableStats& GetTableStats() const; + ~BlockBasedTable(); private: struct Rep; Rep* rep_; bool compaction_optimized_; - explicit Table(Rep* rep) : compaction_optimized_(false) { rep_ = rep; } static Iterator* BlockReader(void*, const ReadOptions&, const EnvOptions& soptions, const Slice&, bool for_compaction); @@ -95,12 +102,6 @@ class Table { // after a call to Seek(key), until handle_result returns false. // May not make such a call if filter policy says that key is not present. friend class TableCache; - Status InternalGet( - const ReadOptions&, const Slice& key, - void* arg, - bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), - void (*mark_key_may_exist)(void*) = nullptr); - void ReadMeta(const Footer& footer); void ReadFilter(const Slice& filter_handle_value); @@ -108,19 +109,14 @@ class Table { static void SetupCacheKeyPrefix(Rep* rep); - // No copying allowed - Table(const Table&); - void operator=(const Table&); -}; + explicit BlockBasedTable(Rep* rep) : + compaction_optimized_(false) { + rep_ = rep; + } -struct TableStatsNames { - static const std::string kDataSize; - static const std::string kIndexSize; - static const std::string kRawKeySize; - static const std::string kRawValueSize; - static const std::string kNumDataBlocks; - static const std::string kNumEntries; - static const std::string kFilterPolicy; + // No copying allowed + explicit BlockBasedTable(const Table&) = delete; + void operator=(const Table&) = delete; }; } // namespace rocksdb diff --git a/table/table_builder.cc b/table/block_based_table_builder.cc similarity index 89% rename from table/table_builder.cc rename to table/block_based_table_builder.cc index a9dd21ff6..1b4db69f2 100644 --- a/table/table_builder.cc +++ b/table/block_based_table_builder.cc @@ -7,19 +7,20 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "rocksdb/table_builder.h" +#include "table/block_based_table_builder.h" #include #include #include "rocksdb/comparator.h" +#include "rocksdb/table.h" #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" #include "rocksdb/options.h" +#include "table/block_based_table.h" #include "table/block_builder.h" #include "table/filter_block.h" #include "table/format.h" -#include "table/table.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/stop_watch.h" @@ -71,7 +72,7 @@ void LogStatsCollectionError( } // anonymous namespace -struct TableBuilder::Rep { +struct BlockBasedTableBuilder::Rep { Options options; Options index_block_options; WritableFile* file; @@ -119,37 +120,22 @@ struct TableBuilder::Rep { } }; -TableBuilder::TableBuilder(const Options& options, WritableFile* file, - int level, const bool enable_compression) - : rep_(new Rep(options, file, enable_compression)), level_(level) { +BlockBasedTableBuilder::BlockBasedTableBuilder(const Options& options, + WritableFile* file, int level, + const bool enable_compression) + : TableBuilder(level), rep_(new Rep(options, file, enable_compression)) { if (rep_->filter_block != nullptr) { rep_->filter_block->StartBlock(0); } } -TableBuilder::~TableBuilder() { +BlockBasedTableBuilder::~BlockBasedTableBuilder() { assert(rep_->closed); // Catch errors where caller forgot to call Finish() delete rep_->filter_block; delete rep_; } -Status TableBuilder::ChangeOptions(const Options& options) { - // Note: if more fields are added to Options, update - // this function to catch changes that should not be allowed to - // change in the middle of building a Table. - if (options.comparator != rep_->options.comparator) { - return Status::InvalidArgument("changing comparator while building table"); - } - - // Note that any live BlockBuilders point to rep_->options and therefore - // will automatically pick up the updated options. - rep_->options = options; - rep_->index_block_options = options; - rep_->index_block_options.block_restart_interval = 1; - return Status::OK(); -} - -void TableBuilder::Add(const Slice& key, const Slice& value) { +void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { Rep* r = rep_; assert(!r->closed); if (!ok()) return; @@ -204,7 +190,7 @@ void TableBuilder::Add(const Slice& key, const Slice& value) { } } -void TableBuilder::Flush() { +void BlockBasedTableBuilder::Flush() { Rep* r = rep_; assert(!r->closed); if (!ok()) return; @@ -222,7 +208,8 @@ void TableBuilder::Flush() { ++r->num_data_blocks; } -void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { +void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block, + BlockHandle* handle) { // File format contains a sequence of blocks where each block has: // block_data: uint8[n] // type: uint8 @@ -302,9 +289,9 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { block->Reset(); } -void TableBuilder::WriteRawBlock(const Slice& block_contents, - CompressionType type, - BlockHandle* handle) { +void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, + CompressionType type, + BlockHandle* handle) { Rep* r = rep_; StopWatch sw(r->options.env, r->options.statistics, WRITE_RAW_BLOCK_MICROS); handle->set_offset(r->offset); @@ -323,11 +310,11 @@ void TableBuilder::WriteRawBlock(const Slice& block_contents, } } -Status TableBuilder::status() const { +Status BlockBasedTableBuilder::status() const { return rep_->status; } -Status TableBuilder::Finish() { +Status BlockBasedTableBuilder::Finish() { Rep* r = rep_; Flush(); assert(!r->closed); @@ -370,7 +357,7 @@ Status TableBuilder::Finish() { if (r->filter_block != nullptr) { // Add mapping from ".Name" to location // of filter data. - std::string key = Table::kFilterBlockPrefix; + std::string key = BlockBasedTable::kFilterBlockPrefix; key.append(r->options.filter_policy->Name()); std::string handle_encoding; filter_block_handle.EncodeTo(&handle_encoding); @@ -435,7 +422,7 @@ Status TableBuilder::Finish() { std::string handle_encoding; stats_block_handle.EncodeTo(&handle_encoding); meta_block_handles.insert( - std::make_pair(Table::kStatsBlock, handle_encoding) + std::make_pair(BlockBasedTable::kStatsBlock, handle_encoding) ); } // end of stats block writing @@ -466,17 +453,17 @@ Status TableBuilder::Finish() { return r->status; } -void TableBuilder::Abandon() { +void BlockBasedTableBuilder::Abandon() { Rep* r = rep_; assert(!r->closed); r->closed = true; } -uint64_t TableBuilder::NumEntries() const { +uint64_t BlockBasedTableBuilder::NumEntries() const { return rep_->num_entries; } -uint64_t TableBuilder::FileSize() const { +uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; } diff --git a/include/rocksdb/table_builder.h b/table/block_based_table_builder.h similarity index 59% rename from include/rocksdb/table_builder.h rename to table/block_based_table_builder.h index da0e07570..b551e49a0 100644 --- a/include/rocksdb/table_builder.h +++ b/table/block_based_table_builder.h @@ -1,18 +1,13 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// TableBuilder provides the interface used to build a Table -// (an immutable and sorted map from keys to values). -// -// Multiple threads can invoke const methods on a TableBuilder without -// external synchronization, but if any of the threads may call a -// non-const method, all threads accessing the same TableBuilder must use -// external synchronization. - -#ifndef STORAGE_ROCKSDB_INCLUDE_TABLE_BUILDER_H_ -#define STORAGE_ROCKSDB_INCLUDE_TABLE_BUILDER_H_ +#pragma once #include #include "rocksdb/options.h" #include "rocksdb/status.h" @@ -22,64 +17,48 @@ namespace rocksdb { class BlockBuilder; class BlockHandle; class WritableFile; +class TableBuilder; + -class TableBuilder { +class BlockBasedTableBuilder : public TableBuilder { public: // Create a builder that will store the contents of the table it is // building in *file. Does not close the file. It is up to the // caller to close the file after calling Finish(). The output file // will be part of level specified by 'level'. A value of -1 means // that the caller does not know which level the output file will reside. - // - // If enable_compression=true, this table will follow the compression - // setting given in parameter options. If enable_compression=false, the - // table will not be compressed. - TableBuilder(const Options& options, WritableFile* file, int level=-1, - const bool enable_compression=true); + BlockBasedTableBuilder(const Options& options, WritableFile* file, + int level = -1, const bool enable_compression = true); // REQUIRES: Either Finish() or Abandon() has been called. - ~TableBuilder(); - - // Change the options used by this builder. Note: only some of the - // option fields can be changed after construction. If a field is - // not allowed to change dynamically and its value in the structure - // passed to the constructor is different from its value in the - // structure passed to this method, this method will return an error - // without changing any fields. - Status ChangeOptions(const Options& options); + ~BlockBasedTableBuilder(); // Add key,value to the table being constructed. // REQUIRES: key is after any previously added key according to comparator. // REQUIRES: Finish(), Abandon() have not been called - void Add(const Slice& key, const Slice& value); - - // Advanced operation: flush any buffered key/value pairs to file. - // Can be used to ensure that two adjacent entries never live in - // the same data block. Most clients should not need to use this method. - // REQUIRES: Finish(), Abandon() have not been called - void Flush(); + void Add(const Slice& key, const Slice& value) override; // Return non-ok iff some error has been detected. - Status status() const; + Status status() const override; // Finish building the table. Stops using the file passed to the // constructor after this function returns. // REQUIRES: Finish(), Abandon() have not been called - Status Finish(); + Status Finish() override; // Indicate that the contents of this builder should be abandoned. Stops // using the file passed to the constructor after this function returns. // If the caller is not going to call Finish(), it must call Abandon() // before destroying this builder. // REQUIRES: Finish(), Abandon() have not been called - void Abandon(); + void Abandon() override; // Number of calls to Add() so far. - uint64_t NumEntries() const; + uint64_t NumEntries() const override; // Size of the file generated so far. If invoked after a successful // Finish() call, returns the size of the final generated file. - uint64_t FileSize() const; + uint64_t FileSize() const override; private: bool ok() const { return status().ok(); } @@ -88,13 +67,17 @@ class TableBuilder { struct Rep; Rep* rep_; - int level_; + + // Advanced operation: flush any buffered key/value pairs to file. + // Can be used to ensure that two adjacent entries never live in + // the same data block. Most clients should not need to use this method. + // REQUIRES: Finish(), Abandon() have not been called + void Flush(); // No copying allowed - TableBuilder(const TableBuilder&); - void operator=(const TableBuilder&); + BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete; + void operator=(const BlockBasedTableBuilder&) = delete; }; } // namespace rocksdb -#endif // STORAGE_ROCKSDB_INCLUDE_TABLE_BUILDER_H_ diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc new file mode 100644 index 000000000..c2c3a79ba --- /dev/null +++ b/table/block_based_table_factory.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + + +#include "table/block_based_table_factory.h" + +#include +#include +#include "table/block_based_table_builder.h" +#include "table/block_based_table.h" +#include "port/port.h" + +namespace rocksdb { + +Status BlockBasedTableFactory::OpenTable(const Options& options, + const EnvOptions& soptions, + unique_ptr && file, + uint64_t file_size, + unique_ptr
* table) const { + + return BlockBasedTable::Open(options, soptions, std::move(file), file_size, + table); +} + +TableBuilder* BlockBasedTableFactory::GetTableBuilder( + const Options& options, WritableFile* file, int level, + const bool enable_compression) const { + return new BlockBasedTableBuilder(options, file, level, enable_compression); +} +} // namespace rocksdb diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h new file mode 100644 index 000000000..fa2cafbc0 --- /dev/null +++ b/table/block_based_table_factory.h @@ -0,0 +1,48 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/table.h" + +namespace rocksdb { + +struct Options; +struct EnvOptions; + +using std::unique_ptr; +class Status; +class RandomAccessFile; +class WritableFile; +class Table; +class TableBuilder; +class BlockBasedTable; +class BlockBasedTableBuilder; + +class BlockBasedTableFactory: public TableFactory { +public: + ~BlockBasedTableFactory() { + } + BlockBasedTableFactory() { + } + const char* Name() const override { + return "BlockBasedTable"; + } + Status OpenTable(const Options& options, const EnvOptions& soptions, + unique_ptr && file, uint64_t file_size, + unique_ptr
* table) const override; + + TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, + int level, const bool enable_compression) const + override; +}; + +} // namespace rocksdb diff --git a/table/block_builder.cc b/table/block_builder.cc index 592d7136c..0382dfc24 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -36,7 +36,6 @@ #include #include #include "rocksdb/comparator.h" -#include "rocksdb/table_builder.h" #include "util/coding.h" namespace rocksdb { diff --git a/table/block_test.cc b/table/block_test.cc index eb141160c..f1b16faff 100644 --- a/table/block_test.cc +++ b/table/block_test.cc @@ -10,10 +10,9 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" -#include "rocksdb/table_builder.h" +#include "rocksdb/table.h" #include "table/block.h" #include "table/block_builder.h" -#include "table/table.h" #include "table/format.h" #include "util/random.h" #include "util/testharness.h" diff --git a/table/format.h b/table/format.h index 91e74638f..82f86884e 100644 --- a/table/format.h +++ b/table/format.h @@ -12,7 +12,7 @@ #include #include "rocksdb/slice.h" #include "rocksdb/status.h" -#include "rocksdb/table_builder.h" +#include "rocksdb/table.h" namespace rocksdb { diff --git a/table/table_test.cc b/table/table_test.cc index 5ed8c7070..161edd0b1 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -17,12 +17,13 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" -#include "rocksdb/table_builder.h" +#include "rocksdb/table.h" #include "rocksdb/memtablerep.h" #include "table/block.h" #include "table/block_builder.h" #include "table/format.h" -#include "table/table.h" +#include "table/block_based_table.h" +#include "table/block_based_table_builder.h" #include "util/random.h" #include "util/testharness.h" #include "util/testutil.h" @@ -237,19 +238,19 @@ class BlockConstructor: public Constructor { BlockConstructor(); }; -class TableConstructor: public Constructor { +class BlockBasedTableConstructor: public Constructor { public: - explicit TableConstructor( + explicit BlockBasedTableConstructor( const Comparator* cmp) : Constructor(cmp) { } - ~TableConstructor() { + ~BlockBasedTableConstructor() { Reset(); } virtual Status FinishImpl(const Options& options, const KVMap& data) { Reset(); sink_.reset(new StringSink()); - TableBuilder builder(options, sink_.get()); + BlockBasedTableBuilder builder(options, sink_.get()); for (KVMap::const_iterator it = data.begin(); it != data.end(); @@ -265,8 +266,11 @@ class TableConstructor: public Constructor { // Open the table uniq_id_ = cur_uniq_id_++; source_.reset(new StringSource(sink_->contents(), uniq_id_)); - return Table::Open(options, soptions, std::move(source_), - sink_->contents().size(), &table_); + unique_ptr table_factory; + return options.table_factory->OpenTable(options, soptions, + std::move(source_), + sink_->contents().size(), + &table_); } virtual Iterator* NewIterator() const { @@ -279,8 +283,10 @@ class TableConstructor: public Constructor { virtual Status Reopen(const Options& options) { source_.reset(new StringSource(sink_->contents(), uniq_id_)); - return Table::Open(options, soptions, std::move(source_), - sink_->contents().size(), &table_); + return options.table_factory->OpenTable(options, soptions, + std::move(source_), + sink_->contents().size(), + &table_); } virtual Table* table() { @@ -300,12 +306,12 @@ class TableConstructor: public Constructor { unique_ptr source_; unique_ptr
table_; - TableConstructor(); + BlockBasedTableConstructor(); static uint64_t cur_uniq_id_; const EnvOptions soptions; }; -uint64_t TableConstructor::cur_uniq_id_ = 1; +uint64_t BlockBasedTableConstructor::cur_uniq_id_ = 1; // A helper class that converts internal format keys into user keys class KeyConvertingIterator: public Iterator { @@ -533,7 +539,7 @@ class Harness { } switch (args.type) { case TABLE_TEST: - constructor_ = new TableConstructor(options_.comparator); + constructor_ = new BlockBasedTableConstructor(options_.comparator); break; case BLOCK_TEST: constructor_ = new BlockConstructor(options_.comparator); @@ -857,7 +863,7 @@ class TableTest { }; // This test include all the basic checks except those for index size and block // size, which will be conducted in separated unit tests. TEST(TableTest, BasicTableStats) { - TableConstructor c(BytewiseComparator()); + BlockBasedTableConstructor c(BytewiseComparator()); c.Add("a1", "val1"); c.Add("b2", "val2"); @@ -901,7 +907,7 @@ TEST(TableTest, BasicTableStats) { } TEST(TableTest, FilterPolicyNameStats) { - TableConstructor c(BytewiseComparator()); + BlockBasedTableConstructor c(BytewiseComparator()); c.Add("a1", "val1"); std::vector keys; KVMap kvmap; @@ -941,7 +947,7 @@ TEST(TableTest, IndexSizeStat) { // Each time we load one more key to the table. the table index block // size is expected to be larger than last time's. for (size_t i = 1; i < keys.size(); ++i) { - TableConstructor c(BytewiseComparator()); + BlockBasedTableConstructor c(BytewiseComparator()); for (size_t j = 0; j < i; ++j) { c.Add(keys[j], "val"); } @@ -962,7 +968,7 @@ TEST(TableTest, IndexSizeStat) { TEST(TableTest, NumBlockStat) { Random rnd(test::RandomSeed()); - TableConstructor c(BytewiseComparator()); + BlockBasedTableConstructor c(BytewiseComparator()); Options options; options.compression = kNoCompression; options.block_restart_interval = 1; @@ -984,7 +990,7 @@ TEST(TableTest, NumBlockStat) { } TEST(TableTest, ApproximateOffsetOfPlain) { - TableConstructor c(BytewiseComparator()); + BlockBasedTableConstructor c(BytewiseComparator()); c.Add("k01", "hello"); c.Add("k02", "hello2"); c.Add("k03", std::string(10000, 'x')); @@ -1015,7 +1021,7 @@ TEST(TableTest, ApproximateOffsetOfPlain) { static void Do_Compression_Test(CompressionType comp) { Random rnd(301); - TableConstructor c(BytewiseComparator()); + BlockBasedTableConstructor c(BytewiseComparator()); std::string tmp; c.Add("k01", "hello"); c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); @@ -1071,7 +1077,7 @@ TEST(TableTest, BlockCacheLeak) { opt.block_cache = NewLRUCache(16*1024*1024); // big enough so we don't ever // lose cached values. - TableConstructor c(BytewiseComparator()); + BlockBasedTableConstructor c(BytewiseComparator()); c.Add("k01", "hello"); c.Add("k02", "hello2"); c.Add("k03", std::string(10000, 'x')); diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index a5c1e592e..965aa65a1 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -9,9 +9,10 @@ #include "table/two_level_iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" #include "table/block.h" #include "table/format.h" -#include "table/table.h" #include "table/iterator_wrapper.h" namespace rocksdb { diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc index c2a69af7b..6ae059201 100644 --- a/tools/sst_dump.cc +++ b/tools/sst_dump.cc @@ -3,7 +3,6 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // -#include "table/table.h" #include #include @@ -15,7 +14,7 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" -#include "rocksdb/table_builder.h" +#include "rocksdb/table.h" #include "table/block.h" #include "table/block_builder.h" #include "table/format.h" @@ -76,7 +75,10 @@ Status SstFileReader::ReadSequential(bool print_kv, } uint64_t file_size; table_options.env->GetFileSize(file_name_, &file_size); - s = Table::Open(table_options, soptions_, std::move(file), file_size, &table); + unique_ptr table_factory; + s = table_options.table_factory->OpenTable(table_options, soptions_, + std::move(file), file_size, + &table); if(!s.ok()) { return s; } diff --git a/util/options.cc b/util/options.cc index 795b08d77..34552615b 100644 --- a/util/options.cc +++ b/util/options.cc @@ -17,6 +17,7 @@ #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" #include "rocksdb/merge_operator.h" +#include "table/block_based_table_factory.h" namespace rocksdb { @@ -91,6 +92,8 @@ Options::Options() filter_deletes(false), max_sequential_skip_in_iterations(8), memtable_factory(std::shared_ptr(new SkipListFactory)), + table_factory( + std::shared_ptr(new BlockBasedTableFactory())), compaction_filter_factory( std::shared_ptr( new DefaultCompactionFilterFactory())), @@ -114,6 +117,7 @@ Options::Dump(Logger* log) const compaction_filter_factory->Name()); Log(log," Options.memtable_factory: %s", memtable_factory->Name()); + Log(log," Options.table_factory: %s", table_factory->Name()); Log(log," Options.error_if_exists: %d", error_if_exists); Log(log," Options.create_if_missing: %d", create_if_missing); Log(log," Options.paranoid_checks: %d", paranoid_checks);