diff --git a/HISTORY.md b/HISTORY.md index f7fac7b0d..0a69d12da 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,8 +6,12 @@ executed in high priority thread pool. ## Unreleased (will be relased in 2.8) -* By default, checksums are verified on every read from database +## Unreleased + +### Public API changes +* Removed arena.h from public header files. +* By default, checksums are verified on every read from database ## 2.7.0 (01/28/2014) diff --git a/Makefile b/Makefile index 99892b761..c7cac9249 100644 --- a/Makefile +++ b/Makefile @@ -6,11 +6,7 @@ INSTALL_PATH ?= $(CURDIR) #----------------------------------------------- -# Uncomment exactly one of the lines labelled (A), (B), and (C) below -# to switch between compilation modes. - -# OPT ?= -DNDEBUG # (A) Production use (optimized mode) -OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer +OPT += -fno-omit-frame-pointer -momit-leaf-frame-pointer #----------------------------------------------- # detect what platform we're building on @@ -57,6 +53,7 @@ TESTS = \ auto_roll_logger_test \ block_test \ bloom_test \ + dynamic_bloom_test \ c_test \ cache_test \ coding_test \ @@ -75,6 +72,7 @@ TESTS = \ merge_test \ redis_test \ reduce_levels_test \ + plain_table_db_test \ simple_table_db_test \ skiplist_test \ stringappend_test \ @@ -93,6 +91,7 @@ TOOLS = \ db_repl_stress \ blob_store_bench + PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS) BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench @@ -143,11 +142,11 @@ all: $(LIBRARY) $(PROGRAMS) # Will also generate shared libraries. release: $(MAKE) clean - OPT=-DNDEBUG $(MAKE) all -j32 + OPT="-DNDEBUG -O2" $(MAKE) all -j32 coverage: $(MAKE) clean - COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check + COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check -j32 (cd coverage; ./coverage_test.sh) # Delete intermediate files find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; @@ -248,6 +247,9 @@ table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJEC bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -278,11 +280,14 @@ crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + $(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh index ceae38192..2d6062009 100755 --- a/build_tools/format-diff.sh +++ b/build_tools/format-diff.sh @@ -47,7 +47,6 @@ fi # ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH # fi # fi - set -e uncommitted_code=`git diff HEAD` @@ -55,7 +54,6 @@ uncommitted_code=`git diff HEAD` # If there's no uncommitted changes, we assume user are doing post-commit # format check, in which case we'll check the modified lines from latest commit. # Otherwise, we'll check format of the uncommitted code only. -format_last_commit=0 if [ -z "$uncommitted_code" ] then # Check the format of last commit diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh index 7a8b5e0fe..08dbd05a5 100755 --- a/coverage/coverage_test.sh +++ b/coverage/coverage_test.sh @@ -44,6 +44,11 @@ $GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null | tee -a $RECENT_REPORT && echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n" +# Unless otherwise specified, we'll not generate html report by default +if [ -z "$HTML" ]; then + exit 0 +fi + # Generate the html report. If we cannot find lcov in this machine, we'll simply # skip this step. echo "Generating the html coverage report..." diff --git a/db/builder.cc b/db/builder.cc index 61671db0d..08e76b539 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -9,16 +9,16 @@ #include "db/builder.h" -#include "db/filename.h" #include "db/dbformat.h" +#include "db/filename.h" #include "db/merge_helper.h" #include "db/table_cache.h" #include "db/version_edit.h" #include "rocksdb/db.h" -#include "rocksdb/table.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" +#include "rocksdb/table.h" #include "table/block_based_table_builder.h" #include "util/stop_watch.h" @@ -26,20 +26,18 @@ namespace rocksdb { class TableFactory; -TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, +TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) { - return options.table_factory->GetTableBuilder(options, file, - compression_type); + return options.table_factory->NewTableBuilder(options, internal_comparator, + file, compression_type); } -Status BuildTable(const std::string& dbname, - Env* env, - const Options& options, - const EnvOptions& soptions, - TableCache* table_cache, - Iterator* iter, - FileMetaData* meta, - const Comparator* user_comparator, +Status BuildTable(const std::string& dbname, Env* env, const Options& options, + const EnvOptions& soptions, TableCache* table_cache, + Iterator* iter, FileMetaData* meta, + const InternalKeyComparator& internal_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, const CompressionType compression) { @@ -64,8 +62,8 @@ Status BuildTable(const std::string& dbname, return s; } - TableBuilder* builder = GetTableBuilder(options, file.get(), - compression); + TableBuilder* builder = + NewTableBuilder(options, internal_comparator, file.get(), compression); // the first key is the smallest key Slice key = iter->key(); @@ -73,8 +71,8 @@ Status BuildTable(const std::string& dbname, meta->smallest_seqno = GetInternalKeySeqno(key); meta->largest_seqno = meta->smallest_seqno; - MergeHelper merge(user_comparator, options.merge_operator.get(), - options.info_log.get(), + MergeHelper merge(internal_comparator.user_comparator(), + options.merge_operator.get(), options.info_log.get(), true /* internal key corruption is not ok */); if (purge) { @@ -103,8 +101,8 @@ Status BuildTable(const std::string& dbname, // If the key is the same as the previous key (and it is not the // first key), then we skip it, since it is an older version. // Otherwise we output the key and mark it as the "new" previous key. - if (!is_first_key && !user_comparator->Compare(prev_ikey.user_key, - this_ikey.user_key)) { + if (!is_first_key && !internal_comparator.user_comparator()->Compare( + prev_ikey.user_key, this_ikey.user_key)) { // seqno within the same key are in decreasing order assert(this_ikey.sequence < prev_ikey.sequence); } else { @@ -202,10 +200,8 @@ Status BuildTable(const std::string& dbname, if (s.ok()) { // Verify that the table is usable - Iterator* it = table_cache->NewIterator(ReadOptions(), - soptions, - meta->number, - meta->file_size); + Iterator* it = table_cache->NewIterator(ReadOptions(), soptions, + internal_comparator, *meta); s = it->status(); delete it; } diff --git a/db/builder.h b/db/builder.h index 2600dc24b..630162968 100644 --- a/db/builder.h +++ b/db/builder.h @@ -24,23 +24,20 @@ class VersionEdit; class TableBuilder; class WritableFile; - -extern TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, - CompressionType compression_type); +extern TableBuilder* NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type); // Build a Table file from the contents of *iter. The generated file // will be named according to meta->number. On success, the rest of // *meta will be filled with metadata about the generated table. // If no data is present in *iter, meta->file_size will be set to // zero, and no Table file will be produced. -extern Status BuildTable(const std::string& dbname, - Env* env, - const Options& options, - const EnvOptions& soptions, - TableCache* table_cache, - Iterator* iter, +extern Status BuildTable(const std::string& dbname, Env* env, + const Options& options, const EnvOptions& soptions, + TableCache* table_cache, Iterator* iter, FileMetaData* meta, - const Comparator* user_comparator, + const InternalKeyComparator& internal_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, const CompressionType compression); diff --git a/db/column_family.cc b/db/column_family.cc index 6f396f29f..ba8bd643f 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -17,6 +17,7 @@ #include "db/internal_stats.h" #include "db/compaction_picker.h" #include "db/table_properties_collector.h" +#include "util/autovector.h" #include "util/hash_skiplist_rep.h" namespace rocksdb { @@ -184,7 +185,7 @@ ColumnFamilyData::~ColumnFamilyData() { if (mem_ != nullptr) { delete mem_->Unref(); } - std::vector to_delete; + autovector to_delete; imm_.current()->Unref(&to_delete); for (MemTable* m : to_delete) { delete m; diff --git a/db/column_family.h b/db/column_family.h index 0aa97699a..999433add 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -16,7 +16,7 @@ #include "rocksdb/options.h" #include "rocksdb/env.h" -#include "db/memtablelist.h" +#include "db/memtable_list.h" #include "db/write_batch_internal.h" #include "db/table_cache.h" @@ -40,7 +40,7 @@ struct SuperVersion { // We need to_delete because during Cleanup(), imm->Unref() returns // all memtables that we need to free through this vector. We then // delete all those memtables outside of mutex, during destruction - std::vector to_delete; + autovector to_delete; // should be called outside the mutex SuperVersion(); diff --git a/db/db_bench.cc b/db/db_bench.cc index 8355a3f0c..bdf842375 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -24,6 +24,7 @@ #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "rocksdb/statistics.h" +#include "rocksdb/perf_context.h" #include "port/port.h" #include "util/bit_set.h" #include "util/crc32c.h" @@ -389,6 +390,8 @@ DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when " DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when" " this is greater than 0."); +DEFINE_int32(perf_level, 0, "Level of perf collection"); + static bool ValidateRateLimit(const char* flagname, double value) { static constexpr double EPSILON = 1e-10; if ( value < -EPSILON ) { @@ -728,6 +731,7 @@ struct SharedState { port::Mutex mu; port::CondVar cv; int total; + int perf_level; // Each thread goes through the following states: // (1) initializing @@ -739,7 +743,7 @@ struct SharedState { long num_done; bool start; - SharedState() : cv(&mu) { } + SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { } }; // Per-thread state for concurrent executions of the same benchmark. @@ -847,6 +851,7 @@ class Benchmark { fprintf(stdout, "Memtablerep: vector\n"); break; } + fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level); PrintWarnings(); fprintf(stdout, "------------------------------------------------\n"); @@ -1202,6 +1207,7 @@ class Benchmark { } } + SetPerfLevel(static_cast (shared->perf_level)); thread->stats.Start(thread->tid); (arg->bm->*(arg->method))(thread); thread->stats.Stop(); diff --git a/db/db_impl.cc b/db/db_impl.cc index 91e327a8b..5e00b42a2 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -22,13 +22,13 @@ #include #include "db/builder.h" -#include "db/dbformat.h" #include "db/db_iter.h" +#include "db/dbformat.h" #include "db/filename.h" #include "db/log_reader.h" #include "db/log_writer.h" #include "db/memtable.h" -#include "db/memtablelist.h" +#include "db/memtable_list.h" #include "db/merge_context.h" #include "db/merge_helper.h" #include "db/prefix_filter_iterator.h" @@ -48,12 +48,13 @@ #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" -#include "port/port.h" #include "table/block.h" #include "table/block_based_table_factory.h" #include "table/merger.h" +#include "table/table_builder.h" #include "table/two_level_iterator.h" #include "util/auto_roll_logger.h" +#include "util/autovector.h" #include "util/build_version.h" #include "util/coding.h" #include "util/hash_skiplist_rep.h" @@ -61,13 +62,12 @@ #include "util/mutexlock.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" -#include "util/autovector.h" namespace rocksdb { const std::string default_column_family_name("default"); -void dumpLeveldbBuildVersion(Logger * log); +void DumpLeveldbBuildVersion(Logger * log); // Information kept for every waiting writer struct DBImpl::Writer { @@ -141,7 +141,10 @@ Options SanitizeOptions(const std::string& dbname, DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { DBOptions result = src; - ClipToRange(&result.max_open_files, 20, 1000000); + // result.max_open_files means an "infinite" open files. + if (result.max_open_files != -1) { + ClipToRange(&result.max_open_files, 20, 1000000); + } if (result.max_background_flushes == 0) { result.max_background_flushes = 1; } @@ -210,10 +213,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) : env_(options.env), dbname_(dbname), options_(SanitizeOptions(dbname, options)), - // Reserve ten files or so for other uses and give the rest to TableCache. - table_cache_(NewLRUCache(options_.max_open_files - 10, - options_.table_cache_numshardbits, - options_.table_cache_remove_scan_count_limit)), db_lock_(nullptr), mutex_(options.use_adaptive_mutex), shutting_down_(nullptr), @@ -239,18 +238,27 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) env_->GetAbsolutePath(dbname, &db_absolute_path_); + // Reserve ten files or so for other uses and give the rest to TableCache. + // Give a large number for setting of "infinite" open files. + const int table_cache_size = + (options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10; + // Reserve ten files or so for other uses and give the rest to TableCache. + table_cache_ = + NewLRUCache(table_cache_size, options_.table_cache_numshardbits, + options_.table_cache_remove_scan_count_limit); + versions_.reset( new VersionSet(dbname_, &options_, storage_options_, table_cache_.get())); column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); - dumpLeveldbBuildVersion(options_.info_log.get()); + DumpLeveldbBuildVersion(options_.info_log.get()); // TODO(icanadi) dump DBOptions and ColumnFamilyOptions separately // options_.Dump(options_.info_log.get()); char name[100]; - Status st = env_->GetHostName(name, 100L); - if (st.ok()) { + Status s = env_->GetHostName(name, 100L); + if (s.ok()) { host_name_ = name; } else { Log(options_.info_log, "Can't get hostname, use localhost as host name."); @@ -283,6 +291,10 @@ DBImpl::~DBImpl() { env_->UnlockFile(db_lock_); } + // versions need to be destroyed before table_cache since it can hold + // references to table_cache. + versions_.reset(); + LogFlush(options_.info_log); } @@ -396,7 +408,7 @@ void DBImpl::MaybeDumpStats() { } // Returns the list of live files in 'sst_live' and the list -// of all files in the filesystem in 'all_files'. +// of all files in the filesystem in 'candidate_files'. // no_full_scan = true -- never do the full scan using GetChildren() // force = false -- don't force the full scan, except every // options_.delete_obsolete_files_period_micros @@ -448,15 +460,18 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, versions_->AddLiveFiles(&deletion_state.sst_live); if (doing_the_full_scan) { - // set of all files in the directory - env_->GetChildren(dbname_, &deletion_state.all_files); // Ignore errors + // set of all files in the directory. We'll exclude files that are still + // alive in the subsequent processings. + env_->GetChildren( + dbname_, &deletion_state.candidate_files + ); // Ignore errors //Add log files in wal_dir if (options_.wal_dir != dbname_) { std::vector log_files; env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors - deletion_state.all_files.insert( - deletion_state.all_files.end(), + deletion_state.candidate_files.insert( + deletion_state.candidate_files.end(), log_files.begin(), log_files.end() ); @@ -469,11 +484,10 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, // files in sst_delete_files and log_delete_files. // It is not necessary to hold the mutex when invoking this method. void DBImpl::PurgeObsoleteFiles(DeletionState& state) { - // check if there is anything to do - if (!state.all_files.size() && - !state.sst_delete_files.size() && - !state.log_delete_files.size()) { + if (state.candidate_files.empty() && + state.sst_delete_files.empty() && + state.log_delete_files.empty()) { return; } @@ -483,100 +497,114 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { if (state.manifest_file_number == 0) { return; } - - uint64_t number; - FileType type; std::vector old_log_files; // Now, convert live list to an unordered set, WITHOUT mutex held; // set is slow. - std::unordered_set live_set(state.sst_live.begin(), - state.sst_live.end()); - - state.all_files.reserve(state.all_files.size() + - state.sst_delete_files.size()); + std::unordered_set sst_live( + state.sst_live.begin(), state.sst_live.end() + ); + + auto& candidate_files = state.candidate_files; + candidate_files.reserve( + candidate_files.size() + + state.sst_delete_files.size() + + state.log_delete_files.size()); + // We may ignore the dbname when generating the file names. + const char* kDumbDbName = ""; for (auto file : state.sst_delete_files) { - state.all_files.push_back(TableFileName("", file->number).substr(1)); + candidate_files.push_back( + TableFileName(kDumbDbName, file->number).substr(1) + ); delete file; } - state.all_files.reserve(state.all_files.size() + - state.log_delete_files.size()); - for (auto filenum : state.log_delete_files) { - if (filenum > 0) { - state.all_files.push_back(LogFileName("", filenum).substr(1)); + for (auto file_num : state.log_delete_files) { + if (file_num > 0) { + candidate_files.push_back( + LogFileName(kDumbDbName, file_num).substr(1) + ); } } - // dedup state.all_files so we don't try to delete the same + // dedup state.candidate_files so we don't try to delete the same // file twice - sort(state.all_files.begin(), state.all_files.end()); - auto unique_end = unique(state.all_files.begin(), state.all_files.end()); - - for (size_t i = 0; state.all_files.begin() + i < unique_end; i++) { - if (ParseFileName(state.all_files[i], &number, &type)) { - bool keep = true; - switch (type) { - case kLogFile: - keep = ((number >= state.log_number) || - (number == state.prev_log_number)); - break; - case kDescriptorFile: - // Keep my manifest file, and any newer incarnations' - // (in case there is a race that allows other incarnations) - keep = (number >= state.manifest_file_number); - break; - case kTableFile: - keep = (live_set.find(number) != live_set.end()); - break; - case kTempFile: - // Any temp files that are currently being written to must - // be recorded in pending_outputs_, which is inserted into "live" - keep = (live_set.find(number) != live_set.end()); - break; - case kInfoLogFile: - keep = true; - if (number != 0) { - old_log_files.push_back(state.all_files[i]); - } - break; - case kCurrentFile: - case kDBLockFile: - case kIdentityFile: - case kMetaDatabase: - keep = true; - break; - } + sort(candidate_files.begin(), candidate_files.end()); + candidate_files.erase( + unique(candidate_files.begin(), candidate_files.end()), + candidate_files.end() + ); + + for (const auto& to_delete : candidate_files) { + uint64_t number; + FileType type; + // Ignore file if we cannot recognize it. + if (!ParseFileName(to_delete, &number, &type)) { + continue; + } - if (!keep) { - if (type == kTableFile) { - // evict from cache - TableCache::Evict(table_cache_.get(), number); + bool keep = true; + switch (type) { + case kLogFile: + keep = ((number >= state.log_number) || + (number == state.prev_log_number)); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (in case there is a race that allows other incarnations) + keep = (number >= state.manifest_file_number); + break; + case kTableFile: + keep = (sst_live.find(number) != sst_live.end()); + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live" + keep = (sst_live.find(number) != sst_live.end()); + break; + case kInfoLogFile: + keep = true; + if (number != 0) { + old_log_files.push_back(to_delete); } - std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) + - "/" + state.all_files[i]; + break; + case kCurrentFile: + case kDBLockFile: + case kIdentityFile: + case kMetaDatabase: + keep = true; + break; + } + + if (keep) { + continue; + } + + if (type == kTableFile) { + // evict from cache + TableCache::Evict(table_cache_.get(), number); + } + std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) + + "/" + to_delete; + Log(options_.info_log, + "Delete type=%d #%lu", + int(type), + (unsigned long)number); + + if (type == kLogFile && + (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) { + Status s = env_->RenameFile(fname, + ArchivedLogFileName(options_.wal_dir, number)); + if (!s.ok()) { Log(options_.info_log, - "Delete type=%d #%lu", - int(type), - (unsigned long)number); - - Status st; - if (type == kLogFile && (options_.WAL_ttl_seconds > 0 || - options_.WAL_size_limit_MB > 0)) { - st = env_->RenameFile(fname, - ArchivedLogFileName(options_.wal_dir, number)); - if (!st.ok()) { - Log(options_.info_log, - "RenameFile logfile #%lu FAILED -- %s\n", - (unsigned long)number, st.ToString().c_str()); - } - } else { - st = env_->DeleteFile(fname); - if (!st.ok()) { - Log(options_.info_log, "Delete type=%d #%lu FAILED -- %s\n", - int(type), (unsigned long)number, st.ToString().c_str()); - } - } + "RenameFile logfile #%lu FAILED -- %s\n", + (unsigned long)number, s.ToString().c_str()); + } + } else { + Status s = env_->DeleteFile(fname); + if (!s.ok()) { + Log(options_.info_log, "Delete type=%d #%lu FAILED -- %s\n", + int(type), (unsigned long)number, s.ToString().c_str()); } } } @@ -805,10 +833,11 @@ Status DBImpl::Recover( if (!s.ok()) { return s; } - uint64_t number; - FileType type; + std::vector logs; for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; if (ParseFileName(filenames[i], &number, &type) && type == kLogFile && ((number >= min_log) || (number == prev_log))) { @@ -824,12 +853,12 @@ Status DBImpl::Recover( // Recover in the order in which the logs were generated std::sort(logs.begin(), logs.end()); - for (size_t i = 0; s.ok() && i < logs.size(); i++) { + for (const auto& log : logs) { // The previous incarnation may not have written any MANIFEST // records after allocating this log number. So we manually // update the file number allocation counter in VersionSet. - versions_->MarkFileNumberUsed(logs[i]); - s = RecoverLogFile(logs[i], &max_sequence, read_only); + versions_->MarkFileNumberUsed(log); + s = RecoverLogFile(log, &max_sequence, read_only); } if (s.ok()) { @@ -1011,7 +1040,7 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, { mutex_.Unlock(); s = BuildTable(dbname_, env_, *cfd->full_options(), storage_options_, - cfd->table_cache(), iter, &meta, cfd->user_comparator(), + cfd->table_cache(), iter, &meta, cfd->internal_comparator(), newest_snapshot, earliest_seqno_in_memtable, GetCompressionFlush(*cfd->full_options())); LogFlush(options_.info_log); @@ -1045,7 +1074,7 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, } Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, - std::vector& mems, VersionEdit* edit, + autovector& mems, VersionEdit* edit, uint64_t* filenumber) { mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); @@ -1062,21 +1091,20 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, Status s; { mutex_.Unlock(); - std::vector list; + std::vector memtables; for (MemTable* m : mems) { Log(options_.info_log, "Flushing memtable with log file: %lu\n", (unsigned long)m->GetLogNumber()); - list.push_back(m->NewIterator()); + memtables.push_back(m->NewIterator()); } - Iterator* iter = - NewMergingIterator(&cfd->internal_comparator(), &list[0], list.size()); - Log(options_.info_log, - "Level-0 flush table #%lu: started", + Iterator* iter = NewMergingIterator(env_, &cfd->internal_comparator(), + &memtables[0], memtables.size()); + Log(options_.info_log, "Level-0 flush table #%lu: started", (unsigned long)meta.number); s = BuildTable(dbname_, env_, *cfd->full_options(), storage_options_, - cfd->table_cache(), iter, &meta, cfd->user_comparator(), + cfd->table_cache(), iter, &meta, cfd->internal_comparator(), newest_snapshot, earliest_seqno_in_memtable, GetCompressionFlush(*cfd->full_options())); LogFlush(options_.info_log); @@ -1092,7 +1120,6 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, } base->Unref(); - // re-acquire the most current version base = cfd->current(); @@ -1145,7 +1172,7 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd, // Save the contents of the earliest memtable as a new Table uint64_t file_number; - std::vector mems; + autovector mems; cfd->imm()->PickMemtablesToFlush(&mems); if (mems.empty()) { Log(options_.info_log, "Nothing in memstore to flush"); @@ -1763,8 +1790,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, void DBImpl::BackgroundCallFlush() { bool madeProgress = false; - DeletionState deletion_state(default_cfd_->options()->max_write_buffer_number, - true); + DeletionState deletion_state(true); assert(bg_flush_scheduled_); MutexLock l(&mutex_); @@ -1815,8 +1841,7 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() { void DBImpl::BackgroundCallCompaction() { bool madeProgress = false; - DeletionState deletion_state(default_cfd_->options()->max_write_buffer_number, - true); + DeletionState deletion_state(true); MaybeDumpStats(); @@ -2077,8 +2102,9 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { *cfd->full_options(), compact->compaction->output_level(), compact->compaction->enable_compression()); - compact->builder.reset(GetTableBuilder( - *cfd->full_options(), compact->outfile.get(), compression_type)); + compact->builder.reset( + NewTableBuilder(*cfd->full_options(), cfd->internal_comparator(), + compact->outfile.get(), compression_type)); } LogFlush(options_.info_log); return s; @@ -2126,8 +2152,9 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, if (s.ok() && current_entries > 0) { // Verify that the table is usable ColumnFamilyData* cfd = compact->compaction->column_family_data(); + FileMetaData meta(output_number, current_bytes); Iterator* iter = cfd->table_cache()->NewIterator( - ReadOptions(), storage_options_, output_number, current_bytes); + ReadOptions(), storage_options_, cfd->internal_comparator(), meta); s = iter->status(); delete iter; if (s.ok()) { @@ -2641,8 +2668,9 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, // Collect iterators for files in L0 - Ln super_version->current->AddIterators(options, storage_options_, &iterator_list); - Iterator* internal_iter = NewMergingIterator( - &cfd->internal_comparator(), &iterator_list[0], iterator_list.size()); + Iterator* internal_iter = + NewMergingIterator(env_, &cfd->internal_comparator(), &iterator_list[0], + iterator_list.size()); IterState* cleanup = new IterState(this, &mutex_, super_version); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); @@ -2677,8 +2705,8 @@ std::pair DBImpl::GetTailingIteratorPair( std::vector list; super_version->imm->AddIterators(options, &list); super_version->current->AddIterators(options, storage_options_, &list); - Iterator* immutable_iter = - NewMergingIterator(&cfd->internal_comparator(), &list[0], list.size()); + Iterator* immutable_iter = NewMergingIterator( + env_, &cfd->internal_comparator(), &list[0], list.size()); // create a DBIter that only uses memtable content; see NewIterator() immutable_iter = @@ -2739,6 +2767,8 @@ Status DBImpl::GetImpl(const ReadOptions& options, const Slice& key, std::string* value, bool* value_found) { StopWatch sw(env_, options_.statistics.get(), DB_GET, false); + StopWatchNano snapshot_timer(env_, false); + StartPerfTimer(&snapshot_timer); mutex_.Lock(); auto cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family.id); @@ -2766,6 +2796,7 @@ Status DBImpl::GetImpl(const ReadOptions& options, // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. LookupKey lkey(key, snapshot); + BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); if (get_version->mem->Get(lkey, value, &s, merge_context, *cfd->full_options())) { // Done @@ -2775,12 +2806,19 @@ Status DBImpl::GetImpl(const ReadOptions& options, // Done RecordTick(options_.statistics.get(), MEMTABLE_HIT); } else { + StopWatchNano from_files_timer(env_, false); + StartPerfTimer(&from_files_timer); + get_version->current->Get(options, lkey, value, &s, &merge_context, &stats, *cfd->full_options(), value_found); have_stat_update = true; + BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer); RecordTick(options_.statistics.get(), MEMTABLE_MISS); } + StopWatchNano post_process_timer(env_, false); + StartPerfTimer(&post_process_timer); + bool delete_get_version = false; if (!cfd->options()->disable_seek_compaction && have_stat_update) { mutex_.Lock(); @@ -2805,8 +2843,10 @@ Status DBImpl::GetImpl(const ReadOptions& options, } // Note, tickers are atomic now - no lock protection needed any more. + RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); RecordTick(options_.statistics.get(), BYTES_READ, value->size()); + BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer); return s; } @@ -2816,6 +2856,9 @@ std::vector DBImpl::MultiGet( const std::vector& keys, std::vector* values) { StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false); + StopWatchNano snapshot_timer(env_, false); + StartPerfTimer(&snapshot_timer); + SequenceNumber snapshot; struct MultiGetColumnFamilyData { @@ -2856,6 +2899,7 @@ std::vector DBImpl::MultiGet( // Keep track of bytes that we read for statistics-recording later uint64_t bytes_read = 0; + BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); // For each of the given keys, apply the entire "get" process as follows: // First look in the memtable, then in the immutable memtable (if any). @@ -2889,6 +2933,9 @@ std::vector DBImpl::MultiGet( } } + // Post processing (decrement reference counts and record statistics) + StopWatchNano post_process_timer(env_, false); + StartPerfTimer(&post_process_timer); autovector superversions_to_delete; bool schedule_flush_or_compaction = false; @@ -2921,6 +2968,7 @@ std::vector DBImpl::MultiGet( RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS); RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, num_keys); RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytes_read); + BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer); return stat_list; } @@ -3080,6 +3128,8 @@ Status DBImpl::Delete(const WriteOptions& options, } Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { + StopWatchNano pre_post_process_timer(env_, false); + StartPerfTimer(&pre_post_process_timer); Writer w(&mutex_); w.batch = my_batch; w.sync = options.sync; @@ -3148,6 +3198,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (options.disableWAL) { flush_on_destroy_ = true; } + BumpPerfTime(&perf_context.write_pre_and_post_process_time, + &pre_post_process_timer); if (!options.disableWAL) { StopWatchNano timer(env_); @@ -3156,7 +3208,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { status = log_->AddRecord(log_entry); RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1); RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size()); - BumpPerfTime(&perf_context.wal_write_time, &timer); if (status.ok() && options.sync) { if (options_.use_fsync) { StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS); @@ -3166,12 +3217,17 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { status = log_->file()->Sync(); } } + BumpPerfTime(&perf_context.write_wal_time, &timer); } if (status.ok()) { + StopWatchNano write_memtable_timer(env_, false); + // reading the column family set outside of DB mutex -- should lock versions_->GetColumnFamilySet()->Lock(); + StartPerfTimer(&write_memtable_timer); status = WriteBatchInternal::InsertInto( updates, column_family_memtables_.get(), 0, this, false); + BumpPerfTime(&perf_context.write_memtable_time, &write_memtable_timer); versions_->GetColumnFamilySet()->Unlock(); if (!status.ok()) { @@ -3184,6 +3240,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, last_sequence); } + StartPerfTimer(&pre_post_process_timer); if (updates == &tmp_batch_) tmp_batch_.Clear(); mutex_.Lock(); if (status.ok()) { @@ -3211,6 +3268,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { writers_.front()->cv.Signal(); } mutex_.Unlock(); + BumpPerfTime(&perf_context.write_pre_and_post_process_time, + &pre_post_process_timer); return status; } @@ -3420,7 +3479,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) { } else { unique_ptr lfile; - MemTable* memtmp = nullptr; + MemTable* new_mem = nullptr; // Attempt to switch to a new memtable and trigger compaction of old. // Do this without holding the dbmutex lock. @@ -3439,7 +3498,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) { // (compression, etc) but err on the side of caution. lfile->SetPreallocationBlockSize(1.1 * cfd->options()->write_buffer_size); - memtmp = new MemTable(cfd->internal_comparator(), *cfd->options()); + new_mem = new MemTable(cfd->internal_comparator(), *cfd->options()); new_superversion = new SuperVersion(); } } @@ -3447,7 +3506,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) { if (!s.ok()) { // Avoid chewing through file number space in a tight loop. versions_->ReuseFileNumber(new_log_number); - assert (!memtmp); + assert (!new_mem); break; } logfile_number_ = new_log_number; @@ -3457,12 +3516,12 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) { if (force) { cfd->imm()->FlushRequested(); } - memtmp->Ref(); - memtmp->SetLogNumber(logfile_number_); - cfd->SetMemtable(memtmp); + new_mem->Ref(); + new_mem->SetLogNumber(logfile_number_); + cfd->SetMemtable(new_mem); Log(options_.info_log, "New memtable created with log file: #%lu\n", (unsigned long)logfile_number_); - force = false; // Do not force another compaction if have room + force = false; // Do not force another compaction if have room MaybeScheduleFlushOrCompaction(); delete cfd->InstallSuperVersion(new_superversion); } @@ -3552,10 +3611,10 @@ Status DBImpl::DeleteFile(std::string name) { } int level; - FileMetaData metadata; + FileMetaData *metadata; ColumnFamilyData* cfd; VersionEdit edit; - DeletionState deletion_state(0, true); + DeletionState deletion_state(true); { MutexLock l(&mutex_); status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); @@ -3567,7 +3626,7 @@ Status DBImpl::DeleteFile(std::string name) { assert((level > 0) && (level < cfd->NumberLevels())); // If the file is being compacted no need to delete. - if (metadata.being_compacted) { + if (metadata->being_compacted) { Log(options_.info_log, "DeleteFile %s Skipped. File about to be compacted\n", name.c_str()); return Status::OK(); @@ -3866,7 +3925,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) { // // A global method that can dump out the build version -void dumpLeveldbBuildVersion(Logger * log) { +void DumpLeveldbBuildVersion(Logger * log) { Log(log, "Git sha %s", rocksdb_build_git_sha); Log(log, "Compile time %s %s", rocksdb_build_compile_time, rocksdb_build_compile_date); diff --git a/db/db_impl.h b/db/db_impl.h index 1d117599c..bb32ea046 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -7,24 +7,26 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once + #include #include #include #include #include + #include "db/dbformat.h" #include "db/log_writer.h" #include "db/snapshot.h" #include "db/column_family.h" #include "db/version_edit.h" +#include "memtable_list.h" +#include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/transaction_log.h" -#include "port/port.h" -#include "util/stats_logger.h" -#include "memtablelist.h" #include "util/autovector.h" +#include "util/stats_logger.h" #include "db/internal_stats.h" namespace rocksdb { @@ -178,7 +180,7 @@ class DBImpl : public DB { // needed for CleanupIteratorState struct DeletionState { inline bool HaveSomethingToDelete() const { - return all_files.size() || + return candidate_files.size() || sst_delete_files.size() || log_delete_files.size(); } @@ -186,7 +188,7 @@ class DBImpl : public DB { // a list of all files that we'll consider deleting // (every once in a while this is filled up with all files // in the DB directory) - std::vector all_files; + std::vector candidate_files; // the list of all live sst files that cannot be deleted std::vector sst_live; @@ -198,7 +200,7 @@ class DBImpl : public DB { std::vector log_delete_files; // a list of memtables to be free - std::vector memtables_to_free; + autovector memtables_to_free; SuperVersion* superversion_to_free; // if nullptr nothing to free @@ -208,12 +210,10 @@ class DBImpl : public DB { // that corresponds to the set of files in 'live'. uint64_t manifest_file_number, log_number, prev_log_number; - explicit DeletionState(const int num_memtables = 0, - bool create_superversion = false) { + explicit DeletionState(bool create_superversion = false) { manifest_file_number = 0; log_number = 0; prev_log_number = 0; - memtables_to_free.reserve(num_memtables); superversion_to_free = nullptr; new_superversion = create_superversion ? new SuperVersion() : nullptr; } @@ -232,7 +232,7 @@ class DBImpl : public DB { }; // Returns the list of live files in 'live' and the list - // of all files in the filesystem in 'all_files'. + // of all files in the filesystem in 'candidate_files'. // If force == false and the last call was less than // options_.delete_obsolete_files_period_micros microseconds ago, // it will not fill up the deletion_state @@ -291,7 +291,7 @@ class DBImpl : public DB { // concurrent flush memtables to storage. Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, VersionEdit* edit); - Status WriteLevel0Table(ColumnFamilyData* cfd, std::vector& mems, + Status WriteLevel0Table(ColumnFamilyData* cfd, autovector& mems, VersionEdit* edit, uint64_t* filenumber); uint64_t SlowdownAmount(int n, double bottom, double top); diff --git a/db/db_iter.cc b/db/db_iter.cc index 71bb2e57c..b8d9038a1 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -102,7 +102,8 @@ class DBIter: public Iterator { virtual void SeekToLast(); private: - void FindNextUserEntry(bool skipping); + inline void FindNextUserEntry(bool skipping); + void FindNextUserEntryInternal(bool skipping); void FindPrevUserEntry(); bool ParseKey(ParsedInternalKey* key); void MergeValuesNewToOld(); @@ -191,7 +192,15 @@ void DBIter::Next() { // // NOTE: In between, saved_key_ can point to a user key that has // a delete marker -void DBIter::FindNextUserEntry(bool skipping) { +inline void DBIter::FindNextUserEntry(bool skipping) { + StopWatchNano timer(env_, false); + StartPerfTimer(&timer); + FindNextUserEntryInternal(skipping); + BumpPerfTime(&perf_context.find_next_user_entry_time, &timer); +} + +// Actual implementation of DBIter::FindNextUserEntry() +void DBIter::FindNextUserEntryInternal(bool skipping) { // Loop until we hit an acceptable entry to yield assert(iter_->Valid()); assert(direction_ == kForward); @@ -226,10 +235,7 @@ void DBIter::FindNextUserEntry(bool skipping) { valid_ = true; MergeValuesNewToOld(); // Go to a different state machine return; - case kTypeColumnFamilyDeletion: - case kTypeColumnFamilyValue: - case kTypeColumnFamilyMerge: - case kTypeLogData: + default: assert(false); break; } @@ -429,13 +435,16 @@ void DBIter::FindPrevUserEntry() { } void DBIter::Seek(const Slice& target) { - direction_ = kForward; - ClearSavedValue(); saved_key_.clear(); AppendInternalKey( &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); + StopWatchNano internal_seek_timer(env_, false); + StartPerfTimer(&internal_seek_timer); iter_->Seek(saved_key_); + BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); if (iter_->Valid()) { + direction_ = kForward; + ClearSavedValue(); FindNextUserEntry(false /*not skipping */); } else { valid_ = false; @@ -445,7 +454,10 @@ void DBIter::Seek(const Slice& target) { void DBIter::SeekToFirst() { direction_ = kForward; ClearSavedValue(); + StopWatchNano internal_seek_timer(env_, false); + StartPerfTimer(&internal_seek_timer); iter_->SeekToFirst(); + BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); if (iter_->Valid()) { FindNextUserEntry(false /* not skipping */); } else { @@ -464,7 +476,10 @@ void DBIter::SeekToLast() { direction_ = kReverse; ClearSavedValue(); + StopWatchNano internal_seek_timer(env_, false); + StartPerfTimer(&internal_seek_timer); iter_->SeekToLast(); + BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); FindPrevUserEntry(); } diff --git a/db/db_test.cc b/db/db_test.cc index 65b1dffd3..1edb14799 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -11,25 +11,29 @@ #include #include -#include "rocksdb/db.h" -#include "rocksdb/filter_policy.h" +#include "db/dbformat.h" #include "db/db_impl.h" #include "db/filename.h" #include "db/version_set.h" #include "db/write_batch_internal.h" -#include "table/block_based_table_factory.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/perf_context.h" +#include "table/plain_table_factory.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" +#include "table/block_based_table_factory.h" #include "util/hash.h" +#include "util/hash_linklist_rep.h" #include "util/logging.h" #include "util/mutexlock.h" +#include "util/statistics.h" #include "util/testharness.h" #include "util/testutil.h" -#include "util/statistics.h" #include "utilities/merge_operators.h" namespace rocksdb { @@ -241,12 +245,17 @@ class SpecialEnv : public EnvWrapper { class DBTest { private: const FilterPolicy* filter_policy_; + static std::unique_ptr prefix_1_transform; + static std::unique_ptr noop_transform; protected: // Sequence of option configurations to try enum OptionConfig { kDefault, + kPlainTableFirstBytePrefix, + kPlainTableAllBytesPrefix, kVectorRep, + kHashLinkList, kMergePut, kFilter, kUncompressed, @@ -260,6 +269,7 @@ class DBTest { kHashSkipList, kUniversalCompaction, kCompressedBlockCache, + kInfiniteMaxOpenFiles, kEnd }; int option_config_; @@ -277,7 +287,8 @@ class DBTest { kNoSkip = 0, kSkipDeletesFilterFirst = 1, kSkipUniversalCompaction = 2, - kSkipMergePut = 4 + kSkipMergePut = 4, + kSkipPlainTable = 8 }; DBTest() : option_config_(kDefault), @@ -299,20 +310,27 @@ class DBTest { // Switch to a fresh database with the next option configuration to // test. Return false if there are no more configurations to test. bool ChangeOptions(int skip_mask = kNoSkip) { - option_config_++; - // skip some options - if (skip_mask & kSkipDeletesFilterFirst && - option_config_ == kDeletesFilterFirst) { - option_config_++; - } - if (skip_mask & kSkipUniversalCompaction && - option_config_ == kUniversalCompaction) { - option_config_++; - } - if (skip_mask & kSkipMergePut && option_config_ == kMergePut) { - option_config_++; + for(option_config_++; option_config_ < kEnd; option_config_++) { + if ((skip_mask & kSkipDeletesFilterFirst) && + option_config_ == kDeletesFilterFirst) { + continue; + } + if ((skip_mask & kSkipUniversalCompaction) && + option_config_ == kUniversalCompaction) { + continue; + } + if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) { + continue; + } + if ((skip_mask & kSkipPlainTable) + && (option_config_ == kPlainTableAllBytesPrefix + || option_config_ == kPlainTableFirstBytePrefix)) { + continue; + } + break; } + if (option_config_ >= kEnd) { Destroy(&last_options_); return false; @@ -345,6 +363,18 @@ class DBTest { options.memtable_factory.reset( NewHashSkipListRepFactory(NewFixedPrefixTransform(1))); break; + case kPlainTableFirstBytePrefix: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor = prefix_1_transform.get(); + options.allow_mmap_reads = true; + options.max_sequential_skip_in_iterations = 999999; + break; + case kPlainTableAllBytesPrefix: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor = noop_transform.get(); + options.allow_mmap_reads = true; + options.max_sequential_skip_in_iterations = 999999; + break; case kMergePut: options.merge_operator = MergeOperators::CreatePutOperator(); break; @@ -380,12 +410,19 @@ class DBTest { case kVectorRep: options.memtable_factory.reset(new VectorRepFactory(100)); break; + case kHashLinkList: + options.memtable_factory.reset( + NewHashLinkListRepFactory(NewFixedPrefixTransform(1), 4)); + break; case kUniversalCompaction: options.compaction_style = kCompactionStyleUniversal; break; case kCompressedBlockCache: options.block_cache_compressed = NewLRUCache(8*1024*1024); break; + case kInfiniteMaxOpenFiles: + options.max_open_files = -1; + break; default: break; } @@ -526,10 +563,7 @@ class DBTest { case kTypeDeletion: result += "DEL"; break; - case kTypeColumnFamilyDeletion: - case kTypeColumnFamilyValue: - case kTypeColumnFamilyMerge: - case kTypeLogData: + default: assert(false); break; } @@ -680,6 +714,72 @@ class DBTest { delete iter; } + // Used to test InplaceUpdate + + // If previous value is nullptr or delta is > than previous value, + // sets newValue with delta + // If previous value is not empty, + // updates previous value with 'b' string of previous value size - 1. + static UpdateStatus + updateInPlaceSmallerSize(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + if (prevValue == nullptr) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } else { + *prevSize = *prevSize - 1; + std::string str_b = std::string(*prevSize, 'b'); + memcpy(prevValue, str_b.c_str(), str_b.size()); + return UpdateStatus::UPDATED_INPLACE; + } + } + + static UpdateStatus + updateInPlaceSmallerVarintSize(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + if (prevValue == nullptr) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } else { + *prevSize = 1; + std::string str_b = std::string(*prevSize, 'b'); + memcpy(prevValue, str_b.c_str(), str_b.size()); + return UpdateStatus::UPDATED_INPLACE; + } + } + + static UpdateStatus + updateInPlaceLargerSize(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } + + static UpdateStatus + updateInPlaceNoAction(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + return UpdateStatus::UPDATE_FAILED; + } + + // Utility method to test InplaceUpdate + void validateNumberOfEntries(int numValues) { + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_EQ(iter->status().ok(), true); + int seq = numValues; + while (iter->Valid()) { + ParsedInternalKey ikey; + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + + // checks sequence number for updates + ASSERT_EQ(ikey.sequence, (unsigned)seq--); + iter->Next(); + } + delete iter; + ASSERT_EQ(0, seq); + } + void CopyFile(const std::string& source, const std::string& destination, uint64_t size = 0) { const EnvOptions soptions; @@ -705,6 +805,10 @@ class DBTest { } }; +std::unique_ptr DBTest::prefix_1_transform( + NewFixedPrefixTransform(1)); +std::unique_ptr DBTest::noop_transform( + NewNoopTransform()); static std::string Key(int i) { char buf[100]; @@ -718,19 +822,19 @@ static long TestGetTickerCount(const Options& options, Tickers ticker_type) { TEST(DBTest, Empty) { do { - ASSERT_TRUE(db_ != nullptr); - ASSERT_EQ("NOT_FOUND", Get("foo")); - } while (ChangeOptions()); -} + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + Reopen(&options); -TEST(DBTest, ReadWrite) { - do { ASSERT_OK(Put("foo", "v1")); ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); - ASSERT_EQ("v3", Get("foo")); - ASSERT_EQ("v2", Get("bar")); + + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + Put("k1", std::string(100000, 'x')); // Fill memtable + Put("k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_EQ("v1", Get("foo")); + env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls } while (ChangeOptions()); } @@ -769,7 +873,7 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); // Create a new talbe. - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); // index/filter blocks added to block cache right after table creation. ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); @@ -1051,7 +1155,10 @@ TEST(DBTest, KeyMayExist) { ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); delete options.filter_policy; - } while (ChangeOptions()); + + // KeyMayExist function only checks data in block caches, which is not used + // by plain table format. + } while (ChangeOptions(kSkipPlainTable)); } TEST(DBTest, NonBlockingIteration) { @@ -1111,7 +1218,9 @@ TEST(DBTest, NonBlockingIteration) { ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); delete iter; - } while (ChangeOptions()); + // This test verifies block cache behaviors, which is not used by plain + // table format. + } while (ChangeOptions(kSkipPlainTable)); } // A delete is skipped for key if KeyMayExist(key) returns False @@ -1250,7 +1359,13 @@ TEST(DBTest, IterMulti) { ASSERT_EQ(IterStatus(iter), "a->va"); iter->Seek("ax"); ASSERT_EQ(IterStatus(iter), "b->vb"); + + SetPerfLevel(kEnableTime); + perf_context.Reset(); iter->Seek("b"); + ASSERT_TRUE((int) perf_context.seek_internal_seek_time > 0); + ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0); + SetPerfLevel(kDisable); ASSERT_EQ(IterStatus(iter), "b->vb"); iter->Seek("z"); ASSERT_EQ(IterStatus(iter), "(invalid)"); @@ -1265,7 +1380,12 @@ TEST(DBTest, IterMulti) { // Switch from forward to reverse iter->SeekToFirst(); iter->Next(); + SetPerfLevel(kEnableTime); + perf_context.Reset(); iter->Next(); + ASSERT_EQ(0, (int) perf_context.seek_internal_seek_time); + ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0); + SetPerfLevel(kDisable); iter->Prev(); ASSERT_EQ(IterStatus(iter), "b->vb"); @@ -1696,22 +1816,42 @@ TEST(DBTest, NumImmutableMemTable) { std::string big_value(1000000, 'x'); std::string num; + SetPerfLevel(kEnableTime);; ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "0"); + perf_context.Reset(); + Get("k1"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "1"); + perf_context.Reset(); + Get("k1"); + ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get("k2"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "2"); + perf_context.Reset(); + Get("k2"); + ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get("k3"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get("k1"); + ASSERT_EQ(3, (int) perf_context.get_from_memtable_count); dbfull()->Flush(FlushOptions()); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "0"); + SetPerfLevel(kDisable); } while (ChangeCompactOptions()); } @@ -1720,11 +1860,16 @@ TEST(DBTest, FLUSH) { Options options = CurrentOptions(); WriteOptions writeOpt = WriteOptions(); writeOpt.disableWAL = true; + SetPerfLevel(kEnableTime);; ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); // this will now also flush the last 2 writes dbfull()->Flush(FlushOptions()); ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); + perf_context.Reset(); + Get("foo"); + ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); + Reopen(); ASSERT_EQ("v1", Get("foo")); ASSERT_EQ("v1", Get("bar")); @@ -1736,7 +1881,9 @@ TEST(DBTest, FLUSH) { Reopen(); ASSERT_EQ("v2", Get("bar")); + perf_context.Reset(); ASSERT_EQ("v2", Get("foo")); + ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); writeOpt.disableWAL = false; ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3")); @@ -1748,6 +1895,8 @@ TEST(DBTest, FLUSH) { // has WAL enabled. ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v3", Get("bar")); + + SetPerfLevel(kDisable); } while (ChangeCompactOptions()); } @@ -2559,9 +2708,9 @@ TEST(DBTest, InPlaceUpdate) { options.inplace_update_support = true; options.env = env_; options.write_buffer_size = 100000; + Reopen(&options); // Update key with values of smaller size - Reopen(&options); int numValues = 10; for (int i = numValues; i > 0; i--) { std::string value = DummyString(i, 'a'); @@ -2569,50 +2718,133 @@ TEST(DBTest, InPlaceUpdate) { ASSERT_EQ(value, Get("key")); } - int count = 0; - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - iter->SeekToFirst(); - ASSERT_EQ(iter->status().ok(), true); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - count++; - // All updates with the same sequence number. - ASSERT_EQ(ikey.sequence, (unsigned)1); - iter->Next(); - } // Only 1 instance for that key. - ASSERT_EQ(count, 1); - delete iter; + validateNumberOfEntries(1); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateLargeNewValue) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + Reopen(&options); // Update key with values of larger size - DestroyAndReopen(&options); - numValues = 10; + int numValues = 10; for (int i = 0; i < numValues; i++) { std::string value = DummyString(i, 'a'); ASSERT_OK(Put("key", value)); ASSERT_EQ(value, Get("key")); } - count = 0; - iter = dbfull()->TEST_NewInternalIterator(); - iter->SeekToFirst(); - ASSERT_EQ(iter->status().ok(), true); - int seq = numValues; - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - count++; - // No inplace updates. All updates are puts with new seq number - ASSERT_EQ(ikey.sequence, (unsigned)seq--); - iter->Next(); + // All 10 updates exist in the internal iterator + validateNumberOfEntries(numValues); + + } while (ChangeCompactOptions()); +} + + +TEST(DBTest, InPlaceUpdateCallbackSmallerSize) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceSmallerSize; + Reopen(&options); + + // Update key with values of smaller size + int numValues = 10; + ASSERT_OK(Put("key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get("key")); + + for (int i = numValues; i > 0; i--) { + ASSERT_OK(Put("key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i - 1, 'b'), Get("key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceSmallerVarintSize; + Reopen(&options); + + // Update key with values of smaller varint size + int numValues = 265; + ASSERT_OK(Put("key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get("key")); + + for (int i = numValues; i > 0; i--) { + ASSERT_OK(Put("key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(1, 'b'), Get("key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceLargerSize; + Reopen(&options); + + // Update key with values of larger size + int numValues = 10; + for (int i = 0; i < numValues; i++) { + ASSERT_OK(Put("key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i, 'c'), Get("key")); } + + // No inplace updates. All updates are puts with new seq number // All 10 updates exist in the internal iterator - ASSERT_EQ(count, numValues); - delete iter; + validateNumberOfEntries(numValues); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateCallbackNoAction) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceNoAction; + Reopen(&options); + + // Callback function requests no actions from db + ASSERT_OK(Put("key", DummyString(1, 'a'))); + ASSERT_EQ(Get("key"), "NOT_FOUND"); } while (ChangeCompactOptions()); } @@ -2653,9 +2885,7 @@ class DeleteFilter : public CompactionFilter { class ChangeFilter : public CompactionFilter { public: - explicit ChangeFilter(int argv) { - assert(argv == 100); - } + explicit ChangeFilter() {} virtual bool Filter(int level, const Slice& key, const Slice& value, std::string* new_value, @@ -2697,19 +2927,16 @@ class DeleteFilterFactory : public CompactionFilterFactory { class ChangeFilterFactory : public CompactionFilterFactory { public: - explicit ChangeFilterFactory(int argv) : argv_(argv) {} + explicit ChangeFilterFactory() {} virtual std::unique_ptr CreateCompactionFilter(const CompactionFilter::Context& context) override { - return std::unique_ptr(new ChangeFilter(argv_)); + return std::unique_ptr(new ChangeFilter()); } virtual const char* Name() const override { return "ChangeFilterFactory"; } - - private: - const int argv_; }; TEST(DBTest, CompactionFilter) { @@ -2856,7 +3083,7 @@ TEST(DBTest, CompactionFilterWithValueChange) { options.num_levels = 3; options.max_mem_compaction_level = 0; options.compaction_filter_factory = - std::make_shared(100); + std::make_shared(); Reopen(&options); // Write 100K+1 keys, these are written to a few files @@ -3000,7 +3227,8 @@ TEST(DBTest, ApproximateSizes) { ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_GT(NumTableFilesAtLevel(1), 0); } - } while (ChangeOptions(kSkipUniversalCompaction)); + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable)); } TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { @@ -3038,7 +3266,8 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { dbfull()->TEST_CompactRange(0, nullptr, nullptr); } - } while (ChangeOptions()); + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipPlainTable)); } TEST(DBTest, IteratorPinsRef) { @@ -3122,7 +3351,9 @@ TEST(DBTest, HiddenValuesAreRemoved) { ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); - } while (ChangeOptions(kSkipUniversalCompaction)); + // ApproximateOffsetOf() is not yet implemented in plain table format, + // which is used by Size(). + } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable)); } TEST(DBTest, CompactBetweenSnapshots) { @@ -4790,7 +5021,9 @@ TEST(DBTest, Randomized) { // TODO(sanjay): Test Get() works int p = rnd.Uniform(100); int minimum = 0; - if (option_config_ == kHashSkipList) { + if (option_config_ == kHashSkipList || + option_config_ == kHashLinkList || + option_config_ == kPlainTableFirstBytePrefix) { minimum = 1; } if (p < 45) { // Put @@ -4969,20 +5202,22 @@ TEST(DBTest, PrefixScan) { snprintf(buf, sizeof(buf), "03______:"); prefix = Slice(buf, 8); key = Slice(buf, 9); - auto prefix_extractor = NewFixedPrefixTransform(8); // db configs env_->count_random_reads_ = true; Options options = CurrentOptions(); options.env = env_; options.no_block_cache = true; - options.filter_policy = NewBloomFilterPolicy(10); - options.prefix_extractor = prefix_extractor; + options.filter_policy = NewBloomFilterPolicy(10); + options.prefix_extractor = NewFixedPrefixTransform(8); options.whole_key_filtering = false; options.disable_auto_compactions = true; options.max_background_compactions = 2; options.create_if_missing = true; options.disable_seek_compaction = true; - options.memtable_factory.reset(NewHashSkipListRepFactory(prefix_extractor)); + // Tricky: options.prefix_extractor will be released by + // NewHashSkipListRepFactory after use. + options.memtable_factory.reset( + NewHashSkipListRepFactory(options.prefix_extractor)); // prefix specified, with blooms: 2 RAND I/Os // SeekToFirst diff --git a/db/dbformat.cc b/db/dbformat.cc index 3d7e61010..43560bc83 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -6,9 +6,9 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/dbformat.h" #include -#include "db/dbformat.h" #include "port/port.h" #include "util/coding.h" #include "util/perf_context_imp.h" @@ -72,6 +72,28 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { return r; } +int InternalKeyComparator::Compare(const ParsedInternalKey& a, + const ParsedInternalKey& b) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_->Compare(a.user_key, b.user_key); + BumpPerfCount(&perf_context.user_key_comparison_count); + if (r == 0) { + if (a.sequence > b.sequence) { + r = -1; + } else if (a.sequence < b.sequence) { + r = +1; + } else if (a.type > b.type) { + r = -1; + } else if (a.type < b.type) { + r = +1; + } + } + return r; +} + void InternalKeyComparator::FindShortestSeparator( std::string* start, const Slice& limit) const { diff --git a/db/dbformat.h b/db/dbformat.h index 82031cf5c..be46d14a1 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -25,7 +25,9 @@ class InternalKey; // Value types encoded as the last component of internal keys. // DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk // data structures. -enum ValueType { +// The highest bit of the value type needs to be reserved to SST tables +// for them to do more flexible encoding. +enum ValueType : unsigned char { kTypeDeletion = 0x0, kTypeValue = 0x1, kTypeMerge = 0x2, @@ -33,7 +35,9 @@ enum ValueType { kTypeColumnFamilyDeletion = 0x4, kTypeColumnFamilyValue = 0x5, kTypeColumnFamilyMerge = 0x6, + kMaxValue = 0x7F }; + // kValueTypeForSeek defines the ValueType that should be passed when // constructing a ParsedInternalKey object for seeking to a particular // sequence number (since we sort sequence numbers in decreasing order @@ -99,6 +103,7 @@ class InternalKeyComparator : public Comparator { name_("rocksdb.InternalKeyComparator:" + std::string(user_comparator_->Name())) { } + virtual ~InternalKeyComparator() {} virtual const char* Name() const; virtual int Compare(const Slice& a, const Slice& b) const; @@ -110,6 +115,7 @@ class InternalKeyComparator : public Comparator { const Comparator* user_comparator() const { return user_comparator_; } int Compare(const InternalKey& a, const InternalKey& b) const; + int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; }; // Filter policy wrapper that converts from internal keys to user keys @@ -166,6 +172,7 @@ inline bool ParseInternalKey(const Slice& internal_key, unsigned char c = num & 0xff; result->sequence = num >> 8; result->type = static_cast(c); + assert(result->type <= ValueType::kMaxValue); result->user_key = Slice(internal_key.data(), n - 8); return (c <= static_cast(kValueTypeForSeek)); } diff --git a/db/log_format.h b/db/log_format.h index 10a31ba27..919c087e2 100644 --- a/db/log_format.h +++ b/db/log_format.h @@ -17,7 +17,6 @@ namespace log { enum RecordType { // Zero is reserved for preallocated files kZeroType = 0, - kFullType = 1, // For fragments diff --git a/db/memtable.cc b/db/memtable.cc index 1616a1227..2f84a289e 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -17,10 +17,14 @@ #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/merge_operator.h" +#include "rocksdb/slice_transform.h" +#include "util/arena.h" #include "util/coding.h" -#include "util/mutexlock.h" #include "util/murmurhash.h" +#include "util/mutexlock.h" +#include "util/perf_context_imp.h" #include "util/statistics.h" +#include "util/stop_watch.h" namespace std { template <> @@ -37,9 +41,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const ColumnFamilyOptions& options) : comparator_(cmp), refs_(0), - arena_impl_(options.arena_block_size), - table_(options.memtable_factory->CreateMemTableRep(comparator_, - &arena_impl_)), + arena_(options.arena_block_size), + table_(options.memtable_factory->CreateMemTableRep(comparator_, &arena_)), flush_in_progress_(false), flush_completed_(false), file_number_(0), @@ -47,23 +50,36 @@ MemTable::MemTable(const InternalKeyComparator& cmp, mem_next_logfile_number_(0), mem_logfile_number_(0), locks_(options.inplace_update_support ? options.inplace_update_num_locks - : 0) {} + : 0), + prefix_extractor_(options.prefix_extractor) { + if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) { + prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits, + options.memtable_prefix_bloom_probes)); + } +} MemTable::~MemTable() { assert(refs_ == 0); } size_t MemTable::ApproximateMemoryUsage() { - return arena_impl_.ApproximateMemoryUsage() + - table_->ApproximateMemoryUsage(); + return arena_.ApproximateMemoryUsage() + table_->ApproximateMemoryUsage(); } -int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) +int MemTable::KeyComparator::operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const { + // Internal keys are encoded as length-prefixed strings. + Slice k1 = GetLengthPrefixedSlice(prefix_len_key1); + Slice k2 = GetLengthPrefixedSlice(prefix_len_key2); + return comparator.Compare(k1, k2); +} + +int MemTable::KeyComparator::operator()(const char* prefix_len_key, + const Slice& key) const { // Internal keys are encoded as length-prefixed strings. - Slice a = GetLengthPrefixedSlice(aptr); - Slice b = GetLengthPrefixedSlice(bptr); - return comparator.Compare(a, b); + Slice a = GetLengthPrefixedSlice(prefix_len_key); + return comparator.Compare(a, key); } Slice MemTableRep::UserKey(const char* key) const { @@ -74,7 +90,7 @@ Slice MemTableRep::UserKey(const char* key) const { // Encode a suitable internal key target for "target" and return it. // Uses *scratch as scratch space, and the returned pointer will point // into this scratch space. -static const char* EncodeKey(std::string* scratch, const Slice& target) { +const char* EncodeKey(std::string* scratch, const Slice& target) { scratch->clear(); PutVarint32(scratch, target.size()); scratch->append(target.data(), target.size()); @@ -83,27 +99,53 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator: public Iterator { public: - MemTableIterator(MemTableRep* table, const ReadOptions& options) - : iter_() { + MemTableIterator(const MemTable& mem, const ReadOptions& options) + : mem_(mem), iter_(), dynamic_prefix_seek_(false), valid_(false) { if (options.prefix) { - iter_.reset(table->GetPrefixIterator(*options.prefix)); + iter_.reset(mem_.table_->GetPrefixIterator(*options.prefix)); } else if (options.prefix_seek) { - iter_.reset(table->GetDynamicPrefixIterator()); + dynamic_prefix_seek_ = true; + iter_.reset(mem_.table_->GetDynamicPrefixIterator()); } else { - iter_.reset(table->GetIterator()); + iter_.reset(mem_.table_->GetIterator()); } } - virtual bool Valid() const { return iter_->Valid(); } - virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } - virtual void SeekToFirst() { iter_->SeekToFirst(); } - virtual void SeekToLast() { iter_->SeekToLast(); } - virtual void Next() { iter_->Next(); } - virtual void Prev() { iter_->Prev(); } + virtual bool Valid() const { return valid_; } + virtual void Seek(const Slice& k) { + if (dynamic_prefix_seek_ && mem_.prefix_bloom_ && + !mem_.prefix_bloom_->MayContain( + mem_.prefix_extractor_->Transform(ExtractUserKey(k)))) { + valid_ = false; + return; + } + iter_->Seek(k, nullptr); + valid_ = iter_->Valid(); + } + virtual void SeekToFirst() { + iter_->SeekToFirst(); + valid_ = iter_->Valid(); + } + virtual void SeekToLast() { + iter_->SeekToLast(); + valid_ = iter_->Valid(); + } + virtual void Next() { + assert(Valid()); + iter_->Next(); + valid_ = iter_->Valid(); + } + virtual void Prev() { + assert(Valid()); + iter_->Prev(); + valid_ = iter_->Valid(); + } virtual Slice key() const { + assert(Valid()); return GetLengthPrefixedSlice(iter_->key()); } virtual Slice value() const { + assert(Valid()); Slice key_slice = GetLengthPrefixedSlice(iter_->key()); return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); } @@ -111,8 +153,10 @@ class MemTableIterator: public Iterator { virtual Status status() const { return Status::OK(); } private: - std::unique_ptr iter_; - std::string tmp_; // For passing to EncodeKey + const MemTable& mem_; + std::shared_ptr iter_; + bool dynamic_prefix_seek_; + bool valid_; // No copying allowed MemTableIterator(const MemTableIterator&); @@ -120,7 +164,7 @@ class MemTableIterator: public Iterator { }; Iterator* MemTable::NewIterator(const ReadOptions& options) { - return new MemTableIterator(table_.get(), options); + return new MemTableIterator(*this, options); } port::RWMutex* MemTable::GetLock(const Slice& key) { @@ -128,7 +172,7 @@ port::RWMutex* MemTable::GetLock(const Slice& key) { } void MemTable::Add(SequenceNumber s, ValueType type, - const Slice& key, + const Slice& key, /* user key */ const Slice& value) { // Format of an entry is concatenation of: // key_size : varint32 of internal_key.size() @@ -141,7 +185,7 @@ void MemTable::Add(SequenceNumber s, ValueType type, const size_t encoded_len = VarintLength(internal_key_size) + internal_key_size + VarintLength(val_size) + val_size; - char* buf = arena_impl_.Allocate(encoded_len); + char* buf = arena_.Allocate(encoded_len); char* p = EncodeVarint32(buf, internal_key_size); memcpy(p, key.data(), key_size); p += key_size; @@ -152,6 +196,11 @@ void MemTable::Add(SequenceNumber s, ValueType type, assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); table_->Insert(buf); + if (prefix_bloom_) { + assert(prefix_extractor_); + prefix_bloom_->Add(prefix_extractor_->Transform(key)); + } + // The first sequence number inserted into the memtable assert(first_seqno_ == 0 || s > first_seqno_); if (first_seqno_ == 0) { @@ -161,17 +210,28 @@ void MemTable::Add(SequenceNumber s, ValueType type, bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, MergeContext& merge_context, const Options& options) { - Slice memkey = key.memtable_key(); - std::unique_ptr iter( - table_->GetIterator(key.user_key())); - iter->Seek(memkey.data()); + StopWatchNano memtable_get_timer(options.env, false); + StartPerfTimer(&memtable_get_timer); + + Slice mem_key = key.memtable_key(); + Slice user_key = key.user_key(); + + std::unique_ptr iter; + if (prefix_bloom_ && + !prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) { + // iter is null if prefix bloom says the key does not exist + } else { + iter.reset(table_->GetIterator(user_key)); + iter->Seek(key.internal_key(), mem_key.data()); + } bool merge_in_progress = s->IsMergeInProgress(); auto merge_operator = options.merge_operator.get(); auto logger = options.info_log; std::string merge_result; - for (; iter->Valid(); iter->Next()) { + bool found_final_value = false; + for (; !found_final_value && iter && iter->Valid(); iter->Next()) { // entry format is: // klength varint32 // userkey char[klength-8] @@ -182,7 +242,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, // sequence number since the Seek() call above should have skipped // all entries with overly large sequence numbers. const char* entry = iter->key(); - uint32_t key_length; + uint32_t key_length = 0; const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); if (comparator_.comparator.user_comparator()->Compare( Slice(key_ptr, key_length - 8), key.user_key()) == 0) { @@ -209,7 +269,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, if (options.inplace_update_support) { GetLock(key.user_key())->Unlock(); } - return true; + found_final_value = true; + break; } case kTypeDeletion: { if (merge_in_progress) { @@ -224,7 +285,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, } else { *s = Status::NotFound(); } - return true; + found_final_value = true; + break; } case kTypeMerge: { Slice v = GetLengthPrefixedSlice(key_ptr + key_length); @@ -244,10 +306,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, } break; } - case kTypeColumnFamilyDeletion: - case kTypeColumnFamilyValue: - case kTypeColumnFamilyMerge: - case kTypeLogData: + default: assert(false); break; } @@ -259,25 +318,27 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, // No change to value, since we have not yet found a Put/Delete - if (merge_in_progress) { + if (!found_final_value && merge_in_progress) { *s = Status::MergeInProgress(""); } - return false; + BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer); + BumpPerfCount(&perf_context.get_from_memtable_count); + return found_final_value; } -bool MemTable::Update(SequenceNumber seq, ValueType type, +void MemTable::Update(SequenceNumber seq, const Slice& key, const Slice& value) { LookupKey lkey(key, seq); - Slice memkey = lkey.memtable_key(); + Slice mem_key = lkey.memtable_key(); std::unique_ptr iter( - table_->GetIterator(lkey.user_key())); - iter->Seek(memkey.data()); + table_->GetIterator(lkey.user_key())); + iter->Seek(lkey.internal_key(), mem_key.data()); if (iter->Valid()) { // entry format is: - // klength varint32 + // key_length varint32 // userkey char[klength-8] // tag uint64 // vlength varint32 @@ -286,7 +347,7 @@ bool MemTable::Update(SequenceNumber seq, ValueType type, // sequence number since the Seek() call above should have skipped // all entries with overly large sequence numbers. const char* entry = iter->key(); - uint32_t key_length; + uint32_t key_length = 0; const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); if (comparator_.comparator.user_comparator()->Compare( Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) { @@ -294,32 +355,105 @@ bool MemTable::Update(SequenceNumber seq, ValueType type, const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); switch (static_cast(tag & 0xff)) { case kTypeValue: { - uint32_t vlength; - GetVarint32Ptr(key_ptr + key_length, - key_ptr + key_length+5, &vlength); - // Update value, if newValue size <= curValue size - if (value.size() <= vlength) { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = prev_value.size(); + uint32_t new_size = value.size(); + + // Update value, if new value size <= previous value size + if (new_size <= prev_size ) { char* p = EncodeVarint32(const_cast(key_ptr) + key_length, - value.size()); + new_size); WriteLock wl(GetLock(lkey.user_key())); memcpy(p, value.data(), value.size()); assert((unsigned)((p + value.size()) - entry) == (unsigned)(VarintLength(key_length) + key_length + VarintLength(value.size()) + value.size())); - return true; + return; } } default: // If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData - // then we probably don't have enough space to update in-place - // Maybe do something later - // Return false, and do normal Add() - return false; + // we don't have enough space for update inplace + Add(seq, kTypeValue, key, value); + return; } } } - // Key doesn't exist + // key doesn't exist + Add(seq, kTypeValue, key, value); +} + +bool MemTable::UpdateCallback(SequenceNumber seq, + const Slice& key, + const Slice& delta, + const Options& options) { + LookupKey lkey(key, seq); + Slice memkey = lkey.memtable_key(); + + std::shared_ptr iter( + table_->GetIterator(lkey.user_key())); + iter->Seek(lkey.internal_key(), memkey.data()); + + if (iter->Valid()) { + // entry format is: + // key_length varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Compare( + Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + switch (static_cast(tag & 0xff)) { + case kTypeValue: { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = prev_value.size(); + + char* prev_buffer = const_cast(prev_value.data()); + uint32_t new_prev_size = prev_size; + + std::string str_value; + WriteLock wl(GetLock(lkey.user_key())); + auto status = options.inplace_callback(prev_buffer, &new_prev_size, + delta, &str_value); + if (status == UpdateStatus::UPDATED_INPLACE) { + // Value already updated by callback. + assert(new_prev_size <= prev_size); + if (new_prev_size < prev_size) { + // overwrite the new prev_size + char* p = EncodeVarint32(const_cast(key_ptr) + key_length, + new_prev_size); + if (VarintLength(new_prev_size) < VarintLength(prev_size)) { + // shift the value buffer as well. + memcpy(p, prev_buffer, new_prev_size); + } + } + RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED); + return true; + } else if (status == UpdateStatus::UPDATED) { + Add(seq, kTypeValue, key, Slice(str_value)); + RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN); + return true; + } else if (status == UpdateStatus::UPDATE_FAILED) { + // No action required. Return. + return true; + } + } + default: + break; + } + } + } + // If the latest value is not kTypeValue + // or key doesn't exist return false; } @@ -331,13 +465,13 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { // The iterator only needs to be ordered within the same user key. std::unique_ptr iter( table_->GetIterator(key.user_key())); - iter->Seek(memkey.data()); + iter->Seek(key.internal_key(), memkey.data()); size_t num_successive_merges = 0; for (; iter->Valid(); iter->Next()) { const char* entry = iter->key(); - uint32_t key_length; + uint32_t key_length = 0; const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); if (!comparator_.comparator.user_comparator()->Compare( Slice(iter_key_ptr, key_length - 8), key.user_key()) == 0) { diff --git a/db/memtable.h b/db/memtable.h index 415c7070b..61bebaee0 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -16,7 +16,8 @@ #include "db/version_edit.h" #include "rocksdb/db.h" #include "rocksdb/memtablerep.h" -#include "util/arena_impl.h" +#include "util/arena.h" +#include "util/dynamic_bloom.h" namespace rocksdb { @@ -29,7 +30,10 @@ class MemTable { struct KeyComparator : public MemTableRep::KeyComparator { const InternalKeyComparator comparator; explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } - virtual int operator()(const char* a, const char* b) const; + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const; + virtual int operator()(const char* prefix_len_key, + const Slice& key) const override; }; // MemTables are reference counted. The initial reference count @@ -94,16 +98,31 @@ class MemTable { bool Get(const LookupKey& key, std::string* value, Status* s, MergeContext& merge_context, const Options& options); - // Update the value and return status ok, - // if key exists in current memtable - // if new sizeof(new_value) <= sizeof(old_value) && - // old_value for that key is a put i.e. kTypeValue - // else return false, and status - NotUpdatable() - // else return false, and status - NotFound() - bool Update(SequenceNumber seq, ValueType type, + // Attempts to update the new_value inplace, else does normal Add + // Pseudocode + // if key exists in current memtable && prev_value is of type kTypeValue + // if new sizeof(new_value) <= sizeof(prev_value) + // update inplace + // else add(key, new_value) + // else add(key, new_value) + void Update(SequenceNumber seq, const Slice& key, const Slice& value); + // If prev_value for key exits, attempts to update it inplace. + // else returns false + // Pseudocode + // if key exists in current memtable && prev_value is of type kTypeValue + // new_value = delta(prev_value) + // if sizeof(new_value) <= sizeof(prev_value) + // update inplace + // else add(key, new_value) + // else return false + bool UpdateCallback(SequenceNumber seq, + const Slice& key, + const Slice& delta, + const Options& options); + // Returns the number of successive merge entries starting from the newest // entry for the key up to the last non-merge entry or last entry for the // key in the memtable. @@ -142,7 +161,7 @@ class MemTable { KeyComparator comparator_; int refs_; - ArenaImpl arena_impl_; + Arena arena_; unique_ptr table_; // These are used to manage memtable flushes to storage @@ -150,7 +169,7 @@ class MemTable { bool flush_completed_; // finished the flush uint64_t file_number_; // filled up after flush is complete - // The udpates to be applied to the transaction log when this + // The updates to be applied to the transaction log when this // memtable is flushed to storage. VersionEdit edit_; @@ -173,6 +192,11 @@ class MemTable { // Get the lock associated for the key port::RWMutex* GetLock(const Slice& key); + + const SliceTransform* const prefix_extractor_; + std::unique_ptr prefix_bloom_; }; +extern const char* EncodeKey(std::string* scratch, const Slice& target); + } // namespace rocksdb diff --git a/db/memtablelist.cc b/db/memtable_list.cc similarity index 94% rename from db/memtablelist.cc rename to db/memtable_list.cc index cbfb7c85e..240edde15 100644 --- a/db/memtablelist.cc +++ b/db/memtable_list.cc @@ -3,7 +3,7 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // -#include "db/memtablelist.h" +#include "db/memtable_list.h" #include #include "rocksdb/db.h" @@ -31,7 +31,7 @@ MemTableListVersion::MemTableListVersion(MemTableListVersion* old) { void MemTableListVersion::Ref() { ++refs_; } -void MemTableListVersion::Unref(std::vector* to_delete) { +void MemTableListVersion::Unref(autovector* to_delete) { assert(refs_ >= 1); --refs_; if (refs_ == 0) { @@ -103,7 +103,7 @@ bool MemTableList::IsFlushPending() { } // Returns the memtables that need to be flushed. -void MemTableList::PickMemtablesToFlush(std::vector* ret) { +void MemTableList::PickMemtablesToFlush(autovector* ret) { const auto& memlist = current_->memlist_; for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { MemTable* m = *it; @@ -113,18 +113,18 @@ void MemTableList::PickMemtablesToFlush(std::vector* ret) { if (num_flush_not_started_ == 0) { imm_flush_needed.Release_Store(nullptr); } - m->flush_in_progress_ = true; // flushing will start very soon + m->flush_in_progress_ = true; // flushing will start very soon ret->push_back(m); } } - flush_requested_ = false; // start-flush request is complete + flush_requested_ = false; // start-flush request is complete } // Record a successful flush in the manifest file Status MemTableList::InstallMemtableFlushResults( - ColumnFamilyData* cfd, const std::vector& mems, VersionSet* vset, + ColumnFamilyData* cfd, const autovector& mems, VersionSet* vset, Status flushStatus, port::Mutex* mu, Logger* info_log, uint64_t file_number, - std::set& pending_outputs, std::vector* to_delete, + std::set& pending_outputs, autovector* to_delete, Directory* db_directory) { mu->AssertHeld(); diff --git a/db/memtablelist.h b/db/memtable_list.h similarity index 90% rename from db/memtablelist.h rename to db/memtable_list.h index d4fee3afd..9ade48798 100644 --- a/db/memtablelist.h +++ b/db/memtable_list.h @@ -3,18 +3,25 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // - #pragma once + #include #include #include #include +#include #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/iterator.h" + #include "db/dbformat.h" +#include "db/memtable.h" #include "db/skiplist.h" #include "db/memtable.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "util/autovector.h" namespace rocksdb { @@ -30,7 +37,7 @@ class MemTableListVersion { explicit MemTableListVersion(MemTableListVersion* old = nullptr); void Ref(); - void Unref(std::vector* to_delete = nullptr); + void Unref(autovector* to_delete = nullptr); int size() const; @@ -89,14 +96,14 @@ class MemTableList { // Returns the earliest memtables that needs to be flushed. The returned // memtables are guaranteed to be in the ascending order of created time. - void PickMemtablesToFlush(std::vector* mems); + void PickMemtablesToFlush(autovector* mems); // Commit a successful flush in the manifest file Status InstallMemtableFlushResults( - ColumnFamilyData* cfd, const std::vector& m, VersionSet* vset, + ColumnFamilyData* cfd, const autovector& m, VersionSet* vset, Status flushStatus, port::Mutex* mu, Logger* info_log, uint64_t file_number, std::set& pending_outputs, - std::vector* to_delete, Directory* db_directory); + autovector* to_delete, Directory* db_directory); // New memtables are inserted at the front of the list. // Takes ownership of the referenced held on *m by the caller of Add(). diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 0934de0cd..472cc719a 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -174,6 +174,13 @@ void ProfileKeyComparison() { HistogramImpl hist_put; HistogramImpl hist_get; + HistogramImpl hist_get_snapshot; + HistogramImpl hist_get_memtable; + HistogramImpl hist_get_post_process; + HistogramImpl hist_num_memtable_checked; + HistogramImpl hist_write_pre_post; + HistogramImpl hist_write_wal_time; + HistogramImpl hist_write_memtable_time; std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; @@ -192,16 +199,37 @@ void ProfileKeyComparison() { perf_context.Reset(); db->Put(write_options, key, value); + hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time); + hist_write_wal_time.Add(perf_context.write_wal_time); + hist_write_memtable_time.Add(perf_context.write_memtable_time); hist_put.Add(perf_context.user_key_comparison_count); perf_context.Reset(); db->Get(read_options, key, &value); + hist_get_snapshot.Add(perf_context.get_snapshot_time); + hist_get_memtable.Add(perf_context.get_from_memtable_time); + hist_num_memtable_checked.Add(perf_context.get_from_memtable_count); + hist_get_post_process.Add(perf_context.get_post_process_time); hist_get.Add(perf_context.user_key_comparison_count); } std::cout << "Put uesr key comparison: \n" << hist_put.ToString() << "Get uesr key comparison: \n" << hist_get.ToString(); - + std::cout << "Put(): Pre and Post Process Time: \n" + << hist_write_pre_post.ToString() + << " Writing WAL time: \n" + << hist_write_wal_time.ToString() << "\n" + << " Writing Mem Table time: \n" + << hist_write_memtable_time.ToString() << "\n"; + + std::cout << "Get(): Time to get snapshot: \n" + << hist_get_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_get_memtable.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" + << hist_get_post_process.ToString() << "\n"; } TEST(PerfContextTest, KeyComparisonCount) { @@ -259,8 +287,8 @@ TEST(PerfContextTest, SeekKeyComparison) { db->Put(write_options, key, value); auto put_time = timer.ElapsedNanos(); hist_put_time.Add(put_time); - hist_wal_time.Add(perf_context.wal_write_time); - hist_time_diff.Add(put_time - perf_context.wal_write_time); + hist_wal_time.Add(perf_context.write_wal_time); + hist_time_diff.Add(put_time - perf_context.write_wal_time); } std::cout << "Put time:\n" << hist_put_time.ToString() diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc new file mode 100644 index 000000000..0d554278c --- /dev/null +++ b/db/plain_table_db_test.cc @@ -0,0 +1,337 @@ +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "table/plain_table_factory.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +using std::unique_ptr; + +namespace rocksdb { + +class PlainTableDBTest { + protected: + private: + std::string dbname_; + Env* env_; + DB* db_; + + Options last_options_; + static std::unique_ptr prefix_transform; + + public: + PlainTableDBTest() : env_(Env::Default()) { + dbname_ = test::TmpDir() + "/plain_table_db_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~PlainTableDBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, Options())); + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + options.table_factory.reset(new PlainTableFactory(16, 2, 0.8)); + options.prefix_extractor = prefix_transform.get(); + options.allow_mmap_reads = true; + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +std::unique_ptr PlainTableDBTest::prefix_transform( + NewFixedPrefixTransform(8)); + +TEST(PlainTableDBTest, Empty) { + ASSERT_TRUE(dbfull() != nullptr); + ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); +} + +TEST(PlainTableDBTest, ReadWrite) { + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_EQ("v1", Get("1000000000000foo")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(PlainTableDBTest, Flush) { + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(PlainTableDBTest, Iterator) { + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + iter->Seek("1000000000foo001"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo002", iter->key().ToString()); + ASSERT_EQ("v_2", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo003", iter->key().ToString()); + ASSERT_EQ("v__3", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); + + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + + + delete iter; +} + +TEST(PlainTableDBTest, Flush2) { + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(Put("1000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("1000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +TEST(PlainTableDBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 100 << 10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/prefix_test.cc b/db/prefix_test.cc index 7e5e9cc0e..ca00c31b3 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -16,11 +16,15 @@ DEFINE_bool(trigger_deadlock, false, DEFINE_uint64(bucket_count, 100000, "number of buckets"); DEFINE_uint64(num_locks, 10001, "number of locks"); DEFINE_bool(random_prefix, false, "randomize prefix"); -DEFINE_uint64(total_prefixes, 1000, "total number of prefixes"); -DEFINE_uint64(items_per_prefix, 10, "total number of values per prefix"); -DEFINE_int64(write_buffer_size, 1000000000, ""); -DEFINE_int64(max_write_buffer_number, 8, ""); -DEFINE_int64(min_write_buffer_number_to_merge, 7, ""); +DEFINE_uint64(total_prefixes, 100000, "total number of prefixes"); +DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix"); +DEFINE_int64(write_buffer_size, 33554432, ""); +DEFINE_int64(max_write_buffer_number, 2, ""); +DEFINE_int64(min_write_buffer_number_to_merge, 1, ""); +DEFINE_int32(skiplist_height, 4, ""); +DEFINE_int32(memtable_prefix_bloom_bits, 10000000, ""); +DEFINE_int32(memtable_prefix_bloom_probes, 10, ""); +DEFINE_int32(value_size, 40, ""); // Path to the database on file system const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test"; @@ -104,218 +108,265 @@ class PrefixTest { options.min_write_buffer_number_to_merge = FLAGS_min_write_buffer_number_to_merge; - options.comparator = new TestKeyComparator(); - if (FLAGS_use_prefix_hash_memtable) { - auto prefix_extractor = NewFixedPrefixTransform(8); - options.prefix_extractor = prefix_extractor; - options.memtable_factory.reset(NewHashSkipListRepFactory( - prefix_extractor, FLAGS_bucket_count)); - } + options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits; + options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes; Status s = DB::Open(options, kDbName, &db); ASSERT_OK(s); return std::shared_ptr(db); } + + bool NextOptions() { + // skip some options + option_config_++; + if (option_config_ < kEnd) { + auto prefix_extractor = NewFixedPrefixTransform(8); + options.prefix_extractor = prefix_extractor; + switch(option_config_) { + case kHashSkipList: + options.memtable_factory.reset( + NewHashSkipListRepFactory(options.prefix_extractor, + FLAGS_bucket_count, + FLAGS_skiplist_height)); + return true; + case kHashLinkList: + options.memtable_factory.reset( + NewHashLinkListRepFactory(options.prefix_extractor, + FLAGS_bucket_count)); + return true; + default: + return false; + } + } + return false; + } + + PrefixTest() : option_config_(kBegin) { + options.comparator = new TestKeyComparator(); + } ~PrefixTest() { delete options.comparator; } protected: + enum OptionConfig { + kBegin, + kHashSkipList, + kHashLinkList, + kEnd + }; + int option_config_; Options options; }; TEST(PrefixTest, DynamicPrefixIterator) { + while (NextOptions()) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::vector prefixes; + for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { + prefixes.push_back(i); + } - DestroyDB(kDbName, Options()); - auto db = OpenDb(); - WriteOptions write_options; - ReadOptions read_options; + if (FLAGS_random_prefix) { + std::random_shuffle(prefixes.begin(), prefixes.end()); + } - std::vector prefixes; - for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { - prefixes.push_back(i); - } + HistogramImpl hist_put_time; + HistogramImpl hist_put_comparison; - if (FLAGS_random_prefix) { - std::random_shuffle(prefixes.begin(), prefixes.end()); - } + // insert x random prefix, each with y continuous element. + for (auto prefix : prefixes) { + for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { + TestKey test_key(prefix, sorted); - // insert x random prefix, each with y continuous element. - for (auto prefix : prefixes) { - for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { - TestKey test_key(prefix, sorted); + Slice key = TestKeyToSlice(test_key); + std::string value(FLAGS_value_size, 0); - Slice key = TestKeyToSlice(test_key); - std::string value = "v" + std::to_string(sorted); + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + ASSERT_OK(db->Put(write_options, key, value)); + hist_put_time.Add(timer.ElapsedNanos()); + hist_put_comparison.Add(perf_context.user_key_comparison_count); + } + } + + std::cout << "Put key comparison: \n" << hist_put_comparison.ToString() + << "Put time: \n" << hist_put_time.ToString(); - ASSERT_OK(db->Put(write_options, key, value)); + // test seek existing keys + HistogramImpl hist_seek_time; + HistogramImpl hist_seek_comparison; + + if (FLAGS_use_prefix_hash_memtable) { + read_options.prefix_seek = true; } - } + std::unique_ptr iter(db->NewIterator(read_options)); - // test seek existing keys - HistogramImpl hist_seek_time; - HistogramImpl hist_seek_comparison; + for (auto prefix : prefixes) { + TestKey test_key(prefix, FLAGS_items_per_prefix / 2); + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(0); - if (FLAGS_use_prefix_hash_memtable) { - read_options.prefix_seek = true; - } - std::unique_ptr iter(db->NewIterator(read_options)); - - for (auto prefix : prefixes) { - TestKey test_key(prefix, FLAGS_items_per_prefix / 2); - Slice key = TestKeyToSlice(test_key); - std::string value = "v" + std::to_string(0); - - perf_context.Reset(); - StopWatchNano timer(Env::Default(), true); - uint64_t total_keys = 0; - for (iter->Seek(key); iter->Valid(); iter->Next()) { - if (FLAGS_trigger_deadlock) { - std::cout << "Behold the deadlock!\n"; - db->Delete(write_options, iter->key()); + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + uint64_t total_keys = 0; + for (iter->Seek(key); iter->Valid(); iter->Next()) { + if (FLAGS_trigger_deadlock) { + std::cout << "Behold the deadlock!\n"; + db->Delete(write_options, iter->key()); + } + auto test_key = SliceToTestKey(iter->key()); + if (test_key->prefix != prefix) break; + total_keys++; } - auto test_key = SliceToTestKey(iter->key()); - if (test_key->prefix != prefix) break; - total_keys++; + hist_seek_time.Add(timer.ElapsedNanos()); + hist_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2); } - hist_seek_time.Add(timer.ElapsedNanos()); - hist_seek_comparison.Add(perf_context.user_key_comparison_count); - ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2); - } - std::cout << "Seek key comparison: \n" - << hist_seek_comparison.ToString() - << "Seek time: \n" - << hist_seek_time.ToString(); - - // test non-existing keys - HistogramImpl hist_no_seek_time; - HistogramImpl hist_no_seek_comparison; - - for (auto prefix = FLAGS_total_prefixes; - prefix < FLAGS_total_prefixes + 100; - prefix++) { - TestKey test_key(prefix, 0); - Slice key = TestKeyToSlice(test_key); - - perf_context.Reset(); - StopWatchNano timer(Env::Default(), true); - iter->Seek(key); - hist_no_seek_time.Add(timer.ElapsedNanos()); - hist_no_seek_comparison.Add(perf_context.user_key_comparison_count); - ASSERT_TRUE(!iter->Valid()); - } + std::cout << "Seek key comparison: \n" + << hist_seek_comparison.ToString() + << "Seek time: \n" + << hist_seek_time.ToString(); - std::cout << "non-existing Seek key comparison: \n" - << hist_no_seek_comparison.ToString() - << "non-existing Seek time: \n" - << hist_no_seek_time.ToString(); -} + // test non-existing keys + HistogramImpl hist_no_seek_time; + HistogramImpl hist_no_seek_comparison; -TEST(PrefixTest, PrefixHash) { + for (auto prefix = FLAGS_total_prefixes; + prefix < FLAGS_total_prefixes + 10000; + prefix++) { + TestKey test_key(prefix, 0); + Slice key = TestKeyToSlice(test_key); - DestroyDB(kDbName, Options()); - auto db = OpenDb(); - WriteOptions write_options; - ReadOptions read_options; + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + hist_no_seek_time.Add(timer.ElapsedNanos()); + hist_no_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_TRUE(!iter->Valid()); + } - std::vector prefixes; - for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { - prefixes.push_back(i); + std::cout << "non-existing Seek key comparison: \n" + << hist_no_seek_comparison.ToString() + << "non-existing Seek time: \n" + << hist_no_seek_time.ToString(); } +} - if (FLAGS_random_prefix) { - std::random_shuffle(prefixes.begin(), prefixes.end()); - } +TEST(PrefixTest, PrefixHash) { + while (NextOptions()) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::vector prefixes; + for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { + prefixes.push_back(i); + } - // insert x random prefix, each with y continuous element. - HistogramImpl hist_put_time; - HistogramImpl hist_put_comparison; + if (FLAGS_random_prefix) { + std::random_shuffle(prefixes.begin(), prefixes.end()); + } - for (auto prefix : prefixes) { - for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { - TestKey test_key(prefix, sorted); + // insert x random prefix, each with y continuous element. + HistogramImpl hist_put_time; + HistogramImpl hist_put_comparison; - Slice key = TestKeyToSlice(test_key); - std::string value = "v" + std::to_string(sorted); + for (auto prefix : prefixes) { + for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { + TestKey test_key(prefix, sorted); - perf_context.Reset(); - StopWatchNano timer(Env::Default(), true); - ASSERT_OK(db->Put(write_options, key, value)); - hist_put_time.Add(timer.ElapsedNanos()); - hist_put_comparison.Add(perf_context.user_key_comparison_count); + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(sorted); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + ASSERT_OK(db->Put(write_options, key, value)); + hist_put_time.Add(timer.ElapsedNanos()); + hist_put_comparison.Add(perf_context.user_key_comparison_count); + } } - } - std::cout << "Put key comparison: \n" << hist_put_comparison.ToString() - << "Put time: \n" << hist_put_time.ToString(); + std::cout << "Put key comparison: \n" << hist_put_comparison.ToString() + << "Put time: \n" << hist_put_time.ToString(); - // test seek existing keys - HistogramImpl hist_seek_time; - HistogramImpl hist_seek_comparison; + // test seek existing keys + HistogramImpl hist_seek_time; + HistogramImpl hist_seek_comparison; - for (auto prefix : prefixes) { - TestKey test_key(prefix, 0); - Slice key = TestKeyToSlice(test_key); - std::string value = "v" + std::to_string(0); + for (auto prefix : prefixes) { + TestKey test_key(prefix, 0); + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(0); - Slice key_prefix; - if (FLAGS_use_prefix_hash_memtable) { - key_prefix = options.prefix_extractor->Transform(key); - read_options.prefix = &key_prefix; - } - std::unique_ptr iter(db->NewIterator(read_options)); + Slice key_prefix; + if (FLAGS_use_prefix_hash_memtable) { + key_prefix = options.prefix_extractor->Transform(key); + read_options.prefix = &key_prefix; + } + std::unique_ptr iter(db->NewIterator(read_options)); - perf_context.Reset(); - StopWatchNano timer(Env::Default(), true); - uint64_t total_keys = 0; - for (iter->Seek(key); iter->Valid(); iter->Next()) { - if (FLAGS_trigger_deadlock) { - std::cout << "Behold the deadlock!\n"; - db->Delete(write_options, iter->key()); + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + uint64_t total_keys = 0; + for (iter->Seek(key); iter->Valid(); iter->Next()) { + if (FLAGS_trigger_deadlock) { + std::cout << "Behold the deadlock!\n"; + db->Delete(write_options, iter->key()); + } + auto test_key = SliceToTestKey(iter->key()); + if (test_key->prefix != prefix) break; + total_keys++; } - auto test_key = SliceToTestKey(iter->key()); - if (test_key->prefix != prefix) break; - total_keys++; + hist_seek_time.Add(timer.ElapsedNanos()); + hist_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_EQ(total_keys, FLAGS_items_per_prefix); } - hist_seek_time.Add(timer.ElapsedNanos()); - hist_seek_comparison.Add(perf_context.user_key_comparison_count); - ASSERT_EQ(total_keys, FLAGS_items_per_prefix); - } - std::cout << "Seek key comparison: \n" - << hist_seek_comparison.ToString() - << "Seek time: \n" - << hist_seek_time.ToString(); + std::cout << "Seek key comparison: \n" + << hist_seek_comparison.ToString() + << "Seek time: \n" + << hist_seek_time.ToString(); - // test non-existing keys - HistogramImpl hist_no_seek_time; - HistogramImpl hist_no_seek_comparison; + // test non-existing keys + HistogramImpl hist_no_seek_time; + HistogramImpl hist_no_seek_comparison; - for (auto prefix = FLAGS_total_prefixes; - prefix < FLAGS_total_prefixes + 100; - prefix++) { - TestKey test_key(prefix, 0); - Slice key = TestKeyToSlice(test_key); + for (auto prefix = FLAGS_total_prefixes; + prefix < FLAGS_total_prefixes + 100; + prefix++) { + TestKey test_key(prefix, 0); + Slice key = TestKeyToSlice(test_key); - if (FLAGS_use_prefix_hash_memtable) { - Slice key_prefix = options.prefix_extractor->Transform(key); - read_options.prefix = &key_prefix; + if (FLAGS_use_prefix_hash_memtable) { + Slice key_prefix = options.prefix_extractor->Transform(key); + read_options.prefix = &key_prefix; + } + std::unique_ptr iter(db->NewIterator(read_options)); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + hist_no_seek_time.Add(timer.ElapsedNanos()); + hist_no_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_TRUE(!iter->Valid()); } - std::unique_ptr iter(db->NewIterator(read_options)); - perf_context.Reset(); - StopWatchNano timer(Env::Default(), true); - iter->Seek(key); - hist_no_seek_time.Add(timer.ElapsedNanos()); - hist_no_seek_comparison.Add(perf_context.user_key_comparison_count); - ASSERT_TRUE(!iter->Valid()); + std::cout << "non-existing Seek key comparison: \n" + << hist_no_seek_comparison.ToString() + << "non-existing Seek time: \n" + << hist_no_seek_time.ToString(); } - - std::cout << "non-existing Seek key comparison: \n" - << hist_no_seek_comparison.ToString() - << "non-existing Seek time: \n" - << hist_no_seek_time.ToString(); } } diff --git a/db/repair.cc b/db/repair.cc index 72387a71d..ed11870b0 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -231,10 +231,8 @@ class Repairer { FileMetaData meta; meta.number = next_file_number_++; Iterator* iter = mem->NewIterator(); - status = BuildTable(dbname_, env_, options_, storage_options_, - table_cache_, iter, &meta, - icmp_.user_comparator(), 0, 0, - kNoCompression); + status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_, + iter, &meta, icmp_, 0, 0, kNoCompression); delete iter; delete mem->Unref(); delete cf_mems_default; @@ -275,8 +273,9 @@ class Repairer { int counter = 0; Status status = env_->GetFileSize(fname, &t->meta.file_size); if (status.ok()) { + FileMetaData dummy_meta(t->meta.number, t->meta.file_size); Iterator* iter = table_cache_->NewIterator( - ReadOptions(), storage_options_, t->meta.number, t->meta.file_size); + ReadOptions(), storage_options_, icmp_, dummy_meta); bool empty = true; ParsedInternalKey parsed; t->min_sequence = 0; diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc index 0f3b89d9b..3d1420c0c 100644 --- a/db/simple_table_db_test.cc +++ b/db/simple_table_db_test.cc @@ -22,6 +22,8 @@ #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" #include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/table_builder.h" #include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" @@ -31,6 +33,7 @@ using std::unique_ptr; +// IS THIS FILE STILL NEEDED? namespace rocksdb { // SimpleTable is a simple table format for UNIT TEST ONLY. It is not built @@ -84,15 +87,13 @@ public: Iterator* NewIterator(const ReadOptions&) override; - Status Get( - const ReadOptions&, const Slice& key, void* arg, - bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), - void (*mark_key_may_exist)(void*) = nullptr) override; + Status Get(const ReadOptions&, const Slice& key, void* arg, + bool (*handle_result)(void* arg, const ParsedInternalKey& k, + const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr) override; uint64_t ApproximateOffsetOf(const Slice& key) override; - bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override; - void SetupForCompaction() override; TableProperties& GetTableProperties() override; @@ -244,7 +245,8 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) { return s; } - int compare_result = rep_->options.comparator->Compare(tmp_slice, target); + InternalKeyComparator ikc(rep_->options.comparator); + int compare_result = ikc.Compare(tmp_slice, target); if (compare_result < 0) { if (left == right) { @@ -279,14 +281,20 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) { return s; } -Status SimpleTableReader::Get( - const ReadOptions& options, const Slice& k, void* arg, - bool (*saver)(void*, const Slice&, const Slice&, bool), - void (*mark_key_may_exist)(void*)) { +Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k, + void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&, bool), + void (*mark_key_may_exist)(void*)) { Status s; SimpleTableIterator* iter = new SimpleTableIterator(this); for (iter->Seek(k); iter->Valid(); iter->Next()) { - if (!(*saver)(arg, iter->key(), iter->value(), true)) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(iter->key(), &parsed_key)) { + return Status::Corruption(Slice()); + } + + if (!(*saver)(arg, parsed_key, iter->value(), true)) { break; } } @@ -295,11 +303,6 @@ Status SimpleTableReader::Get( return s; } -bool SimpleTableReader::TEST_KeyInCache(const ReadOptions& options, - const Slice& key) { - return false; -} - uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) { return 0; } @@ -540,27 +543,30 @@ public: const char* Name() const override { return "SimpleTable"; } - Status GetTableReader(const Options& options, const EnvOptions& soptions, - unique_ptr && file, - uint64_t file_size, + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_key, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const; - TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, + TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& internal_key, + WritableFile* file, CompressionType compression_type) const; }; -Status SimpleTableFactory::GetTableReader( +Status SimpleTableFactory::NewTableReader( const Options& options, const EnvOptions& soptions, - unique_ptr && file, uint64_t file_size, + const InternalKeyComparator& internal_key, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const { return SimpleTableReader::Open(options, soptions, std::move(file), file_size, table_reader); } -TableBuilder* SimpleTableFactory::GetTableBuilder( - const Options& options, WritableFile* file, - CompressionType compression_type) const { +TableBuilder* SimpleTableFactory::NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_key, + WritableFile* file, CompressionType compression_type) const { return new SimpleTableBuilder(options, file, compression_type); } diff --git a/db/skiplist.h b/db/skiplist.h index 2c9c4a6de..e713fe42a 100644 --- a/db/skiplist.h +++ b/db/skiplist.h @@ -34,8 +34,8 @@ #include #include #include "port/port.h" +#include "util/arena.h" #include "util/random.h" -#include "rocksdb/arena.h" namespace rocksdb { @@ -48,7 +48,8 @@ class SkipList { // Create a new SkipList object that will use "cmp" for comparing keys, // and will allocate memory using "*arena". Objects allocated in the arena // must remain allocated for the lifetime of the skiplist object. - explicit SkipList(Comparator cmp, Arena* arena); + explicit SkipList(Comparator cmp, Arena* arena, + int32_t max_height = 12, int32_t branching_factor = 4); // Insert key into the list. // REQUIRES: nothing that compares equal to key is currently in the list. @@ -102,7 +103,8 @@ class SkipList { }; private: - enum { kMaxHeight = 12 }; + const int32_t kMaxHeight_; + const int32_t kBranching_; // Immutable after construction Comparator const compare_; @@ -115,8 +117,8 @@ class SkipList { port::AtomicPointer max_height_; // Height of the entire list // Used for optimizing sequential insert patterns - Node* prev_[kMaxHeight]; - int prev_height_; + Node** prev_; + int32_t prev_height_; inline int GetMaxHeight() const { return static_cast( @@ -258,13 +260,12 @@ inline void SkipList::Iterator::SeekToLast() { template int SkipList::RandomHeight() { // Increase height with probability 1 in kBranching - static const unsigned int kBranching = 4; int height = 1; - while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { + while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) { height++; } assert(height > 0); - assert(height <= kMaxHeight); + assert(height <= kMaxHeight_); return height; } @@ -354,14 +355,24 @@ typename SkipList::Node* SkipList::FindLast() } template -SkipList::SkipList(Comparator cmp, Arena* arena) - : compare_(cmp), +SkipList::SkipList(Comparator cmp, Arena* arena, + int32_t max_height, + int32_t branching_factor) + : kMaxHeight_(max_height), + kBranching_(branching_factor), + compare_(cmp), arena_(arena), - head_(NewNode(0 /* any key will do */, kMaxHeight)), + head_(NewNode(0 /* any key will do */, max_height)), max_height_(reinterpret_cast(1)), prev_height_(1), rnd_(0xdeadbeef) { - for (int i = 0; i < kMaxHeight; i++) { + assert(kMaxHeight_ > 0); + assert(kBranching_ > 0); + // Allocate the prev_ Node* array, directly from the passed-in arena. + // prev_ does not need to be freed, as its life cycle is tied up with + // the arena as a whole. + prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_); + for (int i = 0; i < kMaxHeight_; i++) { head_->SetNext(i, nullptr); prev_[i] = head_; } diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc index dcbaf0abb..b87ddcbb0 100644 --- a/db/skiplist_test.cc +++ b/db/skiplist_test.cc @@ -10,7 +10,7 @@ #include "db/skiplist.h" #include #include "rocksdb/env.h" -#include "util/arena_impl.h" +#include "util/arena.h" #include "util/hash.h" #include "util/random.h" #include "util/testharness.h" @@ -34,9 +34,9 @@ struct TestComparator { class SkipTest { }; TEST(SkipTest, Empty) { - ArenaImpl arena_impl; + Arena arena; TestComparator cmp; - SkipList list(cmp, &arena_impl); + SkipList list(cmp, &arena); ASSERT_TRUE(!list.Contains(10)); SkipList::Iterator iter(&list); @@ -54,9 +54,9 @@ TEST(SkipTest, InsertAndLookup) { const int R = 5000; Random rnd(1000); std::set keys; - ArenaImpl arena_impl; + Arena arena; TestComparator cmp; - SkipList list(cmp, &arena_impl); + SkipList list(cmp, &arena); for (int i = 0; i < N; i++) { Key key = rnd.Next() % R; if (keys.insert(key).second) { @@ -209,14 +209,14 @@ class ConcurrentTest { // Current state of the test State current_; - ArenaImpl arena_impl_; + Arena arena_; // SkipList is not protected by mu_. We just use a single writer // thread to modify it. SkipList list_; public: - ConcurrentTest() : list_(TestComparator(), &arena_impl_) { } + ConcurrentTest() : list_(TestComparator(), &arena_) {} // REQUIRES: External synchronization void WriteStep(Random* rnd) { diff --git a/db/table_cache.cc b/db/table_cache.cc index adf94182d..3301b98d9 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -10,9 +10,10 @@ #include "db/table_cache.h" #include "db/filename.h" +#include "db/version_edit.h" #include "rocksdb/statistics.h" -#include "rocksdb/table.h" +#include "table/table_reader.h" #include "util/coding.h" #include "util/stop_watch.h" @@ -34,7 +35,6 @@ static Slice GetSliceForFileNumber(uint64_t* file_number) { sizeof(*file_number)); } -// TODO(icanadi) Options -> DBOptions TableCache::TableCache(const std::string& dbname, const Options* options, const EnvOptions& storage_options, Cache* const cache) : env_(options->env), @@ -46,7 +46,16 @@ TableCache::TableCache(const std::string& dbname, const Options* options, TableCache::~TableCache() { } +TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) { + return reinterpret_cast(cache_->Value(handle)); +} + +void TableCache::ReleaseHandle(Cache::Handle* handle) { + cache_->Release(handle); +} + Status TableCache::FindTable(const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, uint64_t file_number, uint64_t file_size, Cache::Handle** handle, bool* table_io, const bool no_io) { @@ -70,8 +79,9 @@ Status TableCache::FindTable(const EnvOptions& toptions, file->Hint(RandomAccessFile::RANDOM); } StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS); - s = options_->table_factory->GetTableReader( - *options_, toptions, std::move(file), file_size, &table_reader); + s = options_->table_factory->NewTableReader( + *options_, toptions, internal_comparator, std::move(file), file_size, + &table_reader); } if (!s.ok()) { @@ -89,25 +99,28 @@ Status TableCache::FindTable(const EnvOptions& toptions, Iterator* TableCache::NewIterator(const ReadOptions& options, const EnvOptions& toptions, - uint64_t file_number, - uint64_t file_size, + const InternalKeyComparator& icomparator, + const FileMetaData& file_meta, TableReader** table_reader_ptr, bool for_compaction) { if (table_reader_ptr != nullptr) { *table_reader_ptr = nullptr; } - - Cache::Handle* handle = nullptr; - Status s = FindTable(toptions, file_number, file_size, &handle, - nullptr, options.read_tier == kBlockCacheTier); + Cache::Handle* handle = file_meta.table_reader_handle; + Status s; + if (!handle) { + s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size, + &handle, nullptr, options.read_tier == kBlockCacheTier); + } if (!s.ok()) { return NewErrorIterator(s); } - TableReader* table_reader = - reinterpret_cast(cache_->Value(handle)); + TableReader* table_reader = GetTableReaderFromHandle(handle); Iterator* result = table_reader->NewIterator(options); - result->RegisterCleanup(&UnrefEntry, cache_, handle); + if (!file_meta.table_reader_handle) { + result->RegisterCleanup(&UnrefEntry, cache_, handle); + } if (table_reader_ptr != nullptr) { *table_reader_ptr = table_reader; } @@ -120,22 +133,24 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, } Status TableCache::Get(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - const Slice& k, - void* arg, - bool (*saver)(void*, const Slice&, const Slice&, bool), - bool* table_io, - void (*mark_key_may_exist)(void*)) { - Cache::Handle* handle = nullptr; - Status s = FindTable(storage_options_, file_number, file_size, - &handle, table_io, - options.read_tier == kBlockCacheTier); + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&, bool), + bool* table_io, void (*mark_key_may_exist)(void*)) { + Cache::Handle* handle = file_meta.table_reader_handle; + Status s; + if (!handle) { + s = FindTable(storage_options_, internal_comparator, file_meta.number, + file_meta.file_size, &handle, table_io, + options.read_tier == kBlockCacheTier); + } if (s.ok()) { - TableReader* t = - reinterpret_cast(cache_->Value(handle)); + TableReader* t = GetTableReaderFromHandle(handle); s = t->Get(options, k, arg, saver, mark_key_may_exist); - cache_->Release(handle); + if (!file_meta.table_reader_handle) { + ReleaseHandle(handle); + } } else if (options.read_tier && s.IsIncomplete()) { // Couldnt find Table in cache but treat as kFound if no_io set (*mark_key_may_exist)(arg); @@ -145,19 +160,17 @@ Status TableCache::Get(const ReadOptions& options, } bool TableCache::PrefixMayMatch(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - const Slice& internal_prefix, - bool* table_io) { + const InternalKeyComparator& icomparator, + uint64_t file_number, uint64_t file_size, + const Slice& internal_prefix, bool* table_io) { Cache::Handle* handle = nullptr; - Status s = FindTable(storage_options_, file_number, - file_size, &handle, table_io); + Status s = FindTable(storage_options_, icomparator, file_number, file_size, + &handle, table_io); bool may_match = true; if (s.ok()) { - TableReader* t = - reinterpret_cast(cache_->Value(handle)); + TableReader* t = GetTableReaderFromHandle(handle); may_match = t->PrefixMayMatch(internal_prefix); - cache_->Release(handle); + ReleaseHandle(handle); } return may_match; } diff --git a/db/table_cache.h b/db/table_cache.h index 9807aeb00..44f47e353 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -12,15 +12,18 @@ #pragma once #include #include + #include "db/dbformat.h" -#include "rocksdb/env.h" -#include "rocksdb/cache.h" #include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" #include "rocksdb/table.h" +#include "table/table_reader.h" namespace rocksdb { class Env; +struct FileMetaData; class TableCache { public: @@ -35,10 +38,9 @@ class TableCache { // the returned iterator. The returned "*tableptr" object is owned by // the cache and should not be deleted, and is valid for as long as the // returned iterator is live. - Iterator* NewIterator(const ReadOptions& options, - const EnvOptions& toptions, - uint64_t file_number, - uint64_t file_size, + Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, TableReader** table_reader_ptr = nullptr, bool for_compaction = false); @@ -46,33 +48,40 @@ class TableCache { // call (*handle_result)(arg, found_key, found_value) repeatedly until // it returns false. Status Get(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - const Slice& k, - void* arg, - bool (*handle_result)(void*, const Slice&, const Slice&, bool), - bool* table_io, - void (*mark_key_may_exist)(void*) = nullptr); + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, void* arg, + bool (*handle_result)(void*, const ParsedInternalKey&, + const Slice&, bool), + bool* table_io, void (*mark_key_may_exist)(void*) = nullptr); // Determine whether the table may contain the specified prefix. If - // the table index of blooms are not in memory, this may cause an I/O - bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number, - uint64_t file_size, const Slice& internal_prefix, - bool* table_io); + // the table index or blooms are not in memory, this may cause an I/O + bool PrefixMayMatch(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + uint64_t file_number, uint64_t file_size, + const Slice& internal_prefix, bool* table_io); // Evict any entry for the specified file number static void Evict(Cache* cache, uint64_t file_number); + // Find table reader + Status FindTable(const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + uint64_t file_number, uint64_t file_size, Cache::Handle**, + bool* table_io = nullptr, const bool no_io = false); + + // Get TableReader from a cache handle. + TableReader* GetTableReaderFromHandle(Cache::Handle* handle); + + // Release the handle from a cache + void ReleaseHandle(Cache::Handle* handle); + private: Env* const env_; const std::string dbname_; const Options* options_; const EnvOptions& storage_options_; Cache* const cache_; - - Status FindTable(const EnvOptions& toptions, uint64_t file_number, - uint64_t file_size, Cache::Handle**, bool* table_io=nullptr, - const bool no_io = false); }; } // namespace rocksdb diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc index 3654663c1..25bd70036 100644 --- a/db/table_properties_collector.cc +++ b/db/table_properties_collector.cc @@ -10,87 +10,6 @@ namespace rocksdb { -namespace { - void AppendProperty( - std::string& props, - const std::string& key, - const std::string& value, - const std::string& prop_delim, - const std::string& kv_delim) { - props.append(key); - props.append(kv_delim); - props.append(value); - props.append(prop_delim); - } - - template - void AppendProperty( - std::string& props, - const std::string& key, - const TValue& value, - const std::string& prop_delim, - const std::string& kv_delim) { - AppendProperty( - props, key, std::to_string(value), prop_delim, kv_delim - ); - } -} - -std::string TableProperties::ToString( - const std::string& prop_delim, - const std::string& kv_delim) const { - std::string result; - result.reserve(1024); - - // Basic Info - AppendProperty( - result, "# data blocks", num_data_blocks, prop_delim, kv_delim - ); - AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim); - - AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim); - AppendProperty( - result, - "raw average key size", - num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0, - prop_delim, - kv_delim - ); - AppendProperty( - result, "raw value size", raw_value_size, prop_delim, kv_delim - ); - AppendProperty( - result, - "raw average value size", - num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0, - prop_delim, - kv_delim - ); - - AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); - AppendProperty(result, "index block size", index_size, prop_delim, kv_delim); - AppendProperty( - result, "filter block size", filter_size, prop_delim, kv_delim - ); - AppendProperty( - result, - "(estimated) table size", - data_size + index_size + filter_size, - prop_delim, - kv_delim - ); - - AppendProperty( - result, - "filter policy name", - filter_policy_name.empty() ? std::string("N/A") : filter_policy_name, - prop_delim, - kv_delim - ); - - return result; -} - Status InternalKeyPropertiesCollector::Add( const Slice& key, const Slice& value) { ParsedInternalKey ikey; @@ -106,7 +25,7 @@ Status InternalKeyPropertiesCollector::Add( } Status InternalKeyPropertiesCollector::Finish( - TableProperties::UserCollectedProperties* properties) { + UserCollectedProperties* properties) { assert(properties); assert(properties->find( InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end()); @@ -118,7 +37,7 @@ Status InternalKeyPropertiesCollector::Finish( return Status::OK(); } -TableProperties::UserCollectedProperties +UserCollectedProperties InternalKeyPropertiesCollector::GetReadableProperties() const { return { { "kDeletedKeys", std::to_string(deleted_keys_) } @@ -137,11 +56,11 @@ Status UserKeyTablePropertiesCollector::Add( } Status UserKeyTablePropertiesCollector::Finish( - TableProperties::UserCollectedProperties* properties) { + UserCollectedProperties* properties) { return collector_->Finish(properties); } -TableProperties::UserCollectedProperties +UserCollectedProperties UserKeyTablePropertiesCollector::GetReadableProperties() const { return collector_->GetReadableProperties(); } @@ -151,7 +70,7 @@ const std::string InternalKeyTablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys"; uint64_t GetDeletedKeys( - const TableProperties::UserCollectedProperties& props) { + const UserCollectedProperties& props) { auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys); if (pos == props.end()) { return 0; diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h index 533130db7..6cf56291a 100644 --- a/db/table_properties_collector.h +++ b/db/table_properties_collector.h @@ -24,15 +24,13 @@ class InternalKeyPropertiesCollector : public TablePropertiesCollector { public: virtual Status Add(const Slice& key, const Slice& value) override; - virtual Status Finish( - TableProperties::UserCollectedProperties* properties) override; + virtual Status Finish(UserCollectedProperties* properties) override; virtual const char* Name() const override { return "InternalKeyPropertiesCollector"; } - TableProperties::UserCollectedProperties - GetReadableProperties() const override; + UserCollectedProperties GetReadableProperties() const override; private: uint64_t deleted_keys_ = 0; @@ -61,13 +59,11 @@ class UserKeyTablePropertiesCollector : public TablePropertiesCollector { virtual Status Add(const Slice& key, const Slice& value) override; - virtual Status Finish( - TableProperties::UserCollectedProperties* properties) override; + virtual Status Finish(UserCollectedProperties* properties) override; virtual const char* Name() const override { return collector_->Name(); } - TableProperties::UserCollectedProperties - GetReadableProperties() const override; + UserCollectedProperties GetReadableProperties() const override; protected: std::shared_ptr collector_; diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 6f405b28a..961a7302b 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -7,12 +7,14 @@ #include #include -#include "db/dbformat.h" #include "db/db_impl.h" +#include "db/dbformat.h" #include "db/table_properties_collector.h" -#include "rocksdb/table_properties.h" #include "rocksdb/table.h" #include "table/block_based_table_factory.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" +#include "table/table_builder.h" #include "util/coding.h" #include "util/testharness.h" #include "util/testutil.h" @@ -20,8 +22,6 @@ namespace rocksdb { class TablePropertiesTest { - private: - unique_ptr table_reader_; }; // TODO(kailiu) the following classes should be moved to some more general @@ -83,30 +83,13 @@ class DumbLogger : public Logger { }; // Utilities test functions -void MakeBuilder( - const Options& options, - std::unique_ptr* writable, - std::unique_ptr* builder) { +void MakeBuilder(const Options& options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr* writable, + std::unique_ptr* builder) { writable->reset(new FakeWritableFile); - builder->reset( - options.table_factory->GetTableBuilder(options, writable->get(), - options.compression)); -} - -void OpenTable( - const Options& options, - const std::string& contents, - std::unique_ptr* table_reader) { - - std::unique_ptr file(new FakeRandomeAccessFile(contents)); - auto s = options.table_factory->GetTableReader( - options, - EnvOptions(), - std::move(file), - contents.size(), - table_reader - ); - ASSERT_OK(s); + builder->reset(options.table_factory->NewTableBuilder( + options, internal_comparator, writable->get(), options.compression)); } // Collects keys that starts with "A" in a table. @@ -114,10 +97,10 @@ class RegularKeysStartWithA: public TablePropertiesCollector { public: const char* Name() const { return "RegularKeysStartWithA"; } - Status Finish(TableProperties::UserCollectedProperties* properties) { + Status Finish(UserCollectedProperties* properties) { std::string encoded; PutVarint32(&encoded, count_); - *properties = TableProperties::UserCollectedProperties { + *properties = UserCollectedProperties { { "TablePropertiesTest", "Rocksdb" }, { "Count", encoded } }; @@ -132,8 +115,7 @@ class RegularKeysStartWithA: public TablePropertiesCollector { return Status::OK(); } - virtual TableProperties::UserCollectedProperties - GetReadableProperties() const { + virtual UserCollectedProperties GetReadableProperties() const { return {}; } @@ -142,23 +124,65 @@ class RegularKeysStartWithA: public TablePropertiesCollector { uint32_t count_ = 0; }; -TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) { - Options options; - +extern uint64_t kBlockBasedTableMagicNumber; +extern uint64_t kPlainTableMagicNumber; +void TestCustomizedTablePropertiesCollector( + uint64_t magic_number, bool encode_as_internal, const Options& options, + const InternalKeyComparator& internal_comparator) { // make sure the entries will be inserted with order. std::map kvs = { - {"About", "val5"}, // starts with 'A' - {"Abstract", "val2"}, // starts with 'A' - {"Around", "val7"}, // starts with 'A' - {"Beyond", "val3"}, - {"Builder", "val1"}, - {"Cancel", "val4"}, - {"Find", "val6"}, + {"About ", "val5"}, // starts with 'A' + {"Abstract", "val2"}, // starts with 'A' + {"Around ", "val7"}, // starts with 'A' + {"Beyond ", "val3"}, + {"Builder ", "val1"}, + {"Cancel ", "val4"}, + {"Find ", "val6"}, }; + // -- Step 1: build table + std::unique_ptr builder; + std::unique_ptr writable; + MakeBuilder(options, internal_comparator, &writable, &builder); + + for (const auto& kv : kvs) { + if (encode_as_internal) { + InternalKey ikey(kv.first, 0, ValueType::kTypeValue); + builder->Add(ikey.Encode(), kv.second); + } else { + builder->Add(kv.first, kv.second); + } + } + ASSERT_OK(builder->Finish()); + + // -- Step 2: Read properties + FakeRandomeAccessFile readable(writable->contents()); + TableProperties props; + Status s = ReadTableProperties( + &readable, + writable->contents().size(), + magic_number, + Env::Default(), + nullptr, + &props + ); + ASSERT_OK(s); + + auto user_collected = props.user_collected_properties; + + ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest")); + + uint32_t starts_with_A = 0; + Slice key(user_collected.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(3u, starts_with_A); +} + +TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) { // Test properties collectors with internal keys or regular keys + // for block based table for (bool encode_as_internal : { true, false }) { - // -- Step 1: build table + Options options; auto collector = new RegularKeysStartWithA(); if (encode_as_internal) { options.table_properties_collectors = { @@ -168,97 +192,111 @@ TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) { options.table_properties_collectors.resize(1); options.table_properties_collectors[0].reset(collector); } - std::unique_ptr builder; - std::unique_ptr writable; - MakeBuilder(options, &writable, &builder); - - for (const auto& kv : kvs) { - if (encode_as_internal) { - InternalKey ikey(kv.first, 0, ValueType::kTypeValue); - builder->Add(ikey.Encode(), kv.second); - } else { - builder->Add(kv.first, kv.second); - } - } - ASSERT_OK(builder->Finish()); - - // -- Step 2: Open table - std::unique_ptr table_reader; - OpenTable(options, writable->contents(), &table_reader); - const auto& properties = - table_reader->GetTableProperties().user_collected_properties; - - ASSERT_EQ("Rocksdb", properties.at("TablePropertiesTest")); - - uint32_t starts_with_A = 0; - Slice key(properties.at("Count")); - ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); - ASSERT_EQ(3u, starts_with_A); + test::PlainInternalKeyComparator ikc(options.comparator); + TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber, + encode_as_internal, options, ikc); } + + // test plain table + Options options; + options.table_properties_collectors.push_back( + std::make_shared() + ); + options.table_factory = std::make_shared(8, 8, 0); + test::PlainInternalKeyComparator ikc(options.comparator); + TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options, + ikc); } -TEST(TablePropertiesTest, InternalKeyPropertiesCollector) { +void TestInternalKeyPropertiesCollector( + uint64_t magic_number, + bool sanitized, + std::shared_ptr table_factory) { InternalKey keys[] = { - InternalKey("A", 0, ValueType::kTypeValue), - InternalKey("B", 0, ValueType::kTypeValue), - InternalKey("C", 0, ValueType::kTypeValue), - InternalKey("W", 0, ValueType::kTypeDeletion), - InternalKey("X", 0, ValueType::kTypeDeletion), - InternalKey("Y", 0, ValueType::kTypeDeletion), - InternalKey("Z", 0, ValueType::kTypeDeletion), + InternalKey("A ", 0, ValueType::kTypeValue), + InternalKey("B ", 0, ValueType::kTypeValue), + InternalKey("C ", 0, ValueType::kTypeValue), + InternalKey("W ", 0, ValueType::kTypeDeletion), + InternalKey("X ", 0, ValueType::kTypeDeletion), + InternalKey("Y ", 0, ValueType::kTypeDeletion), + InternalKey("Z ", 0, ValueType::kTypeDeletion), }; - for (bool sanitized : { false, true }) { - std::unique_ptr builder; - std::unique_ptr writable; - Options options; - if (sanitized) { - options.table_properties_collectors = { - std::make_shared() - }; - // with sanitization, even regular properties collector will be able to - // handle internal keys. - auto comparator = options.comparator; - // HACK: Set options.info_log to avoid writing log in - // SanitizeOptions(). - options.info_log = std::make_shared(); - options = SanitizeOptions( - "db", // just a place holder - nullptr, // with skip internal key comparator - nullptr, // don't care filter policy - options - ); - options.comparator = comparator; - } else { - options.table_properties_collectors = { - std::make_shared() - }; - } - - MakeBuilder(options, &writable, &builder); - for (const auto& k : keys) { - builder->Add(k.Encode(), "val"); - } + std::unique_ptr builder; + std::unique_ptr writable; + Options options; + test::PlainInternalKeyComparator pikc(options.comparator); + + options.table_factory = table_factory; + if (sanitized) { + options.table_properties_collectors = { + std::make_shared() + }; + // with sanitization, even regular properties collector will be able to + // handle internal keys. + auto comparator = options.comparator; + // HACK: Set options.info_log to avoid writing log in + // SanitizeOptions(). + options.info_log = std::make_shared(); + options = SanitizeOptions("db", // just a place holder + &pikc, nullptr, // don't care filter policy + options); + options.comparator = comparator; + } else { + options.table_properties_collectors = { + std::make_shared() + }; + } - ASSERT_OK(builder->Finish()); + MakeBuilder(options, pikc, &writable, &builder); + for (const auto& k : keys) { + builder->Add(k.Encode(), "val"); + } - std::unique_ptr table_reader; - OpenTable(options, writable->contents(), &table_reader); - const auto& properties = - table_reader->GetTableProperties().user_collected_properties; + ASSERT_OK(builder->Finish()); + + FakeRandomeAccessFile readable(writable->contents()); + TableProperties props; + Status s = ReadTableProperties( + &readable, + writable->contents().size(), + magic_number, + Env::Default(), + nullptr, + &props + ); + ASSERT_OK(s); - uint64_t deleted = GetDeletedKeys(properties); - ASSERT_EQ(4u, deleted); + auto user_collected = props.user_collected_properties; + uint64_t deleted = GetDeletedKeys(user_collected); + ASSERT_EQ(4u, deleted); - if (sanitized) { - uint32_t starts_with_A = 0; - Slice key(properties.at("Count")); - ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); - ASSERT_EQ(1u, starts_with_A); - } + if (sanitized) { + uint32_t starts_with_A = 0; + Slice key(user_collected.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(1u, starts_with_A); } } +TEST(TablePropertiesTest, InternalKeyPropertiesCollector) { + TestInternalKeyPropertiesCollector( + kBlockBasedTableMagicNumber, + true /* sanitize */, + std::make_shared() + ); + TestInternalKeyPropertiesCollector( + kBlockBasedTableMagicNumber, + true /* not sanitize */, + std::make_shared() + ); + TestInternalKeyPropertiesCollector( + kPlainTableMagicNumber, + false /* not sanitize */, + std::make_shared(8, 8, 0) + ); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/version_edit.cc b/db/version_edit.cc index 50fc0dec5..87d303e25 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -78,12 +78,10 @@ void VersionEdit::EncodeTo(std::string* dst) const { PutVarint64(dst, last_sequence_); } - for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); - iter != deleted_files_.end(); - ++iter) { + for (const auto& deleted : deleted_files_) { PutVarint32(dst, kDeletedFile); - PutVarint32(dst, iter->first); // level - PutVarint64(dst, iter->second); // file number + PutVarint32(dst, deleted.first /* level */); + PutVarint64(dst, deleted.second /* file number */); } for (size_t i = 0; i < new_files_.size(); i++) { diff --git a/db/version_edit.h b/db/version_edit.h index b2df9f8d3..bd5f0df95 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -12,6 +12,7 @@ #include #include #include +#include "rocksdb/cache.h" #include "db/dbformat.h" namespace rocksdb { @@ -29,8 +30,17 @@ struct FileMetaData { SequenceNumber smallest_seqno;// The smallest seqno in this file SequenceNumber largest_seqno; // The largest seqno in this file - FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0), - being_compacted(false) {} + // Needs to be disposed when refs becomes 0. + Cache::Handle* table_reader_handle; + + FileMetaData(uint64_t number, uint64_t file_size) + : refs(0), + allowed_seeks(1 << 30), + number(number), + file_size(file_size), + being_compacted(false), + table_reader_handle(nullptr) {} + FileMetaData() : FileMetaData(0, 0) {} }; class VersionEdit { @@ -70,6 +80,7 @@ class VersionEdit { const InternalKey& largest, const SequenceNumber& smallest_seqno, const SequenceNumber& largest_seqno) { + assert(smallest_seqno <= largest_seqno); FileMetaData f; f.number = file; f.file_size = file_size; @@ -77,13 +88,12 @@ class VersionEdit { f.largest = largest; f.smallest_seqno = smallest_seqno; f.largest_seqno = largest_seqno; - assert(smallest_seqno <= largest_seqno); new_files_.push_back(std::make_pair(level, f)); } // Delete the specified "file" from the specified "level". void DeleteFile(int level, uint64_t file) { - deleted_files_.insert(std::make_pair(level, file)); + deleted_files_.insert({level, file}); } // Number of edits @@ -120,7 +130,7 @@ class VersionEdit { private: friend class VersionSet; - typedef std::set< std::pair > DeletedFileSet; + typedef std::set< std::pair> DeletedFileSet; bool GetLevel(Slice* input, int* level, const char** msg); diff --git a/db/version_set.cc b/db/version_set.cc index 1f64171c7..228d323b7 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -14,6 +14,7 @@ #include #include #include + #include "db/filename.h" #include "db/log_reader.h" #include "db/log_writer.h" @@ -23,7 +24,7 @@ #include "db/compaction.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" -#include "rocksdb/table.h" +#include "table/table_reader.h" #include "table/merger.h" #include "table/two_level_iterator.h" #include "util/coding.h" @@ -54,6 +55,10 @@ Version::~Version() { assert(f->refs > 0); f->refs--; if (f->refs <= 0) { + if (f->table_reader_handle) { + cfd_->table_cache()->ReleaseHandle(f->table_reader_handle); + f->table_reader_handle = nullptr; + } vset_->obsolete_files_.push_back(f); } } @@ -188,11 +193,10 @@ class Version::LevelFileNumIterator : public Iterator { mutable char value_buf_[16]; }; -static Iterator* GetFileIterator(void* arg, - const ReadOptions& options, +static Iterator* GetFileIterator(void* arg, const ReadOptions& options, const EnvOptions& soptions, - const Slice& file_value, - bool for_compaction) { + const InternalKeyComparator& icomparator, + const Slice& file_value, bool for_compaction) { TableCache* cache = reinterpret_cast(arg); if (file_value.size() != 16) { return NewErrorIterator( @@ -205,12 +209,11 @@ static Iterator* GetFileIterator(void* arg, options_copy = options; options_copy.prefix = nullptr; } - return cache->NewIterator(options.prefix ? options_copy : options, - soptions, - DecodeFixed64(file_value.data()), - DecodeFixed64(file_value.data() + 8), - nullptr /* don't need reference to table*/, - for_compaction); + FileMetaData meta(DecodeFixed64(file_value.data()), + DecodeFixed64(file_value.data() + 8)); + return cache->NewIterator( + options.prefix ? options_copy : options, soptions, icomparator, meta, + nullptr /* don't need reference to table*/, for_compaction); } } @@ -230,7 +233,8 @@ bool Version::PrefixMayMatch(const ReadOptions& options, may_match = true; } else { may_match = cfd_->table_cache()->PrefixMayMatch( - options, DecodeFixed64(level_iter->value().data()), + options, cfd_->internal_comparator(), + DecodeFixed64(level_iter->value().data()), DecodeFixed64(level_iter->value().data() + 8), internal_prefix, nullptr); } @@ -252,7 +256,7 @@ Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, } } return NewTwoLevelIterator(level_iter, &GetFileIterator, cfd_->table_cache(), - options, soptions); + options, soptions, cfd_->internal_comparator()); } void Version::AddIterators(const ReadOptions& options, @@ -261,7 +265,7 @@ void Version::AddIterators(const ReadOptions& options, // Merge all level zero files together since they may overlap for (const FileMetaData* file : files_[0]) { iters->push_back(cfd_->table_cache()->NewIterator( - options, soptions, file->number, file->file_size)); + options, soptions, cfd_->internal_comparator(), *file)); } // For levels > 0, we can use a concatenating iterator that sequentially @@ -311,83 +315,73 @@ static void MarkKeyMayExist(void* arg) { } } -static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){ +static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, + const Slice& v, bool didIO) { Saver* s = reinterpret_cast(arg); MergeContext* merge_contex = s->merge_context; std::string merge_result; // temporary area for merge results later assert(s != nullptr && merge_contex != nullptr); - ParsedInternalKey parsed_key; // TODO: didIO and Merge? s->didIO = didIO; - if (!ParseInternalKey(ikey, &parsed_key)) { - // TODO: what about corrupt during Merge? - s->state = kCorrupt; - } else { - if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { - // Key matches. Process it - switch (parsed_key.type) { - case kTypeValue: - if (kNotFound == s->state) { - s->state = kFound; - s->value->assign(v.data(), v.size()); - } else if (kMerge == s->state) { - assert(s->merge_operator != nullptr); - s->state = kFound; - if (!s->merge_operator->FullMerge(s->user_key, &v, - merge_contex->GetOperands(), - s->value, s->logger)) { - RecordTick(s->statistics, NUMBER_MERGE_FAILURES); - s->state = kCorrupt; - } - } else { - assert(false); + if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { + // Key matches. Process it + switch (parsed_key.type) { + case kTypeValue: + if (kNotFound == s->state) { + s->state = kFound; + s->value->assign(v.data(), v.size()); + } else if (kMerge == s->state) { + assert(s->merge_operator != nullptr); + s->state = kFound; + if (!s->merge_operator->FullMerge(s->user_key, &v, + merge_contex->GetOperands(), + s->value, s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + s->state = kCorrupt; } - return false; + } else { + assert(false); + } + return false; - case kTypeDeletion: - if (kNotFound == s->state) { - s->state = kDeleted; - } else if (kMerge == s->state) { - s->state = kFound; + case kTypeDeletion: + if (kNotFound == s->state) { + s->state = kDeleted; + } else if (kMerge == s->state) { + s->state = kFound; if (!s->merge_operator->FullMerge(s->user_key, nullptr, merge_contex->GetOperands(), s->value, s->logger)) { - RecordTick(s->statistics, NUMBER_MERGE_FAILURES); - s->state = kCorrupt; - } - } else { - assert(false); + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + s->state = kCorrupt; } - return false; - - case kTypeMerge: - assert(s->state == kNotFound || s->state == kMerge); - s->state = kMerge; - merge_contex->PushOperand(v); - while (merge_contex->GetNumOperands() >= 2) { - // Attempt to merge operands together via user associateive merge - if (s->merge_operator->PartialMerge(s->user_key, - merge_contex->GetOperand(0), - merge_contex->GetOperand(1), - &merge_result, - s->logger)) { - merge_contex->PushPartialMergeResult(merge_result); - } else { - // Associative merge returns false ==> stack the operands - break; - } - } - return true; - - case kTypeColumnFamilyDeletion: - case kTypeColumnFamilyValue: - case kTypeColumnFamilyMerge: - case kTypeLogData: + } else { assert(false); + } + return false; + + case kTypeMerge: + assert(s->state == kNotFound || s->state == kMerge); + s->state = kMerge; + merge_contex->PushOperand(v); + while (merge_contex->GetNumOperands() >= 2) { + // Attempt to merge operands together via user associateive merge + if (s->merge_operator->PartialMerge( + s->user_key, merge_contex->GetOperand(0), + merge_contex->GetOperand(1), &merge_result, s->logger)) { + merge_contex->PushPartialMergeResult(merge_result); + } else { + // Associative merge returns false ==> stack the operands break; + } } + return true; + + default: + assert(false); + break; } } @@ -524,8 +518,8 @@ void Version::Get(const ReadOptions& options, prev_file = f; #endif bool tableIO = false; - *status = cfd_->table_cache()->Get(options, f->number, f->file_size, ikey, - &saver, SaveValue, &tableIO, + *status = cfd_->table_cache()->Get(options, cfd_->internal_comparator(), + *f, ikey, &saver, SaveValue, &tableIO, MarkKeyMayExist); // TODO: examine the behavior for corrupted key if (!status->ok()) { @@ -707,7 +701,7 @@ bool CompareSeqnoDescending(const Version::Fsize& first, return false; } -} // anonymous namespace +} // anonymous namespace void Version::UpdateFilesBySize() { // No need to sort the highest level because it is never compacted. @@ -756,12 +750,14 @@ void Version::Ref() { ++refs_; } -void Version::Unref() { +bool Version::Unref() { assert(refs_ >= 1); --refs_; if (refs_ == 0) { delete this; + return true; } + return false; } bool Version::NeedsCompaction() const { @@ -1200,10 +1196,15 @@ class VersionSet::Builder { FileMetaData* f = to_unref[i]; f->refs--; if (f->refs <= 0) { + if (f->table_reader_handle) { + cfd_->table_cache()->ReleaseHandle(f->table_reader_handle); + f->table_reader_handle = nullptr; + } delete f; } } } + delete[] levels_; base_->Unref(); } @@ -1280,19 +1281,17 @@ class VersionSet::Builder { // Delete files const VersionEdit::DeletedFileSet& del = edit->deleted_files_; - for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin(); - iter != del.end(); - ++iter) { - const int level = iter->first; - const uint64_t number = iter->second; + for (const auto& del_file : del) { + const auto level = del_file.first; + const auto number = del_file.second; levels_[level].deleted_files.insert(number); CheckConsistencyForDeletes(edit, number, level); } // Add new files - for (size_t i = 0; i < edit->new_files_.size(); i++) { - const int level = edit->new_files_[i].first; - FileMetaData* f = new FileMetaData(edit->new_files_[i].second); + for (const auto& new_file : edit->new_files_) { + const int level = new_file.first; + FileMetaData* f = new FileMetaData(new_file.second); f->refs = 1; // We arrange to automatically compact this file after @@ -1325,23 +1324,21 @@ class VersionSet::Builder { for (int level = 0; level < base_->NumberLevels(); level++) { // Merge the set of added files with the set of pre-existing files. // Drop any deleted files. Store the result in *v. - const std::vector& base_files = base_->files_[level]; - std::vector::const_iterator base_iter = base_files.begin(); - std::vector::const_iterator base_end = base_files.end(); - const FileSet* added = levels_[level].added_files; - v->files_[level].reserve(base_files.size() + added->size()); - for (FileSet::const_iterator added_iter = added->begin(); - added_iter != added->end(); - ++added_iter) { + const auto& base_files = base_->files_[level]; + auto base_iter = base_files.begin(); + auto base_end = base_files.end(); + const auto& added_files = *levels_[level].added_files; + v->files_[level].reserve(base_files.size() + added_files.size()); + + for (const auto& added : added_files) { // Add all smaller files listed in base_ - for (std::vector::const_iterator bpos - = std::upper_bound(base_iter, base_end, *added_iter, cmp); + for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp); base_iter != bpos; ++base_iter) { MaybeAddFile(v, level, *base_iter); } - MaybeAddFile(v, level, *added_iter); + MaybeAddFile(v, level, added); } // Add remaining base files @@ -1353,11 +1350,24 @@ class VersionSet::Builder { CheckConsistency(v); } + void LoadTableHandlers() { + for (int level = 0; level < cfd_->NumberLevels(); level++) { + for (auto& file_meta : *(levels_[level].added_files)) { + assert (!file_meta->table_reader_handle); + bool table_io; + cfd_->table_cache()->FindTable( + base_->vset_->storage_options_, cfd_->internal_comparator(), + file_meta->number, file_meta->file_size, + &file_meta->table_reader_handle, &table_io, false); + } + } + } + void MaybeAddFile(Version* v, int level, FileMetaData* f) { if (levels_[level].deleted_files.count(f->number) > 0) { // File is deleted: do nothing } else { - std::vector* files = &v->files_[level]; + auto* files = &v->files_[level]; if (level > 0 && !files->empty()) { // Must not overlap assert(cfd_->internal_comparator().Compare( @@ -1442,13 +1452,12 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, ManifestWriter* last_writer = &w; assert(!manifest_writers_.empty()); assert(manifest_writers_.front() == &w); - std::deque::iterator iter = manifest_writers_.begin(); - for (; iter != manifest_writers_.end(); ++iter) { - if ((*iter)->cfd->GetID() != column_family_data->GetID()) { + for (const auto& writer : manifest_writers_) { + if (writer->cfd->GetID() != column_family_data->GetID()) { // group commits across column families are not yet supported break; } - last_writer = *iter; + last_writer = writer; LogAndApplyHelper(column_family_data, &builder, v, last_writer->edit, mu); batch_edits.push_back(last_writer->edit); } @@ -1456,7 +1465,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, // Initialize new descriptor log file if necessary by creating // a temporary file that contains a snapshot of the current version. - std::string new_manifest_file; + std::string new_manifest_filename; uint64_t new_manifest_file_size = 0; Status s; // we will need this if we are creating new manifest @@ -1470,11 +1479,11 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, } if (new_descriptor_log) { - new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); + new_manifest_filename = DescriptorFileName(dbname_, manifest_file_number_); edit->SetNextFile(next_file_number_); } - // Unlock during expensive MANIFEST log write. New writes cannot get here + // Unlock during expensive operations. New writes cannot get here // because &w is ensuring that all new writes get queued. { // calculate the amount of data being compacted at every level @@ -1484,11 +1493,18 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, mu->Unlock(); + if (options_->max_open_files == -1) { + // unlimited table cache. Pre-load table handle now. + // Need to do it out of the mutex. + builder.LoadTableHandlers(); + } + // This is fine because everything inside of this block is serialized -- // only one thread can be here at the same time - if (!new_manifest_file.empty()) { + if (!new_manifest_filename.empty()) { unique_ptr descriptor_file; - s = env_->NewWritableFile(new_manifest_file, &descriptor_file, + s = env_->NewWritableFile(new_manifest_filename, + &descriptor_file, storage_options_); if (s.ok()) { descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); @@ -1536,7 +1552,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, // If we just created a new descriptor file, install it by writing a // new CURRENT file that points to it. - if (s.ok() && !new_manifest_file.empty()) { + if (s.ok() && !new_manifest_filename.empty()) { s = SetCurrentFile(env_, dbname_, manifest_file_number_); if (s.ok() && old_manifest_file_number < manifest_file_number_) { // delete old manifest file @@ -1573,9 +1589,9 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, Log(options_->info_log, "Error in committing version %lu", (unsigned long)v->GetVersionNumber()); delete v; - if (!new_manifest_file.empty()) { + if (!new_manifest_filename.empty()) { descriptor_log_.reset(); - env_->DeleteFile(new_manifest_file); + env_->DeleteFile(new_manifest_filename); } } @@ -1631,27 +1647,33 @@ Status VersionSet::Recover( std::set column_families_not_found; // Read "CURRENT" file, which contains a pointer to the current manifest file - std::string current; - Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t); + std::string manifest_filename; + Status s = ReadFileToString( + env_, CurrentFileName(dbname_), &manifest_filename + ); if (!s.ok()) { return s; } - if (current.empty() || current[current.size()-1] != '\n') { + if (manifest_filename.empty() || + manifest_filename.back() != '\n') { return Status::Corruption("CURRENT file does not end with newline"); } - current.resize(current.size() - 1); + // remove the trailing '\n' + manifest_filename.resize(manifest_filename.size() - 1); Log(options_->info_log, "Recovering from manifest file:%s\n", - current.c_str()); + manifest_filename.c_str()); - std::string dscname = dbname_ + "/" + current; - unique_ptr file; - s = env_->NewSequentialFile(dscname, &file, storage_options_); + manifest_filename = dbname_ + "/" + manifest_filename; + unique_ptr manifest_file; + s = env_->NewSequentialFile( + manifest_filename, &manifest_file, storage_options_ + ); if (!s.ok()) { return s; } uint64_t manifest_file_size; - s = env_->GetFileSize(dscname, &manifest_file_size); + s = env_->GetFileSize(manifest_filename, &manifest_file_size); if (!s.ok()) { return s; } @@ -1682,8 +1704,8 @@ Status VersionSet::Recover( { VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(file), &reporter, true/*checksum*/, - 0/*initial_offset*/); + log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/, + 0 /*initial_offset*/); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -1797,7 +1819,6 @@ Status VersionSet::Recover( } } } - file.reset(); if (s.ok()) { if (!have_next_file) { @@ -1846,7 +1867,7 @@ Status VersionSet::Recover( "manifest_file_number is %lu, next_file_number is %lu, " "last_sequence is %lu, log_number is %lu," "prev_log_number is %lu\n", - current.c_str(), + manifest_filename.c_str(), (unsigned long)manifest_file_number_, (unsigned long)next_file_number_, (unsigned long)last_sequence_, @@ -2229,8 +2250,8 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { // approximate offset of "ikey" within the table. TableReader* table_reader_ptr; Iterator* iter = v->cfd_->table_cache()->NewIterator( - ReadOptions(), storage_options_, files[i]->number, - files[i]->file_size, &table_reader_ptr); + ReadOptions(), storage_options_, v->cfd_->internal_comparator(), + *(files[i]), &table_reader_ptr); if (table_reader_ptr != nullptr) { result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); } @@ -2285,8 +2306,9 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { if (c->level() + which == 0) { for (const auto& file : *c->inputs(which)) { list[num++] = c->column_family_data()->table_cache()->NewIterator( - options, storage_options_compactions_, file->number, - file->file_size, nullptr, true /* for compaction */); + options, storage_options_compactions_, + c->column_family_data()->internal_comparator(), *file, nullptr, + true /* for compaction */); } } else { // Create concatenating iterator for the files from this level @@ -2295,13 +2317,14 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { c->column_family_data()->internal_comparator(), c->inputs(which)), &GetFileIterator, c->column_family_data()->table_cache(), options, - storage_options_, true /* for compaction */); + storage_options_, c->column_family_data()->internal_comparator(), + true /* for compaction */); } } } assert(num <= space); Iterator* result = NewMergingIterator( - &c->column_family_data()->internal_comparator(), list, num); + env_, &c->column_family_data()->internal_comparator(), list, num); delete[] list; return result; } @@ -2356,14 +2379,14 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { } Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, - FileMetaData* meta, + FileMetaData** meta, ColumnFamilyData** cfd) { for (auto cfd_iter : *column_family_set_) { Version* version = cfd_iter->current(); for (int level = 0; level < version->NumberLevels(); level++) { for (const auto& file : version->files_[level]) { if (file->number == number) { - *meta = *file; + *meta = file; *filelevel = level; *cfd = cfd_iter; return Status::OK(); diff --git a/db/version_set.h b/db/version_set.h index 43705bf99..e2cbd5643 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -85,8 +85,8 @@ class Version { }; void Get(const ReadOptions&, const LookupKey& key, std::string* val, Status* status, MergeContext* merge_context, - GetStats* stats, const Options& db_option, bool* value_found = - nullptr); + GetStats* stats, const Options& db_option, + bool* value_found = nullptr); // Adds "stats" into the current state. Returns true if a new // compaction may need to be triggered, false otherwise. @@ -101,7 +101,9 @@ class Version { // Reference count management (so Versions do not disappear out from // under live iterators) void Ref(); - void Unref(); + // Decrease reference count. Delete the object if no reference left + // and return true. Otherwise, return false. + bool Unref(); // Returns true iff some level needs a compaction. bool NeedsCompaction() const; @@ -384,7 +386,7 @@ class VersionSet { bool VerifyCompactionFileConsistency(Compaction* c); Status GetMetadataForFile(uint64_t number, int* filelevel, - FileMetaData* metadata, ColumnFamilyData** cfd); + FileMetaData** metadata, ColumnFamilyData** cfd); void GetLiveFilesMetaData( std::vector *metadata); diff --git a/db/write_batch.cc b/db/write_batch.cc index 1132b3551..084091aad 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -146,7 +146,7 @@ Status WriteBatch::Iterate(Handler* handler) const { return Status::Corruption("unknown WriteBatch tag"); } } - if (found != WriteBatchInternal::Count(this)) { + if (found != WriteBatchInternal::Count(this)) { return Status::Corruption("WriteBatch has wrong count"); } else { return Status::OK(); @@ -261,14 +261,45 @@ class MemTableInserter : public WriteBatch::Handler { } MemTable* mem = cf_mems_->GetMemTable(); const Options* options = cf_mems_->GetFullOptions(); - if (options->inplace_update_support && - mem->Update(sequence_, kTypeValue, key, value)) { + if (!options->inplace_update_support) { + mem->Add(sequence_, kTypeValue, key, value); + } else if (options->inplace_callback == nullptr) { + mem->Update(sequence_, key, value); RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED); } else { - mem->Add(sequence_, kTypeValue, key, value); + if (mem->UpdateCallback(sequence_, key, value, *options)) { + } else { + // key not found in memtable. Do sst get, update, add + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions ropts; + ropts.snapshot = &read_from_snapshot; + + std::string prev_value; + std::string merged_value; + Status s = db_->Get(ropts, key, &prev_value); + char* prev_buffer = const_cast(prev_value.c_str()); + uint32_t prev_size = prev_value.size(); + auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr, + s.ok() ? &prev_size : nullptr, + value, &merged_value); + if (status == UpdateStatus::UPDATED_INPLACE) { + // prev_value is updated in-place with final value. + mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size)); + RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); + } else if (status == UpdateStatus::UPDATED) { + // merged_value contains the final value. + mem->Add(sequence_, kTypeValue, key, Slice(merged_value)); + RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); + } + } } + // Since all Puts are logged in trasaction logs (if enabled), always bump + // sequence number. Even if the update eventually fails and does not result + // in memtable add/update. sequence_++; } + virtual void MergeCF(uint32_t column_family_id, const Slice& key, const Slice& value) { bool found = cf_mems_->Seek(column_family_id); @@ -333,6 +364,7 @@ class MemTableInserter : public WriteBatch::Handler { sequence_++; } + virtual void DeleteCF(uint32_t column_family_id, const Slice& key) { bool found = cf_mems_->Seek(column_family_id); if (!found || IgnoreUpdate()) { diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index a2dee2959..d56d7107a 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -58,10 +58,7 @@ static std::string PrintContents(WriteBatch* b) { state.append(")"); count++; break; - case kTypeColumnFamilyDeletion: - case kTypeColumnFamilyValue: - case kTypeColumnFamilyMerge: - case kTypeLogData: + default: assert(false); break; } diff --git a/include/rocksdb/arena.h b/include/rocksdb/arena.h deleted file mode 100644 index 642b61408..000000000 --- a/include/rocksdb/arena.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Arena class defines memory allocation methods. It's used by memtable and -// skiplist. - -#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_ -#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_ - -#include -#include - -namespace rocksdb { - -class Arena { - public: - Arena() {}; - virtual ~Arena() {}; - - // Return a pointer to a newly allocated memory block of "bytes" bytes. - virtual char* Allocate(size_t bytes) = 0; - - // Allocate memory with the normal alignment guarantees provided by malloc. - virtual char* AllocateAligned(size_t bytes) = 0; - - // Returns an estimate of the total memory used by arena. - virtual const size_t ApproximateMemoryUsage() = 0; - - // Returns the total number of bytes in all blocks allocated so far. - virtual const size_t MemoryAllocatedBytes() = 0; - - private: - // No copying allowed - Arena(const Arena&); - void operator=(const Arena&); -}; - -} // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_ARENA_H_ diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 7d58e1546..b5821bac2 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -102,7 +102,10 @@ class Cache { virtual uint64_t NewId() = 0; // returns the maximum configured capacity of the cache - virtual size_t GetCapacity() = 0; + virtual size_t GetCapacity() const = 0; + + // returns the memory size for the entries residing in the cache. + virtual size_t GetUsage() const = 0; // Call this on shutdown if you want to speed it up. Cache will disown // any underlying data and will not free it on delete. This call will leak diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 484582c90..48a4d33d4 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -438,7 +438,7 @@ class WritableFile { // This asks the OS to initiate flushing the cached data to disk, // without waiting for completion. // Default implementation does nothing. - virtual Status RangeSync(off64_t offset, off64_t nbytes) { + virtual Status RangeSync(off_t offset, off_t nbytes) { return Status::OK(); } diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 9e24942ac..e9a41aedd 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -33,8 +33,7 @@ // iteration over the entire collection is rare since doing so requires all the // keys to be copied into a sorted data structure. -#ifndef STORAGE_ROCKSDB_DB_MEMTABLEREP_H_ -#define STORAGE_ROCKSDB_DB_MEMTABLEREP_H_ +#pragma once #include @@ -52,7 +51,11 @@ class MemTableRep { public: // Compare a and b. Return a negative value if a is less than b, 0 if they // are equal, and a positive value if a is greater than b - virtual int operator()(const char* a, const char* b) const = 0; + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const = 0; + + virtual int operator()(const char* prefix_len_key, + const Slice& key) const = 0; virtual ~KeyComparator() { } }; @@ -100,7 +103,7 @@ class MemTableRep { virtual void Prev() = 0; // Advance to the first entry with a key >= target - virtual void Seek(const char* target) = 0; + virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; // Position at the first entry in collection. // Final state of iterator is Valid() iff collection is not empty. @@ -175,26 +178,22 @@ public: } }; -// HashSkipListRep is backed by hash map of buckets. Each bucket is a skip -// list. All the keys with the same prefix will be in the same bucket. -// The prefix is determined using user supplied SliceTransform. It has -// to match prefix_extractor in options.prefix_extractor. -// -// Iteration over the entire collection is implemented by dumping all the keys -// into a separate skip list. Thus, these data structures are best used when -// iteration over the entire collection is rare. -// -// Parameters: -// transform: The prefix extractor that returns prefix when supplied a user -// key. Has to match options.prefix_extractor -// bucket_count: Number of buckets in a hash_map. Each bucket needs -// 8 bytes. By default, we set buckets to one million, which -// will take 8MB of memory. If you know the number of keys you'll -// keep in hash map, set bucket count to be approximately twice -// the number of keys +// This class contains a fixed array of buckets, each +// pointing to a skiplist (null if the bucket is empty). +// bucket_count: number of fixed array buckets +// skiplist_height: the max height of the skiplist +// skiplist_branching_factor: probabilistic size ratio between adjacent +// link lists in the skiplist extern MemTableRepFactory* NewHashSkipListRepFactory( - const SliceTransform* transform, size_t bucket_count = 1000000); + const SliceTransform* transform, size_t bucket_count = 1000000, + int32_t skiplist_height = 4, int32_t skiplist_branching_factor = 4 +); -} +// The factory is to create memtables with a hashed linked list: +// it contains a fixed array of buckets, each pointing to a sorted single +// linked list (null if the bucket is empty). +// bucket_count: number of fixed array buckets +extern MemTableRepFactory* NewHashLinkListRepFactory( + const SliceTransform* transform, size_t bucket_count = 50000); -#endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_ +} diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 47ee930e8..4623543fd 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -34,6 +34,7 @@ class TablePropertiesCollector; class Slice; class SliceTransform; class Statistics; +class InternalKeyComparator; using std::shared_ptr; @@ -65,6 +66,12 @@ struct CompressionOptions { : window_bits(wbits), level(lev), strategy(strategy) {} }; +enum UpdateStatus { // Return status For inplace update callback + UPDATE_FAILED = 0, // Nothing to update + UPDATED_INPLACE = 1, // Value updated inplace + UPDATED = 2, // No inplace update. Merged value set +}; + struct Options; struct ColumnFamilyOptions { @@ -410,13 +417,17 @@ struct ColumnFamilyOptions { // the tables. // Default: emtpy vector -- no user-defined statistics collection will be // performed. - std::vector> - table_properties_collectors; - - // Allows thread-safe inplace updates. Requires Updates iff - // * key exists in current memtable - // * new sizeof(new_value) <= sizeof(old_value) - // * old_value for that key is a put i.e. kTypeValue + typedef std::vector> + TablePropertiesCollectors; + TablePropertiesCollectors table_properties_collectors; + + // Allows thread-safe inplace updates. + // If inplace_callback function is not set, + // Put(key, new_value) will update inplace the existing_value iff + // * key exists in current memtable + // * new sizeof(new_value) <= sizeof(existing_value) + // * existing_value for that key is a put i.e. kTypeValue + // If inplace_callback function is set, check doc for inplace_callback. // Default: false. bool inplace_update_support; @@ -424,6 +435,55 @@ struct ColumnFamilyOptions { // Default: 10000, if inplace_update_support = true, else 0. size_t inplace_update_num_locks; + // existing_value - pointer to previous value (from both memtable and sst). + // nullptr if key doesn't exist + // existing_value_size - pointer to size of existing_value). + // nullptr if key doesn't exist + // delta_value - Delta value to be merged with the existing_value. + // Stored in transaction logs. + // merged_value - Set when delta is applied on the previous value. + + // Applicable only when inplace_update_support is true, + // this callback function is called at the time of updating the memtable + // as part of a Put operation, lets say Put(key, delta_value). It allows the + // 'delta_value' specified as part of the Put operation to be merged with + // an 'existing_value' of the key in the database. + + // If the merged value is smaller in size that the 'existing_value', + // then this function can update the 'existing_value' buffer inplace and + // the corresponding 'existing_value'_size pointer, if it wishes to. + // The callback should return UpdateStatus::UPDATED_INPLACE. + // In this case. (In this case, the snapshot-semantics of the rocksdb + // Iterator is not atomic anymore). + + // If the merged value is larger in size than the 'existing_value' or the + // application does not wish to modify the 'existing_value' buffer inplace, + // then the merged value should be returned via *merge_value. It is set by + // merging the 'existing_value' and the Put 'delta_value'. The callback should + // return UpdateStatus::UPDATED in this case. This merged value will be added + // to the memtable. + + // If merging fails or the application does not wish to take any action, + // then the callback should return UpdateStatus::UPDATE_FAILED. + + // Please remember that the original call from the application is Put(key, + // delta_value). So the transaction log (if enabled) will still contain (key, + // delta_value). The 'merged_value' is not stored in the transaction log. + // Hence the inplace_callback function should be consistent across db reopens. + + // Default: nullptr + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + + // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom + // for memtable + uint32_t memtable_prefix_bloom_bits; + + // number of hash probes per key + uint32_t memtable_prefix_bloom_probes; + // Maximum number of successive merge operations on a key in the memtable. // // When a merge operation is added to the memtable and the maximum number of @@ -473,9 +533,10 @@ struct DBOptions { shared_ptr info_log; // Number of open files that can be used by the DB. You may need to - // increase this if your database has a large working set (budget - // one open file per 2MB of working set). - // + // increase this if your database has a large working set. Value -1 means + // files opened are always kept open. You can estimate number of files based + // on target_file_size_base and target_file_size_multiplier for level-based + // compaction. For universal-style compaction, you can usually set it to -1. // Default: 1000 int max_open_files; diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 9e900e050..551ca8fe6 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -38,7 +38,27 @@ struct PerfContext { uint64_t internal_key_skipped_count; // total number of deletes skipped over during iteration uint64_t internal_delete_skipped_count; - uint64_t wal_write_time; // total time spent on writing to WAL + + uint64_t get_snapshot_time; // total time spent on getting snapshot + uint64_t get_from_memtable_time; // total time spent on querying memtables + uint64_t get_from_memtable_count; // number of mem tables queried + // total time spent after Get() finds a key + uint64_t get_post_process_time; + uint64_t get_from_output_files_time; // total time reading from output files + // total time spent on seeking child iters + uint64_t seek_child_seek_time; + // number of seek issued in child iterators + uint64_t seek_child_seek_count; + uint64_t seek_min_heap_time; // total time spent on the merge heap + // total time spent on seeking the internal entries + uint64_t seek_internal_seek_time; + // total time spent on iterating internal entries to find the next user entry + uint64_t find_next_user_entry_time; + // total time spent on pre or post processing when writing a record + uint64_t write_pre_and_post_process_time; + uint64_t write_wal_time; // total time spent on writing to WAL + // total time spent on writing to mem tables + uint64_t write_memtable_time; }; extern __thread PerfContext perf_context; diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index f5fbb5924..cddd74bf8 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -7,7 +7,6 @@ #define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ #include -#include #include #include #include @@ -18,10 +17,8 @@ namespace rocksdb { /** * Keep adding ticker's here. - * Any ticker should have a value less than TICKER_ENUM_MAX. - * Add a new ticker by assigning it the current value of TICKER_ENUM_MAX - * Add a string representation in TickersNameMap below. - * And incrementing TICKER_ENUM_MAX. + * 1. Any ticker should be added before TICKER_ENUM_MAX. + * 2. Add a readable string in TickersNameMap below for the newly added ticker. */ enum Tickers { // total block cache misses @@ -252,7 +249,7 @@ class Statistics { virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0; virtual void measureTime(Histograms histogramType, uint64_t time) = 0; - virtual void histogramData(Histograms type, HistogramData * const data) = 0; + virtual void histogramData(Histograms type, HistogramData* const data) = 0; // String representation of the statistic object. std::string ToString(); }; diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 2d2bfacc4..d4965ca45 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -1,127 +1,81 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Currently we support two types of tables: plain table and block-based table. +// 1. Block-based table: this is the default table type that we inherited from +// LevelDB, which was designed for storing data in hard disk or flash +// device. +// 2. Plain table: it is one of RocksDB's SST file format optimized +// for low query latency on pure-memory or really low-latency media. +// +// A tutorial of rocksdb table formats is available here: +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats +// +// Example code is also available +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples #pragma once #include -#include +#include +#include + #include "rocksdb/env.h" #include "rocksdb/iterator.h" -#include "rocksdb/table_properties.h" #include "rocksdb/options.h" +#include "rocksdb/status.h" namespace rocksdb { -struct Options; +// -- Block-based Table +class FlushBlockPolicyFactory; class RandomAccessFile; -struct ReadOptions; -class TableCache; +class TableBuilder; +class TableReader; class WritableFile; +struct EnvOptions; +struct Options; using std::unique_ptr; -// TableBuilder provides the interface used to build a Table -// (an immutable and sorted map from keys to values). -// -// Multiple threads can invoke const methods on a TableBuilder without -// external synchronization, but if any of the threads may call a -// non-const method, all threads accessing the same TableBuilder must use -// external synchronization. -class TableBuilder { - public: - // REQUIRES: Either Finish() or Abandon() has been called. - virtual ~TableBuilder() {} - - // Add key,value to the table being constructed. - // REQUIRES: key is after any previously added key according to comparator. - // REQUIRES: Finish(), Abandon() have not been called - virtual void Add(const Slice& key, const Slice& value) = 0; - - // Return non-ok iff some error has been detected. - virtual Status status() const = 0; - - // Finish building the table. - // REQUIRES: Finish(), Abandon() have not been called - virtual Status Finish() = 0; - - // Indicate that the contents of this builder should be abandoned. - // If the caller is not going to call Finish(), it must call Abandon() - // before destroying this builder. - // REQUIRES: Finish(), Abandon() have not been called - virtual void Abandon() = 0; - - // Number of calls to Add() so far. - virtual uint64_t NumEntries() const = 0; - - // Size of the file generated so far. If invoked after a successful - // Finish() call, returns the size of the final generated file. - virtual uint64_t FileSize() const = 0; -}; - -// A Table is a sorted map from strings to strings. Tables are -// immutable and persistent. A Table may be safely accessed from -// multiple threads without external synchronization. -class TableReader { - public: - virtual ~TableReader() {} - - // Determine whether there is a chance that the current table file - // contains the key a key starting with iternal_prefix. The specific - // table implementation can use bloom filter and/or other heuristic - // to filter out this table as a whole. - virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0; - - // Returns a new iterator over the table contents. - // The result of NewIterator() is initially invalid (caller must - // call one of the Seek methods on the iterator before using it). - virtual Iterator* NewIterator(const ReadOptions&) = 0; - - // Given a key, return an approximate byte offset in the file where - // the data for that key begins (or would begin if the key were - // present in the file). The returned value is in terms of file - // bytes, and so includes effects like compression of the underlying data. - // E.g., the approximate offset of the last key in the table will - // be close to the file length. - virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0; - - // Returns true if the block for the specified key is in cache. - // REQUIRES: key is in this table. - virtual bool TEST_KeyInCache(const ReadOptions& options, - const Slice& key) = 0; - - // Set up the table for Compaction. Might change some parameters with - // posix_fadvise - virtual void SetupForCompaction() = 0; - - virtual TableProperties& GetTableProperties() = 0; - - // Calls (*result_handler)(handle_context, ...) repeatedly, starting with - // the entry found after a call to Seek(key), until result_handler returns - // false, where k is the actual internal key for a row found and v as the - // value of the key. didIO is true if I/O is involved in the operation. May - // not make such a call if filter policy says that key is not present. - // - // mark_key_may_exist_handler needs to be called when it is configured to be - // memory only and the key is not found in the block cache, with - // the parameter to be handle_context. +// For advanced user only +struct BlockBasedTableOptions { + // @flush_block_policy_factory creates the instances of flush block policy. + // which provides a configurable way to determine when to flush a block in + // the block based tables. If not set, table builder will use the default + // block flush policy, which cut blocks by block size (please refer to + // `FlushBlockBySizePolicy`). + std::shared_ptr flush_block_policy_factory; + + // TODO(kailiu) Temporarily disable this feature by making the default value + // to be false. // - // readOptions is the options for the read - // key is the key to search for - virtual Status Get( - const ReadOptions& readOptions, - const Slice& key, - void* handle_context, - bool (*result_handler)(void* handle_context, const Slice& k, - const Slice& v, bool didIO), - void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0; + // Indicating if we'd put index/filter blocks to the block cache. + // If not specified, each "table reader" object will pre-load index/filter + // block during table initialization. + bool cache_index_and_filter_blocks = false; }; -// A base class for table factories +// Create default block based table factory. +extern TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); + +// -- Plain Table +// @user_key_len: plain table has optimization for fix-sized keys, which can be +// specified via user_key_len. Alternatively, you can pass +// `kPlainTableVariableLength` if your keys have variable +// lengths. +// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may +// disable it by passing a zero. +// @hash_table_ratio: the desired utilization of the hash table used for prefix +// hashing. hash_table_ratio = number of prefixes / #buckets +// in the hash table +const uint32_t kPlainTableVariableLength = 0; +extern TableFactory* NewPlainTableFactory( + uint32_t user_key_len = kPlainTableVariableLength, + int bloom_bits_per_key = 10, double hash_table_ratio = 0.75); + +// A base class for table factories. class TableFactory { public: virtual ~TableFactory() {} @@ -139,7 +93,7 @@ class TableFactory { // in parameter file. It's the caller's responsibility to make sure // file is in the correct format. // - // GetTableReader() is called in two places: + // NewTableReader() is called in two places: // (1) TableCache::FindTable() calls the function when table cache miss // and cache the table object returned. // (1) SstFileReader (for SST Dump) opens the table and dump the table @@ -150,9 +104,10 @@ class TableFactory { // file is a file handler to handle the file for the table // file_size is the physical file size of the file // table_reader is the output table reader - virtual Status GetTableReader( + virtual Status NewTableReader( const Options& options, const EnvOptions& soptions, - unique_ptr && file, uint64_t file_size, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const = 0; // Return a table builder to write to a file for this table type. @@ -173,8 +128,9 @@ class TableFactory { // file is a handle of a writable file. It is the caller's responsibility to // keep the file open and close the file after closing the table builder. // compression_type is the compression type to use in this table. - virtual TableBuilder* GetTableBuilder( - const Options& options, WritableFile* file, - CompressionType compression_type) const = 0; + virtual TableBuilder* NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const = 0; }; + } // namespace rocksdb diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index 8824ca13c..1d4b9e344 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -1,28 +1,25 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once #include #include - #include "rocksdb/status.h" namespace rocksdb { +// -- Table Properties +// Other than basic table properties, each table may also have the user +// collected properties. +// The value of the user-collected properties are encoded as raw bytes -- +// users have to interprete these values by themselves. +typedef std::unordered_map UserCollectedProperties; + // TableProperties contains a bunch of read-only properties of its associated // table. struct TableProperties { public: - // Other than basic table properties, each table may also have the user - // collected properties. - // The value of the user-collected properties are encoded as raw bytes -- - // users have to interprete these values by themselves. - typedef - std::unordered_map - UserCollectedProperties; - // the total size of all data blocks. uint64_t data_size = 0; // the size of index block. @@ -37,6 +34,10 @@ struct TableProperties { uint64_t num_data_blocks = 0; // the number of entries in this table uint64_t num_entries = 0; + // format version, reserved for backward compatibility + uint64_t format_version = 0; + // If 0, key is variable length. Otherwise number of bytes for each key. + uint64_t fixed_key_len = 0; // The name of the filter policy used in this table. // If no filter policy is used, `filter_policy_name` will be an empty string. @@ -47,17 +48,32 @@ struct TableProperties { // convert this object to a human readable form // @prop_delim: delimiter for each property. - std::string ToString( - const std::string& prop_delim = "; ", - const std::string& kv_delim = "=") const; + std::string ToString(const std::string& prop_delim = "; ", + const std::string& kv_delim = "=") const; }; +// table properties' human-readable names in the property block. +struct TablePropertiesNames { + static const std::string kDataSize; + static const std::string kIndexSize; + static const std::string kFilterSize; + static const std::string kRawKeySize; + static const std::string kRawValueSize; + static const std::string kNumDataBlocks; + static const std::string kNumEntries; + static const std::string kFormatVersion; + static const std::string kFixedKeyLen; + static const std::string kFilterPolicy; +}; + +extern const std::string kPropertiesBlock; + // `TablePropertiesCollector` provides the mechanism for users to collect // their own interested properties. This class is essentially a collection // of callback functions that will be invoked during table building. class TablePropertiesCollector { public: - virtual ~TablePropertiesCollector() { } + virtual ~TablePropertiesCollector() {} // Add() will be called when a new key/value pair is inserted into the table. // @params key the original key that is inserted into the table. @@ -68,23 +84,20 @@ class TablePropertiesCollector { // for writing the properties block. // @params properties User will add their collected statistics to // `properties`. - virtual Status Finish( - TableProperties::UserCollectedProperties* properties) = 0; + virtual Status Finish(UserCollectedProperties* properties) = 0; // The name of the properties collector can be used for debugging purpose. virtual const char* Name() const = 0; // Return the human-readable properties, where the key is property name and // the value is the human-readable form of value. - virtual TableProperties::UserCollectedProperties - GetReadableProperties() const = 0; + virtual UserCollectedProperties GetReadableProperties() const = 0; }; // Extra properties // Below is a list of non-basic properties that are collected by database // itself. Especially some properties regarding to the internal keys (which // is unknown to `table`). -extern uint64_t GetDeletedKeys( - const TableProperties::UserCollectedProperties& props); +extern uint64_t GetDeletedKeys(const UserCollectedProperties& props); } // namespace rocksdb diff --git a/port/port_posix.h b/port/port_posix.h index 15ab0dc5b..8ff2480a3 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -396,7 +396,6 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length, _stream.next_out = (char *)(output + old_sz); _stream.avail_out = output_len - old_sz; break; - case Z_BUF_ERROR: default: delete[] output; BZ2_bzDecompressEnd(&_stream); diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index a5e546be8..e5f3bd4d2 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -17,15 +17,17 @@ #include "rocksdb/flush_block_policy.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" -#include "rocksdb/table.h" +#include "table/table_builder.h" #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" #include "rocksdb/options.h" +#include "db/dbformat.h" #include "table/block_based_table_reader.h" #include "table/block.h" #include "table/block_builder.h" #include "table/filter_block.h" #include "table/format.h" +#include "table/meta_blocks.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/stop_watch.h" @@ -34,51 +36,24 @@ namespace rocksdb { namespace { -struct BytewiseLessThan { - bool operator()(const std::string& key1, const std::string& key2) const { - // smaller entries will be placed in front. - return comparator->Compare(key1, key2) <= 0; - } - const Comparator* comparator = BytewiseComparator(); -}; - -// When writing to a block that requires entries to be sorted by -// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap` -// before writng to store. -typedef std::map BytewiseSortedMap; - -void AddProperties(BytewiseSortedMap& props, std::string name, uint64_t val) { - assert(props.find(name) == props.end()); - - std::string dst; - PutVarint64(&dst, val); - - props.insert( - std::make_pair(name, dst) - ); -} - static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { // Check to see if compressed less than 12.5% return compressed_size < raw_size - (raw_size / 8u); } -// Were we encounter any error occurs during user-defined statistics collection, -// we'll write the warning message to info log. -void LogPropertiesCollectionError( - Logger* info_log, const std::string& method, const std::string& name) { - assert(method == "Add" || method == "Finish"); - - std::string msg = - "[Warning] encountered error when calling TablePropertiesCollector::" + - method + "() with collector name: " + name; - Log(info_log, "%s", msg.c_str()); -} - } // anonymous namespace +// kBlockBasedTableMagicNumber was picked by running +// echo http://code.google.com/p/leveldb/ | sha1sum +// and taking the leading 64 bits. +// Please note that kBlockBasedTableMagicNumber may also be accessed by +// other .cc files so it have to be explicitly declared with "extern". +extern const uint64_t kBlockBasedTableMagicNumber + = 0xdb4775248b80fb57ull; + struct BlockBasedTableBuilder::Rep { Options options; + const InternalKeyComparator& internal_comparator; WritableFile* file; uint64_t offset = 0; Status status; @@ -98,31 +73,30 @@ struct BlockBasedTableBuilder::Rep { std::string compressed_output; std::unique_ptr flush_block_policy; - Rep(const Options& opt, - WritableFile* f, - FlushBlockPolicyFactory* flush_block_policy_factory, + Rep(const Options& opt, const InternalKeyComparator& icomparator, + WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory, CompressionType compression_type) : options(opt), + internal_comparator(icomparator), file(f), - data_block(options), + data_block(options, &internal_comparator), // To avoid linear scan, we make the block_restart_interval to be `1` // in index block builder - index_block(1 /* block_restart_interval */, options.comparator), + index_block(1 /* block_restart_interval */, &internal_comparator), compression_type(compression_type), - filter_block(opt.filter_policy == nullptr ? nullptr - : new FilterBlockBuilder(opt)), + filter_block(opt.filter_policy == nullptr + ? nullptr + : new FilterBlockBuilder(opt, &internal_comparator)), flush_block_policy( - flush_block_policy_factory->NewFlushBlockPolicy(data_block)) { - } + flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {} }; BlockBasedTableBuilder::BlockBasedTableBuilder( - const Options& options, - WritableFile* file, - FlushBlockPolicyFactory* flush_block_policy_factory, + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, FlushBlockPolicyFactory* flush_block_policy_factory, CompressionType compression_type) - : rep_(new Rep(options, - file, flush_block_policy_factory, compression_type)) { + : rep_(new Rep(options, internal_comparator, file, + flush_block_policy_factory, compression_type)) { if (rep_->filter_block != nullptr) { rep_->filter_block->StartBlock(0); } @@ -145,7 +119,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { assert(!r->closed); if (!ok()) return; if (r->props.num_entries > 0) { - assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); + assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0); } auto should_flush = r->flush_block_policy->Update(key, value); @@ -162,7 +136,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { // entries in the first block and < all entries in subsequent // blocks. if (ok()) { - r->options.comparator->FindShortestSeparator(&r->last_key, key); + r->internal_comparator.FindShortestSeparator(&r->last_key, key); std::string handle_encoding; r->pending_handle.EncodeTo(&handle_encoding); r->index_block.Add(r->last_key, Slice(handle_encoding)); @@ -179,16 +153,12 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->props.raw_key_size += key.size(); r->props.raw_value_size += value.size(); - for (auto collector : r->options.table_properties_collectors) { - Status s = collector->Add(key, value); - if (!s.ok()) { - LogPropertiesCollectionError( - r->options.info_log.get(), - "Add", /* method */ - collector->Name() - ); - } - } + NotifyCollectTableCollectorsOnAdd( + key, + value, + r->options.table_properties_collectors, + r->options.info_log.get() + ); } void BlockBasedTableBuilder::Flush() { @@ -370,7 +340,7 @@ Status BlockBasedTableBuilder::Finish() { // block, we will finish writing all index entries here and flush them // to storage after metaindex block is written. if (ok() && !empty_data_block) { - r->options.comparator->FindShortSuccessor(&r->last_key); + r->internal_comparator.FindShortSuccessor(&r->last_key); std::string handle_encoding; r->pending_handle.EncodeTo(&handle_encoding); @@ -382,14 +352,7 @@ Status BlockBasedTableBuilder::Finish() { // 2. [meta block: properties] // 3. [metaindex block] if (ok()) { - // We use `BytewiseComparator` as the comparator for meta block. - BlockBuilder meta_index_block( - r->options.block_restart_interval, - BytewiseComparator() - ); - // Key: meta block name - // Value: block handle to that meta block - BytewiseSortedMap meta_block_handles; + MetaIndexBuilder meta_index_builer; // Write filter block. if (r->filter_block != nullptr) { @@ -397,104 +360,43 @@ Status BlockBasedTableBuilder::Finish() { // of filter data. std::string key = BlockBasedTable::kFilterBlockPrefix; key.append(r->options.filter_policy->Name()); - std::string handle_encoding; - filter_block_handle.EncodeTo(&handle_encoding); - meta_block_handles.insert( - std::make_pair(key, handle_encoding) - ); + meta_index_builer.Add(key, filter_block_handle); } // Write properties block. { - BlockBuilder properties_block( - r->options.block_restart_interval, - BytewiseComparator() - ); - - BytewiseSortedMap properties; - - // Add basic properties - AddProperties( - properties, - BlockBasedTablePropertiesNames::kRawKeySize, - r->props.raw_key_size - ); - AddProperties( - properties, - BlockBasedTablePropertiesNames::kRawValueSize, - r->props.raw_value_size - ); - AddProperties( - properties, - BlockBasedTablePropertiesNames::kDataSize, - r->props.data_size - ); + PropertyBlockBuilder property_block_builder; + std::vector failed_user_prop_collectors; + r->props.filter_policy_name = r->options.filter_policy != nullptr ? + r->options.filter_policy->Name() : ""; r->props.index_size = r->index_block.CurrentSizeEstimate() + kBlockTrailerSize; - AddProperties( - properties, - BlockBasedTablePropertiesNames::kIndexSize, - r->props.index_size - ); - AddProperties( - properties, - BlockBasedTablePropertiesNames::kNumEntries, - r->props.num_entries - ); - AddProperties( - properties, - BlockBasedTablePropertiesNames::kNumDataBlocks, - r->props.num_data_blocks); - if (r->filter_block != nullptr) { - properties.insert({ - BlockBasedTablePropertiesNames::kFilterPolicy, - r->options.filter_policy->Name() - }); - } - AddProperties( - properties, - BlockBasedTablePropertiesNames::kFilterSize, - r->props.filter_size - ); - for (auto collector : r->options.table_properties_collectors) { - TableProperties::UserCollectedProperties user_collected_properties; - Status s = - collector->Finish(&user_collected_properties); - - if (!s.ok()) { - LogPropertiesCollectionError( - r->options.info_log.get(), - "Finish", /* method */ - collector->Name() - ); - } else { - properties.insert( - user_collected_properties.begin(), - user_collected_properties.end() - ); - } - } + // Add basic properties + property_block_builder.AddTableProperty(r->props); - for (const auto& stat : properties) { - properties_block.Add(stat.first, stat.second); - } + NotifyCollectTableCollectorsOnFinish( + r->options.table_properties_collectors, + r->options.info_log.get(), + &property_block_builder + ); BlockHandle properties_block_handle; - WriteBlock(&properties_block, &properties_block_handle); - - std::string handle_encoding; - properties_block_handle.EncodeTo(&handle_encoding); - meta_block_handles.insert( - { BlockBasedTable::kPropertiesBlock, handle_encoding } + WriteRawBlock( + property_block_builder.Finish(), + kNoCompression, + &properties_block_handle ); - } // end of properties block writing - for (const auto& metablock : meta_block_handles) { - meta_index_block.Add(metablock.first, metablock.second); - } + meta_index_builer.Add(kPropertiesBlock, + properties_block_handle); + } // end of properties block writing - WriteBlock(&meta_index_block, &metaindex_block_handle); + WriteRawBlock( + meta_index_builer.Finish(), + kNoCompression, + &metaindex_block_handle + ); } // meta blocks and metaindex block. // Write index block @@ -504,7 +406,7 @@ Status BlockBasedTableBuilder::Finish() { // Write footer if (ok()) { - Footer footer; + Footer footer(kBlockBasedTableMagicNumber); footer.set_metaindex_handle(metaindex_block_handle); footer.set_index_handle(index_block_handle); std::string footer_encoding; @@ -556,4 +458,7 @@ uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; } +const std::string BlockBasedTable::kFilterBlockPrefix = + "filter."; + } // namespace rocksdb diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h index 517f8e785..1c4be1f83 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based_table_builder.h @@ -12,7 +12,7 @@ #include "rocksdb/flush_block_policy.h" #include "rocksdb/options.h" #include "rocksdb/status.h" -#include "rocksdb/table.h" +#include "table/table_builder.h" namespace rocksdb { @@ -20,13 +20,13 @@ class BlockBuilder; class BlockHandle; class WritableFile; - class BlockBasedTableBuilder : public TableBuilder { public: // Create a builder that will store the contents of the table it is // building in *file. Does not close the file. It is up to the // caller to close the file after calling Finish(). BlockBasedTableBuilder(const Options& options, + const InternalKeyComparator& internal_comparator, WritableFile* file, FlushBlockPolicyFactory* flush_block_policy_factory, CompressionType compression_type); diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index a9cd35a68..6a4a64462 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -18,17 +18,19 @@ namespace rocksdb { -Status BlockBasedTableFactory::GetTableReader( +Status BlockBasedTableFactory::NewTableReader( const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const { return BlockBasedTable::Open(options, soptions, table_options_, - std::move(file), file_size, table_reader); + internal_comparator, std::move(file), file_size, + table_reader); } -TableBuilder* BlockBasedTableFactory::GetTableBuilder( - const Options& options, WritableFile* file, - CompressionType compression_type) const { +TableBuilder* BlockBasedTableFactory::NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const { auto flush_block_policy_factory = table_options_.flush_block_policy_factory.get(); @@ -45,11 +47,9 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder( options.block_size_deviation); } - auto table_builder = new BlockBasedTableBuilder( - options, - file, - flush_block_policy_factory, - compression_type); + auto table_builder = + new BlockBasedTableBuilder(options, internal_comparator, file, + flush_block_policy_factory, compression_type); // Delete flush_block_policy_factory only when it's just created from the // options. @@ -63,4 +63,9 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder( return table_builder; } +TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& table_options) { + return new BlockBasedTableFactory(table_options); +} + } // namespace rocksdb diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index 5a4d1bd6e..556997065 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -14,7 +14,6 @@ #include "rocksdb/flush_block_policy.h" #include "rocksdb/options.h" #include "rocksdb/table.h" -#include "table/block_based_table_options.h" namespace rocksdb { @@ -22,31 +21,26 @@ struct Options; struct EnvOptions; using std::unique_ptr; -class Status; -class RandomAccessFile; -class WritableFile; -class Table; -class TableBuilder; -class BlockBasedTable; class BlockBasedTableBuilder; -class BlockBasedTableFactory: public TableFactory { +class BlockBasedTableFactory : public TableFactory { public: - BlockBasedTableFactory() : BlockBasedTableFactory(BlockBasedTableOptions()) {} - explicit BlockBasedTableFactory(const BlockBasedTableOptions& table_options) + explicit BlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()) : table_options_(table_options) {} ~BlockBasedTableFactory() {} const char* Name() const override { return "BlockBasedTable"; } - Status GetTableReader(const Options& options, const EnvOptions& soptions, + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const override; - TableBuilder* GetTableBuilder(const Options& options, WritableFile* file, - CompressionType compression_type) - const override; + TableBuilder* NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const override; private: BlockBasedTableOptions table_options_; diff --git a/table/block_based_table_options.h b/table/block_based_table_options.h deleted file mode 100644 index f5774e2bf..000000000 --- a/table/block_based_table_options.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once -#include - -namespace rocksdb { - -class FlushBlockPolicyFactory; - -struct BlockBasedTableOptions { - // @flush_block_policy_factory creates the instances of flush block policy. - // which provides a configurable way to determine when to flush a block in - // the block based tables. If not set, table builder will use the default - // block flush policy, which cut blocks by block size (please refer to - // `FlushBlockBySizePolicy`). - std::shared_ptr flush_block_policy_factory; - - // TODO(kailiu) Temporarily disable this feature by making the default value - // to be false. Also in master branch, this file is non-public so no user - // will be able to change the value of `cache_index_and_filter_blocks`. - // - // Indicating if we'd put index/filter blocks to the block cache. - // If not specified, each "table reader" object will pre-load index/filter - // block during table initialization. - bool cache_index_and_filter_blocks = false; -}; - -} // namespace rocksdb diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index b08ea1934..f4dd5b2ec 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -21,15 +21,17 @@ #include "table/block.h" #include "table/filter_block.h" #include "table/format.h" +#include "table/meta_blocks.h" #include "table/two_level_iterator.h" #include "util/coding.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" -#include "table/block_based_table_options.h" namespace rocksdb { +extern uint64_t kBlockBasedTableMagicNumber; + // The longest the prefix of the cache key used to identify blocks can be. // We are using the fact that we know for Posix files the unique ID is three // varints. @@ -37,12 +39,13 @@ const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1; using std::unique_ptr; struct BlockBasedTable::Rep { - Rep(const EnvOptions& storage_options) : - soptions(storage_options) { - } + Rep(const EnvOptions& storage_options, + const InternalKeyComparator& internal_comparator) + : soptions(storage_options), internal_comparator_(internal_comparator) {} Options options; const EnvOptions& soptions; + const InternalKeyComparator& internal_comparator_; Status status; unique_ptr file; char cache_key_prefix[kMaxCacheKeyPrefixSize]; @@ -223,34 +226,19 @@ Cache::Handle* GetFromBlockCache( Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) { table_reader->reset(); - if (file_size < Footer::kEncodedLength) { - return Status::InvalidArgument("file is too short to be an sstable"); - } - - char footer_space[Footer::kEncodedLength]; - Slice footer_input; - Status s = file->Read(file_size - Footer::kEncodedLength, - Footer::kEncodedLength, &footer_input, footer_space); - if (!s.ok()) return s; - - // Check that we actually read the whole footer from the file. It may be - // that size isn't correct. - if (footer_input.size() != Footer::kEncodedLength) { - return Status::InvalidArgument("file is too short to be an sstable"); - } - - Footer footer; - s = footer.DecodeFrom(&footer_input); + Footer footer(kBlockBasedTableMagicNumber); + auto s = ReadFooterFromFile(file.get(), file_size, &footer); if (!s.ok()) return s; // We've successfully read the footer and the index block: we're // ready to serve requests. - Rep* rep = new BlockBasedTable::Rep(soptions); + Rep* rep = new BlockBasedTable::Rep(soptions, internal_comparator); rep->options = options; rep->file = std::move(file); rep->metaindex_handle = footer.metaindex_handle(); @@ -265,10 +253,11 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, // Read the properties meta_iter->Seek(kPropertiesBlock); - if (meta_iter->Valid() && meta_iter->key() == Slice(kPropertiesBlock)) { + if (meta_iter->Valid() && meta_iter->key() == kPropertiesBlock) { s = meta_iter->status(); if (s.ok()) { - s = ReadProperties(meta_iter->value(), rep, &rep->table_properties); + s = ReadProperties(meta_iter->value(), rep->file.get(), rep->options.env, + rep->options.info_log.get(), &rep->table_properties); } if (!s.ok()) { @@ -350,7 +339,7 @@ void BlockBasedTable::SetupForCompaction() { compaction_optimized_ = true; } -TableProperties& BlockBasedTable::GetTableProperties() { +const TableProperties& BlockBasedTable::GetTableProperties() { return rep_->table_properties; } @@ -415,96 +404,6 @@ FilterBlockReader* BlockBasedTable::ReadFilter ( rep->options, block.data, block.heap_allocated); } -Status BlockBasedTable::ReadProperties( - const Slice& handle_value, Rep* rep, TableProperties* table_properties) { - assert(table_properties); - - Slice v = handle_value; - BlockHandle handle; - if (!handle.DecodeFrom(&v).ok()) { - return Status::InvalidArgument("Failed to decode properties block handle"); - } - - BlockContents block_contents; - Status s = ReadBlockContents( - rep->file.get(), - ReadOptions(), - handle, - &block_contents, - rep->options.env, - false - ); - - if (!s.ok()) { - return s; - } - - Block properties_block(block_contents); - std::unique_ptr iter( - properties_block.NewIterator(BytewiseComparator()) - ); - - // All pre-defined properties of type uint64_t - std::unordered_map predefined_uint64_properties = { - { BlockBasedTablePropertiesNames::kDataSize, - &table_properties->data_size }, - { BlockBasedTablePropertiesNames::kIndexSize, - &table_properties->index_size }, - { BlockBasedTablePropertiesNames::kFilterSize, - &table_properties->filter_size }, - { BlockBasedTablePropertiesNames::kRawKeySize, - &table_properties->raw_key_size }, - { BlockBasedTablePropertiesNames::kRawValueSize, - &table_properties->raw_value_size }, - { BlockBasedTablePropertiesNames::kNumDataBlocks, - &table_properties->num_data_blocks }, - { BlockBasedTablePropertiesNames::kNumEntries, - &table_properties->num_entries }, - }; - - std::string last_key; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - s = iter->status(); - if (!s.ok()) { - break; - } - - auto key = iter->key().ToString(); - // properties block is strictly sorted with no duplicate key. - assert( - last_key.empty() || - BytewiseComparator()->Compare(key, last_key) > 0 - ); - last_key = key; - - auto raw_val = iter->value(); - auto pos = predefined_uint64_properties.find(key); - - if (pos != predefined_uint64_properties.end()) { - // handle predefined rocksdb properties - uint64_t val; - if (!GetVarint64(&raw_val, &val)) { - // skip malformed value - auto error_msg = - "[Warning] detect malformed value in properties meta-block:" - "\tkey: " + key + "\tval: " + raw_val.ToString(); - Log(rep->options.info_log, "%s", error_msg.c_str()); - continue; - } - *(pos->second) = val; - } else if (key == BlockBasedTablePropertiesNames::kFilterPolicy) { - table_properties->filter_policy_name = raw_val.ToString(); - } else { - // handle user-collected - table_properties->user_collected_properties.insert( - std::make_pair(key, raw_val.ToString()) - ); - } - } - - return s; -} - Status BlockBasedTable::GetBlock( const BlockBasedTable* table, const BlockHandle& handle, @@ -764,7 +663,7 @@ Iterator* BlockBasedTable::BlockReader(void* arg, Iterator* iter; if (block != nullptr) { - iter = block->NewIterator(table->rep_->options.comparator); + iter = block->NewIterator(&(table->rep_->internal_comparator_)); if (cache_handle != nullptr) { iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); } else { @@ -837,7 +736,7 @@ BlockBasedTable::GetFilter(bool no_io) const { // Get the iterator from the index block. Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const { if (rep_->index_block) { - return rep_->index_block->NewIterator(rep_->options.comparator); + return rep_->index_block->NewIterator(&(rep_->internal_comparator_)); } // get index block from cache @@ -858,7 +757,7 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const { Iterator* iter; if (entry.value != nullptr) { - iter = entry.value->NewIterator(rep_->options.comparator); + iter = entry.value->NewIterator(&(rep_->internal_comparator_)); if (entry.cache_handle) { iter->RegisterCleanup( &ReleaseBlock, rep_->options.block_cache.get(), entry.cache_handle @@ -872,9 +771,9 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const { return iter; } -Iterator* BlockBasedTable::BlockReader(void* arg, - const ReadOptions& options, +Iterator* BlockBasedTable::BlockReader(void* arg, const ReadOptions& options, const EnvOptions& soptions, + const InternalKeyComparator& icomparator, const Slice& index_value, bool for_compaction) { return BlockReader(arg, options, index_value, nullptr, for_compaction); @@ -965,20 +864,15 @@ Iterator* BlockBasedTable::NewIterator(const ReadOptions& options) { } } - return NewTwoLevelIterator( - IndexBlockReader(options), - &BlockBasedTable::BlockReader, - const_cast(this), - options, - rep_->soptions - ); + return NewTwoLevelIterator(IndexBlockReader(options), + &BlockBasedTable::BlockReader, + const_cast(this), options, + rep_->soptions, rep_->internal_comparator_); } Status BlockBasedTable::Get( - const ReadOptions& readOptions, - const Slice& key, - void* handle_context, - bool (*result_handler)(void* handle_context, const Slice& k, + const ReadOptions& readOptions, const Slice& key, void* handle_context, + bool (*result_handler)(void* handle_context, const ParsedInternalKey& k, const Slice& v, bool didIO), void (*mark_key_may_exist_handler)(void* handle_context)) { Status s; @@ -1016,8 +910,13 @@ Status BlockBasedTable::Get( // Call the *saver function on each entry/block until it returns false for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) { - if (!(*result_handler)(handle_context, block_iter->key(), - block_iter->value(), didIO)) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(block_iter->key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + + if (!(*result_handler)(handle_context, parsed_key, block_iter->value(), + didIO)) { done = true; break; } @@ -1034,7 +933,8 @@ Status BlockBasedTable::Get( return s; } -bool SaveDidIO(void* arg, const Slice& key, const Slice& value, bool didIO) { +bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value, + bool didIO) { *reinterpret_cast(arg) = didIO; return false; } @@ -1075,25 +975,4 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { return result; } -const std::string BlockBasedTable::kFilterBlockPrefix = - "filter."; -const std::string BlockBasedTable::kPropertiesBlock = - "rocksdb.properties"; -const std::string BlockBasedTablePropertiesNames::kDataSize = - "rocksdb.data.size"; -const std::string BlockBasedTablePropertiesNames::kIndexSize = - "rocksdb.index.size"; -const std::string BlockBasedTablePropertiesNames::kFilterSize = - "rocksdb.filter.size"; -const std::string BlockBasedTablePropertiesNames::kRawKeySize = - "rocksdb.raw.key.size"; -const std::string BlockBasedTablePropertiesNames::kRawValueSize = - "rocksdb.raw.value.size"; -const std::string BlockBasedTablePropertiesNames::kNumDataBlocks = - "rocksdb.num.data.blocks"; -const std::string BlockBasedTablePropertiesNames::kNumEntries = - "rocksdb.num.entries"; -const std::string BlockBasedTablePropertiesNames::kFilterPolicy = - "rocksdb.filter.policy"; - } // namespace rocksdb diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 52ece7441..58e5b0716 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -14,8 +14,7 @@ #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/statistics.h" -#include "rocksdb/table_properties.h" -#include "rocksdb/table.h" +#include "table/table_reader.h" #include "util/coding.h" namespace rocksdb { @@ -39,7 +38,6 @@ using std::unique_ptr; class BlockBasedTable : public TableReader { public: static const std::string kFilterBlockPrefix; - static const std::string kPropertiesBlock; // Attempt to open the table that is stored in bytes [0..file_size) // of "file", and read the metadata entries necessary to allow @@ -53,6 +51,7 @@ class BlockBasedTable : public TableReader { // *file must remain live while this Table is in use. static Status Open(const Options& db_options, const EnvOptions& env_options, const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_key_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader); @@ -63,14 +62,13 @@ class BlockBasedTable : public TableReader { // call one of the Seek methods on the iterator before using it). Iterator* NewIterator(const ReadOptions&) override; - Status Get( - const ReadOptions& readOptions, - const Slice& key, - void* handle_context, - bool (*result_handler)(void* handle_context, const Slice& k, - const Slice& v, bool didIO), - void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) - override; + Status Get(const ReadOptions& readOptions, const Slice& key, + void* handle_context, + bool (*result_handler)(void* handle_context, + const ParsedInternalKey& k, const Slice& v, + bool didIO), + void (*mark_key_may_exist_handler)(void* handle_context) = + nullptr) override; // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were @@ -82,13 +80,13 @@ class BlockBasedTable : public TableReader { // Returns true if the block for the specified key is in cache. // REQUIRES: key is in this table. - bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override; + bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); // Set up the table for Compaction. Might change some parameters with // posix_fadvise void SetupForCompaction() override; - TableProperties& GetTableProperties() override; + const TableProperties& GetTableProperties() override; ~BlockBasedTable(); @@ -101,8 +99,9 @@ class BlockBasedTable : public TableReader { bool compaction_optimized_; static Iterator* BlockReader(void*, const ReadOptions&, - const EnvOptions& soptions, const Slice&, - bool for_compaction); + const EnvOptions& soptions, + const InternalKeyComparator& icomparator, + const Slice&, bool for_compaction); static Iterator* BlockReader(void*, const ReadOptions&, const Slice&, bool* didIO, bool for_compaction = false); @@ -142,7 +141,6 @@ class BlockBasedTable : public TableReader { void ReadMeta(const Footer& footer); void ReadFilter(const Slice& filter_handle_value); - static Status ReadProperties(const Slice& handle_value, Rep* rep); // Read the meta block from sst. static Status ReadMetaBlock( @@ -156,10 +154,6 @@ class BlockBasedTable : public TableReader { Rep* rep, size_t* filter_size = nullptr); - // Read the table properties from properties block. - static Status ReadProperties( - const Slice& handle_value, Rep* rep, TableProperties* properties); - static void SetupCacheKeyPrefix(Rep* rep); explicit BlockBasedTable(Rep* rep) : @@ -181,15 +175,4 @@ class BlockBasedTable : public TableReader { void operator=(const TableReader&) = delete; }; -struct BlockBasedTablePropertiesNames { - static const std::string kDataSize; - static const std::string kIndexSize; - static const std::string kFilterSize; - static const std::string kRawKeySize; - static const std::string kRawValueSize; - static const std::string kNumDataBlocks; - static const std::string kNumEntries; - static const std::string kFilterPolicy; -}; - } // namespace rocksdb diff --git a/table/block_builder.cc b/table/block_builder.cc index 917601865..f812dbae7 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -36,6 +36,7 @@ #include #include #include "rocksdb/comparator.h" +#include "db/dbformat.h" #include "util/coding.h" namespace rocksdb { @@ -51,9 +52,8 @@ BlockBuilder::BlockBuilder(int block_restart_interval, restarts_.push_back(0); // First restart point is at offset 0 } -BlockBuilder::BlockBuilder(const Options& options) - : BlockBuilder(options.block_restart_interval, options.comparator) { -} +BlockBuilder::BlockBuilder(const Options& options, const Comparator* comparator) + : BlockBuilder(options.block_restart_interval, comparator) {} void BlockBuilder::Reset() { buffer_.clear(); diff --git a/table/block_builder.h b/table/block_builder.h index 31faf19b8..ed2f290fd 100644 --- a/table/block_builder.h +++ b/table/block_builder.h @@ -21,7 +21,7 @@ class Comparator; class BlockBuilder { public: BlockBuilder(int block_builder, const Comparator* comparator); - explicit BlockBuilder(const Options& options); + explicit BlockBuilder(const Options& options, const Comparator* comparator); // Reset the contents as if the BlockBuilder was just constructed. void Reset(); diff --git a/table/block_test.cc b/table/block_test.cc index 7f33e3a90..588ce6729 100644 --- a/table/block_test.cc +++ b/table/block_test.cc @@ -32,9 +32,12 @@ class BlockTest {}; TEST(BlockTest, SimpleTest) { Random rnd(301); Options options = Options(); + std::unique_ptr ic; + ic.reset(new test::PlainInternalKeyComparator(options.comparator)); + std::vector keys; std::vector values; - BlockBuilder builder(options); + BlockBuilder builder(options, ic.get()); int num_records = 100000; char buf[10]; char* p = &buf[0]; diff --git a/table/filter_block.cc b/table/filter_block.cc index 96ba7cb1d..d7be78e1c 100644 --- a/table/filter_block.cc +++ b/table/filter_block.cc @@ -21,11 +21,12 @@ namespace rocksdb { static const size_t kFilterBaseLg = 11; static const size_t kFilterBase = 1 << kFilterBaseLg; -FilterBlockBuilder::FilterBlockBuilder(const Options& opt) - : policy_(opt.filter_policy), - prefix_extractor_(opt.prefix_extractor), - whole_key_filtering_(opt.whole_key_filtering), - comparator_(opt.comparator){} +FilterBlockBuilder::FilterBlockBuilder(const Options& opt, + const Comparator* internal_comparator) + : policy_(opt.filter_policy), + prefix_extractor_(opt.prefix_extractor), + whole_key_filtering_(opt.whole_key_filtering), + comparator_(internal_comparator) {} void FilterBlockBuilder::StartBlock(uint64_t block_offset) { uint64_t filter_index = (block_offset / kFilterBase); diff --git a/table/filter_block.h b/table/filter_block.h index e47f94653..da19d42e9 100644 --- a/table/filter_block.h +++ b/table/filter_block.h @@ -35,7 +35,8 @@ class FilterPolicy; // (StartBlock AddKey*)* Finish class FilterBlockBuilder { public: - explicit FilterBlockBuilder(const Options& opt); + explicit FilterBlockBuilder(const Options& opt, + const Comparator* internal_comparator); void StartBlock(uint64_t block_offset); void AddKey(const Slice& key); diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc index bc1a0d0ab..1703d59d1 100644 --- a/table/filter_block_test.cc +++ b/table/filter_block_test.cc @@ -55,7 +55,7 @@ class FilterBlockTest { }; TEST(FilterBlockTest, EmptyBuilder) { - FilterBlockBuilder builder(options_); + FilterBlockBuilder builder(options_, options_.comparator); Slice block = builder.Finish(); ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block)); FilterBlockReader reader(options_, block); @@ -64,7 +64,7 @@ TEST(FilterBlockTest, EmptyBuilder) { } TEST(FilterBlockTest, SingleChunk) { - FilterBlockBuilder builder(options_); + FilterBlockBuilder builder(options_, options_.comparator); builder.StartBlock(100); builder.AddKey("foo"); builder.AddKey("bar"); @@ -85,7 +85,7 @@ TEST(FilterBlockTest, SingleChunk) { } TEST(FilterBlockTest, MultiChunk) { - FilterBlockBuilder builder(options_); + FilterBlockBuilder builder(options_, options_.comparator); // First filter builder.StartBlock(0); diff --git a/table/format.cc b/table/format.cc index ff6d8fa24..77a55237e 100644 --- a/table/format.cc +++ b/table/format.cc @@ -34,6 +34,7 @@ Status BlockHandle::DecodeFrom(Slice* input) { return Status::Corruption("bad block handle"); } } +const BlockHandle BlockHandle::kNullBlockHandle(0, 0); void Footer::EncodeTo(std::string* dst) const { #ifndef NDEBUG @@ -72,6 +73,30 @@ Status Footer::DecodeFrom(Slice* input) { return result; } +Status ReadFooterFromFile(RandomAccessFile* file, + uint64_t file_size, + Footer* footer) { + if (file_size < Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + char footer_space[Footer::kEncodedLength]; + Slice footer_input; + Status s = file->Read(file_size - Footer::kEncodedLength, + Footer::kEncodedLength, + &footer_input, + footer_space); + if (!s.ok()) return s; + + // Check that we actually read the whole footer from the file. It may be + // that size isn't correct. + if (footer_input.size() != Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + return footer->DecodeFrom(&footer_input); +} + Status ReadBlockContents(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, diff --git a/table/format.h b/table/format.h index 2f1c1e8dc..207527fcb 100644 --- a/table/format.h +++ b/table/format.h @@ -26,6 +26,7 @@ struct ReadOptions; class BlockHandle { public: BlockHandle(); + BlockHandle(uint64_t offset, uint64_t size); // The offset of the block in the file. uint64_t offset() const { return offset_; } @@ -38,19 +39,36 @@ class BlockHandle { void EncodeTo(std::string* dst) const; Status DecodeFrom(Slice* input); + // if the block handle's offset and size are both "0", we will view it + // as a null block handle that points to no where. + bool IsNull() const { + return offset_ == 0 && size_ == 0; + } + + static const BlockHandle& NullBlockHandle() { + return kNullBlockHandle; + } + // Maximum encoding length of a BlockHandle enum { kMaxEncodedLength = 10 + 10 }; private: - uint64_t offset_; - uint64_t size_; + uint64_t offset_ = 0; + uint64_t size_ = 0; + + static const BlockHandle kNullBlockHandle; }; // Footer encapsulates the fixed information stored at the tail // end of every table file. class Footer { public: - Footer() { } + // @table_magic_number serves two purposes: + // 1. Identify different types of the tables. + // 2. Help us to identify if a given file is a valid sst. + Footer(uint64_t table_magic_number) : + kTableMagicNumber(table_magic_number) { + } // The block handle for the metaindex block of the table const BlockHandle& metaindex_handle() const { return metaindex_handle_; } @@ -77,12 +95,13 @@ class Footer { private: BlockHandle metaindex_handle_; BlockHandle index_handle_; + const uint64_t kTableMagicNumber; }; -// kTableMagicNumber was picked by running -// echo http://code.google.com/p/leveldb/ | sha1sum -// and taking the leading 64 bits. -static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; +// Read the footer from file +Status ReadFooterFromFile(RandomAccessFile* file, + uint64_t file_size, + Footer* footer); // 1-byte type + 32-bit crc static const size_t kBlockTrailerSize = 5; @@ -115,8 +134,13 @@ extern Status UncompressBlockContents(const char* data, // Implementation details follow. Clients should ignore, inline BlockHandle::BlockHandle() - : offset_(~static_cast(0)), - size_(~static_cast(0)) { + : BlockHandle(~static_cast(0), + ~static_cast(0)) { +} + +inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size) + : offset_(offset), + size_(size) { } } // namespace rocksdb diff --git a/table/merger.cc b/table/merger.cc index f5ce7440c..1aed00cc5 100644 --- a/table/merger.cc +++ b/table/merger.cc @@ -11,8 +11,11 @@ #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" +#include "rocksdb/options.h" #include "table/iter_heap.h" #include "table/iterator_wrapper.h" +#include "util/stop_watch.h" +#include "util/perf_context_imp.h" #include @@ -22,10 +25,13 @@ namespace { class MergingIterator : public Iterator { public: - MergingIterator(const Comparator* comparator, Iterator** children, int n) + MergingIterator(Env* const env, const Comparator* comparator, + Iterator** children, int n) : comparator_(comparator), children_(n), current_(nullptr), + use_heap_(true), + env_(env), direction_(kForward), maxHeap_(NewMaxIterHeap(comparator_)), minHeap_ (NewMinIterHeap(comparator_)) { @@ -70,15 +76,52 @@ class MergingIterator : public Iterator { } virtual void Seek(const Slice& target) { - ClearHeaps(); + // Invalidate the heap. + use_heap_ = false; + IteratorWrapper* first_child = nullptr; + StopWatchNano child_seek_timer(env_, false); + StopWatchNano min_heap_timer(env_, false); for (auto& child : children_) { + StartPerfTimer(&child_seek_timer); child.Seek(target); + BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer); + BumpPerfCount(&perf_context.seek_child_seek_count); + if (child.Valid()) { - minHeap_.push(&child); + // This child has valid key + if (!use_heap_) { + if (first_child == nullptr) { + // It's the first child has valid key. Only put it int + // current_. Now the values in the heap should be invalid. + first_child = &child; + } else { + // We have more than one children with valid keys. Initialize + // the heap and put the first child into the heap. + StartPerfTimer(&min_heap_timer); + ClearHeaps(); + BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); + StartPerfTimer(&min_heap_timer); + minHeap_.push(first_child); + BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); + } + } + if (use_heap_) { + StartPerfTimer(&min_heap_timer); + minHeap_.push(&child); + BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); + } } } - FindSmallest(); - direction_ = kForward; + if (use_heap_) { + // If heap is valid, need to put the smallest key to curent_. + StartPerfTimer(&min_heap_timer); + FindSmallest(); + BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); + } else { + // The heap is not valid, then the current_ iterator is the first + // one, or null if there is no first child. + current_ = first_child; + } } virtual void Next() { @@ -109,10 +152,14 @@ class MergingIterator : public Iterator { // as the current points to the current record. move the iterator forward. // and if it is valid add it to the heap. current_->Next(); - if (current_->Valid()){ - minHeap_.push(current_); + if (use_heap_) { + if (current_->Valid()) { + minHeap_.push(current_); + } + FindSmallest(); + } else if (!current_->Valid()) { + current_ = nullptr; } - FindSmallest(); } virtual void Prev() { @@ -178,6 +225,11 @@ class MergingIterator : public Iterator { const Comparator* comparator_; std::vector children_; IteratorWrapper* current_; + // If the value is true, both of iterators in the heap and current_ + // contain valid rows. If it is false, only current_ can possibly contain + // valid rows. + bool use_heap_; + Env* const env_; // Which direction is the iterator moving? enum Direction { kForward, @@ -189,6 +241,7 @@ class MergingIterator : public Iterator { }; void MergingIterator::FindSmallest() { + assert(use_heap_); if (minHeap_.empty()) { current_ = nullptr; } else { @@ -199,6 +252,7 @@ void MergingIterator::FindSmallest() { } void MergingIterator::FindLargest() { + assert(use_heap_); if (maxHeap_.empty()) { current_ = nullptr; } else { @@ -209,19 +263,21 @@ void MergingIterator::FindLargest() { } void MergingIterator::ClearHeaps() { + use_heap_ = true; maxHeap_ = NewMaxIterHeap(comparator_); minHeap_ = NewMinIterHeap(comparator_); } } // namespace -Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { +Iterator* NewMergingIterator(Env* const env, const Comparator* cmp, + Iterator** list, int n) { assert(n >= 0); if (n == 0) { return NewEmptyIterator(); } else if (n == 1) { return list[0]; } else { - return new MergingIterator(cmp, list, n); + return new MergingIterator(env, cmp, list, n); } } diff --git a/table/merger.h b/table/merger.h index dbc1f69eb..ea8daa770 100644 --- a/table/merger.h +++ b/table/merger.h @@ -13,6 +13,7 @@ namespace rocksdb { class Comparator; class Iterator; +class Env; // Return an iterator that provided the union of the data in // children[0,n-1]. Takes ownership of the child iterators and @@ -22,7 +23,8 @@ class Iterator; // key is present in K child iterators, it will be yielded K times. // // REQUIRES: n >= 0 -extern Iterator* NewMergingIterator( - const Comparator* comparator, Iterator** children, int n); +extern Iterator* NewMergingIterator(Env* const env, + const Comparator* comparator, + Iterator** children, int n); } // namespace rocksdb diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc new file mode 100644 index 000000000..a4d98bb22 --- /dev/null +++ b/table/meta_blocks.cc @@ -0,0 +1,286 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "table/meta_blocks.h" + +#include + +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/format.h" +#include "util/coding.h" + +namespace rocksdb { + +MetaIndexBuilder::MetaIndexBuilder() + : meta_index_block_( + new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { +} + +void MetaIndexBuilder::Add(const std::string& key, + const BlockHandle& handle) { + std::string handle_encoding; + handle.EncodeTo(&handle_encoding); + meta_block_handles_.insert({key, handle_encoding}); +} + +Slice MetaIndexBuilder::Finish() { + for (const auto& metablock : meta_block_handles_) { + meta_index_block_->Add(metablock.first, metablock.second); + } + return meta_index_block_->Finish(); +} + +PropertyBlockBuilder::PropertyBlockBuilder() + : properties_block_( + new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { +} + +void PropertyBlockBuilder::Add(const std::string& name, + const std::string& val) { + props_.insert({name, val}); +} + +void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) { + assert(props_.find(name) == props_.end()); + + std::string dst; + PutVarint64(&dst, val); + + Add(name, dst); +} + +void PropertyBlockBuilder::Add( + const UserCollectedProperties& user_collected_properties) { + for (const auto& prop : user_collected_properties) { + Add(prop.first, prop.second); + } +} + +void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { + Add(TablePropertiesNames::kRawKeySize, props.raw_key_size); + Add(TablePropertiesNames::kRawValueSize, props.raw_value_size); + Add(TablePropertiesNames::kDataSize, props.data_size); + Add(TablePropertiesNames::kIndexSize, props.index_size); + Add(TablePropertiesNames::kNumEntries, props.num_entries); + Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); + Add(TablePropertiesNames::kFilterSize, props.filter_size); + Add(TablePropertiesNames::kFormatVersion, props.format_version); + Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len); + + if (!props.filter_policy_name.empty()) { + Add(TablePropertiesNames::kFilterPolicy, + props.filter_policy_name); + } +} + +Slice PropertyBlockBuilder::Finish() { + for (const auto& prop : props_) { + properties_block_->Add(prop.first, prop.second); + } + + return properties_block_->Finish(); +} + +void LogPropertiesCollectionError( + Logger* info_log, const std::string& method, const std::string& name) { + assert(method == "Add" || method == "Finish"); + + std::string msg = + "[Warning] encountered error when calling TablePropertiesCollector::" + + method + "() with collector name: " + name; + Log(info_log, "%s", msg.c_str()); +} + +bool NotifyCollectTableCollectorsOnAdd( + const Slice& key, + const Slice& value, + const Options::TablePropertiesCollectors& collectors, + Logger* info_log) { + bool all_succeeded = true; + for (auto collector : collectors) { + Status s = collector->Add(key, value); + all_succeeded = all_succeeded && s.ok(); + if (!s.ok()) { + LogPropertiesCollectionError( + info_log, "Add", /* method */ collector->Name() + ); + } + } + return all_succeeded; +} + +bool NotifyCollectTableCollectorsOnFinish( + const Options::TablePropertiesCollectors& collectors, + Logger* info_log, + PropertyBlockBuilder* builder) { + bool all_succeeded = true; + for (auto collector : collectors) { + UserCollectedProperties user_collected_properties; + Status s = collector->Finish(&user_collected_properties); + + all_succeeded = all_succeeded && s.ok(); + if (!s.ok()) { + LogPropertiesCollectionError( + info_log, "Finish", /* method */ collector->Name() + ); + } else { + builder->Add(user_collected_properties); + } + } + + return all_succeeded; +} + +Status ReadProperties( + const Slice& handle_value, + RandomAccessFile* file, + Env* env, + Logger* logger, + TableProperties* table_properties) { + assert(table_properties); + + Slice v = handle_value; + BlockHandle handle; + if (!handle.DecodeFrom(&v).ok()) { + return Status::InvalidArgument("Failed to decode properties block handle"); + } + + BlockContents block_contents; + ReadOptions read_options; + read_options.verify_checksums = false; + Status s = ReadBlockContents( + file, + read_options, + handle, + &block_contents, + env, + false + ); + + if (!s.ok()) { + return s; + } + + Block properties_block(block_contents); + std::unique_ptr iter( + properties_block.NewIterator(BytewiseComparator()) + ); + + // All pre-defined properties of type uint64_t + std::unordered_map predefined_uint64_properties = { + { TablePropertiesNames::kDataSize, &table_properties->data_size }, + { TablePropertiesNames::kIndexSize, &table_properties->index_size }, + { TablePropertiesNames::kFilterSize, &table_properties->filter_size }, + { TablePropertiesNames::kRawKeySize, &table_properties->raw_key_size }, + { TablePropertiesNames::kRawValueSize, &table_properties->raw_value_size }, + { TablePropertiesNames::kNumDataBlocks, + &table_properties->num_data_blocks }, + { TablePropertiesNames::kNumEntries, &table_properties->num_entries }, + { TablePropertiesNames::kFormatVersion, &table_properties->format_version }, + { TablePropertiesNames::kFixedKeyLen, &table_properties->fixed_key_len }, + }; + + std::string last_key; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = iter->status(); + if (!s.ok()) { + break; + } + + auto key = iter->key().ToString(); + // properties block is strictly sorted with no duplicate key. + assert( + last_key.empty() || + BytewiseComparator()->Compare(key, last_key) > 0 + ); + last_key = key; + + auto raw_val = iter->value(); + auto pos = predefined_uint64_properties.find(key); + + if (pos != predefined_uint64_properties.end()) { + // handle predefined rocksdb properties + uint64_t val; + if (!GetVarint64(&raw_val, &val)) { + // skip malformed value + auto error_msg = + "[Warning] detect malformed value in properties meta-block:" + "\tkey: " + key + "\tval: " + raw_val.ToString(); + Log(logger, "%s", error_msg.c_str()); + continue; + } + *(pos->second) = val; + } else if (key == TablePropertiesNames::kFilterPolicy) { + table_properties->filter_policy_name = raw_val.ToString(); + } else { + // handle user-collected properties + table_properties->user_collected_properties.insert( + std::make_pair(key, raw_val.ToString()) + ); + } + } + + return s; +} + +Status ReadTableProperties( + RandomAccessFile* file, + uint64_t file_size, + uint64_t table_magic_number, + Env* env, + Logger* info_log, + TableProperties* properties) { + // -- Read metaindex block + Footer footer(table_magic_number); + auto s = ReadFooterFromFile(file, file_size, &footer); + if (!s.ok()) { + return s; + } + + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + ReadOptions read_options; + read_options.verify_checksums = false; + s = ReadBlockContents( + file, + read_options, + metaindex_handle, + &metaindex_contents, + env, + false + ); + if (!s.ok()) { + return s; + } + Block metaindex_block(metaindex_contents); + std::unique_ptr meta_iter( + metaindex_block.NewIterator(BytewiseComparator()) + ); + + // -- Read property block + meta_iter->Seek(kPropertiesBlock); + TableProperties table_properties; + if (meta_iter->Valid() && + meta_iter->key() == kPropertiesBlock && + meta_iter->status().ok()) { + s = ReadProperties( + meta_iter->value(), + file, + env, + info_log, + properties + ); + } else { + s = Status::Corruption( + "Unable to read the property block from the plain table" + ); + } + + return s; +} + + +} // namespace rocksdb diff --git a/table/meta_blocks.h b/table/meta_blocks.h new file mode 100644 index 000000000..9f236eff6 --- /dev/null +++ b/table/meta_blocks.h @@ -0,0 +1,121 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/table_properties.h" +#include "table/block_builder.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class Env; +class Logger; +class RandomAccessFile; +struct TableProperties; + +// An STL style comparator that does the bytewise comparator comparasion +// internally. +struct BytewiseLessThan { + bool operator()(const std::string& key1, const std::string& key2) const { + // smaller entries will be placed in front. + return comparator->Compare(key1, key2) <= 0; + } + + const Comparator* comparator = BytewiseComparator(); +}; + +// When writing to a block that requires entries to be sorted by +// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap` +// before writng to store. +typedef std::map BytewiseSortedMap; + +class MetaIndexBuilder { + public: + MetaIndexBuilder(const MetaIndexBuilder&) = delete; + MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete; + + MetaIndexBuilder(); + void Add(const std::string& key, const BlockHandle& handle); + + // Write all the added key/value pairs to the block and return the contents + // of the block. + Slice Finish(); + + private: + // store the sorted key/handle of the metablocks. + BytewiseSortedMap meta_block_handles_; + std::unique_ptr meta_index_block_; +}; + +class PropertyBlockBuilder { + public: + PropertyBlockBuilder(const PropertyBlockBuilder&) = delete; + PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete; + + PropertyBlockBuilder(); + + void AddTableProperty(const TableProperties& props); + void Add(const std::string& key, uint64_t value); + void Add(const std::string& key, const std::string& value); + void Add(const UserCollectedProperties& user_collected_properties); + + // Write all the added entries to the block and return the block contents + Slice Finish(); + + private: + std::unique_ptr properties_block_; + BytewiseSortedMap props_; +}; + +// Were we encounter any error occurs during user-defined statistics collection, +// we'll write the warning message to info log. +void LogPropertiesCollectionError( + Logger* info_log, const std::string& method, const std::string& name); + +// Utility functions help table builder to trigger batch events for user +// defined property collectors. +// Return value indicates if there is any error occurred; if error occurred, +// the warning message will be logged. +// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all +// property collectors. +bool NotifyCollectTableCollectorsOnAdd( + const Slice& key, + const Slice& value, + const Options::TablePropertiesCollectors& collectors, + Logger* info_log); + +// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all +// property collectors. The collected properties will be added to `builder`. +bool NotifyCollectTableCollectorsOnFinish( + const Options::TablePropertiesCollectors& collectors, + Logger* info_log, + PropertyBlockBuilder* builder); + +// Read the properties from the table. +Status ReadProperties( + const Slice& handle_value, + RandomAccessFile* file, + Env* env, + Logger* logger, + TableProperties* table_properties); + +// Directly read the properties from the properties block of a plain table. +Status ReadTableProperties( + RandomAccessFile* file, + uint64_t file_size, + uint64_t table_magic_number, + Env* env, + Logger* info_log, + TableProperties* properties); + +} // namespace rocksdb diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc new file mode 100644 index 000000000..e33ac39f2 --- /dev/null +++ b/table/plain_table_builder.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/plain_table_builder.h" + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "table/plain_table_factory.h" +#include "db/dbformat.h" +#include "table/block_builder.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +namespace { + +// a utility that helps writing block content to the file +// @offset will advance if @block_contents was successfully written. +// @block_handle the block handle this particular block. +Status WriteBlock( + const Slice& block_contents, + WritableFile* file, + uint64_t* offset, + BlockHandle* block_handle) { + block_handle->set_offset(*offset); + block_handle->set_size(block_contents.size()); + Status s = file->Append(block_contents); + + if (s.ok()) { + *offset += block_contents.size(); + } + return s; +} + +} // namespace + +// kPlainTableMagicNumber was picked by running +// echo rocksdb.plain.table | sha1sum +// and taking the leading 64 bits. +extern const uint64_t kPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; + +PlainTableBuilder::PlainTableBuilder(const Options& options, + WritableFile* file, + uint32_t user_key_len) : + options_(options), file_(file), user_key_len_(user_key_len) { + properties_.fixed_key_len = user_key_len; + + // for plain table, we put all the data in a big chuck. + properties_.num_data_blocks = 1; + // emphasize that currently plain table doesn't have persistent index or + // filter block. + properties_.index_size = 0; + properties_.filter_size = 0; + properties_.format_version = 0; +} + +PlainTableBuilder::~PlainTableBuilder() { +} + +void PlainTableBuilder::Add(const Slice& key, const Slice& value) { + size_t user_key_size = key.size() - 8; + assert(user_key_len_ == 0 || user_key_size == user_key_len_); + + if (!IsFixedLength()) { + // Write key length + key_size_str_.clear(); + PutVarint32(&key_size_str_, user_key_size); + file_->Append(key_size_str_); + offset_ += key_size_str_.length(); + } + + // Write key + ParsedInternalKey parsed_key; + if (!ParseInternalKey(key, &parsed_key)) { + status_ = Status::Corruption(Slice()); + return; + } + if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) { + file_->Append(Slice(key.data(), user_key_size)); + char tmp_char = PlainTableFactory::kValueTypeSeqId0; + file_->Append(Slice(&tmp_char, 1)); + offset_ += key.size() - 7; + } else { + file_->Append(key); + offset_ += key.size(); + } + + // Write value length + value_size_str_.clear(); + int value_size = value.size(); + PutVarint32(&value_size_str_, value_size); + file_->Append(value_size_str_); + + // Write value + file_->Append(value); + offset_ += value_size + value_size_str_.length(); + + properties_.num_entries++; + properties_.raw_key_size += key.size(); + properties_.raw_value_size += value.size(); + + // notify property collectors + NotifyCollectTableCollectorsOnAdd( + key, + value, + options_.table_properties_collectors, + options_.info_log.get() + ); +} + +Status PlainTableBuilder::status() const { return status_; } + +Status PlainTableBuilder::Finish() { + assert(!closed_); + closed_ = true; + + properties_.data_size = offset_; + + // Write the following blocks + // 1. [meta block: properties] + // 2. [metaindex block] + // 3. [footer] + MetaIndexBuilder meta_index_builer; + + PropertyBlockBuilder property_block_builder; + // -- Add basic properties + property_block_builder.AddTableProperty(properties_); + + // -- Add user collected properties + NotifyCollectTableCollectorsOnFinish( + options_.table_properties_collectors, + options_.info_log.get(), + &property_block_builder + ); + + // -- Write property block + BlockHandle property_block_handle; + auto s = WriteBlock( + property_block_builder.Finish(), + file_, + &offset_, + &property_block_handle + ); + if (!s.ok()) { + return s; + } + meta_index_builer.Add(kPropertiesBlock, property_block_handle); + + // -- write metaindex block + BlockHandle metaindex_block_handle; + s = WriteBlock( + meta_index_builer.Finish(), + file_, + &offset_, + &metaindex_block_handle + ); + if (!s.ok()) { + return s; + } + + // Write Footer + Footer footer(kPlainTableMagicNumber); + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(BlockHandle::NullBlockHandle()); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + s = file_->Append(footer_encoding); + if (s.ok()) { + offset_ += footer_encoding.size(); + } + + return s; +} + +void PlainTableBuilder::Abandon() { + closed_ = true; +} + +uint64_t PlainTableBuilder::NumEntries() const { + return properties_.num_entries; +} + +uint64_t PlainTableBuilder::FileSize() const { + return offset_; +} + +} // namespace rocksdb diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h new file mode 100644 index 000000000..1793d1d72 --- /dev/null +++ b/table/plain_table_builder.h @@ -0,0 +1,85 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// IndexedTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. + +#pragma once +#include +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "table/table_builder.h" +#include "rocksdb/table_properties.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; +class TableBuilder; + +class PlainTableBuilder: public TableBuilder { +public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + PlainTableBuilder(const Options& options, WritableFile* file, + uint32_t user_key_size); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~PlainTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + +private: + Options options_; + WritableFile* file_; + uint64_t offset_ = 0; + Status status_; + TableProperties properties_; + + const size_t user_key_len_; + bool closed_ = false; // Either Finish() or Abandon() has been called. + + std::string key_size_str_; + std::string value_size_str_; + + bool IsFixedLength() const { + return user_key_len_ > 0; + } + + // No copying allowed + PlainTableBuilder(const PlainTableBuilder&) = delete; + void operator=(const PlainTableBuilder&) = delete; +}; + +} // namespace rocksdb + diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc new file mode 100644 index 000000000..c7ee8eb2f --- /dev/null +++ b/table/plain_table_factory.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/plain_table_factory.h" + +#include +#include +#include "db/dbformat.h" +#include "table/plain_table_builder.h" +#include "table/plain_table_reader.h" +#include "port/port.h" + +namespace rocksdb { + +Status PlainTableFactory::NewTableReader(const Options& options, + const EnvOptions& soptions, + const InternalKeyComparator& icomp, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table) const { + return PlainTableReader::Open(options, soptions, icomp, std::move(file), + file_size, table, bloom_bits_per_key_, + hash_table_ratio_); +} + +TableBuilder* PlainTableFactory::NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const { + return new PlainTableBuilder(options, file, user_key_len_); +} + +extern TableFactory* NewPlainTableFactory(uint32_t user_key_len, + int bloom_bits_per_key, + double hash_table_ratio) { + return new PlainTableFactory(user_key_len, bloom_bits_per_key, + hash_table_ratio); +} + +} // namespace rocksdb diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h new file mode 100644 index 000000000..382efe3c1 --- /dev/null +++ b/table/plain_table_factory.h @@ -0,0 +1,76 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +struct Options; +struct EnvOptions; + +using std::unique_ptr; +class Status; +class RandomAccessFile; +class WritableFile; +class Table; +class TableBuilder; + +// IndexedTable requires fixed length key, configured as a constructor +// parameter of the factory class. Output file format: +// +-------------+-----------------+ +// | version | user_key_length | +// +------------++------------------------------+ <= key1 offset +// | [key_size] | key1 | value_size | | +// +------------+-------------+-------------+ | +// | value1 | +// | | +// +----------------------------------------+---+ <= key2 offset +// | [key_size] | key2 | value_size | | +// +------------+-------------+-------------+ | +// | value2 | +// | | +// | ...... | +// +-----------------+--------------------------+ +// If user_key_length = kPlainTableVariableLength, it means the key is variable +// length, there will be an extra field for key size encoded before every key. +class PlainTableFactory : public TableFactory { + public: + ~PlainTableFactory() {} + // user_key_size is the length of the user key. If it is set to be + // kPlainTableVariableLength, then it means variable length. Otherwise, all + // the keys need to have the fix length of this value. bloom_bits_per_key is + // number of bits used for bloom filer per key. hash_table_ratio is + // the desired utilization of the hash table used for prefix hashing. + // hash_table_ratio = number of prefixes / #buckets in the hash table + explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength, + int bloom_bits_per_key = 0, + double hash_table_ratio = 0.75) + : user_key_len_(user_key_len), + bloom_bits_per_key_(bloom_bits_per_key), + hash_table_ratio_(hash_table_ratio) {} + const char* Name() const override { return "PlainTable"; } + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override; + TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& icomparator, + WritableFile* file, + CompressionType compression_type) const + override; + + static const char kValueTypeSeqId0 = 0xFF; + + private: + uint32_t user_key_len_; + int bloom_bits_per_key_; + double hash_table_ratio_; +}; + +} // namespace rocksdb diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc new file mode 100644 index 000000000..b07862bad --- /dev/null +++ b/table/plain_table_reader.cc @@ -0,0 +1,695 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/plain_table_reader.h" + +#include + +#include "db/dbformat.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" + +#include "table/block.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/two_level_iterator.h" +#include "table/plain_table_factory.h" + +#include "util/coding.h" +#include "util/dynamic_bloom.h" +#include "util/hash.h" +#include "util/histogram.h" +#include "util/murmurhash.h" +#include "util/perf_context_imp.h" +#include "util/stop_watch.h" + + +namespace rocksdb { + +namespace { + +inline uint32_t GetSliceHash(Slice const& s) { + return Hash(s.data(), s.size(), 397) ; +} + +inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { + return hash % num_buckets; +} + +} // namespace + +// Iterator to iterate IndexedTable +class PlainTableIterator : public Iterator { + public: + explicit PlainTableIterator(PlainTableReader* table); + ~PlainTableIterator(); + + bool Valid() const; + + void SeekToFirst(); + + void SeekToLast(); + + void Seek(const Slice& target); + + void Next(); + + void Prev(); + + Slice key() const; + + Slice value() const; + + Status status() const; + + private: + PlainTableReader* table_; + uint32_t offset_; + uint32_t next_offset_; + Slice key_; + Slice value_; + Status status_; + std::string tmp_str_; + // No copying allowed + PlainTableIterator(const PlainTableIterator&) = delete; + void operator=(const Iterator&) = delete; +}; + +extern const uint64_t kPlainTableMagicNumber; +PlainTableReader::PlainTableReader(const EnvOptions& storage_options, + const InternalKeyComparator& icomparator, + uint64_t file_size, int bloom_bits_per_key, + double hash_table_ratio, + const TableProperties& table_properties) + : soptions_(storage_options), + internal_comparator_(icomparator), + file_size_(file_size), + kHashTableRatio(hash_table_ratio), + kBloomBitsPerKey(bloom_bits_per_key), + table_properties_(table_properties), + data_end_offset_(table_properties_.data_size), + user_key_len_(table_properties.fixed_key_len) {} + +PlainTableReader::~PlainTableReader() { + delete[] hash_table_; + delete[] sub_index_; + delete bloom_; +} + +Status PlainTableReader::Open(const Options& options, + const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table_reader, + const int bloom_bits_per_key, + double hash_table_ratio) { + assert(options.allow_mmap_reads); + + if (file_size > kMaxFileSize) { + return Status::NotSupported("File is too large for PlainTableReader!"); + } + + TableProperties table_properties; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + options.env, options.info_log.get(), + &table_properties); + if (!s.ok()) { + return s; + } + + std::unique_ptr new_reader(new PlainTableReader( + soptions, internal_comparator, file_size, bloom_bits_per_key, + hash_table_ratio, table_properties)); + new_reader->file_ = std::move(file); + new_reader->options_ = options; + + // -- Populate Index + s = new_reader->PopulateIndex(); + if (!s.ok()) { + return s; + } + + *table_reader = std::move(new_reader); + return s; +} + +void PlainTableReader::SetupForCompaction() { +} + +bool PlainTableReader::PrefixMayMatch(const Slice& internal_prefix) { + return true; +} + +Iterator* PlainTableReader::NewIterator(const ReadOptions& options) { + return new PlainTableIterator(this); +} + +struct PlainTableReader::IndexRecord { + uint32_t hash; // hash of the prefix + uint32_t offset; // offset of a row + IndexRecord* next; +}; + +// Helper class to track all the index records +class PlainTableReader::IndexRecordList { + public: + explicit IndexRecordList(size_t num_records_per_group) + : kNumRecordsPerGroup(num_records_per_group), + current_group_(nullptr), + num_records_in_current_group_(num_records_per_group) {} + + ~IndexRecordList() { + for (size_t i = 0; i < groups_.size(); i++) { + delete[] groups_[i]; + } + } + + void AddRecord(murmur_t hash, uint32_t offset) { + if (num_records_in_current_group_ == kNumRecordsPerGroup) { + current_group_ = AllocateNewGroup(); + num_records_in_current_group_ = 0; + } + auto& new_record = current_group_[num_records_in_current_group_++]; + new_record.hash = hash; + new_record.offset = offset; + new_record.next = nullptr; + } + + size_t GetNumRecords() const { + return (groups_.size() - 1) * kNumRecordsPerGroup + + num_records_in_current_group_; + } + IndexRecord* At(size_t index) { + return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]); + } + + private: + IndexRecord* AllocateNewGroup() { + IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; + groups_.push_back(result); + return result; + } + + const size_t kNumRecordsPerGroup; + IndexRecord* current_group_; + // List of arrays allocated + std::vector groups_; + size_t num_records_in_current_group_; +}; + +int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) { + Slice prev_key_prefix_slice; + uint32_t prev_key_prefix_hash = 0; + uint32_t pos = data_start_offset_; + int key_index_within_prefix = 0; + bool is_first_record = true; + HistogramImpl keys_per_prefix_hist; + // Need map to be ordered to make sure sub indexes generated + // are in order. + + int num_prefixes = 0; + while (pos < data_end_offset_) { + uint32_t key_offset = pos; + ParsedInternalKey key; + Slice value_slice; + status_ = Next(pos, &key, &value_slice, pos); + Slice key_prefix_slice = GetPrefix(key); + + if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { + ++num_prefixes; + if (!is_first_record) { + keys_per_prefix_hist.Add(key_index_within_prefix); + } + key_index_within_prefix = 0; + prev_key_prefix_slice = key_prefix_slice; + prev_key_prefix_hash = GetSliceHash(key_prefix_slice); + } + + if (key_index_within_prefix++ % kIndexIntervalForSamePrefixKeys == 0) { + // Add an index key for every kIndexIntervalForSamePrefixKeys keys + record_list->AddRecord(prev_key_prefix_hash, key_offset); + } + is_first_record = false; + } + + keys_per_prefix_hist.Add(key_index_within_prefix); + Log(options_.info_log, "Number of Keys per prefix Histogram: %s", + keys_per_prefix_hist.ToString().c_str()); + + return num_prefixes; +} + +void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) { + delete[] hash_table_; + + if (kBloomBitsPerKey > 0) { + bloom_ = new DynamicBloom(num_prefixes * kBloomBitsPerKey); + } + double hash_table_size_multipier = + (kHashTableRatio > 1.0) ? 1.0 : 1.0 / kHashTableRatio; + hash_table_size_ = num_prefixes * hash_table_size_multipier + 1; + hash_table_ = new uint32_t[hash_table_size_]; +} + +size_t PlainTableReader::BucketizeIndexesAndFillBloom( + IndexRecordList& record_list, int num_prefixes, + std::vector* hash_to_offsets, + std::vector* bucket_count) { + size_t sub_index_size_needed = 0; + bool first = true; + uint32_t prev_hash = 0; + size_t num_records = record_list.GetNumRecords(); + for (size_t i = 0; i < num_records; i++) { + IndexRecord* index_record = record_list.At(i); + uint32_t cur_hash = index_record->hash; + if (first || prev_hash != cur_hash) { + prev_hash = cur_hash; + first = false; + if (bloom_) { + bloom_->AddHash(cur_hash); + } + } + uint32_t bucket = GetBucketIdFromHash(cur_hash, hash_table_size_); + IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; + index_record->next = prev_bucket_head; + (*hash_to_offsets)[bucket] = index_record; + auto& item_count = (*bucket_count)[bucket]; + if (item_count > 0) { + if (item_count == 1) { + sub_index_size_needed += kOffsetLen + 1; + } + if (item_count == 127) { + // Need more than one byte for length + sub_index_size_needed++; + } + sub_index_size_needed += kOffsetLen; + } + item_count++; + } + return sub_index_size_needed; +} + +void PlainTableReader::FillIndexes( + size_t sub_index_size_needed, + const std::vector& hash_to_offsets, + const std::vector& bucket_count) { + Log(options_.info_log, "Reserving %zu bytes for sub index", + sub_index_size_needed); + // 8 bytes buffer for variable length size + size_t buffer_size = 8 * 8; + size_t buffer_used = 0; + sub_index_size_needed += buffer_size; + sub_index_ = new char[sub_index_size_needed]; + size_t sub_index_offset = 0; + char* prev_ptr; + char* cur_ptr; + uint32_t* sub_index_ptr; + for (int i = 0; i < hash_table_size_; i++) { + uint32_t num_keys_for_bucket = bucket_count[i]; + switch (num_keys_for_bucket) { + case 0: + // No key for bucket + hash_table_[i] = data_end_offset_; + break; + case 1: + // point directly to the file offset + hash_table_[i] = hash_to_offsets[i]->offset; + break; + default: + // point to second level indexes. + hash_table_[i] = sub_index_offset | kSubIndexMask; + prev_ptr = sub_index_ + sub_index_offset; + cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); + sub_index_offset += (cur_ptr - prev_ptr); + if (cur_ptr - prev_ptr > 2 + || (cur_ptr - prev_ptr == 2 && num_keys_for_bucket <= 127)) { + // Need to resize sub_index. Exponentially grow buffer. + buffer_used += cur_ptr - prev_ptr - 1; + if (buffer_used + 4 > buffer_size) { + Log(options_.info_log, "Recalculate suffix_map length to %zu", + sub_index_size_needed); + + sub_index_size_needed += buffer_size; + buffer_size *= 2; + char* new_sub_index = new char[sub_index_size_needed]; + memcpy(new_sub_index, sub_index_, sub_index_offset); + delete[] sub_index_; + sub_index_ = new_sub_index; + } + } + sub_index_ptr = (uint32_t*) (sub_index_ + sub_index_offset); + IndexRecord* record = hash_to_offsets[i]; + int j; + for (j = num_keys_for_bucket - 1; j >= 0 && record; + j--, record = record->next) { + sub_index_ptr[j] = record->offset; + } + assert(j == -1 && record == nullptr); + sub_index_offset += kOffsetLen * num_keys_for_bucket; + break; + } + } + + Log(options_.info_log, "hash table size: %d, suffix_map length %zu", + hash_table_size_, sub_index_size_needed); +} + +Status PlainTableReader::PopulateIndex() { + // Get mmapped memory to file_data_. + Status s = file_->Read(0, file_size_, &file_data_, nullptr); + if (!s.ok()) { + return s; + } + + IndexRecordList record_list(kRecordsPerGroup); + // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows + // for a prefix (starting from the first one), generate a record of (hash, + // offset) and append it to IndexRecordList, which is a data structure created + // to store them. + int num_prefixes = PopulateIndexRecordList(&record_list); + // Calculated hash table and bloom filter size and allocate memory for indexes + // and bloom filter based on the number of prefixes. + AllocateIndexAndBloom(num_prefixes); + + // Bucketize all the index records to a temp data structure, in which for + // each bucket, we generate a linked list of IndexRecord, in reversed order. + std::vector hash_to_offsets(hash_table_size_, nullptr); + std::vector bucket_count(hash_table_size_, 0); + size_t sub_index_size_needed = BucketizeIndexesAndFillBloom( + record_list, num_prefixes, &hash_to_offsets, &bucket_count); + // From the temp data structure, populate indexes. + FillIndexes(sub_index_size_needed, hash_to_offsets, bucket_count); + + return Status::OK(); +} + +Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, + uint32_t prefix_hash, bool& prefix_matched, + uint32_t& ret_offset) { + prefix_matched = false; + int bucket = GetBucketIdFromHash(prefix_hash, hash_table_size_); + uint32_t bucket_value = hash_table_[bucket]; + if (bucket_value == data_end_offset_) { + ret_offset = data_end_offset_; + return Status::OK(); + } else if ((bucket_value & kSubIndexMask) == 0) { + // point directly to the file + ret_offset = bucket_value; + return Status::OK(); + } + + // point to sub-index, need to do a binary search + uint32_t low = 0; + uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask; + + const char* index_ptr = sub_index_ + prefix_index_offset; + uint32_t upper_bound = 0; + const uint32_t* base_ptr = (const uint32_t*) GetVarint32Ptr(index_ptr, + index_ptr + 4, + &upper_bound); + uint32_t high = upper_bound; + ParsedInternalKey mid_key; + ParsedInternalKey parsed_target; + if (!ParseInternalKey(target, &parsed_target)) { + return Status::Corruption(Slice()); + } + + // The key is between [low, high). Do a binary search between it. + while (high - low > 1) { + uint32_t mid = (high + low) / 2; + uint32_t file_offset = base_ptr[mid]; + size_t tmp; + Status s = ReadKey(file_data_.data() + file_offset, &mid_key, tmp); + if (!s.ok()) { + return s; + } + int cmp_result = internal_comparator_.Compare(mid_key, parsed_target); + if (cmp_result < 0) { + low = mid; + } else { + if (cmp_result == 0) { + // Happen to have found the exact key or target is smaller than the + // first key after base_offset. + prefix_matched = true; + ret_offset = file_offset; + return Status::OK(); + } else { + high = mid; + } + } + } + // Both of the key at the position low or low+1 could share the same + // prefix as target. We need to rule out one of them to avoid to go + // to the wrong prefix. + ParsedInternalKey low_key; + size_t tmp; + uint32_t low_key_offset = base_ptr[low]; + Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, tmp); + if (GetPrefix(low_key) == prefix) { + prefix_matched = true; + ret_offset = low_key_offset; + } else if (low + 1 < upper_bound) { + // There is possible a next prefix, return it + prefix_matched = false; + ret_offset = base_ptr[low + 1]; + } else { + // target is larger than a key of the last prefix in this bucket + // but with a different prefix. Key does not exist. + ret_offset = data_end_offset_; + } + return Status::OK(); +} + +bool PlainTableReader::MayHavePrefix(uint32_t hash) { + return bloom_ == nullptr || bloom_->MayContainHash(hash); +} + +Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) { + return options_.prefix_extractor->Transform(target.user_key); +} + +Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key, + size_t& bytes_read) { + const char* key_ptr = nullptr; + bytes_read = 0; + size_t user_key_size = 0; + if (IsFixedLength()) { + user_key_size = user_key_len_; + key_ptr = row_ptr; + } else { + uint32_t tmp_size = 0; + key_ptr = GetVarint32Ptr(row_ptr, file_data_.data() + data_end_offset_, + &tmp_size); + if (key_ptr == nullptr) { + return Status::Corruption("Unable to read the next key"); + } + user_key_size = (size_t)tmp_size; + bytes_read = key_ptr - row_ptr; + } + if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) { + return Status::Corruption("Unable to read the next key"); + } + + if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) { + // Special encoding for the row with seqID=0 + key->user_key = Slice(key_ptr, user_key_size); + key->sequence = 0; + key->type = kTypeValue; + bytes_read += user_key_size + 1; + } else { + if (row_ptr + user_key_size + 8 >= file_data_.data() + data_end_offset_) { + return Status::Corruption("Unable to read the next key"); + } + if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) { + return Status::Corruption(Slice()); + } + bytes_read += user_key_size + 8; + } + + return Status::OK(); +} + +Status PlainTableReader::Next(uint32_t offset, ParsedInternalKey* key, + Slice* value, uint32_t& next_offset) { + if (offset == data_end_offset_) { + next_offset = data_end_offset_; + return Status::OK(); + } + + if (offset > data_end_offset_) { + return Status::Corruption("Offset is out of file size"); + } + + const char* row_ptr = file_data_.data() + offset; + size_t bytes_for_key; + Status s = ReadKey(row_ptr, key, bytes_for_key); + uint32_t value_size; + const char* value_ptr = GetVarint32Ptr(row_ptr + bytes_for_key, + file_data_.data() + data_end_offset_, + &value_size); + if (value_ptr == nullptr) { + return Status::Corruption("Error reading value length."); + } + next_offset = offset + (value_ptr - row_ptr) + value_size; + if (next_offset > data_end_offset_) { + return Status::Corruption("Reach end of file when reading value"); + } + *value = Slice(value_ptr, value_size); + + return Status::OK(); +} + +Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, + void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&, bool), + void (*mark_key_may_exist)(void*)) { + // Check bloom filter first. + Slice prefix_slice = GetPrefix(target); + uint32_t prefix_hash = GetSliceHash(prefix_slice); + if (!MayHavePrefix(prefix_hash)) { + return Status::OK(); + } + uint32_t offset; + bool prefix_match; + Status s = GetOffset(target, prefix_slice, prefix_hash, prefix_match, offset); + if (!s.ok()) { + return s; + } + ParsedInternalKey found_key; + ParsedInternalKey parsed_target; + if (!ParseInternalKey(target, &parsed_target)) { + return Status::Corruption(Slice()); + } + + Slice found_value; + while (offset < data_end_offset_) { + Status s = Next(offset, &found_key, &found_value, offset); + if (!s.ok()) { + return s; + } + if (!prefix_match) { + // Need to verify prefix for the first key found if it is not yet + // checked. + if (GetPrefix(found_key) != prefix_slice) { + return Status::OK(); + } + prefix_match = true; + } + if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { + if (!(*saver)(arg, found_key, found_value, true)) { + break; + } + } + } + return Status::OK(); +} + +uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) { + return 0; +} + +PlainTableIterator::PlainTableIterator(PlainTableReader* table) : + table_(table) { + next_offset_ = offset_ = table_->data_end_offset_; +} + +PlainTableIterator::~PlainTableIterator() { +} + +bool PlainTableIterator::Valid() const { + return offset_ < table_->data_end_offset_ + && offset_ >= table_->data_start_offset_; +} + +void PlainTableIterator::SeekToFirst() { + next_offset_ = table_->data_start_offset_; + if (next_offset_ >= table_->data_end_offset_) { + next_offset_ = offset_ = table_->data_end_offset_; + } else { + Next(); + } +} + +void PlainTableIterator::SeekToLast() { + assert(false); +} + +void PlainTableIterator::Seek(const Slice& target) { + Slice prefix_slice = table_->GetPrefix(target); + uint32_t prefix_hash = GetSliceHash(prefix_slice); + if (!table_->MayHavePrefix(prefix_hash)) { + offset_ = next_offset_ = table_->data_end_offset_; + return; + } + bool prefix_match; + status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match, + next_offset_); + if (!status_.ok()) { + offset_ = next_offset_ = table_->data_end_offset_; + return; + } + + if (next_offset_ < table_-> data_end_offset_) { + for (Next(); status_.ok() && Valid(); Next()) { + if (!prefix_match) { + // Need to verify the first key's prefix + if (table_->GetPrefix(key()) != prefix_slice) { + offset_ = next_offset_ = table_->data_end_offset_; + break; + } + prefix_match = true; + } + if (table_->internal_comparator_.Compare(key(), target) >= 0) { + break; + } + } + } else { + offset_ = table_->data_end_offset_; + } +} + +void PlainTableIterator::Next() { + offset_ = next_offset_; + if (offset_ < table_->data_end_offset_) { + Slice tmp_slice; + ParsedInternalKey parsed_key; + status_ = table_->Next(next_offset_, &parsed_key, &value_, next_offset_); + if (status_.ok()) { + // Make a copy in this case. TODO optimize. + tmp_str_.clear(); + AppendInternalKey(&tmp_str_, parsed_key); + key_ = Slice(tmp_str_); + } else { + offset_ = next_offset_ = table_->data_end_offset_; + } + } +} + +void PlainTableIterator::Prev() { + assert(false); +} + +Slice PlainTableIterator::key() const { + assert(Valid()); + return key_; +} + +Slice PlainTableIterator::value() const { + assert(Valid()); + return value_; +} + +Status PlainTableIterator::status() const { + return status_; +} + +} // namespace rocksdb diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h new file mode 100644 index 000000000..1abe4e35c --- /dev/null +++ b/table/plain_table_reader.h @@ -0,0 +1,220 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/table_reader.h" +#include "table/plain_table_factory.h" + +namespace rocksdb { + +class Block; +class BlockHandle; +class Footer; +struct Options; +class RandomAccessFile; +struct ReadOptions; +class TableCache; +class TableReader; +class DynamicBloom; +class InternalKeyComparator; + +using std::unique_ptr; +using std::unordered_map; +extern const uint32_t kPlainTableVariableLength; + +// Based on following output file format shown in plain_table_factory.h +// When opening the output file, IndexedTableReader creates a hash table +// from key prefixes to offset of the output file. IndexedTable will decide +// whether it points to the data offset of the first key with the key prefix +// or the offset of it. If there are too many keys share this prefix, it will +// create a binary search-able index from the suffix to offset on disk. +// +// The implementation of IndexedTableReader requires output file is mmaped +class PlainTableReader: public TableReader { + public: + static Status Open(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table, + const int bloom_bits_per_key, double hash_table_ratio); + + bool PrefixMayMatch(const Slice& internal_prefix); + + Iterator* NewIterator(const ReadOptions&); + + Status Get(const ReadOptions&, const Slice& key, void* arg, + bool (*result_handler)(void* arg, const ParsedInternalKey& k, + const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr); + + uint64_t ApproximateOffsetOf(const Slice& key); + + void SetupForCompaction(); + + const TableProperties& GetTableProperties() { return table_properties_; } + + PlainTableReader(const EnvOptions& storage_options, + const InternalKeyComparator& internal_comparator, + uint64_t file_size, int bloom_num_bits, + double hash_table_ratio, + const TableProperties& table_properties); + ~PlainTableReader(); + + private: + struct IndexRecord; + class IndexRecordList; + + uint32_t* hash_table_ = nullptr; + int hash_table_size_ = 0; + char* sub_index_ = nullptr; + + Options options_; + const EnvOptions& soptions_; + const InternalKeyComparator internal_comparator_; + Status status_; + unique_ptr file_; + + Slice file_data_; + uint32_t version_; + uint32_t file_size_; + + const double kHashTableRatio; + const int kBloomBitsPerKey; + DynamicBloom* bloom_ = nullptr; + + TableProperties table_properties_; + const uint32_t data_start_offset_ = 0; + const uint32_t data_end_offset_; + const size_t user_key_len_; + + static const size_t kNumInternalBytes = 8; + static const uint32_t kSubIndexMask = 0x80000000; + static const size_t kOffsetLen = sizeof(uint32_t); + static const uint64_t kMaxFileSize = 1u << 31; + static const size_t kRecordsPerGroup = 256; + // To speed up the search for keys with same prefix, we'll add index key for + // every N keys, where the "N" is determined by + // kIndexIntervalForSamePrefixKeys + static const size_t kIndexIntervalForSamePrefixKeys = 16; + + bool IsFixedLength() const { + return user_key_len_ != kPlainTableVariableLength; + } + + size_t GetFixedInternalKeyLength() const { + return user_key_len_ + kNumInternalBytes; + } + + friend class TableCache; + friend class PlainTableIterator; + + // Internal helper function to generate an IndexRecordList object from all + // the rows, which contains index records as a list. + int PopulateIndexRecordList(IndexRecordList* record_list); + + // Internal helper function to allocate memory for indexes and bloom filters + void AllocateIndexAndBloom(int num_prefixes); + + // Internal helper function to bucket index record list to hash buckets. + // hash_to_offsets is sized of of hash_table_size_, each contains a linked + // list + // of offsets for the hash, in reversed order. + // bucket_count is sized of hash_table_size_. The value is how many index + // records are there in hash_to_offsets for the same bucket. + size_t BucketizeIndexesAndFillBloom( + IndexRecordList& record_list, int num_prefixes, + std::vector* hash_to_offsets, + std::vector* bucket_count); + + // Internal helper class to fill the indexes and bloom filters to internal + // data structures. hash_to_offsets and bucket_count are bucketized indexes + // and counts generated by BucketizeIndexesAndFillBloom(). + void FillIndexes(size_t sub_index_size_needed, + const std::vector& hash_to_offsets, + const std::vector& bucket_count); + + // PopulateIndex() builds index of keys. It must be called before any query + // to the table. + // + // hash_table_ contains buckets size of hash_table_size_, each is a 32-bit + // integer. The lower 31 bits contain an offset value (explained below) and + // the first bit of the integer indicates type of the offset. + // + // +--------------+------------------------------------------------------+ + // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + + // +--------------+------------------------------------------------------+ + // + // Explanation for the "flag bit": + // + // 0 indicates that the bucket contains only one prefix (no conflict when + // hashing this prefix), whose first row starts from this offset of the + // file. + // 1 indicates that the bucket contains more than one prefixes, or there + // are too many rows for one prefix so we need a binary search for it. In + // this case, the offset indicates the offset of sub_index_ holding the + // binary search indexes of keys for those rows. Those binary search indexes + // are organized in this way: + // + // The first 4 bytes, indicate how many indexes (N) are stored after it. After + // it, there are N 32-bit integers, each points of an offset of the file, + // which + // points to starting of a row. Those offsets need to be guaranteed to be in + // ascending order so the keys they are pointing to are also in ascending + // order + // to make sure we can use them to do binary searches. Below is visual + // presentation of a bucket. + // + // + // number_of_records: varint32 + // record 1 file offset: fixedint32 + // record 2 file offset: fixedint32 + // .... + // record N file offset: fixedint32 + // + Status PopulateIndex(); + + // Check bloom filter to see whether it might contain this prefix. + // The hash of the prefix is given, since it can be reused for index lookup + // too. + bool MayHavePrefix(uint32_t hash); + + Status ReadKey(const char* row_ptr, ParsedInternalKey* key, + size_t& bytes_read); + // Read the key and value at offset to key and value. + // tmp_slice is a tmp slice. + // return next_offset as the offset for the next key. + Status Next(uint32_t offset, ParsedInternalKey* key, Slice* value, + uint32_t& next_offset); + // Get file offset for key target. + // return value prefix_matched is set to true if the offset is confirmed + // for a key with the same prefix as target. + Status GetOffset(const Slice& target, const Slice& prefix, + uint32_t prefix_hash, bool& prefix_matched, + uint32_t& ret_offset); + + Slice GetPrefix(const Slice& target) { + assert(target.size() >= 8); // target is internal key + return options_.prefix_extractor->Transform( + Slice(target.data(), target.size() - 8)); + } + + Slice GetPrefix(const ParsedInternalKey& target); + + // No copying allowed + explicit PlainTableReader(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; +} // namespace rocksdb diff --git a/table/table_builder.h b/table/table_builder.h new file mode 100644 index 000000000..ee32cff86 --- /dev/null +++ b/table/table_builder.h @@ -0,0 +1,55 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +namespace rocksdb { + +class Slice; +class Status; + +// TableBuilder provides the interface used to build a Table +// (an immutable and sorted map from keys to values). +// +// Multiple threads can invoke const methods on a TableBuilder without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same TableBuilder must use +// external synchronization. +class TableBuilder { + public: + // REQUIRES: Either Finish() or Abandon() has been called. + virtual ~TableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Add(const Slice& key, const Slice& value) = 0; + + // Return non-ok iff some error has been detected. + virtual Status status() const = 0; + + // Finish building the table. + // REQUIRES: Finish(), Abandon() have not been called + virtual Status Finish() = 0; + + // Indicate that the contents of this builder should be abandoned. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Abandon() = 0; + + // Number of calls to Add() so far. + virtual uint64_t NumEntries() const = 0; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + virtual uint64_t FileSize() const = 0; +}; + +} // namespace rocksdb diff --git a/table/table_properties.cc b/table/table_properties.cc new file mode 100644 index 000000000..414b15681 --- /dev/null +++ b/table/table_properties.cc @@ -0,0 +1,114 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "rocksdb/table_properties.h" + +namespace rocksdb { + +namespace { + void AppendProperty( + std::string& props, + const std::string& key, + const std::string& value, + const std::string& prop_delim, + const std::string& kv_delim) { + props.append(key); + props.append(kv_delim); + props.append(value); + props.append(prop_delim); + } + + template + void AppendProperty( + std::string& props, + const std::string& key, + const TValue& value, + const std::string& prop_delim, + const std::string& kv_delim) { + AppendProperty( + props, key, std::to_string(value), prop_delim, kv_delim + ); + } +} + +std::string TableProperties::ToString( + const std::string& prop_delim, + const std::string& kv_delim) const { + std::string result; + result.reserve(1024); + + // Basic Info + AppendProperty( + result, "# data blocks", num_data_blocks, prop_delim, kv_delim + ); + AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim); + + AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim); + AppendProperty( + result, + "raw average key size", + num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0, + prop_delim, + kv_delim + ); + AppendProperty( + result, "raw value size", raw_value_size, prop_delim, kv_delim + ); + AppendProperty( + result, + "raw average value size", + num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0, + prop_delim, + kv_delim + ); + + AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); + AppendProperty(result, "index block size", index_size, prop_delim, kv_delim); + AppendProperty( + result, "filter block size", filter_size, prop_delim, kv_delim + ); + AppendProperty( + result, + "(estimated) table size", + data_size + index_size + filter_size, + prop_delim, + kv_delim + ); + + AppendProperty( + result, + "filter policy name", + filter_policy_name.empty() ? std::string("N/A") : filter_policy_name, + prop_delim, + kv_delim + ); + + return result; +} + +const std::string TablePropertiesNames::kDataSize = + "rocksdb.data.size"; +const std::string TablePropertiesNames::kIndexSize = + "rocksdb.index.size"; +const std::string TablePropertiesNames::kFilterSize = + "rocksdb.filter.size"; +const std::string TablePropertiesNames::kRawKeySize = + "rocksdb.raw.key.size"; +const std::string TablePropertiesNames::kRawValueSize = + "rocksdb.raw.value.size"; +const std::string TablePropertiesNames::kNumDataBlocks = + "rocksdb.num.data.blocks"; +const std::string TablePropertiesNames::kNumEntries = + "rocksdb.num.entries"; +const std::string TablePropertiesNames::kFilterPolicy = + "rocksdb.filter.policy"; +const std::string TablePropertiesNames::kFormatVersion = + "rocksdb.format.version"; +const std::string TablePropertiesNames::kFixedKeyLen = + "rocksdb.fixed.key.length"; + +extern const std::string kPropertiesBlock = "rocksdb.properties"; + +} // namespace rocksdb diff --git a/table/table_reader.h b/table/table_reader.h new file mode 100644 index 000000000..9acbb33d0 --- /dev/null +++ b/table/table_reader.h @@ -0,0 +1,71 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +namespace rocksdb { + +class Iterator; +struct ParsedInternalKey; +class Slice; +struct ReadOptions; +struct TableProperties; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. +class TableReader { + public: + virtual ~TableReader() {} + + // Determine whether there is a chance that the current table file + // contains the key a key starting with iternal_prefix. The specific + // table implementation can use bloom filter and/or other heuristic + // to filter out this table as a whole. + virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0; + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + virtual Iterator* NewIterator(const ReadOptions&) = 0; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0; + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + virtual void SetupForCompaction() = 0; + + virtual const TableProperties& GetTableProperties() = 0; + + // Calls (*result_handler)(handle_context, ...) repeatedly, starting with + // the entry found after a call to Seek(key), until result_handler returns + // false, where k is the actual internal key for a row found and v as the + // value of the key. didIO is true if I/O is involved in the operation. May + // not make such a call if filter policy says that key is not present. + // + // mark_key_may_exist_handler needs to be called when it is configured to be + // memory only and the key is not found in the block cache, with + // the parameter to be handle_context. + // + // readOptions is the options for the read + // key is the key to search for + virtual Status Get( + const ReadOptions& readOptions, const Slice& key, void* handle_context, + bool (*result_handler)(void* arg, const ParsedInternalKey& k, + const Slice& v, bool didIO), + void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0; +}; + +} // namespace rocksdb diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index e7b6b0b7a..f746592fe 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -6,12 +6,13 @@ #include #include "rocksdb/db.h" -#include "rocksdb/table.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" #include "db/db_impl.h" #include "db/dbformat.h" #include "port/atomic_pointer.h" #include "table/block_based_table_factory.h" +#include "table/plain_table_factory.h" #include "util/histogram.h" #include "util/testharness.h" #include "util/testutil.h" @@ -33,8 +34,8 @@ static std::string MakeKey(int i, int j, bool through_db) { return key.Encode().ToString(); } -static bool DummySaveValue(void* arg, const Slice& ikey, const Slice& v, - bool didIO) { +static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey, + const Slice& v, bool didIO) { return false; } @@ -70,7 +71,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, Status s; if (!through_db) { env->NewWritableFile(file_name, &file, env_options); - tb = opts.table_factory->GetTableBuilder(opts, file.get(), + tb = opts.table_factory->NewTableBuilder(opts, file.get(), CompressionType::kNoCompression); } else { s = DB::Open(opts, dbname, &db); @@ -101,7 +102,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, Status s = env->NewRandomAccessFile(file_name, &raf, env_options); uint64_t file_size; env->GetFileSize(file_name, &file_size); - s = opts.table_factory->GetTableReader(opts, env_options, std::move(raf), + s = opts.table_factory->NewTableReader(opts, env_options, std::move(raf), file_size, &table_reader); } @@ -218,6 +219,8 @@ DEFINE_bool(iterator, false, "For test iterator"); DEFINE_bool(through_db, false, "If enable, a DB instance will be created and " "the query will be against DB. Otherwise, will be directly against " "a table reader."); +DEFINE_bool(plain_table, false, "Use PlainTable"); + int main(int argc, char** argv) { google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + @@ -230,10 +233,23 @@ int main(int argc, char** argv) { options.prefix_extractor = rocksdb::NewFixedPrefixTransform( FLAGS_prefix_len); } - options.SetUpDefaultFlushBlockPolicyFactory(); rocksdb::ReadOptions ro; rocksdb::EnvOptions env_options; options.create_if_missing = true; + options.compression = rocksdb::CompressionType::kNoCompression; + options.internal_comparator = + new rocksdb::InternalKeyComparator(options.comparator); + + if (FLAGS_plain_table) { + options.allow_mmap_reads = true; + env_options.use_mmap_reads = true; + tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8, + 0.75); + options.prefix_extractor = rocksdb::NewFixedPrefixTransform( + FLAGS_prefix_len); + } else { + tf = new rocksdb::BlockBasedTableFactory(); + } options.table_factory = std::shared_ptr(tf); TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1, diff --git a/table/table_test.cc b/table/table_test.cc index 5b312f272..e473b8007 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -6,6 +6,7 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include #include #include #include @@ -16,17 +17,22 @@ #include "util/statistics.h" #include "db/memtable.h" #include "db/write_batch_internal.h" + #include "rocksdb/cache.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" #include "rocksdb/memtablerep.h" +#include "table/block.h" #include "table/block_based_table_builder.h" #include "table/block_based_table_factory.h" #include "table/block_based_table_reader.h" #include "table/block_builder.h" -#include "table/block.h" #include "table/format.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" + #include "util/random.h" #include "util/testharness.h" #include "util/testutil.h" @@ -34,15 +40,12 @@ namespace rocksdb { namespace { + // Return reverse of "key". // Used to test non-lexicographic comparators. -static std::string Reverse(const Slice& key) { - std::string str(key.ToString()); - std::string rev(""); - for (std::string::reverse_iterator rit = str.rbegin(); - rit != str.rend(); ++rit) { - rev.push_back(*rit); - } +std::string Reverse(const Slice& key) { + auto rev = key.ToString(); + std::reverse(rev.begin(), rev.end()); return rev; } @@ -71,10 +74,10 @@ class ReverseKeyComparator : public Comparator { *key = Reverse(s); } }; -} // namespace -static ReverseKeyComparator reverse_key_comparator; -static void Increment(const Comparator* cmp, std::string* key) { +ReverseKeyComparator reverse_key_comparator; + +void Increment(const Comparator* cmp, std::string* key) { if (cmp == BytewiseComparator()) { key->push_back('\0'); } else { @@ -86,7 +89,6 @@ static void Increment(const Comparator* cmp, std::string* key) { } // An STL comparator that uses a Comparator -namespace anon { struct STLLessThan { const Comparator* cmp; @@ -96,6 +98,7 @@ struct STLLessThan { return cmp->Compare(Slice(a), Slice(b)) < 0; } }; + } // namespace class StringSink: public WritableFile { @@ -120,8 +123,9 @@ class StringSink: public WritableFile { class StringSource: public RandomAccessFile { public: - StringSource(const Slice& contents, uint64_t uniq_id) - : contents_(contents.data(), contents.size()), uniq_id_(uniq_id) { + StringSource(const Slice& contents, uint64_t uniq_id, bool mmap) + : contents_(contents.data(), contents.size()), uniq_id_(uniq_id), + mmap_(mmap) { } virtual ~StringSource() { } @@ -136,8 +140,12 @@ class StringSource: public RandomAccessFile { if (offset + n > contents_.size()) { n = contents_.size() - offset; } - memcpy(scratch, &contents_[offset], n); - *result = Slice(scratch, n); + if (!mmap_) { + memcpy(scratch, &contents_[offset], n); + *result = Slice(scratch, n); + } else { + *result = Slice(&contents_[offset], n); + } return Status::OK(); } @@ -155,15 +163,16 @@ class StringSource: public RandomAccessFile { private: std::string contents_; uint64_t uniq_id_; + bool mmap_; }; -typedef std::map KVMap; +typedef std::map KVMap; // Helper class for tests to unify the interface between // BlockBuilder/TableBuilder and Block/Table. class Constructor { public: - explicit Constructor(const Comparator* cmp) : data_(anon::STLLessThan(cmp)) { } + explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) {} virtual ~Constructor() { } void Add(const std::string& key, const Slice& value) { @@ -174,8 +183,9 @@ class Constructor { // been added so far. Returns the keys in sorted order in "*keys" // and stores the key/value pairs in "*kvmap" void Finish(const Options& options, - std::vector* keys, - KVMap* kvmap) { + const InternalKeyComparator& internal_comparator, + std::vector* keys, KVMap* kvmap) { + last_internal_key_ = &internal_comparator; *kvmap = data_; keys->clear(); for (KVMap::const_iterator it = data_.begin(); @@ -184,12 +194,14 @@ class Constructor { keys->push_back(it->first); } data_.clear(); - Status s = FinishImpl(options, *kvmap); + Status s = FinishImpl(options, internal_comparator, *kvmap); ASSERT_TRUE(s.ok()) << s.ToString(); } // Construct the data structure from the data in "data" - virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) = 0; virtual Iterator* NewIterator() const = 0; @@ -197,6 +209,9 @@ class Constructor { virtual DB* db() const { return nullptr; } // Overridden in DBConstructor + protected: + const InternalKeyComparator* last_internal_key_; + private: KVMap data_; }; @@ -210,10 +225,12 @@ class BlockConstructor: public Constructor { ~BlockConstructor() { delete block_; } - virtual Status FinishImpl(const Options& options, const KVMap& data) { + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { delete block_; block_ = nullptr; - BlockBuilder builder(options); + BlockBuilder builder(options, &internal_comparator); for (KVMap::const_iterator it = data.begin(); it != data.end(); @@ -241,49 +258,97 @@ class BlockConstructor: public Constructor { BlockConstructor(); }; -class BlockBasedTableConstructor: public Constructor { +// A helper class that converts internal format keys into user keys +class KeyConvertingIterator: public Iterator { public: - explicit BlockBasedTableConstructor(const Comparator* cmp) - : Constructor(cmp) {} - ~BlockBasedTableConstructor() { - Reset(); + explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } + virtual ~KeyConvertingIterator() { delete iter_; } + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& target) { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + iter_->Seek(encoded); + } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + + virtual Slice key() const { + assert(Valid()); + ParsedInternalKey key; + if (!ParseInternalKey(iter_->key(), &key)) { + status_ = Status::Corruption("malformed internal key"); + return Slice("corrupted key"); + } + return key.user_key; } - virtual Status FinishImpl(const Options& options, const KVMap& data) { + virtual Slice value() const { return iter_->value(); } + virtual Status status() const { + return status_.ok() ? iter_->status() : status_; + } + + private: + mutable Status status_; + Iterator* iter_; + + // No copying allowed + KeyConvertingIterator(const KeyConvertingIterator&); + void operator=(const KeyConvertingIterator&); +}; + +class TableConstructor: public Constructor { + public: + explicit TableConstructor(const Comparator* cmp, + bool convert_to_internal_key = false) + : Constructor(cmp), convert_to_internal_key_(convert_to_internal_key) {} + ~TableConstructor() { Reset(); } + + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { Reset(); sink_.reset(new StringSink()); - std::unique_ptr flush_policy_factory( - new FlushBlockBySizePolicyFactory(options.block_size, - options.block_size_deviation)); - - BlockBasedTableBuilder builder( - options, - sink_.get(), - flush_policy_factory.get(), - options.compression); + unique_ptr builder; + builder.reset(options.table_factory->NewTableBuilder( + options, internal_comparator, sink_.get(), options.compression)); for (KVMap::const_iterator it = data.begin(); it != data.end(); ++it) { - builder.Add(it->first, it->second); - ASSERT_TRUE(builder.status().ok()); + if (convert_to_internal_key_) { + ParsedInternalKey ikey(it->first, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + builder->Add(encoded, it->second); + } else { + builder->Add(it->first, it->second); + } + ASSERT_TRUE(builder->status().ok()); } - Status s = builder.Finish(); + Status s = builder->Finish(); ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(sink_->contents().size(), builder.FileSize()); + ASSERT_EQ(sink_->contents().size(), builder->FileSize()); // Open the table uniq_id_ = cur_uniq_id_++; - source_.reset(new StringSource(sink_->contents(), uniq_id_)); - return options.table_factory->GetTableReader(options, soptions, - std::move(source_), - sink_->contents().size(), - &table_reader_); + source_.reset(new StringSource(sink_->contents(), uniq_id_, + options.allow_mmap_reads)); + return options.table_factory->NewTableReader( + options, soptions, internal_comparator, std::move(source_), + sink_->contents().size(), &table_reader_); } virtual Iterator* NewIterator() const { - return table_reader_->NewIterator(ReadOptions()); + Iterator* iter = table_reader_->NewIterator(ReadOptions()); + if (convert_to_internal_key_) { + return new KeyConvertingIterator(iter); + } else { + return iter; + } } uint64_t ApproximateOffsetOf(const Slice& key) const { @@ -291,11 +356,12 @@ class BlockBasedTableConstructor: public Constructor { } virtual Status Reopen(const Options& options) { - source_.reset(new StringSource(sink_->contents(), uniq_id_)); - return options.table_factory->GetTableReader(options, soptions, - std::move(source_), - sink_->contents().size(), - &table_reader_); + source_.reset( + new StringSource(sink_->contents(), uniq_id_, + options.allow_mmap_reads)); + return options.table_factory->NewTableReader( + options, soptions, *last_internal_key_, std::move(source_), + sink_->contents().size(), &table_reader_); } virtual TableReader* table_reader() { @@ -309,59 +375,19 @@ class BlockBasedTableConstructor: public Constructor { sink_.reset(); source_.reset(); } + bool convert_to_internal_key_; uint64_t uniq_id_; unique_ptr sink_; unique_ptr source_; unique_ptr table_reader_; - BlockBasedTableConstructor(); + TableConstructor(); static uint64_t cur_uniq_id_; const EnvOptions soptions; }; -uint64_t BlockBasedTableConstructor::cur_uniq_id_ = 1; - -// A helper class that converts internal format keys into user keys -class KeyConvertingIterator: public Iterator { - public: - explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } - virtual ~KeyConvertingIterator() { delete iter_; } - virtual bool Valid() const { return iter_->Valid(); } - virtual void Seek(const Slice& target) { - ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); - std::string encoded; - AppendInternalKey(&encoded, ikey); - iter_->Seek(encoded); - } - virtual void SeekToFirst() { iter_->SeekToFirst(); } - virtual void SeekToLast() { iter_->SeekToLast(); } - virtual void Next() { iter_->Next(); } - virtual void Prev() { iter_->Prev(); } - - virtual Slice key() const { - assert(Valid()); - ParsedInternalKey key; - if (!ParseInternalKey(iter_->key(), &key)) { - status_ = Status::Corruption("malformed internal key"); - return Slice("corrupted key"); - } - return key.user_key; - } - - virtual Slice value() const { return iter_->value(); } - virtual Status status() const { - return status_.ok() ? iter_->status() : status_; - } - - private: - mutable Status status_; - Iterator* iter_; - - // No copying allowed - KeyConvertingIterator(const KeyConvertingIterator&); - void operator=(const KeyConvertingIterator&); -}; +uint64_t TableConstructor::cur_uniq_id_ = 1; class MemTableConstructor: public Constructor { public: @@ -378,7 +404,9 @@ class MemTableConstructor: public Constructor { ~MemTableConstructor() { delete memtable_->Unref(); } - virtual Status FinishImpl(const Options& options, const KVMap& data) { + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { delete memtable_->Unref(); Options memtable_options; memtable_options.memtable_factory = table_factory_; @@ -414,7 +442,9 @@ class DBConstructor: public Constructor { ~DBConstructor() { delete db_; } - virtual Status FinishImpl(const Options& options, const KVMap& data) { + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { delete db_; db_ = nullptr; NewDB(); @@ -480,7 +510,9 @@ static bool BZip2CompressionSupported() { #endif enum TestType { - TABLE_TEST, + BLOCK_BASED_TABLE_TEST, + PLAIN_TABLE_SEMI_FIXED_PREFIX, + PLAIN_TABLE_FULL_STR_PREFIX, BLOCK_TEST, MEMTABLE_TEST, DB_TEST @@ -493,49 +525,98 @@ struct TestArgs { CompressionType compression; }; - static std::vector GenerateArgList() { - std::vector ret; - TestType test_type[4] = {TABLE_TEST, BLOCK_TEST, MEMTABLE_TEST, DB_TEST}; - int test_type_len = 4; - bool reverse_compare[2] = {false, true}; - int reverse_compare_len = 2; - int restart_interval[3] = {16, 1, 1024}; - int restart_interval_len = 3; + std::vector test_args; + std::vector test_types = { + BLOCK_BASED_TABLE_TEST, PLAIN_TABLE_SEMI_FIXED_PREFIX, + PLAIN_TABLE_FULL_STR_PREFIX, BLOCK_TEST, + MEMTABLE_TEST, DB_TEST}; + std::vector reverse_compare_types = {false, true}; + std::vector restart_intervals = {16, 1, 1024}; // Only add compression if it is supported - std::vector compression_types; - compression_types.push_back(kNoCompression); + std::vector compression_types = {kNoCompression}; #ifdef SNAPPY - if (SnappyCompressionSupported()) + if (SnappyCompressionSupported()) { compression_types.push_back(kSnappyCompression); + } #endif #ifdef ZLIB - if (ZlibCompressionSupported()) + if (ZlibCompressionSupported()) { compression_types.push_back(kZlibCompression); + } #endif #ifdef BZIP2 - if (BZip2CompressionSupported()) + if (BZip2CompressionSupported()) { compression_types.push_back(kBZip2Compression); + } #endif - for(int i =0; i < test_type_len; i++) - for (int j =0; j < reverse_compare_len; j++) - for (int k =0; k < restart_interval_len; k++) - for (unsigned int n =0; n < compression_types.size(); n++) { - TestArgs one_arg; - one_arg.type = test_type[i]; - one_arg.reverse_compare = reverse_compare[j]; - one_arg.restart_interval = restart_interval[k]; - one_arg.compression = compression_types[n]; - ret.push_back(one_arg); - } + for (auto test_type : test_types) { + for (auto reverse_compare : reverse_compare_types) { + if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX || + test_type == PLAIN_TABLE_FULL_STR_PREFIX) { + // Plain table doesn't use restart index or compression. + TestArgs one_arg; + one_arg.type = test_type; + one_arg.reverse_compare = reverse_compare; + one_arg.restart_interval = restart_intervals[0]; + one_arg.compression = compression_types[0]; + test_args.push_back(one_arg); + continue; + } - return ret; + for (auto restart_interval : restart_intervals) { + for (auto compression_type : compression_types) { + TestArgs one_arg; + one_arg.type = test_type; + one_arg.reverse_compare = reverse_compare; + one_arg.restart_interval = restart_interval; + one_arg.compression = compression_type; + test_args.push_back(one_arg); + } + } + } + } + return test_args; } +// In order to make all tests run for plain table format, including +// those operating on empty keys, create a new prefix transformer which +// return fixed prefix if the slice is not shorter than the prefix length, +// and the full slice if it is shorter. +class FixedOrLessPrefixTransform : public SliceTransform { + private: + const size_t prefix_len_; + + public: + explicit FixedOrLessPrefixTransform(size_t prefix_len) : + prefix_len_(prefix_len) { + } + + virtual const char* Name() const { + return "rocksdb.FixedPrefix"; + } + + virtual Slice Transform(const Slice& src) const { + assert(InDomain(src)); + if (src.size() < prefix_len_) { + return src; + } + return Slice(src.data(), prefix_len_); + } + + virtual bool InDomain(const Slice& src) const { + return true; + } + + virtual bool InRange(const Slice& dst) const { + return (dst.size() <= prefix_len_); + } +}; + class Harness { public: Harness() : constructor_(nullptr) { } @@ -553,9 +634,40 @@ class Harness { if (args.reverse_compare) { options_.comparator = &reverse_key_comparator; } + + internal_comparator_.reset( + new test::PlainInternalKeyComparator(options_.comparator)); + + support_prev_ = true; + only_support_prefix_seek_ = false; + BlockBasedTableOptions table_options; switch (args.type) { - case TABLE_TEST: - constructor_ = new BlockBasedTableConstructor(options_.comparator); + case BLOCK_BASED_TABLE_TEST: + table_options.flush_block_policy_factory.reset( + new FlushBlockBySizePolicyFactory(options_.block_size, + options_.block_size_deviation)); + options_.table_factory.reset(new BlockBasedTableFactory(table_options)); + constructor_ = new TableConstructor(options_.comparator); + break; + case PLAIN_TABLE_SEMI_FIXED_PREFIX: + support_prev_ = false; + only_support_prefix_seek_ = true; + options_.prefix_extractor = prefix_transform.get(); + options_.allow_mmap_reads = true; + options_.table_factory.reset(new PlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; + case PLAIN_TABLE_FULL_STR_PREFIX: + support_prev_ = false; + only_support_prefix_seek_ = true; + options_.prefix_extractor = noop_transform.get(); + options_.allow_mmap_reads = true; + options_.table_factory.reset(new PlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); break; case BLOCK_TEST: constructor_ = new BlockConstructor(options_.comparator); @@ -580,10 +692,12 @@ class Harness { void Test(Random* rnd) { std::vector keys; KVMap data; - constructor_->Finish(options_, &keys, &data); + constructor_->Finish(options_, *internal_comparator_, &keys, &data); TestForwardScan(keys, data); - TestBackwardScan(keys, data); + if (support_prev_) { + TestBackwardScan(keys, data); + } TestRandomAccess(rnd, keys, data); } @@ -626,7 +740,7 @@ class Harness { KVMap::const_iterator model_iter = data.begin(); if (kVerbose) fprintf(stderr, "---\n"); for (int i = 0; i < 200; i++) { - const int toss = rnd->Uniform(5); + const int toss = rnd->Uniform(support_prev_ ? 5 : 3); switch (toss) { case 0: { if (iter->Valid()) { @@ -718,17 +832,20 @@ class Harness { } else { const int index = rnd->Uniform(keys.size()); std::string result = keys[index]; - switch (rnd->Uniform(3)) { + switch (rnd->Uniform(support_prev_ ? 3 : 1)) { case 0: // Return an existing key break; case 1: { // Attempt to return something smaller than an existing key - if (result.size() > 0 && result[result.size()-1] > '\0') { - result[result.size()-1]--; + if (result.size() > 0 && result[result.size() - 1] > '\0' + && (!only_support_prefix_seek_ + || options_.prefix_extractor->Transform(result).size() + < result.size())) { + result[result.size() - 1]--; } break; - } + } case 2: { // Return something larger than an existing key Increment(options_.comparator, &result); @@ -745,50 +862,17 @@ class Harness { private: Options options_ = Options(); Constructor* constructor_; + bool support_prev_; + bool only_support_prefix_seek_; + shared_ptr internal_comparator_; + static std::unique_ptr noop_transform; + static std::unique_ptr prefix_transform; }; -// Test the empty key -TEST(Harness, SimpleEmptyKey) { - std::vector args = GenerateArgList(); - for (unsigned int i = 0; i < args.size(); i++) { - Init(args[i]); - Random rnd(test::RandomSeed() + 1); - Add("", "v"); - Test(&rnd); - } -} - -TEST(Harness, SimpleSingle) { - std::vector args = GenerateArgList(); - for (unsigned int i = 0; i < args.size(); i++) { - Init(args[i]); - Random rnd(test::RandomSeed() + 2); - Add("abc", "v"); - Test(&rnd); - } -} - -TEST(Harness, SimpleMulti) { - std::vector args = GenerateArgList(); - for (unsigned int i = 0; i < args.size(); i++) { - Init(args[i]); - Random rnd(test::RandomSeed() + 3); - Add("abc", "v"); - Add("abcd", "v"); - Add("ac", "v2"); - Test(&rnd); - } -} - -TEST(Harness, SimpleSpecialKey) { - std::vector args = GenerateArgList(); - for (unsigned int i = 0; i < args.size(); i++) { - Init(args[i]); - Random rnd(test::RandomSeed() + 4); - Add("\xff\xff", "v3"); - Test(&rnd); - } -} +std::unique_ptr Harness::noop_transform( + NewNoopTransform()); +std::unique_ptr Harness::prefix_transform( + new FixedOrLessPrefixTransform(2)); static bool Between(uint64_t val, uint64_t low, uint64_t high) { bool result = (val >= low) && (val <= high); @@ -801,12 +885,30 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) { return result; } -class TableTest { }; +// Tests against all kinds of tables +class TableTest { + public: + const InternalKeyComparator& GetPlainInternalComparator( + const Comparator* comp) { + if (!plain_internal_comparator) { + plain_internal_comparator.reset( + new test::PlainInternalKeyComparator(comp)); + } + return *plain_internal_comparator; + } + + private: + std::unique_ptr plain_internal_comparator; +}; + +class GeneralTableTest : public TableTest {}; +class BlockBasedTableTest : public TableTest {}; +class PlainTableTest : public TableTest {}; // This test include all the basic checks except those for index size and block // size, which will be conducted in separated unit tests. -TEST(TableTest, BasicTableProperties) { - BlockBasedTableConstructor c(BytewiseComparator()); +TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) { + TableConstructor c(BytewiseComparator()); c.Add("a1", "val1"); c.Add("b2", "val2"); @@ -824,7 +926,8 @@ TEST(TableTest, BasicTableProperties) { options.compression = kNoCompression; options.block_restart_interval = 1; - c.Finish(options, &keys, &kvmap); + c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, + &kvmap); auto& props = c.table_reader()->GetTableProperties(); ASSERT_EQ(kvmap.size(), props.num_entries); @@ -838,7 +941,7 @@ TEST(TableTest, BasicTableProperties) { ASSERT_EQ("", props.filter_policy_name); // no filter policy is used // Verify data size. - BlockBuilder block_builder(options); + BlockBuilder block_builder(options, options.comparator); for (const auto& item : kvmap) { block_builder.Add(item.first, item.second); } @@ -849,8 +952,8 @@ TEST(TableTest, BasicTableProperties) { ); } -TEST(TableTest, FilterPolicyNameProperties) { - BlockBasedTableConstructor c(BytewiseComparator()); +TEST(BlockBasedTableTest, FilterPolicyNameProperties) { + TableConstructor c(BytewiseComparator()); c.Add("a1", "val1"); std::vector keys; KVMap kvmap; @@ -860,7 +963,8 @@ TEST(TableTest, FilterPolicyNameProperties) { ); options.filter_policy = filter_policy.get(); - c.Finish(options, &keys, &kvmap); + c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, + &kvmap); auto& props = c.table_reader()->GetTableProperties(); ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name); } @@ -874,7 +978,7 @@ static std::string RandomString(Random* rnd, int len) { // It's very hard to figure out the index block size of a block accurately. // To make sure we get the index size, we just make sure as key number // grows, the filter block size also grows. -TEST(TableTest, IndexSizeStat) { +TEST(BlockBasedTableTest, IndexSizeStat) { uint64_t last_index_size = 0; // we need to use random keys since the pure human readable texts @@ -890,7 +994,7 @@ TEST(TableTest, IndexSizeStat) { // Each time we load one more key to the table. the table index block // size is expected to be larger than last time's. for (size_t i = 1; i < keys.size(); ++i) { - BlockBasedTableConstructor c(BytewiseComparator()); + TableConstructor c(BytewiseComparator()); for (size_t j = 0; j < i; ++j) { c.Add(keys[j], "val"); } @@ -901,7 +1005,8 @@ TEST(TableTest, IndexSizeStat) { options.compression = kNoCompression; options.block_restart_interval = 1; - c.Finish(options, &ks, &kvmap); + c.Finish(options, GetPlainInternalComparator(options.comparator), &ks, + &kvmap); auto index_size = c.table_reader()->GetTableProperties().index_size; ASSERT_GT(index_size, last_index_size); @@ -909,9 +1014,9 @@ TEST(TableTest, IndexSizeStat) { } } -TEST(TableTest, NumBlockStat) { +TEST(BlockBasedTableTest, NumBlockStat) { Random rnd(test::RandomSeed()); - BlockBasedTableConstructor c(BytewiseComparator()); + TableConstructor c(BytewiseComparator()); Options options; options.compression = kNoCompression; options.block_restart_interval = 1; @@ -925,7 +1030,8 @@ TEST(TableTest, NumBlockStat) { std::vector ks; KVMap kvmap; - c.Finish(options, &ks, &kvmap); + c.Finish(options, GetPlainInternalComparator(options.comparator), &ks, + &kvmap); ASSERT_EQ( kvmap.size(), c.table_reader()->GetTableProperties().num_data_blocks @@ -972,7 +1078,7 @@ class BlockCacheProperties { long data_block_cache_hit = 0; }; -TEST(TableTest, BlockCacheTest) { +TEST(BlockBasedTableTest, BlockCacheTest) { // -- Table construction Options options; options.create_if_missing = true; @@ -986,9 +1092,10 @@ TEST(TableTest, BlockCacheTest) { std::vector keys; KVMap kvmap; - BlockBasedTableConstructor c(BytewiseComparator()); + TableConstructor c(BytewiseComparator()); c.Add("key", "value"); - c.Finish(options, &keys, &kvmap); + c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, + &kvmap); // -- PART 1: Open with regular block cache. // Since block_cache is disabled, no cache activities will be involved. @@ -1106,8 +1213,83 @@ TEST(TableTest, BlockCacheTest) { } } -TEST(TableTest, ApproximateOffsetOfPlain) { - BlockBasedTableConstructor c(BytewiseComparator()); +TEST(BlockBasedTableTest, BlockCacheLeak) { + // Check that when we reopen a table we don't lose access to blocks already + // in the cache. This test checks whether the Table actually makes use of the + // unique ID from the file. + + Options opt; + unique_ptr ikc; + ikc.reset(new test::PlainInternalKeyComparator(opt.comparator)); + opt.block_size = 1024; + opt.compression = kNoCompression; + opt.block_cache = + NewLRUCache(16 * 1024 * 1024); // big enough so we don't ever + // lose cached values. + + TableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + c.Finish(opt, *ikc, &keys, &kvmap); + + unique_ptr iter(c.NewIterator()); + iter->SeekToFirst(); + while (iter->Valid()) { + iter->key(); + iter->value(); + iter->Next(); + } + ASSERT_OK(iter->status()); + + ASSERT_OK(c.Reopen(opt)); + auto table_reader = dynamic_cast(c.table_reader()); + for (const std::string& key : keys) { + ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); + } +} + +extern const uint64_t kPlainTableMagicNumber; +TEST(PlainTableTest, BasicPlainTableProperties) { + PlainTableFactory factory(8, 8, 0); + StringSink sink; + Options options; + InternalKeyComparator ikc(options.comparator); + std::unique_ptr builder( + factory.NewTableBuilder(options, ikc, &sink, kNoCompression)); + + for (char c = 'a'; c <= 'z'; ++c) { + std::string key(8, c); + key.append("\1 "); // PlainTable expects internal key structure + std::string value(28, c + 42); + builder->Add(key, value); + } + ASSERT_OK(builder->Finish()); + + StringSource source(sink.contents(), 72242, true); + + TableProperties props; + auto s = ReadTableProperties(&source, sink.contents().size(), + kPlainTableMagicNumber, Env::Default(), nullptr, + &props); + ASSERT_OK(s); + + ASSERT_EQ(0ul, props.index_size); + ASSERT_EQ(0ul, props.filter_size); + ASSERT_EQ(16ul * 26, props.raw_key_size); + ASSERT_EQ(28ul * 26, props.raw_value_size); + ASSERT_EQ(26ul, props.num_entries); + ASSERT_EQ(1ul, props.num_data_blocks); +} + +TEST(GeneralTableTest, ApproximateOffsetOfPlain) { + TableConstructor c(BytewiseComparator()); c.Add("k01", "hello"); c.Add("k02", "hello2"); c.Add("k03", std::string(10000, 'x')); @@ -1118,9 +1300,10 @@ TEST(TableTest, ApproximateOffsetOfPlain) { std::vector keys; KVMap kvmap; Options options; + test::PlainInternalKeyComparator internal_comparator(options.comparator); options.block_size = 1024; options.compression = kNoCompression; - c.Finish(options, &keys, &kvmap); + c.Finish(options, internal_comparator, &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); @@ -1136,9 +1319,9 @@ TEST(TableTest, ApproximateOffsetOfPlain) { } -static void Do_Compression_Test(CompressionType comp) { +static void DoCompressionTest(CompressionType comp) { Random rnd(301); - BlockBasedTableConstructor c(BytewiseComparator()); + TableConstructor c(BytewiseComparator()); std::string tmp; c.Add("k01", "hello"); c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); @@ -1147,19 +1330,20 @@ static void Do_Compression_Test(CompressionType comp) { std::vector keys; KVMap kvmap; Options options; + test::PlainInternalKeyComparator ikc(options.comparator); options.block_size = 1024; options.compression = comp; - c.Finish(options, &keys, &kvmap); + c.Finish(options, ikc, &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6100)); } -TEST(TableTest, ApproximateOffsetOfCompressed) { +TEST(GeneralTableTest, ApproximateOffsetOfCompressed) { CompressionType compression_state[2]; int valid = 0; if (!SnappyCompressionSupported()) { @@ -1178,49 +1362,11 @@ TEST(TableTest, ApproximateOffsetOfCompressed) { for(int i =0; i < valid; i++) { - Do_Compression_Test(compression_state[i]); + DoCompressionTest(compression_state[i]); } } -TEST(TableTest, BlockCacheLeak) { - // Check that when we reopen a table we don't lose access to blocks already - // in the cache. This test checks whether the Table actually makes use of the - // unique ID from the file. - - Options opt; - opt.block_size = 1024; - opt.compression = kNoCompression; - opt.block_cache = NewLRUCache(16*1024*1024); // big enough so we don't ever - // lose cached values. - - BlockBasedTableConstructor c(BytewiseComparator()); - c.Add("k01", "hello"); - c.Add("k02", "hello2"); - c.Add("k03", std::string(10000, 'x')); - c.Add("k04", std::string(200000, 'x')); - c.Add("k05", std::string(300000, 'x')); - c.Add("k06", "hello3"); - c.Add("k07", std::string(100000, 'x')); - std::vector keys; - KVMap kvmap; - c.Finish(opt, &keys, &kvmap); - - unique_ptr iter(c.NewIterator()); - iter->SeekToFirst(); - while (iter->Valid()) { - iter->key(); - iter->value(); - iter->Next(); - } - ASSERT_OK(iter->status()); - - ASSERT_OK(c.Reopen(opt)); - for (const std::string& key: keys) { - ASSERT_TRUE(c.table_reader()->TEST_KeyInCache(ReadOptions(), key)); - } -} - TEST(Harness, Randomized) { std::vector args = GenerateArgList(); for (unsigned int i = 0; i < args.size(); i++) { @@ -1297,6 +1443,49 @@ TEST(MemTableTest, Simple) { delete memtable->Unref(); } +// Test the empty key +TEST(Harness, SimpleEmptyKey) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 1); + Add("", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSingle) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 2); + Add("abc", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleMulti) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 3); + Add("abc", "v"); + Add("abcd", "v"); + Add("ac", "v2"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSpecialKey) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 4); + Add("\xff\xff", "v3"); + Test(&rnd); + } +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index ac2d8d3d9..65a58ad93 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -20,18 +20,17 @@ namespace rocksdb { namespace { typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, - const EnvOptions& soptions, const Slice&, - bool for_compaction); + const EnvOptions& soptions, + const InternalKeyComparator& icomparator, + const Slice&, bool for_compaction); class TwoLevelIterator: public Iterator { public: - TwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options, - const EnvOptions& soptions, - bool for_compaction); + TwoLevelIterator(Iterator* index_iter, BlockFunction block_function, + void* arg, const ReadOptions& options, + const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + bool for_compaction); virtual ~TwoLevelIterator(); @@ -76,6 +75,7 @@ class TwoLevelIterator: public Iterator { void* arg_; const ReadOptions options_; const EnvOptions& soptions_; + const InternalKeyComparator& internal_comparator_; Status status_; IteratorWrapper index_iter_; IteratorWrapper data_iter_; // May be nullptr @@ -86,20 +86,17 @@ class TwoLevelIterator: public Iterator { }; TwoLevelIterator::TwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options, - const EnvOptions& soptions, - bool for_compaction) + Iterator* index_iter, BlockFunction block_function, void* arg, + const ReadOptions& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, bool for_compaction) : block_function_(block_function), arg_(arg), options_(options), soptions_(soptions), + internal_comparator_(internal_comparator), index_iter_(index_iter), data_iter_(nullptr), - for_compaction_(for_compaction) { -} + for_compaction_(for_compaction) {} TwoLevelIterator::~TwoLevelIterator() { } @@ -181,8 +178,9 @@ void TwoLevelIterator::InitDataBlock() { // data_iter_ is already constructed with this iterator, so // no need to change anything } else { - Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle, - for_compaction_); + Iterator* iter = + (*block_function_)(arg_, options_, soptions_, internal_comparator_, + handle, for_compaction_); data_block_handle_.assign(handle.data(), handle.size()); SetDataIterator(iter); } @@ -191,15 +189,14 @@ void TwoLevelIterator::InitDataBlock() { } // namespace -Iterator* NewTwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options, - const EnvOptions& soptions, - bool for_compaction) { - return new TwoLevelIterator(index_iter, block_function, arg, - options, soptions, for_compaction); +Iterator* NewTwoLevelIterator(Iterator* index_iter, + BlockFunction block_function, void* arg, + const ReadOptions& options, + const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + bool for_compaction) { + return new TwoLevelIterator(index_iter, block_function, arg, options, + soptions, internal_comparator, for_compaction); } } // namespace rocksdb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index 85aed3f14..d313dcb18 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -14,6 +14,7 @@ namespace rocksdb { struct ReadOptions; +class InternalKeyComparator; // Return a new two level iterator. A two-level iterator contains an // index iterator whose values point to a sequence of blocks where @@ -27,14 +28,11 @@ struct ReadOptions; extern Iterator* NewTwoLevelIterator( Iterator* index_iter, Iterator* (*block_function)( - void* arg, - const ReadOptions& options, - const EnvOptions& soptions, - const Slice& index_value, - bool for_compaction), - void* arg, - const ReadOptions& options, - const EnvOptions& soptions, + void* arg, const ReadOptions& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + const Slice& index_value, bool for_compaction), + void* arg, const ReadOptions& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, bool for_compaction = false); } // namespace rocksdb diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc index 903889556..79b361841 100644 --- a/tools/sst_dump.cc +++ b/tools/sst_dump.cc @@ -15,6 +15,7 @@ #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/table.h" +#include "rocksdb/table_properties.h" #include "table/block.h" #include "table/block_builder.h" #include "table/format.h" @@ -38,22 +39,50 @@ class SstFileReader { bool has_to, const std::string& to_key); + Status ReadTableProperties(TableProperties* table_properties); uint64_t GetReadNumber() { return read_num_; } -private: + private: + Status NewTableReader(const std::string& file_path); + std::string file_name_; uint64_t read_num_; bool verify_checksum_; bool output_hex_; EnvOptions soptions_; + + Status init_result_; + unique_ptr table_reader_; + unique_ptr file_; + // table_options_ and internal_comparator_ will also be used in + // ReadSequential internally (specifically, seek-related operations) + Options table_options_; + InternalKeyComparator internal_comparator_; }; SstFileReader::SstFileReader(const std::string& file_path, bool verify_checksum, bool output_hex) - :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum), - output_hex_(output_hex) { - std::cout << "Process " << file_path << "\n"; + :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum), + output_hex_(output_hex), internal_comparator_(BytewiseComparator()) { + fprintf(stdout, "Process %s\n", file_path.c_str()); + + init_result_ = NewTableReader(file_name_); +} + +Status SstFileReader::NewTableReader(const std::string& file_path) { + Status s = table_options_.env->NewRandomAccessFile(file_path, &file_, + soptions_); + if (!s.ok()) { + return s; + } + uint64_t file_size; + table_options_.env->GetFileSize(file_path, &file_size); + unique_ptr table_factory; + s = table_options_.table_factory->NewTableReader( + table_options_, soptions_, internal_comparator_, std::move(file_), + file_size, &table_reader_); + return s; } Status SstFileReader::ReadSequential(bool print_kv, @@ -61,29 +90,12 @@ Status SstFileReader::ReadSequential(bool print_kv, bool has_from, const std::string& from_key, bool has_to, - const std::string& to_key) -{ - unique_ptr table_reader; - InternalKeyComparator internal_comparator_(BytewiseComparator()); - Options table_options; - table_options.comparator = &internal_comparator_; - unique_ptr file; - Status s = table_options.env->NewRandomAccessFile(file_name_, &file, - soptions_); - if(!s.ok()) { - return s; - } - uint64_t file_size; - table_options.env->GetFileSize(file_name_, &file_size); - unique_ptr table_factory; - s = table_options.table_factory->GetTableReader(table_options, soptions_, - std::move(file), file_size, - &table_reader); - if(!s.ok()) { - return s; + const std::string& to_key) { + if (!table_reader_) { + return init_result_; } - Iterator* iter = table_reader->NewIterator(ReadOptions(verify_checksum_, + Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_, false)); uint64_t i = 0; if (has_from) { @@ -113,21 +125,29 @@ Status SstFileReader::ReadSequential(bool print_kv, } if (print_kv) { - std::cout << ikey.DebugString(output_hex_) - << " => " - << value.ToString(output_hex_) << "\n"; + fprintf(stdout, "%s => %s\n", + ikey.DebugString(output_hex_).c_str(), + value.ToString(output_hex_).c_str()); } + } - } + read_num_ += i; + + Status ret = iter->status(); + delete iter; + return ret; +} - read_num_ += i; +Status SstFileReader::ReadTableProperties(TableProperties* table_properties) { + if (!table_reader_) { + return init_result_; + } - Status ret = iter->status(); - delete iter; - return ret; + *table_properties = table_reader_->GetTableProperties(); + return init_result_; } -} // namespace rocksdb +} // namespace rocksdb static void print_help() { fprintf(stderr, @@ -137,7 +157,8 @@ static void print_help() { " [--input_key_hex]" " [--from=]" " [--to=]" - " [--read_num=NUM]\n"); + " [--read_num=NUM]" + " [--show_properties]\n"); } string HexToString(const string& str) { @@ -158,7 +179,6 @@ string HexToString(const string& str) { } int main(int argc, char** argv) { - const char* dir_or_file = nullptr; uint64_t read_num = -1; std::string command; @@ -170,10 +190,10 @@ int main(int argc, char** argv) { bool input_key_hex = false; bool has_from = false; bool has_to = false; + bool show_properties = false; std::string from_key; std::string to_key; - for (int i = 1; i < argc; i++) - { + for (int i = 1; i < argc; i++) { if (strncmp(argv[i], "--file=", 7) == 0) { dir_or_file = argv[i] + 7; } else if (strcmp(argv[i], "--output_hex") == 0) { @@ -194,7 +214,9 @@ int main(int argc, char** argv) { } else if (strncmp(argv[i], "--to=", 5) == 0) { to_key = argv[i] + 5; has_to = true; - }else { + } else if (strcmp(argv[i], "--show_properties") == 0) { + show_properties = true; + } else { print_help(); exit(1); } @@ -210,7 +232,7 @@ int main(int argc, char** argv) { } } - if(dir_or_file == nullptr) { + if (dir_or_file == nullptr) { print_help(); exit(1); } @@ -225,18 +247,19 @@ int main(int argc, char** argv) { dir = false; } - std::cout << "from [" << rocksdb::Slice(from_key).ToString(true) - << "] to [" << rocksdb::Slice(to_key).ToString(true) << "]\n"; + fprintf(stdout, "from [%s] to [%s]\n", + rocksdb::Slice(from_key).ToString(true).c_str(), + rocksdb::Slice(to_key).ToString(true).c_str()); uint64_t total_read = 0; for (size_t i = 0; i < filenames.size(); i++) { std::string filename = filenames.at(i); if (filename.length() <= 4 || filename.rfind(".sst") != filename.length() - 4) { - //ignore + // ignore continue; } - if(dir) { + if (dir) { filename = std::string(dir_or_file) + "/" + filename; } rocksdb::SstFileReader reader(filename, verify_checksum, @@ -257,5 +280,20 @@ int main(int argc, char** argv) { break; } } + if (show_properties) { + rocksdb::TableProperties table_properties; + st = reader.ReadTableProperties(&table_properties); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str()); + } else { + fprintf(stdout, + "Table Properties:\n" + "------------------------------\n" + " %s", table_properties.ToString("\n ", ": ").c_str()); + fprintf(stdout, "# deleted keys: %zd\n", + rocksdb::GetDeletedKeys( + table_properties.user_collected_properties)); + } + } } } diff --git a/util/arena_impl.cc b/util/arena.cc similarity index 82% rename from util/arena_impl.cc rename to util/arena.cc index 5125e2364..dffc8b88e 100644 --- a/util/arena_impl.cc +++ b/util/arena.cc @@ -7,19 +7,19 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/arena_impl.h" +#include "util/arena.h" #include namespace rocksdb { -const size_t ArenaImpl::kMinBlockSize = 4096; -const size_t ArenaImpl::kMaxBlockSize = 2 << 30; +const size_t Arena::kMinBlockSize = 4096; +const size_t Arena::kMaxBlockSize = 2 << 30; static const int kAlignUnit = sizeof(void*); size_t OptimizeBlockSize(size_t block_size) { // Make sure block_size is in optimal range - block_size = std::max(ArenaImpl::kMinBlockSize, block_size); - block_size = std::min(ArenaImpl::kMaxBlockSize, block_size); + block_size = std::max(Arena::kMinBlockSize, block_size); + block_size = std::min(Arena::kMaxBlockSize, block_size); // make sure block_size is the multiple of kAlignUnit if (block_size % kAlignUnit != 0) { @@ -29,19 +29,18 @@ size_t OptimizeBlockSize(size_t block_size) { return block_size; } -ArenaImpl::ArenaImpl(size_t block_size) - : kBlockSize(OptimizeBlockSize(block_size)) { +Arena::Arena(size_t block_size) : kBlockSize(OptimizeBlockSize(block_size)) { assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize && kBlockSize % kAlignUnit == 0); } -ArenaImpl::~ArenaImpl() { +Arena::~Arena() { for (const auto& block : blocks_) { delete[] block; } } -char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) { +char* Arena::AllocateFallback(size_t bytes, bool aligned) { if (bytes > kBlockSize / 4) { // Object is more than a quarter of our block size. Allocate it separately // to avoid wasting too much space in leftover bytes. @@ -63,7 +62,7 @@ char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) { } } -char* ArenaImpl::AllocateAligned(size_t bytes) { +char* Arena::AllocateAligned(size_t bytes) { assert((kAlignUnit & (kAlignUnit - 1)) == 0); // Pointer size should be a power of 2 size_t current_mod = @@ -83,7 +82,7 @@ char* ArenaImpl::AllocateAligned(size_t bytes) { return result; } -char* ArenaImpl::AllocateNewBlock(size_t block_bytes) { +char* Arena::AllocateNewBlock(size_t block_bytes) { char* block = new char[block_bytes]; blocks_memory_ += block_bytes; blocks_.push_back(block); diff --git a/util/arena_impl.h b/util/arena.h similarity index 81% rename from util/arena_impl.h rename to util/arena.h index 538385ccc..4c45417f4 100644 --- a/util/arena_impl.h +++ b/util/arena.h @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// ArenaImpl is an implementation of Arena class. For a request of small size, +// Arena is an implementation of Arena class. For a request of small size, // it allocates a block with pre-defined block size. For a request of big // size, it uses malloc to directly get the requested size. @@ -16,37 +16,35 @@ #include #include #include -#include "rocksdb/arena.h" +#include "util/arena.h" namespace rocksdb { -class ArenaImpl : public Arena { +class Arena { public: // No copying allowed - ArenaImpl(const ArenaImpl&) = delete; - void operator=(const ArenaImpl&) = delete; + Arena(const Arena&) = delete; + void operator=(const Arena&) = delete; static const size_t kMinBlockSize; static const size_t kMaxBlockSize; - explicit ArenaImpl(size_t block_size = kMinBlockSize); - virtual ~ArenaImpl(); + explicit Arena(size_t block_size = kMinBlockSize); + ~Arena(); - virtual char* Allocate(size_t bytes) override; + char* Allocate(size_t bytes); - virtual char* AllocateAligned(size_t bytes) override; + char* AllocateAligned(size_t bytes); // Returns an estimate of the total memory usage of data allocated // by the arena (exclude the space allocated but not yet used for future // allocations). - virtual const size_t ApproximateMemoryUsage() { + const size_t ApproximateMemoryUsage() { return blocks_memory_ + blocks_.capacity() * sizeof(char*) - alloc_bytes_remaining_; } - virtual const size_t MemoryAllocatedBytes() override { - return blocks_memory_; - } + const size_t MemoryAllocatedBytes() { return blocks_memory_; } private: // Number of bytes allocated in one block @@ -72,7 +70,7 @@ class ArenaImpl : public Arena { size_t blocks_memory_ = 0; }; -inline char* ArenaImpl::Allocate(size_t bytes) { +inline char* Arena::Allocate(size_t bytes) { // The semantics of what to return are a bit messy if we allow // 0-byte allocations, so we disallow them here (we don't need // them for our internal use). diff --git a/util/arena_test.cc b/util/arena_test.cc index ca6dfc99d..1b2b53175 100644 --- a/util/arena_test.cc +++ b/util/arena_test.cc @@ -7,34 +7,32 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/arena_impl.h" +#include "util/arena.h" #include "util/random.h" #include "util/testharness.h" namespace rocksdb { -class ArenaImplTest { }; +class ArenaTest {}; -TEST(ArenaImplTest, Empty) { - ArenaImpl arena0; -} +TEST(ArenaTest, Empty) { Arena arena0; } -TEST(ArenaImplTest, MemoryAllocatedBytes) { +TEST(ArenaTest, MemoryAllocatedBytes) { const int N = 17; - size_t req_sz; //requested size + size_t req_sz; // requested size size_t bsz = 8192; // block size size_t expected_memory_allocated; - ArenaImpl arena_impl(bsz); + Arena arena(bsz); // requested size > quarter of a block: // allocate requested size separately req_sz = 3001; for (int i = 0; i < N; i++) { - arena_impl.Allocate(req_sz); + arena.Allocate(req_sz); } expected_memory_allocated = req_sz * N; - ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated); + ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); // requested size < quarter of a block: // allocate a block with the default size, then try to use unused part @@ -42,28 +40,28 @@ TEST(ArenaImplTest, MemoryAllocatedBytes) { // Allocate(99) call. All the remaining calls won't lead to new allocation. req_sz = 99; for (int i = 0; i < N; i++) { - arena_impl.Allocate(req_sz); + arena.Allocate(req_sz); } expected_memory_allocated += bsz; - ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated); + ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); // requested size > quarter of a block: // allocate requested size separately req_sz = 99999999; for (int i = 0; i < N; i++) { - arena_impl.Allocate(req_sz); + arena.Allocate(req_sz); } expected_memory_allocated += req_sz * N; - ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated); + ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); } // Make sure we didn't count the allocate but not used memory space in // Arena::ApproximateMemoryUsage() -TEST(ArenaImplTest, ApproximateMemoryUsageTest) { +TEST(ArenaTest, ApproximateMemoryUsageTest) { const size_t kBlockSize = 4096; const size_t kEntrySize = kBlockSize / 8; - const size_t kZero = 0; - ArenaImpl arena(kBlockSize); + const size_t kZero = 0; + Arena arena(kBlockSize); ASSERT_EQ(kZero, arena.ApproximateMemoryUsage()); auto num_blocks = kBlockSize / kEntrySize; @@ -83,9 +81,9 @@ TEST(ArenaImplTest, ApproximateMemoryUsageTest) { ASSERT_GT(usage, mem_usage); } -TEST(ArenaImplTest, Simple) { +TEST(ArenaTest, Simple) { std::vector> allocated; - ArenaImpl arena_impl; + Arena arena; const int N = 100000; size_t bytes = 0; Random rnd(301); @@ -104,9 +102,9 @@ TEST(ArenaImplTest, Simple) { } char* r; if (rnd.OneIn(10)) { - r = arena_impl.AllocateAligned(s); + r = arena.AllocateAligned(s); } else { - r = arena_impl.Allocate(s); + r = arena.Allocate(s); } for (unsigned int b = 0; b < s; b++) { @@ -115,9 +113,9 @@ TEST(ArenaImplTest, Simple) { } bytes += s; allocated.push_back(std::make_pair(s, r)); - ASSERT_GE(arena_impl.ApproximateMemoryUsage(), bytes); + ASSERT_GE(arena.ApproximateMemoryUsage(), bytes); if (i > N / 10) { - ASSERT_LE(arena_impl.ApproximateMemoryUsage(), bytes * 1.10); + ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10); } } for (unsigned int i = 0; i < allocated.size(); i++) { @@ -132,6 +130,4 @@ TEST(ArenaImplTest, Simple) { } // namespace rocksdb -int main(int argc, char** argv) { - return rocksdb::test::RunAllTests(); -} +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/util/autovector.h b/util/autovector.h index 9998e2956..812a61795 100644 --- a/util/autovector.h +++ b/util/autovector.h @@ -57,11 +57,9 @@ class autovector { typedef std::random_access_iterator_tag iterator_category; iterator_impl(TAutoVector* vect, size_t index) - : vect_(vect) - , index_(index) { - }; + : vect_(vect), index_(index) {}; iterator_impl(const iterator_impl&) = default; - ~iterator_impl() { } + ~iterator_impl() {} iterator_impl& operator=(const iterator_impl&) = default; // -- Advancement @@ -130,9 +128,7 @@ class autovector { return index_ == other.index_; } - bool operator!=(const self_type& other) const { - return !(*this == other); - } + bool operator!=(const self_type& other) const { return !(*this == other); } bool operator>(const self_type& other) const { assert(vect_ == other.vect_); @@ -174,13 +170,9 @@ class autovector { return vect_.capacity() == 0; } - size_type size() const { - return num_stack_items_ + vect_.size(); - } + size_type size() const { return num_stack_items_ + vect_.size(); } - bool empty() const { - return size() == 0; - } + bool empty() const { return size() == 0; } // will not check boundry const_reference operator[](size_type n) const { @@ -235,11 +227,9 @@ class autovector { } } - void push_back(const T& item) { - push_back(value_type(item)); - } + void push_back(const T& item) { push_back(value_type(item)); } - template + template void emplace_back(Args&&... args) { push_back(value_type(args...)); } @@ -261,13 +251,9 @@ class autovector { // -- Copy and Assignment autovector& assign(const autovector& other); - autovector(const autovector& other) { - assign(other); - } + autovector(const autovector& other) { assign(other); } - autovector& operator=(const autovector& other) { - return assign(other); - } + autovector& operator=(const autovector& other) { return assign(other); } // move operation are disallowed since it is very hard to make sure both // autovectors are allocated from the same function stack. @@ -275,41 +261,29 @@ class autovector { autovector(autovector&& other) = delete; // -- Iterator Operations - iterator begin() { - return iterator(this, 0); - } + iterator begin() { return iterator(this, 0); } - const_iterator begin() const { - return const_iterator(this, 0); - } + const_iterator begin() const { return const_iterator(this, 0); } - iterator end() { - return iterator(this, this->size()); - } + iterator end() { return iterator(this, this->size()); } - const_iterator end() const { - return const_iterator(this, this->size()); - } + const_iterator end() const { return const_iterator(this, this->size()); } - reverse_iterator rbegin() { - return reverse_iterator(end()); - } + reverse_iterator rbegin() { return reverse_iterator(end()); } const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); } - reverse_iterator rend() { - return reverse_iterator(begin()); - } + reverse_iterator rend() { return reverse_iterator(begin()); } const_reverse_iterator rend() const { return const_reverse_iterator(begin()); } private: - size_type num_stack_items_ = 0; // current number of items - value_type values_[kSize]; // the first `kSize` items + size_type num_stack_items_ = 0; // current number of items + value_type values_[kSize]; // the first `kSize` items // used only if there are more than `kSize` items. std::vector vect_; }; diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 9dbd5d2cc..2c430e203 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -7,12 +7,16 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include + #include "rocksdb/filter_policy.h" #include "util/logging.h" #include "util/testharness.h" #include "util/testutil.h" +DEFINE_int32(bits_per_key, 10, ""); + namespace rocksdb { static const int kVerbose = 1; @@ -29,7 +33,7 @@ class BloomTest { std::vector keys_; public: - BloomTest() : policy_(NewBloomFilterPolicy(10)) { } + BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { } ~BloomTest() { delete policy_; @@ -160,5 +164,7 @@ TEST(BloomTest, VaryingLengths) { } // namespace rocksdb int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + return rocksdb::test::RunAllTests(); } diff --git a/util/cache.cc b/util/cache.cc index 4707eac94..8f7deaaa8 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -10,10 +10,10 @@ #include #include #include -#include #include "rocksdb/cache.h" #include "port/port.h" +#include "util/autovector.h" #include "util/hash.h" #include "util/mutexlock.h" @@ -156,6 +156,13 @@ class LRUCache { Cache::Handle* Lookup(const Slice& key, uint32_t hash); void Release(Cache::Handle* handle); void Erase(const Slice& key, uint32_t hash); + // Although in some platforms the update of size_t is atomic, to make sure + // GetUsage() works correctly under any platforms, we'll protect this + // function with mutex. + size_t GetUsage() const { + MutexLock l(&mutex_); + return usage_; + } private: void LRU_Remove(LRUHandle* e); @@ -171,7 +178,9 @@ class LRUCache { uint32_t remove_scan_count_limit_; // mutex_ protects the following state. - port::Mutex mutex_; + // We don't count mutex_ as the cache's internal state so semantically we + // don't mind mutex_ invoking the non-const actions. + mutable port::Mutex mutex_; size_t usage_; // Dummy head of LRU list. @@ -255,8 +264,7 @@ Cache::Handle* LRUCache::Insert( LRUHandle* e = reinterpret_cast( malloc(sizeof(LRUHandle)-1 + key.size())); - std::vector last_reference_list; - last_reference_list.reserve(1); + autovector last_reference_list; e->value = value; e->deleter = deleter; @@ -342,10 +350,10 @@ static int kRemoveScanCountLimit = 0; // default values, can be overridden class ShardedLRUCache : public Cache { private: - LRUCache* shard_; + LRUCache* shards_; port::Mutex id_mutex_; uint64_t last_id_; - int numShardBits; + int num_shard_bits_; size_t capacity_; static inline uint32_t HashSlice(const Slice& s) { @@ -354,18 +362,18 @@ class ShardedLRUCache : public Cache { uint32_t Shard(uint32_t hash) { // Note, hash >> 32 yields hash in gcc, not the zero we expect! - return (numShardBits > 0) ? (hash >> (32 - numShardBits)) : 0; + return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0; } void init(size_t capacity, int numbits, int removeScanCountLimit) { - numShardBits = numbits; + num_shard_bits_ = numbits; capacity_ = capacity; - int numShards = 1 << numShardBits; - shard_ = new LRUCache[numShards]; - const size_t per_shard = (capacity + (numShards - 1)) / numShards; - for (int s = 0; s < numShards; s++) { - shard_[s].SetCapacity(per_shard); - shard_[s].SetRemoveScanCountLimit(removeScanCountLimit); + int num_shards = 1 << num_shard_bits_; + shards_ = new LRUCache[num_shards]; + const size_t per_shard = (capacity + (num_shards - 1)) / num_shards; + for (int s = 0; s < num_shards; s++) { + shards_[s].SetCapacity(per_shard); + shards_[s].SetRemoveScanCountLimit(removeScanCountLimit); } } @@ -374,30 +382,30 @@ class ShardedLRUCache : public Cache { : last_id_(0) { init(capacity, kNumShardBits, kRemoveScanCountLimit); } - ShardedLRUCache(size_t capacity, int numShardBits, + ShardedLRUCache(size_t capacity, int num_shard_bits, int removeScanCountLimit) : last_id_(0) { - init(capacity, numShardBits, removeScanCountLimit); + init(capacity, num_shard_bits, removeScanCountLimit); } virtual ~ShardedLRUCache() { - delete[] shard_; + delete[] shards_; } virtual Handle* Insert(const Slice& key, void* value, size_t charge, void (*deleter)(const Slice& key, void* value)) { const uint32_t hash = HashSlice(key); - return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter); + return shards_[Shard(hash)].Insert(key, hash, value, charge, deleter); } virtual Handle* Lookup(const Slice& key) { const uint32_t hash = HashSlice(key); - return shard_[Shard(hash)].Lookup(key, hash); + return shards_[Shard(hash)].Lookup(key, hash); } virtual void Release(Handle* handle) { LRUHandle* h = reinterpret_cast(handle); - shard_[Shard(h->hash)].Release(handle); + shards_[Shard(h->hash)].Release(handle); } virtual void Erase(const Slice& key) { const uint32_t hash = HashSlice(key); - shard_[Shard(hash)].Erase(key, hash); + shards_[Shard(hash)].Erase(key, hash); } virtual void* Value(Handle* handle) { return reinterpret_cast(handle)->value; @@ -406,11 +414,23 @@ class ShardedLRUCache : public Cache { MutexLock l(&id_mutex_); return ++(last_id_); } - virtual size_t GetCapacity() { + virtual size_t GetCapacity() const { return capacity_; } + + virtual size_t GetUsage() const { + // We will not lock the cache when getting the usage from shards. + // for (size_t i = 0; i < num_shard_bits_; ++i) + int num_shards = 1 << num_shard_bits_; + size_t usage = 0; + for (int s = 0; s < num_shards; s++) { + usage += shards_[s].GetUsage(); + } + return usage; + } + virtual void DisownData() { - shard_ = nullptr; + shards_ = nullptr; } }; @@ -420,17 +440,17 @@ shared_ptr NewLRUCache(size_t capacity) { return NewLRUCache(capacity, kNumShardBits); } -shared_ptr NewLRUCache(size_t capacity, int numShardBits) { - return NewLRUCache(capacity, numShardBits, kRemoveScanCountLimit); +shared_ptr NewLRUCache(size_t capacity, int num_shard_bits) { + return NewLRUCache(capacity, num_shard_bits, kRemoveScanCountLimit); } -shared_ptr NewLRUCache(size_t capacity, int numShardBits, +shared_ptr NewLRUCache(size_t capacity, int num_shard_bits, int removeScanCountLimit) { - if (numShardBits >= 20) { + if (num_shard_bits >= 20) { return nullptr; // the cache cannot be sharded into too many fine pieces } return std::make_shared(capacity, - numShardBits, + num_shard_bits, removeScanCountLimit); } diff --git a/util/cache_test.cc b/util/cache_test.cc index 87ab91389..b99f47b38 100644 --- a/util/cache_test.cc +++ b/util/cache_test.cc @@ -107,6 +107,39 @@ class CacheTest { }; CacheTest* CacheTest::current_; +void dumbDeleter(const Slice& key, void* value) { } + +TEST(CacheTest, UsageTest) { + // cache is shared_ptr and will be automatically cleaned up. + const uint64_t kCapacity = 100000; + auto cache = NewLRUCache(kCapacity, 8, 200); + + size_t usage = 0; + const char* value = "abcdef"; + // make sure everything will be cached + for (int i = 1; i < 100; ++i) { + std::string key(i, 'a'); + auto kv_size = key.size() + 5; + cache->Release( + cache->Insert(key, (void*)value, kv_size, dumbDeleter) + ); + usage += kv_size; + ASSERT_EQ(usage, cache->GetUsage()); + } + + // make sure the cache will be overloaded + for (uint64_t i = 1; i < kCapacity; ++i) { + auto key = std::to_string(i); + cache->Release( + cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter) + ); + } + + // the usage should be close to the capacity + ASSERT_GT(kCapacity, cache->GetUsage()); + ASSERT_LT(kCapacity * 0.95, cache->GetUsage()); +} + TEST(CacheTest, HitAndMiss) { ASSERT_EQ(-1, Lookup(100)); @@ -353,7 +386,6 @@ void deleter(const Slice& key, void* value) { delete (Value *)value; } - TEST(CacheTest, BadEviction) { int n = 10; diff --git a/util/coding.cc b/util/coding.cc index 6cf67efad..31ae0e356 100644 --- a/util/coding.cc +++ b/util/coding.cc @@ -9,131 +9,41 @@ #include "util/coding.h" +#include #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" - -#include - namespace rocksdb { -void EncodeFixed32(char* buf, uint32_t value) { -#if __BYTE_ORDER == __LITTLE_ENDIAN - memcpy(buf, &value, sizeof(value)); -#else - buf[0] = value & 0xff; - buf[1] = (value >> 8) & 0xff; - buf[2] = (value >> 16) & 0xff; - buf[3] = (value >> 24) & 0xff; -#endif -} - -void EncodeFixed64(char* buf, uint64_t value) { -#if __BYTE_ORDER == __LITTLE_ENDIAN - memcpy(buf, &value, sizeof(value)); -#else - buf[0] = value & 0xff; - buf[1] = (value >> 8) & 0xff; - buf[2] = (value >> 16) & 0xff; - buf[3] = (value >> 24) & 0xff; - buf[4] = (value >> 32) & 0xff; - buf[5] = (value >> 40) & 0xff; - buf[6] = (value >> 48) & 0xff; - buf[7] = (value >> 56) & 0xff; -#endif -} - -void PutFixed32(std::string* dst, uint32_t value) { - char buf[sizeof(value)]; - EncodeFixed32(buf, value); - dst->append(buf, sizeof(buf)); -} - -void PutFixed64(std::string* dst, uint64_t value) { - char buf[sizeof(value)]; - EncodeFixed64(buf, value); - dst->append(buf, sizeof(buf)); -} - char* EncodeVarint32(char* dst, uint32_t v) { // Operate on characters as unsigneds unsigned char* ptr = reinterpret_cast(dst); static const int B = 128; - if (v < (1<<7)) { + if (v < (1 << 7)) { *(ptr++) = v; - } else if (v < (1<<14)) { + } else if (v < (1 << 14)) { *(ptr++) = v | B; - *(ptr++) = v>>7; - } else if (v < (1<<21)) { + *(ptr++) = v >> 7; + } else if (v < (1 << 21)) { *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = v>>14; - } else if (v < (1<<28)) { + *(ptr++) = (v >> 7) | B; + *(ptr++) = v >> 14; + } else if (v < (1 << 28)) { *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = (v>>14) | B; - *(ptr++) = v>>21; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = v >> 21; } else { *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = (v>>14) | B; - *(ptr++) = (v>>21) | B; - *(ptr++) = v>>28; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = (v >> 21) | B; + *(ptr++) = v >> 28; } return reinterpret_cast(ptr); } -void PutVarint32(std::string* dst, uint32_t v) { - char buf[5]; - char* ptr = EncodeVarint32(buf, v); - dst->append(buf, ptr - buf); -} - -char* EncodeVarint64(char* dst, uint64_t v) { - static const unsigned int B = 128; - unsigned char* ptr = reinterpret_cast(dst); - while (v >= B) { - *(ptr++) = (v & (B-1)) | B; - v >>= 7; - } - *(ptr++) = static_cast(v); - return reinterpret_cast(ptr); -} - -void PutVarint64(std::string* dst, uint64_t v) { - char buf[10]; - char* ptr = EncodeVarint64(buf, v); - dst->append(buf, ptr - buf); -} - -void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { - PutVarint32(dst, value.size()); - dst->append(value.data(), value.size()); -} - -void PutLengthPrefixedSliceParts(std::string* dst, - const SliceParts& slice_parts) { - uint32_t total_bytes = 0; - for (int i = 0; i < slice_parts.num_parts; ++i) { - total_bytes += slice_parts.parts[i].size(); - } - PutVarint32(dst, total_bytes); - for (int i = 0; i < slice_parts.num_parts; ++i) { - dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size()); - } -} - -int VarintLength(uint64_t v) { - int len = 1; - while (v >= 128) { - v >>= 7; - len++; - } - return len; -} - -const char* GetVarint32PtrFallback(const char* p, - const char* limit, +const char* GetVarint32PtrFallback(const char* p, const char* limit, uint32_t* value) { uint32_t result = 0; for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { @@ -151,18 +61,6 @@ const char* GetVarint32PtrFallback(const char* p, return nullptr; } -bool GetVarint32(Slice* input, uint32_t* value) { - const char* p = input->data(); - const char* limit = p + input->size(); - const char* q = GetVarint32Ptr(p, limit, value); - if (q == nullptr) { - return false; - } else { - *input = Slice(q, limit - q); - return true; - } -} - const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { uint64_t result = 0; for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { @@ -180,58 +78,6 @@ const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { return nullptr; } -bool GetVarint64(Slice* input, uint64_t* value) { - const char* p = input->data(); - const char* limit = p + input->size(); - const char* q = GetVarint64Ptr(p, limit, value); - if (q == nullptr) { - return false; - } else { - *input = Slice(q, limit - q); - return true; - } -} - -const char* GetLengthPrefixedSlice(const char* p, const char* limit, - Slice* result) { - uint32_t len; - p = GetVarint32Ptr(p, limit, &len); - if (p == nullptr) return nullptr; - if (p + len > limit) return nullptr; - *result = Slice(p, len); - return p + len; -} - -bool GetLengthPrefixedSlice(Slice* input, Slice* result) { - uint32_t len; - if (GetVarint32(input, &len) && - input->size() >= len) { - *result = Slice(input->data(), len); - input->remove_prefix(len); - return true; - } else { - return false; - } -} - -Slice GetLengthPrefixedSlice(const char* data) { - uint32_t len; - const char* p = data; - p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted - return Slice(p, len); -} - -Slice GetSliceUntil(Slice* slice, char delimiter) { - uint32_t len; - for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) { - // nothing - } - - Slice ret(slice->data(), len); - slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0)); - return ret; -} - void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, uint32_t bits, uint64_t value) { assert((offset + bits + 7)/8 <= dstlen); @@ -320,14 +166,4 @@ void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits, BitStreamGetInt(dst, offset, bits)); } -uint64_t BitStreamGetInt(const std::string* src, size_t offset, - uint32_t bits) { - return BitStreamGetInt(src->data(), src->size(), offset, bits); -} - -uint64_t BitStreamGetInt(const Slice* src, size_t offset, - uint32_t bits) { - return BitStreamGetInt(src->data(), src->size(), offset, bits); -} - } // namespace rocksdb diff --git a/util/coding.h b/util/coding.h index c6a6b203d..8ffba51cb 100644 --- a/util/coding.h +++ b/util/coding.h @@ -13,6 +13,7 @@ // * Strings are encoded prefixed by their length in varint format #pragma once +#include #include #include #include @@ -40,6 +41,7 @@ extern void PutLengthPrefixedSliceParts(std::string* dst, extern bool GetVarint32(Slice* input, uint32_t* value); extern bool GetVarint64(Slice* input, uint64_t* value); extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); +// This function assumes data is well-formed. extern Slice GetLengthPrefixedSlice(const char* data); extern Slice GetSliceUntil(Slice* slice, char delimiter); @@ -138,4 +140,155 @@ extern uint64_t BitStreamGetInt(const std::string* src, size_t offset, extern uint64_t BitStreamGetInt(const Slice* src, size_t offset, uint32_t bits); +// -- Implementation of the functions declared above +inline void EncodeFixed32(char* buf, uint32_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif +} + +inline void EncodeFixed64(char* buf, uint64_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif +} + +inline void PutFixed32(std::string* dst, uint32_t value) { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); +} + +inline void PutFixed64(std::string* dst, uint64_t value) { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); +} + +inline void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, ptr - buf); +} + +inline char* EncodeVarint64(char* dst, uint64_t v) { + static const unsigned int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B - 1)) | B; + v >>= 7; + } + *(ptr++) = static_cast(v); + return reinterpret_cast(ptr); +} + +inline void PutVarint64(std::string* dst, uint64_t v) { + char buf[10]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, ptr - buf); +} + +inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +inline void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts) { + uint32_t total_bytes = 0; + for (int i = 0; i < slice_parts.num_parts; ++i) { + total_bytes += slice_parts.parts[i].size(); + } + PutVarint32(dst, total_bytes); + for (int i = 0; i < slice_parts.num_parts; ++i) { + dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size()); + } +} + +inline int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +inline bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +inline bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len = 0; + if (GetVarint32(input, &len) && input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +inline Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len = 0; + // +5: we assume "data" is not corrupted + auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len); + return Slice(p, len); +} + +inline Slice GetSliceUntil(Slice* slice, char delimiter) { + uint32_t len = 0; + for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) { + // nothing + } + + Slice ret(slice->data(), len); + slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0)); + return ret; +} + +inline uint64_t BitStreamGetInt(const std::string* src, size_t offset, + uint32_t bits) { + return BitStreamGetInt(src->data(), src->size(), offset, bits); +} + +inline uint64_t BitStreamGetInt(const Slice* src, size_t offset, + uint32_t bits) { + return BitStreamGetInt(src->data(), src->size(), offset, bits); +} + } // namespace rocksdb diff --git a/util/coding_test.cc b/util/coding_test.cc index fb0613238..ed542d6bf 100644 --- a/util/coding_test.cc +++ b/util/coding_test.cc @@ -41,7 +41,7 @@ TEST(Coding, Fixed64) { const char* p = s.data(); for (int power = 0; power <= 63; power++) { uint64_t v = static_cast(1) << power; - uint64_t actual; + uint64_t actual = 0; actual = DecodeFixed64(p); ASSERT_EQ(v-1, actual); p += sizeof(uint64_t); @@ -90,7 +90,7 @@ TEST(Coding, Varint32) { const char* limit = p + s.size(); for (uint32_t i = 0; i < (32 * 32); i++) { uint32_t expected = (i / 32) << (i % 32); - uint32_t actual; + uint32_t actual = 0; const char* start = p; p = GetVarint32Ptr(p, limit, &actual); ASSERT_TRUE(p != nullptr); @@ -125,7 +125,7 @@ TEST(Coding, Varint64) { const char* limit = p + s.size(); for (unsigned int i = 0; i < values.size(); i++) { ASSERT_TRUE(p < limit); - uint64_t actual; + uint64_t actual = 0; const char* start = p; p = GetVarint64Ptr(p, limit, &actual); ASSERT_TRUE(p != nullptr); diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc new file mode 100644 index 000000000..94df660ef --- /dev/null +++ b/util/dynamic_bloom.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "dynamic_bloom.h" + +#include "rocksdb/slice.h" +#include "util/hash.h" + +namespace rocksdb { + +namespace { +static uint32_t BloomHash(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} +} + +DynamicBloom::DynamicBloom(uint32_t total_bits, + uint32_t (*hash_func)(const Slice& key), + uint32_t num_probes) + : hash_func_(hash_func), + kTotalBits((total_bits + 7) / 8 * 8), + kNumProbes(num_probes) { + assert(hash_func_); + assert(kNumProbes > 0); + assert(kTotalBits > 0); + data_.reset(new unsigned char[kTotalBits / 8]()); +} + +DynamicBloom::DynamicBloom(uint32_t total_bits, + uint32_t num_probes) + : DynamicBloom(total_bits, &BloomHash, num_probes) { +} + +} // rocksdb diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h new file mode 100644 index 000000000..0851becbf --- /dev/null +++ b/util/dynamic_bloom.h @@ -0,0 +1,72 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include + +namespace rocksdb { + +class Slice; + +class DynamicBloom { + public: + // total_bits: fixed total bits for the bloom + // hash_func: customized hash function + // num_probes: number of hash probes for a single key + DynamicBloom(uint32_t total_bits, + uint32_t (*hash_func)(const Slice& key), + uint32_t num_probes = 6); + + explicit DynamicBloom(uint32_t total_bits, uint32_t num_probes = 6); + + // Assuming single threaded access to this function. + void Add(const Slice& key); + + // Assuming single threaded access to this function. + void AddHash(uint32_t hash); + + // Multithreaded access to this function is OK + bool MayContain(const Slice& key); + + // Multithreaded access to this function is OK + bool MayContainHash(uint32_t hash); + + private: + uint32_t (*hash_func_)(const Slice& key); + const uint32_t kTotalBits; + const uint32_t kNumProbes; + std::unique_ptr data_; +}; + +inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); } + +inline bool DynamicBloom::MayContain(const Slice& key) { + return (MayContainHash(hash_func_(key))); +} + +inline bool DynamicBloom::MayContainHash(uint32_t h) { + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (uint32_t i = 0; i < kNumProbes; i++) { + const uint32_t bitpos = h % kTotalBits; + if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { + return false; + } + h += delta; + } + return true; +} + +inline void DynamicBloom::AddHash(uint32_t h) { + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (uint32_t i = 0; i < kNumProbes; i++) { + const uint32_t bitpos = h % kTotalBits; + data_[bitpos / 8] |= (1 << (bitpos % 8)); + h += delta; + } +} + +} // rocksdb diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc new file mode 100644 index 000000000..58f05ae50 --- /dev/null +++ b/util/dynamic_bloom_test.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include + +#include "dynamic_bloom.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +DEFINE_int32(bits_per_key, 10, ""); +DEFINE_int32(num_probes, 6, ""); + +namespace rocksdb { + +static Slice Key(int i, char* buffer) { + memcpy(buffer, &i, sizeof(i)); + return Slice(buffer, sizeof(i)); +} + +class DynamicBloomTest { +}; + +TEST(DynamicBloomTest, EmptyFilter) { + DynamicBloom bloom(100, 2); + ASSERT_TRUE(! bloom.MayContain("hello")); + ASSERT_TRUE(! bloom.MayContain("world")); +} + +TEST(DynamicBloomTest, Small) { + DynamicBloom bloom(100, 2); + bloom.Add("hello"); + bloom.Add("world"); + ASSERT_TRUE(bloom.MayContain("hello")); + ASSERT_TRUE(bloom.MayContain("world")); + ASSERT_TRUE(! bloom.MayContain("x")); + ASSERT_TRUE(! bloom.MayContain("foo")); +} + +static int NextLength(int length) { + if (length < 10) { + length += 1; + } else if (length < 100) { + length += 10; + } else if (length < 1000) { + length += 100; + } else { + length += 1000; + } + return length; +} + +TEST(DynamicBloomTest, VaryingLengths) { + char buffer[sizeof(int)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + + fprintf(stderr, "bits_per_key: %d num_probes: %d\n", + FLAGS_bits_per_key, FLAGS_num_probes); + + for (int length = 1; length <= 10000; length = NextLength(length)) { + uint32_t bloom_bits = std::max(length * FLAGS_bits_per_key, 64); + DynamicBloom bloom(bloom_bits, FLAGS_num_probes); + for (int i = 0; i < length; i++) { + bloom.Add(Key(i, buffer)); + ASSERT_TRUE(bloom.MayContain(Key(i, buffer))); + } + + // All added keys must match + for (int i = 0; i < length; i++) { + ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) + << "Length " << length << "; key " << i; + } + + // Check false positive rate + + int result = 0; + for (int i = 0; i < 10000; i++) { + if (bloom.MayContain(Key(i + 1000000000, buffer))) { + result++; + } + } + double rate = result / 10000.0; + + fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; \n", + rate*100.0, length); + + //ASSERT_LE(rate, 0.02); // Must not be over 2% + if (rate > 0.0125) + mediocre_filters++; // Allowed, but not too often + else + good_filters++; + } + + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + + ASSERT_LE(mediocre_filters, good_filters/5); +} + +// Different bits-per-byte + +} // namespace rocksdb + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + + return rocksdb::test::RunAllTests(); +} diff --git a/util/env_posix.cc b/util/env_posix.cc index 638b6c906..b53cd0103 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -306,7 +306,13 @@ class PosixMmapReadableFile: public RandomAccessFile { assert(options.use_mmap_reads); assert(options.use_os_buffer); } - virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); } + virtual ~PosixMmapReadableFile() { + int ret = munmap(mmapped_region_, length_); + if (ret != 0) { + fprintf(stdout, "failed to munmap %p length %zu \n", + mmapped_region_, length_); + } + } virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc new file mode 100644 index 000000000..83f0f3d5a --- /dev/null +++ b/util/hash_linklist_rep.cc @@ -0,0 +1,470 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include "util/hash_linklist_rep.h" + +#include "rocksdb/memtablerep.h" +#include "util/arena.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "port/port.h" +#include "port/atomic_pointer.h" +#include "util/murmurhash.h" +#include "db/memtable.h" +#include "db/skiplist.h" + +namespace rocksdb { +namespace { + +typedef const char* Key; + +struct Node { + explicit Node(const Key& k) : + key(k) { + } + + Key const key; + + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next() { + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast(next_.Acquire_Load()); + } + void SetNext(Node* x) { + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_.Release_Store(x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next() { + return reinterpret_cast(next_.NoBarrier_Load()); + } + void NoBarrier_SetNext(Node* x) { + next_.NoBarrier_Store(x); + } + +private: + port::AtomicPointer next_; +}; + +class HashLinkListRep : public MemTableRep { + public: + HashLinkListRep(MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, size_t bucket_size); + + virtual void Insert(const char* key) override; + + virtual bool Contains(const char* key) const override; + + virtual size_t ApproximateMemoryUsage() override; + + virtual ~HashLinkListRep(); + + virtual MemTableRep::Iterator* GetIterator() override; + + virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override; + + virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix) + override; + + virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override; + + private: + friend class DynamicIterator; + typedef SkipList FullList; + + size_t bucket_size_; + + // Maps slices (which are transformed user keys) to buckets of keys sharing + // the same transform. + port::AtomicPointer* buckets_; + + // The user-supplied transform whose domain is the user keys. + const SliceTransform* transform_; + + MemTableRep::KeyComparator& compare_; + // immutable after construction + Arena* const arena_; + + bool BucketContains(Node* head, const Slice& key) const; + + Slice GetPrefix(const Slice& internal_key) const { + return transform_->Transform(ExtractUserKey(internal_key)); + } + + size_t GetHash(const Slice& slice) const { + return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_; + } + + Node* GetBucket(size_t i) const { + return static_cast(buckets_[i].Acquire_Load()); + } + + Node* GetBucket(const Slice& slice) const { + return GetBucket(GetHash(slice)); + } + + Node* NewNode(const Key& key) { + char* mem = arena_->AllocateAligned(sizeof(Node)); + return new (mem) Node(key); + } + + bool Equal(const Slice& a, const Key& b) const { + return (compare_(b, a) == 0); + } + + + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, internal_key) < 0); + } + + bool KeyIsAfterNode(const Key& key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, key) < 0); + } + + + Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const; + + class FullListIterator : public MemTableRep::Iterator { + public: + explicit FullListIterator(FullList* list) + : iter_(list), full_list_(list) {} + + virtual ~FullListIterator() { + } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const { + return iter_.Valid(); + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const { + assert(Valid()); + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() { + assert(Valid()); + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() { + assert(Valid()); + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) { + const char* encoded_key = + (memtable_key != nullptr) ? + memtable_key : EncodeKey(&tmp_, internal_key); + iter_.Seek(encoded_key); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + iter_.SeekToFirst(); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + iter_.SeekToLast(); + } + private: + FullList::Iterator iter_; + // To destruct with the iterator. + std::unique_ptr full_list_; + std::string tmp_; // For passing to EncodeKey + }; + + class Iterator : public MemTableRep::Iterator { + public: + explicit Iterator(const HashLinkListRep* const hash_link_list_rep, + Node* head) : + hash_link_list_rep_(hash_link_list_rep), head_(head), node_(nullptr) { + } + + virtual ~Iterator() { + } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const { + return node_ != nullptr; + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const { + assert(Valid()); + return node_->key; + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() { + assert(Valid()); + node_ = node_->Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) { + node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, + internal_key); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + protected: + void Reset(Node* head) { + head_ = head; + node_ = nullptr; + } + private: + friend class HashLinkListRep; + const HashLinkListRep* const hash_link_list_rep_; + Node* head_; + Node* node_; + std::string tmp_; // For passing to EncodeKey + + virtual void SeekToHead() { + node_ = head_; + } + }; + + class DynamicIterator : public HashLinkListRep::Iterator { + public: + explicit DynamicIterator(HashLinkListRep& memtable_rep) + : HashLinkListRep::Iterator(&memtable_rep, nullptr), + memtable_rep_(memtable_rep) {} + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& k, const char* memtable_key) { + auto transformed = memtable_rep_.GetPrefix(k); + Reset(memtable_rep_.GetBucket(transformed)); + HashLinkListRep::Iterator::Seek(k, memtable_key); + } + + private: + // the underlying memtable + const HashLinkListRep& memtable_rep_; + }; + + class EmptyIterator : public MemTableRep::Iterator { + // This is used when there wasn't a bucket. It is cheaper than + // instantiating an empty bucket over which to iterate. + public: + EmptyIterator() { } + virtual bool Valid() const { + return false; + } + virtual const char* key() const { + assert(false); + return nullptr; + } + virtual void Next() { } + virtual void Prev() { } + virtual void Seek(const Slice& user_key, const char* memtable_key) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + private: + }; +}; + +HashLinkListRep::HashLinkListRep(MemTableRep::KeyComparator& compare, + Arena* arena, const SliceTransform* transform, + size_t bucket_size) + : bucket_size_(bucket_size), + transform_(transform), + compare_(compare), + arena_(arena) { + char* mem = arena_->AllocateAligned( + sizeof(port::AtomicPointer) * bucket_size); + + buckets_ = new (mem) port::AtomicPointer[bucket_size]; + + for (size_t i = 0; i < bucket_size_; ++i) { + buckets_[i].NoBarrier_Store(nullptr); + } +} + +HashLinkListRep::~HashLinkListRep() { +} + +void HashLinkListRep::Insert(const char* key) { + assert(!Contains(key)); + Slice internal_key = GetLengthPrefixedSlice(key); + auto transformed = GetPrefix(internal_key); + auto& bucket = buckets_[GetHash(transformed)]; + Node* head = static_cast(bucket.Acquire_Load()); + + if (!head) { + Node* x = NewNode(key); + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(nullptr); + bucket.Release_Store(static_cast(x)); + return; + } + + Node* cur = head; + Node* prev = nullptr; + while (true) { + if (cur == nullptr) { + break; + } + Node* next = cur->Next(); + // Make sure the lists are sorted. + // If x points to head_ or next points nullptr, it is trivially satisfied. + assert((cur == head) || (next == nullptr) || + KeyIsAfterNode(next->key, cur)); + if (KeyIsAfterNode(internal_key, cur)) { + // Keep searching in this list + prev = cur; + cur = next; + } else { + break; + } + } + + // Our data structure does not allow duplicate insertion + assert(cur == nullptr || !Equal(key, cur->key)); + + Node* x = NewNode(key); + + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(cur); + + if (prev) { + prev->SetNext(x); + } else { + bucket.Release_Store(static_cast(x)); + } +} + +bool HashLinkListRep::Contains(const char* key) const { + Slice internal_key = GetLengthPrefixedSlice(key); + + auto transformed = GetPrefix(internal_key); + auto bucket = GetBucket(transformed); + if (bucket == nullptr) { + return false; + } + return BucketContains(bucket, internal_key); +} + +size_t HashLinkListRep::ApproximateMemoryUsage() { + // Memory is always allocated from the arena. + return 0; +} + +MemTableRep::Iterator* HashLinkListRep::GetIterator() { + auto list = new FullList(compare_, arena_); + for (size_t i = 0; i < bucket_size_; ++i) { + auto bucket = GetBucket(i); + if (bucket != nullptr) { + Iterator itr(this, bucket); + for (itr.SeekToHead(); itr.Valid(); itr.Next()) { + list->Insert(itr.key()); + } + } + } + return new FullListIterator(list); +} + +MemTableRep::Iterator* HashLinkListRep::GetPrefixIterator( + const Slice& prefix) { + auto bucket = GetBucket(prefix); + if (bucket == nullptr) { + return new EmptyIterator(); + } + return new Iterator(this, bucket); +} + +MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) { + return GetPrefixIterator(transform_->Transform(slice)); +} + +MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() { + return new DynamicIterator(*this); +} + +bool HashLinkListRep::BucketContains(Node* head, const Slice& user_key) const { + Node* x = FindGreaterOrEqualInBucket(head, user_key); + return (x != nullptr && Equal(user_key, x->key)); +} + +Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, + const Slice& key) const { + Node* x = head; + while (true) { + if (x == nullptr) { + return x; + } + Node* next = x->Next(); + // Make sure the lists are sorted. + // If x points to head_ or next points nullptr, it is trivially satisfied. + assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x)); + if (KeyIsAfterNode(key, x)) { + // Keep searching in this list + x = next; + } else { + break; + } + } + return x; +} + +} // anon namespace + +MemTableRep* HashLinkListRepFactory::CreateMemTableRep( + MemTableRep::KeyComparator& compare, Arena* arena) { + return new HashLinkListRep(compare, arena, transform_, bucket_count_); +} + +MemTableRepFactory* NewHashLinkListRepFactory( + const SliceTransform* transform, size_t bucket_count) { + return new HashLinkListRepFactory(transform, bucket_count); +} + +} // namespace rocksdb diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h new file mode 100644 index 000000000..efa9d8f2e --- /dev/null +++ b/util/hash_linklist_rep.h @@ -0,0 +1,39 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/slice_transform.h" +#include "rocksdb/memtablerep.h" + +namespace rocksdb { + +class HashLinkListRepFactory : public MemTableRepFactory { + public: + explicit HashLinkListRepFactory( + const SliceTransform* transform, + size_t bucket_count) + : transform_(transform), + bucket_count_(bucket_count) { } + + virtual ~HashLinkListRepFactory() { delete transform_; } + + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare, + Arena* arena) override; + + virtual const char* Name() const override { + return "HashLinkListRepFactory"; + } + + const SliceTransform* GetTransform() { return transform_; } + + private: + const SliceTransform* transform_; + const size_t bucket_count_; +}; + +} diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc index e9fe1573a..aa070bc8b 100644 --- a/util/hash_skiplist_rep.cc +++ b/util/hash_skiplist_rep.cc @@ -7,12 +7,13 @@ #include "util/hash_skiplist_rep.h" #include "rocksdb/memtablerep.h" -#include "rocksdb/arena.h" +#include "util/arena.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "port/port.h" #include "port/atomic_pointer.h" #include "util/murmurhash.h" +#include "db/memtable.h" #include "db/skiplist.h" namespace rocksdb { @@ -21,7 +22,8 @@ namespace { class HashSkipListRep : public MemTableRep { public: HashSkipListRep(MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform* transform, size_t bucket_size); + const SliceTransform* transform, size_t bucket_size, + int32_t skiplist_height, int32_t skiplist_branching_factor); virtual void Insert(const char* key) override; @@ -46,6 +48,9 @@ class HashSkipListRep : public MemTableRep { size_t bucket_size_; + const int32_t skiplist_height_; + const int32_t skiplist_branching_factor_; + // Maps slices (which are transformed user keys) to buckets of keys sharing // the same transform. port::AtomicPointer* buckets_; @@ -112,9 +117,12 @@ class HashSkipListRep : public MemTableRep { } // Advance to the first entry with a key >= target - virtual void Seek(const char* target) { + virtual void Seek(const Slice& internal_key, const char* memtable_key) { if (list_ != nullptr) { - iter_.Seek(target); + const char* encoded_key = + (memtable_key != nullptr) ? + memtable_key : EncodeKey(&tmp_, internal_key); + iter_.Seek(encoded_key); } } @@ -151,6 +159,7 @@ class HashSkipListRep : public MemTableRep { // here we track if we own list_. If we own it, we are also // responsible for it's cleaning. This is a poor man's shared_ptr bool own_list_; + std::string tmp_; // For passing to EncodeKey }; class DynamicIterator : public HashSkipListRep::Iterator { @@ -160,11 +169,10 @@ class HashSkipListRep : public MemTableRep { memtable_rep_(memtable_rep) {} // Advance to the first entry with a key >= target - virtual void Seek(const char* target) { - auto transformed = memtable_rep_.transform_->Transform( - memtable_rep_.UserKey(target)); + virtual void Seek(const Slice& k, const char* memtable_key) { + auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k)); Reset(memtable_rep_.GetBucket(transformed)); - HashSkipListRep::Iterator::Seek(target); + HashSkipListRep::Iterator::Seek(k, memtable_key); } // Position at the first entry in collection. @@ -201,7 +209,8 @@ class HashSkipListRep : public MemTableRep { } virtual void Next() { } virtual void Prev() { } - virtual void Seek(const char* target) { } + virtual void Seek(const Slice& internal_key, + const char* memtable_key) { } virtual void SeekToFirst() { } virtual void SeekToLast() { } private: @@ -210,8 +219,11 @@ class HashSkipListRep : public MemTableRep { HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare, Arena* arena, const SliceTransform* transform, - size_t bucket_size) + size_t bucket_size, int32_t skiplist_height, + int32_t skiplist_branching_factor) : bucket_size_(bucket_size), + skiplist_height_(skiplist_height), + skiplist_branching_factor_(skiplist_branching_factor), transform_(transform), compare_(compare), arena_(arena) { @@ -232,7 +244,8 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket( auto bucket = GetBucket(hash); if (bucket == nullptr) { auto addr = arena_->AllocateAligned(sizeof(Bucket)); - bucket = new (addr) Bucket(compare_, arena_); + bucket = new (addr) Bucket(compare_, arena_, skiplist_height_, + skiplist_branching_factor_); buckets_[hash].Release_Store(static_cast(bucket)); } return bucket; @@ -292,12 +305,15 @@ MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() { MemTableRep* HashSkipListRepFactory::CreateMemTableRep( MemTableRep::KeyComparator& compare, Arena* arena) { - return new HashSkipListRep(compare, arena, transform_, bucket_count_); + return new HashSkipListRep(compare, arena, transform_, bucket_count_, + skiplist_height_, skiplist_branching_factor_); } MemTableRepFactory* NewHashSkipListRepFactory( - const SliceTransform* transform, size_t bucket_count) { - return new HashSkipListRepFactory(transform, bucket_count); + const SliceTransform* transform, size_t bucket_count, + int32_t skiplist_height, int32_t skiplist_branching_factor) { + return new HashSkipListRepFactory(transform, bucket_count, + skiplist_height, skiplist_branching_factor); } } // namespace rocksdb diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h index 7b8414c88..1ea844eda 100644 --- a/util/hash_skiplist_rep.h +++ b/util/hash_skiplist_rep.h @@ -14,10 +14,15 @@ namespace rocksdb { class HashSkipListRepFactory : public MemTableRepFactory { public: - explicit HashSkipListRepFactory(const SliceTransform* transform, - size_t bucket_count = 1000000) - : transform_(transform), - bucket_count_(bucket_count) { } + explicit HashSkipListRepFactory( + const SliceTransform* transform, + size_t bucket_count, + int32_t skiplist_height, + int32_t skiplist_branching_factor) + : transform_(transform), + bucket_count_(bucket_count), + skiplist_height_(skiplist_height), + skiplist_branching_factor_(skiplist_branching_factor) { } virtual ~HashSkipListRepFactory() { delete transform_; } @@ -33,6 +38,8 @@ class HashSkipListRepFactory : public MemTableRepFactory { private: const SliceTransform* transform_; const size_t bucket_count_; + const int32_t skiplist_height_; + const int32_t skiplist_branching_factor_; }; } diff --git a/util/options.cc b/util/options.cc index 212dc4653..2a2807155 100644 --- a/util/options.cc +++ b/util/options.cc @@ -16,10 +16,11 @@ #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" -#include "rocksdb/merge_operator.h" #include "rocksdb/memtablerep.h" +#include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "table/block_based_table_factory.h" @@ -73,6 +74,9 @@ ColumnFamilyOptions::ColumnFamilyOptions() std::shared_ptr(new BlockBasedTableFactory())), inplace_update_support(false), inplace_update_num_locks(10000), + inplace_callback(nullptr), + memtable_prefix_bloom_bits(0), + memtable_prefix_bloom_probes(6), max_successive_merges(0) { assert(memtable_factory.get() != nullptr); } @@ -131,6 +135,9 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) table_properties_collectors(options.table_properties_collectors), inplace_update_support(options.inplace_update_support), inplace_update_num_locks(options.inplace_update_num_locks), + inplace_callback(options.inplace_callback), + memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits), + memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes), max_successive_merges(options.max_successive_merges) { assert(memtable_factory.get() != nullptr); } @@ -396,6 +403,11 @@ Options::Dump(Logger* log) const inplace_update_support); Log(log, " Options.inplace_update_num_locks: %zd", inplace_update_num_locks); + // TODO: easier config for bloom (maybe based on avg key/value size) + Log(log, " Options.memtable_prefix_bloom_bits: %d", + memtable_prefix_bloom_bits); + Log(log, " Options.memtable_prefix_bloom_probes: %d", + memtable_prefix_bloom_probes); Log(log, " Options.max_successive_merges: %zd", max_successive_merges); } // Options::Dump diff --git a/util/perf_context.cc b/util/perf_context.cc index 1e8ddfb5e..6833f6836 100644 --- a/util/perf_context.cc +++ b/util/perf_context.cc @@ -22,7 +22,20 @@ void PerfContext::Reset() { block_decompress_time = 0; internal_key_skipped_count = 0; internal_delete_skipped_count = 0; - wal_write_time = 0; + write_wal_time = 0; + + get_snapshot_time = 0; + get_from_memtable_time = 0; + get_from_memtable_count = 0; + get_post_process_time = 0; + get_from_output_files_time = 0; + seek_child_seek_time = 0; + seek_child_seek_count = 0; + seek_min_heap_time = 0; + seek_internal_seek_time = 0; + find_next_user_entry_time = 0; + write_pre_and_post_process_time = 0; + write_memtable_time = 0; } __thread PerfContext perf_context; diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc index a5b072ad1..6f1fb1a15 100644 --- a/util/skiplistrep.cc +++ b/util/skiplistrep.cc @@ -70,8 +70,13 @@ public: } // Advance to the first entry with a key >= target - virtual void Seek(const char* target) override { - iter_.Seek(target); + virtual void Seek(const Slice& user_key, const char* memtable_key) + override { + if (memtable_key != nullptr) { + iter_.Seek(memtable_key); + } else { + iter_.Seek(EncodeKey(&tmp_, user_key)); + } } // Position at the first entry in list. @@ -85,6 +90,8 @@ public: virtual void SeekToLast() override { iter_.SeekToLast(); } + protected: + std::string tmp_; // For passing to EncodeKey }; // Unhide default implementations of GetIterator diff --git a/util/testutil.h b/util/testutil.h index c73210fec..4fc8c0f5b 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -8,6 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include +#include "db/dbformat.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" #include "util/random.h" @@ -51,5 +53,28 @@ class ErrorEnv : public EnvWrapper { } }; +// An internal comparator that just forward comparing results from the +// user comparator in it. Can be used to test entities that have no dependency +// on internal key structure but consumes InternalKeyComparator, like +// BlockBasedTable. +class PlainInternalKeyComparator : public InternalKeyComparator { + public: + explicit PlainInternalKeyComparator(const Comparator* c) + : InternalKeyComparator(c) {} + + virtual ~PlainInternalKeyComparator() {} + + virtual int Compare(const Slice& a, const Slice& b) const override { + return user_comparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + user_comparator()->FindShortestSeparator(start, limit); + } + virtual void FindShortSuccessor(std::string* key) const override { + user_comparator()->FindShortSuccessor(key); + } +}; + } // namespace test } // namespace rocksdb diff --git a/util/vectorrep.cc b/util/vectorrep.cc index 87fae4bc7..4b8b3d552 100644 --- a/util/vectorrep.cc +++ b/util/vectorrep.cc @@ -11,7 +11,8 @@ #include #include -#include "rocksdb/arena.h" +#include "util/arena.h" +#include "db/memtable.h" #include "port/port.h" #include "util/mutexlock.h" #include "util/stl_wrappers.h" @@ -45,6 +46,7 @@ class VectorRep : public MemTableRep { std::shared_ptr> bucket_; typename std::vector::const_iterator mutable cit_; const KeyComparator& compare_; + std::string tmp_; // For passing to EncodeKey bool mutable sorted_; void DoSort() const; public: @@ -73,7 +75,7 @@ class VectorRep : public MemTableRep { virtual void Prev() override; // Advance to the first entry with a key >= target - virtual void Seek(const char* target) override; + virtual void Seek(const Slice& user_key, const char* memtable_key) override; // Position at the first entry in collection. // Final state of iterator is Valid() iff collection is not empty. @@ -200,12 +202,15 @@ void VectorRep::Iterator::Prev() { } // Advance to the first entry with a key >= target -void VectorRep::Iterator::Seek(const char* target) { +void VectorRep::Iterator::Seek(const Slice& user_key, + const char* memtable_key) { DoSort(); // Do binary search to find first value not less than the target + const char* encoded_key = + (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key); cit_ = std::equal_range(bucket_->begin(), bucket_->end(), - target, + encoded_key, [this] (const char* a, const char* b) { return compare_(a, b) < 0; }).first;