You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
rocksdb/db/version_set.h

633 lines
23 KiB

// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// The representation of a DBImpl consists of a set of Versions. The
// newest version is called "current". Older versions may be kept
// around to provide a consistent view to live iterators.
//
// Each Version keeps track of a set of Table files per level. The
// entire set of versions is maintained in a VersionSet.
//
// Version,VersionSet are thread-compatible, but require external
// synchronization on all accesses.
#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
#define STORAGE_LEVELDB_DB_VERSION_SET_H_
#include <map>
#include <memory>
#include <set>
#include <vector>
#include <deque>
#include "db/dbformat.h"
#include "db/version_edit.h"
#include "port/port.h"
#include "db/table_cache.h"
namespace rocksdb {
namespace log { class Writer; }
class Compaction;
class Iterator;
class MemTable;
class TableBuilder;
class TableCache;
class Version;
class VersionSet;
class WritableFile;
// Return the smallest index i such that files[i]->largest >= key.
// Return files.size() if there is no such file.
// REQUIRES: "files" contains a sorted list of non-overlapping files.
extern int FindFile(const InternalKeyComparator& icmp,
const std::vector<FileMetaData*>& files,
const Slice& key);
// Returns true iff some file in "files" overlaps the user key range
// [*smallest,*largest].
// smallest==nullptr represents a key smaller than all keys in the DB.
// largest==nullptr represents a key largest than all keys in the DB.
// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
// in sorted order.
extern bool SomeFileOverlapsRange(
const InternalKeyComparator& icmp,
bool disjoint_sorted_files,
const std::vector<FileMetaData*>& files,
const Slice* smallest_user_key,
const Slice* largest_user_key);
class Version {
public:
// Append to *iters a sequence of iterators that will
// yield the contents of this Version when merged together.
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
void AddIterators(const ReadOptions&, const EnvOptions& soptions,
std::vector<Iterator*>* iters);
// Lookup the value for key. If found, store it in *val and
// return OK. Else return a non-OK status. Fills *stats.
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences. Summary: Here are the major changes to the Merge Interface. It has been expanded to handle cases where the MergeOperator is not associative. It does so by stacking up merge operations while scanning through the key history (i.e.: during Get() or Compaction), until a valid Put/Delete/end-of-history is encountered; it then applies all of the merge operations in the correct sequence starting with the base/sentinel value. I have also introduced an "AssociativeMerge" function which allows the user to take advantage of associative merge operations (such as in the case of counters). The implementation will always attempt to merge the operations/operands themselves together when they are encountered, and will resort to the "stacking" method if and only if the "associative-merge" fails. This implementation is conjectured to allow MergeOperator to handle the general case, while still providing the user with the ability to take advantage of certain efficiencies in their own merge-operator / data-structure. NOTE: This is a preliminary diff. This must still go through a lot of review, revision, and testing. Feedback welcome! Test Plan: -This is a preliminary diff. I have only just begun testing/debugging it. -I will be testing this with the existing MergeOperator use-cases and unit-tests (counters, string-append, and redis-lists) -I will be "desk-checking" and walking through the code with the help gdb. -I will find a way of stress-testing the new interface / implementation using db_bench, db_test, merge_test, and/or db_stress. -I will ensure that my tests cover all cases: Get-Memtable, Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0, Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found, end-of-history, end-of-file, etc. -A lot of feedback from the reviewers. Reviewers: haobo, dhruba, zshao, emayanke Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11499
11 years ago
// Uses *operands to store merge_operator operations to apply later
// REQUIRES: lock is not held
struct GetStats {
FileMetaData* seek_file;
int seek_file_level;
};
void Get(const ReadOptions&, const LookupKey& key, std::string* val,
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences. Summary: Here are the major changes to the Merge Interface. It has been expanded to handle cases where the MergeOperator is not associative. It does so by stacking up merge operations while scanning through the key history (i.e.: during Get() or Compaction), until a valid Put/Delete/end-of-history is encountered; it then applies all of the merge operations in the correct sequence starting with the base/sentinel value. I have also introduced an "AssociativeMerge" function which allows the user to take advantage of associative merge operations (such as in the case of counters). The implementation will always attempt to merge the operations/operands themselves together when they are encountered, and will resort to the "stacking" method if and only if the "associative-merge" fails. This implementation is conjectured to allow MergeOperator to handle the general case, while still providing the user with the ability to take advantage of certain efficiencies in their own merge-operator / data-structure. NOTE: This is a preliminary diff. This must still go through a lot of review, revision, and testing. Feedback welcome! Test Plan: -This is a preliminary diff. I have only just begun testing/debugging it. -I will be testing this with the existing MergeOperator use-cases and unit-tests (counters, string-append, and redis-lists) -I will be "desk-checking" and walking through the code with the help gdb. -I will find a way of stress-testing the new interface / implementation using db_bench, db_test, merge_test, and/or db_stress. -I will ensure that my tests cover all cases: Get-Memtable, Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0, Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found, end-of-history, end-of-file, etc. -A lot of feedback from the reviewers. Reviewers: haobo, dhruba, zshao, emayanke Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11499
11 years ago
Status* status, std::deque<std::string>* operands, GetStats* stats,
const Options& db_option,
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences. Summary: Here are the major changes to the Merge Interface. It has been expanded to handle cases where the MergeOperator is not associative. It does so by stacking up merge operations while scanning through the key history (i.e.: during Get() or Compaction), until a valid Put/Delete/end-of-history is encountered; it then applies all of the merge operations in the correct sequence starting with the base/sentinel value. I have also introduced an "AssociativeMerge" function which allows the user to take advantage of associative merge operations (such as in the case of counters). The implementation will always attempt to merge the operations/operands themselves together when they are encountered, and will resort to the "stacking" method if and only if the "associative-merge" fails. This implementation is conjectured to allow MergeOperator to handle the general case, while still providing the user with the ability to take advantage of certain efficiencies in their own merge-operator / data-structure. NOTE: This is a preliminary diff. This must still go through a lot of review, revision, and testing. Feedback welcome! Test Plan: -This is a preliminary diff. I have only just begun testing/debugging it. -I will be testing this with the existing MergeOperator use-cases and unit-tests (counters, string-append, and redis-lists) -I will be "desk-checking" and walking through the code with the help gdb. -I will find a way of stress-testing the new interface / implementation using db_bench, db_test, merge_test, and/or db_stress. -I will ensure that my tests cover all cases: Get-Memtable, Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0, Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found, end-of-history, end-of-file, etc. -A lot of feedback from the reviewers. Reviewers: haobo, dhruba, zshao, emayanke Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11499
11 years ago
bool* value_found = nullptr);
// Adds "stats" into the current state. Returns true if a new
// compaction may need to be triggered, false otherwise.
// REQUIRES: lock is held
bool UpdateStats(const GetStats& stats);
// Reference count management (so Versions do not disappear out from
// under live iterators)
void Ref();
void Unref();
void GetOverlappingInputs(
int level,
const InternalKey* begin, // nullptr means before all keys
const InternalKey* end, // nullptr means after all keys
std::vector<FileMetaData*>* inputs,
int hint_index = -1, // index of overlap file
int* file_index = nullptr); // return index of overlap file
void GetOverlappingInputsBinarySearch(
int level,
const Slice& begin, // nullptr means before all keys
const Slice& end, // nullptr means after all keys
std::vector<FileMetaData*>* inputs,
int hint_index, // index of overlap file
int* file_index); // return index of overlap file
void ExtendOverlappingInputs(
int level,
const Slice& begin, // nullptr means before all keys
const Slice& end, // nullptr means after all keys
std::vector<FileMetaData*>* inputs,
unsigned int index); // start extending from this index
// Returns true iff some file in the specified level overlaps
// some part of [*smallest_user_key,*largest_user_key].
// smallest_user_key==NULL represents a key smaller than all keys in the DB.
// largest_user_key==NULL represents a key largest than all keys in the DB.
bool OverlapInLevel(int level,
const Slice* smallest_user_key,
const Slice* largest_user_key);
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences. Summary: Here are the major changes to the Merge Interface. It has been expanded to handle cases where the MergeOperator is not associative. It does so by stacking up merge operations while scanning through the key history (i.e.: during Get() or Compaction), until a valid Put/Delete/end-of-history is encountered; it then applies all of the merge operations in the correct sequence starting with the base/sentinel value. I have also introduced an "AssociativeMerge" function which allows the user to take advantage of associative merge operations (such as in the case of counters). The implementation will always attempt to merge the operations/operands themselves together when they are encountered, and will resort to the "stacking" method if and only if the "associative-merge" fails. This implementation is conjectured to allow MergeOperator to handle the general case, while still providing the user with the ability to take advantage of certain efficiencies in their own merge-operator / data-structure. NOTE: This is a preliminary diff. This must still go through a lot of review, revision, and testing. Feedback welcome! Test Plan: -This is a preliminary diff. I have only just begun testing/debugging it. -I will be testing this with the existing MergeOperator use-cases and unit-tests (counters, string-append, and redis-lists) -I will be "desk-checking" and walking through the code with the help gdb. -I will find a way of stress-testing the new interface / implementation using db_bench, db_test, merge_test, and/or db_stress. -I will ensure that my tests cover all cases: Get-Memtable, Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0, Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found, end-of-history, end-of-file, etc. -A lot of feedback from the reviewers. Reviewers: haobo, dhruba, zshao, emayanke Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11499
11 years ago
// Returns true iff the first or last file in inputs contains
// an overlapping user key to the file "just outside" of it (i.e.
// just after the last file, or just before the first file)
// REQUIRES: "*inputs" is a sorted list of non-overlapping files
bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
int level);
// Return the level at which we should place a new memtable compaction
// result that covers the range [smallest_user_key,largest_user_key].
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
const Slice& largest_user_key);
int NumFiles(int level) const { return files_[level].size(); }
// Return a human readable string that describes this version's contents.
std::string DebugString(bool hex = false) const;
// Returns the version nuber of this version
uint64_t GetVersionNumber() {
return version_number_;
}
private:
friend class Compaction;
friend class VersionSet;
friend class DBImpl;
class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&,
const EnvOptions& soptions,
int level) const;
bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
const Slice& internal_prefix, Iterator* level_iter) const;
VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list
int refs_; // Number of live refs to this version
// List of files per level, files in each level are arranged
// in increasing order of keys
std::vector<FileMetaData*>* files_;
// A list for the same set of files that are stored in files_,
// but files in each level are now sorted based on file
// size. The file with the largest size is at the front.
// This vector stores the index of the file from files_.
std::vector< std::vector<int> > files_by_size_;
// An index into files_by_size_ that specifies the first
// file that is not yet compacted
std::vector<int> next_file_to_compact_by_size_;
// Only the first few entries of files_by_size_ are sorted.
// There is no need to sort all the files because it is likely
// that on a running system, we need to look at only the first
// few largest files because a new version is created every few
// seconds/minutes (because of concurrent compactions).
static const int number_of_files_to_sort_ = 50;
// Next file to compact based on seek stats.
FileMetaData* file_to_compact_;
int file_to_compact_level_;
// Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields
// are initialized by Finalize().
// The most critical level to be compacted is listed first
// These are used to pick the best compaction level
std::vector<double> compaction_score_;
std::vector<int> compaction_level_;
double max_compaction_score_; // max score in l1 to ln-1
int max_compaction_score_level_; // level on which max score occurs
// The offset in the manifest file where this version is stored.
uint64_t offset_manifest_file_;
// A version number that uniquely represents this version. This is
// used for debugging and logging purposes only.
uint64_t version_number_;
explicit Version(VersionSet* vset, uint64_t version_number = 0);
~Version();
// re-initializes the index that is used to offset into files_by_size_
// to find the next compaction candidate file.
void ResetNextCompactionIndex(int level) {
next_file_to_compact_by_size_[level] = 0;
}
// No copying allowed
Version(const Version&);
void operator=(const Version&);
};
class VersionSet {
public:
VersionSet(const std::string& dbname,
const Options* options,
const EnvOptions& storage_options,
TableCache* table_cache,
const InternalKeyComparator*);
~VersionSet();
// Apply *edit to the current version to form a new descriptor that
// is both saved to persistent state and installed as the new
// current version. Will release *mu while actually writing to the file.
// REQUIRES: *mu is held on entry.
// REQUIRES: no other thread concurrently calls LogAndApply()
Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
bool new_descriptor_log = false);
// Recover the last saved descriptor from persistent storage.
Status Recover();
// Try to reduce the number of levels. This call is valid when
// only one level from the new max level to the old
// max level containing files.
// For example, a db currently has 7 levels [0-6], and a call to
// to reduce to 5 [0-4] can only be executed when only one level
// among [4-6] contains files.
Status ReduceNumberOfLevels(int new_levels, port::Mutex* mu);
// Return the current version.
Version* current() const { return current_; }
// Return the current manifest file number
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
// Allocate and return a new file number
uint64_t NewFileNumber() { return next_file_number_++; }
// Arrange to reuse "file_number" unless a newer file number has
// already been allocated.
// REQUIRES: "file_number" was returned by a call to NewFileNumber().
void ReuseFileNumber(uint64_t file_number) {
if (next_file_number_ == file_number + 1) {
next_file_number_ = file_number;
}
}
// Return the number of Table files at the specified level.
int NumLevelFiles(int level) const;
// Return the combined file size of all files at the specified level.
int64_t NumLevelBytes(int level) const;
// Return the last sequence number.
uint64_t LastSequence() const { return last_sequence_; }
// Set the last sequence number to s.
void SetLastSequence(uint64_t s) {
assert(s >= last_sequence_);
last_sequence_ = s;
}
// Mark the specified file number as used.
void MarkFileNumberUsed(uint64_t number);
// Return the current log file number.
uint64_t LogNumber() const { return log_number_; }
// Return the log file number for the log file that is currently
// being compacted, or zero if there is no such log file.
uint64_t PrevLogNumber() const { return prev_log_number_; }
int NumberLevels() const { return num_levels_; }
// Pick level and inputs for a new compaction.
// Returns nullptr if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result.
Compaction* PickCompaction();
// Return a compaction object for compacting the range [begin,end] in
// the specified level. Returns nullptr if there is nothing in that
// level that overlaps the specified range. Caller should delete
// the result.
Compaction* CompactRange(
int level,
const InternalKey* begin,
const InternalKey* end);
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
int64_t MaxNextLevelOverlappingBytes();
// Create an iterator that reads over the compaction inputs for "*c".
// The caller should delete the iterator when no longer needed.
Iterator* MakeInputIterator(Compaction* c);
// Returns true iff some level needs a compaction because it has
// exceeded its target size.
bool NeedsSizeCompaction() const {
for (int i = 0; i < NumberLevels()-1; i++) {
if (current_->compaction_score_[i] >= 1) {
return true;
}
}
return false;
}
// Returns true iff some level needs a compaction.
bool NeedsCompaction() const {
return ((current_->file_to_compact_ != nullptr) ||
NeedsSizeCompaction());
}
// Returns the maxmimum compaction score for levels 1 to max
double MaxCompactionScore() const {
return current_->max_compaction_score_;
}
// See field declaration
int MaxCompactionScoreLevel() const {
return current_->max_compaction_score_level_;
}
// Add all files listed in any live version to *live.
void AddLiveFiles(std::vector<uint64_t>* live_list);
// Add all files listed in the current version to *live.
void AddLiveFilesCurrentVersion(std::set<uint64_t>* live);
// Return the approximate offset in the database of the data for
// "key" as of version "v".
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
// Return a human-readable short (single-line) summary of the number
// of files per level. Uses *scratch as backing store.
struct LevelSummaryStorage {
char buffer[100];
};
struct FileSummaryStorage {
char buffer[1000];
};
const char* LevelSummary(LevelSummaryStorage* scratch) const;
Utility to dump manifest contents. Summary: ./manifest_dump --file=/tmp/dbbench/MANIFEST-000002 Output looks like manifest_file_number 30 next_file_number 31 last_sequence 388082 log_number 28 prev_log_number 0 --- level 0 --- --- level 1 --- --- level 2 --- 5:3244155['0000000000000000' @ 1 : 1 .. '0000000000028220' @ 28221 : 1] 7:3244177['0000000000028221' @ 28222 : 1 .. '0000000000056441' @ 56442 : 1] 9:3244156['0000000000056442' @ 56443 : 1 .. '0000000000084662' @ 84663 : 1] 11:3244178['0000000000084663' @ 84664 : 1 .. '0000000000112883' @ 112884 : 1] 13:3244158['0000000000112884' @ 112885 : 1 .. '0000000000141104' @ 141105 : 1] 15:3244176['0000000000141105' @ 141106 : 1 .. '0000000000169325' @ 169326 : 1] 17:3244156['0000000000169326' @ 169327 : 1 .. '0000000000197546' @ 197547 : 1] 19:3244178['0000000000197547' @ 197548 : 1 .. '0000000000225767' @ 225768 : 1] 21:3244155['0000000000225768' @ 225769 : 1 .. '0000000000253988' @ 253989 : 1] 23:3244179['0000000000253989' @ 253990 : 1 .. '0000000000282209' @ 282210 : 1] 25:3244157['0000000000282210' @ 282211 : 1 .. '0000000000310430' @ 310431 : 1] 27:3244176['0000000000310431' @ 310432 : 1 .. '0000000000338651' @ 338652 : 1] 29:3244156['0000000000338652' @ 338653 : 1 .. '0000000000366872' @ 366873 : 1] --- level 3 --- --- level 4 --- --- level 5 --- --- level 6 --- Test Plan: run on test directory created by dbbench Reviewers: heyongqiang Reviewed By: heyongqiang CC: hustliubo Differential Revision: https://reviews.facebook.net/D4743
12 years ago
// printf contents (for debugging)
Status DumpManifest(Options& options, std::string& manifestFileName,
bool verbose, bool hex = false);
Utility to dump manifest contents. Summary: ./manifest_dump --file=/tmp/dbbench/MANIFEST-000002 Output looks like manifest_file_number 30 next_file_number 31 last_sequence 388082 log_number 28 prev_log_number 0 --- level 0 --- --- level 1 --- --- level 2 --- 5:3244155['0000000000000000' @ 1 : 1 .. '0000000000028220' @ 28221 : 1] 7:3244177['0000000000028221' @ 28222 : 1 .. '0000000000056441' @ 56442 : 1] 9:3244156['0000000000056442' @ 56443 : 1 .. '0000000000084662' @ 84663 : 1] 11:3244178['0000000000084663' @ 84664 : 1 .. '0000000000112883' @ 112884 : 1] 13:3244158['0000000000112884' @ 112885 : 1 .. '0000000000141104' @ 141105 : 1] 15:3244176['0000000000141105' @ 141106 : 1 .. '0000000000169325' @ 169326 : 1] 17:3244156['0000000000169326' @ 169327 : 1 .. '0000000000197546' @ 197547 : 1] 19:3244178['0000000000197547' @ 197548 : 1 .. '0000000000225767' @ 225768 : 1] 21:3244155['0000000000225768' @ 225769 : 1 .. '0000000000253988' @ 253989 : 1] 23:3244179['0000000000253989' @ 253990 : 1 .. '0000000000282209' @ 282210 : 1] 25:3244157['0000000000282210' @ 282211 : 1 .. '0000000000310430' @ 310431 : 1] 27:3244176['0000000000310431' @ 310432 : 1 .. '0000000000338651' @ 338652 : 1] 29:3244156['0000000000338652' @ 338653 : 1 .. '0000000000366872' @ 366873 : 1] --- level 3 --- --- level 4 --- --- level 5 --- --- level 6 --- Test Plan: run on test directory created by dbbench Reviewers: heyongqiang Reviewed By: heyongqiang CC: hustliubo Differential Revision: https://reviews.facebook.net/D4743
12 years ago
// Return a human-readable short (single-line) summary of the data size
// of files per level. Uses *scratch as backing store.
const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const;
// Return a human-readable short (single-line) summary of files
// in a specified level. Uses *scratch as backing store.
const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
// Return the size of the current manifest file
const uint64_t ManifestFileSize() { return current_->offset_manifest_file_; }
// For the specfied level, pick a compaction.
// Returns nullptr if there is no compaction to be done.
// If level is 0 and there is already a compaction on that level, this
// function will return nullptr.
Compaction* PickCompactionBySize(int level, double score);
// Pick files to compact in Universal mode
Compaction* PickCompactionUniversal(int level, double score);
// Pick Universal compaction to limit read amplification
Compaction* PickCompactionUniversalReadAmp(int level, double score,
unsigned int ratio, unsigned int num_files);
// Pick Universal compaction to limit space amplification.
Compaction* PickCompactionUniversalSizeAmp(int level, double score);
// Free up the files that were participated in a compaction
void ReleaseCompactionFiles(Compaction* c, Status status);
// verify that the files that we started with for a compaction
// still exist in the current version and in the same original level.
// This ensures that a concurrent compaction did not erroneously
// pick the same files to compact.
bool VerifyCompactionFileConsistency(Compaction* c);
// used to sort files by size
typedef struct fsize {
int index;
FileMetaData* file;
} Fsize;
// Sort all files for this version based on their file size and
// record results in files_by_size_. The largest files are listed first.
void UpdateFilesBySize(Version *v);
// Get the max file size in a given level.
uint64_t MaxFileSizeForLevel(int level);
double MaxBytesForLevel(int level);
Status GetMetadataForFile(
uint64_t number, int *filelevel, FileMetaData *metadata);
void GetLiveFilesMetaData(
std::vector<LiveFileMetaData> *metadata);
private:
class Builder;
struct ManifestWriter;
friend class Compaction;
friend class Version;
void Init(int num_levels);
Prevent segfault because SizeUnderCompaction was called without any locks. Summary: SizeBeingCompacted was called without any lock protection. This causes crashes, especially when running db_bench with value_size=128K. The fix is to compute SizeUnderCompaction while holding the mutex and passing in these values into the call to Finalize. (gdb) where #4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827 #5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420 #6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016 #7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473 #8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757 #9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268 #10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170 #11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941 #12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874 #13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301 #14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115 Test Plan: make check I am running db_bench with a value size of 128K to see if the segfault is fixed. Reviewers: MarkCallaghan, sheki, emayanke Reviewed By: sheki CC: leveldb Differential Revision: https://reviews.facebook.net/D9279
12 years ago
void Finalize(Version* v, std::vector<uint64_t>&);
void GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest,
InternalKey* largest);
void GetRange2(const std::vector<FileMetaData*>& inputs1,
const std::vector<FileMetaData*>& inputs2,
InternalKey* smallest,
InternalKey* largest);
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences. Summary: Here are the major changes to the Merge Interface. It has been expanded to handle cases where the MergeOperator is not associative. It does so by stacking up merge operations while scanning through the key history (i.e.: during Get() or Compaction), until a valid Put/Delete/end-of-history is encountered; it then applies all of the merge operations in the correct sequence starting with the base/sentinel value. I have also introduced an "AssociativeMerge" function which allows the user to take advantage of associative merge operations (such as in the case of counters). The implementation will always attempt to merge the operations/operands themselves together when they are encountered, and will resort to the "stacking" method if and only if the "associative-merge" fails. This implementation is conjectured to allow MergeOperator to handle the general case, while still providing the user with the ability to take advantage of certain efficiencies in their own merge-operator / data-structure. NOTE: This is a preliminary diff. This must still go through a lot of review, revision, and testing. Feedback welcome! Test Plan: -This is a preliminary diff. I have only just begun testing/debugging it. -I will be testing this with the existing MergeOperator use-cases and unit-tests (counters, string-append, and redis-lists) -I will be "desk-checking" and walking through the code with the help gdb. -I will find a way of stress-testing the new interface / implementation using db_bench, db_test, merge_test, and/or db_stress. -I will ensure that my tests cover all cases: Get-Memtable, Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0, Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found, end-of-history, end-of-file, etc. -A lot of feedback from the reviewers. Reviewers: haobo, dhruba, zshao, emayanke Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11499
11 years ago
void ExpandWhileOverlapping(Compaction* c);
void SetupOtherInputs(Compaction* c);
// Save current contents to *log
Status WriteSnapshot(log::Writer* log);
void AppendVersion(Version* v);
bool ManifestContains(const std::string& record) const;
uint64_t ExpandedCompactionByteSizeLimit(int level);
uint64_t MaxGrandParentOverlapBytes(int level);
Env* const env_;
const std::string dbname_;
const Options* const options_;
TableCache* const table_cache_;
const InternalKeyComparator icmp_;
uint64_t next_file_number_;
uint64_t manifest_file_number_;
uint64_t last_sequence_;
uint64_t log_number_;
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
int num_levels_;
// Opened lazily
unique_ptr<log::Writer> descriptor_log_;
Version dummy_versions_; // Head of circular doubly-linked list of versions.
Version* current_; // == dummy_versions_.prev_
// Per-level key at which the next compaction at that level should start.
// Either an empty string, or a valid InternalKey.
std::string* compact_pointer_;
// Per-level target file size.
uint64_t* max_file_size_;
// Per-level max bytes
uint64_t* level_max_bytes_;
// record all the ongoing compactions for all levels
std::vector<std::set<Compaction*> > compactions_in_progress_;
// generates a increasing version number for every new version
uint64_t current_version_number_;
// Queue of writers to the manifest file
std::deque<ManifestWriter*> manifest_writers_;
// Store the manifest file size when it is checked.
// Save us the cost of checking file size twice in LogAndApply
uint64_t last_observed_manifest_size_;
// storage options for all reads and writes except compactions
const EnvOptions& storage_options_;
// storage options used for compactions. This is a copy of
// storage_options_ but with readaheads set to readahead_compactions_.
const EnvOptions storage_options_compactions_;
// No copying allowed
VersionSet(const VersionSet&);
void operator=(const VersionSet&);
// Return the total amount of data that is undergoing
Prevent segfault because SizeUnderCompaction was called without any locks. Summary: SizeBeingCompacted was called without any lock protection. This causes crashes, especially when running db_bench with value_size=128K. The fix is to compute SizeUnderCompaction while holding the mutex and passing in these values into the call to Finalize. (gdb) where #4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827 #5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420 #6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016 #7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473 #8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757 #9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268 #10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170 #11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941 #12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874 #13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301 #14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115 Test Plan: make check I am running db_bench with a value size of 128K to see if the segfault is fixed. Reviewers: MarkCallaghan, sheki, emayanke Reviewed By: sheki CC: leveldb Differential Revision: https://reviews.facebook.net/D9279
12 years ago
// compactions per level
void SizeBeingCompacted(std::vector<uint64_t>&);
// Returns true if any one of the parent files are being compacted
bool ParentRangeInCompaction(const InternalKey* smallest,
const InternalKey* largest, int level, int* index);
// Returns true if any one of the specified files are being compacted
bool FilesInCompaction(std::vector<FileMetaData*>& files);
void LogAndApplyHelper(Builder*b, Version* v,
VersionEdit* edit, port::Mutex* mu);
};
// A Compaction encapsulates information about a compaction.
class Compaction {
public:
~Compaction();
// Return the level that is being compacted. Inputs from "level"
// will be merged.
int level() const { return level_; }
// Outputs will go to this level
int output_level() const { return out_level_; }
// Return the object that holds the edits to the descriptor done
// by this compaction.
VersionEdit* edit() { return edit_; }
// "which" must be either 0 or 1
int num_input_files(int which) const { return inputs_[which].size(); }
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
// Maximum size of files to build during this compaction.
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
// Is this a trivial compaction that can be implemented by just
// moving a single input file to the next level (no merging or splitting)
bool IsTrivialMove() const;
// Add all inputs to this compaction as delete operations to *edit.
void AddInputDeletions(VersionEdit* edit);
// Returns true if the information we have available guarantees that
// the compaction is producing data in "level+1" for which no data exists
// in levels greater than "level+1".
bool IsBaseLevelForKey(const Slice& user_key);
// Returns true iff we should stop building the current output
// before processing "internal_key".
bool ShouldStopBefore(const Slice& internal_key);
// Release the input version for the compaction, once the compaction
// is successful.
void ReleaseInputs();
void Summary(char* output, int len);
// Return the score that was used to pick this compaction run.
double score() const { return score_; }
// Is this compaction creating a file in the bottom most level?
bool BottomMostLevel() { return bottommost_level_; }
private:
friend class Version;
friend class VersionSet;
explicit Compaction(int level, int out_level, uint64_t target_file_size,
uint64_t max_grandparent_overlap_bytes, int number_levels,
bool seek_compaction = false);
int level_;
int out_level_; // levels to which output files are stored
uint64_t max_output_file_size_;
uint64_t maxGrandParentOverlapBytes_;
Version* input_version_;
VersionEdit* edit_;
int number_levels_;
bool seek_compaction_;
// Each compaction reads inputs from "level_" and "level_+1"
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
// State used to check for number of of overlapping grandparent files
// (parent == level_ + 1, grandparent == level_ + 2)
std::vector<FileMetaData*> grandparents_;
size_t grandparent_index_; // Index in grandparent_starts_
bool seen_key_; // Some output key has been seen
uint64_t overlapped_bytes_; // Bytes of overlap between current output
// and grandparent files
int base_index_; // index of the file in files_[level_]
int parent_index_; // index of some file with same range in files_[level_+1]
double score_; // score that was used to pick this compaction.
// Is this compaction creating a file in the bottom most level?
bool bottommost_level_;
// level_ptrs_ holds indices into input_version_->levels_: our state
// is that we are positioned at one of the file ranges for each
// higher level than the ones involved in this compaction (i.e. for
// all L >= level_ + 2).
std::vector<size_t> level_ptrs_;
// mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool);
// Initialize whether compaction producing files at the bottommost level
void SetupBottomMostLevel(bool isManual);
// In case of compaction error, reset the nextIndex that is used
// to pick up the next file to be compacted from files_by_size_
void ResetNextCompactionIndex();
};
} // namespace rocksdb
#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_