A number of bugfixes:

- Added DB::CompactRange() method.

  Changed manual compaction code so it breaks up compactions of
  big ranges into smaller compactions.

  Changed the code that pushes the output of memtable compactions
  to higher levels to obey the grandparent constraint: i.e., we
  must never have a single file in level L that overlaps too
  much data in level L+1 (to avoid very expensive L-1 compactions).

  Added code to pretty-print internal keys.

- Fixed bug where we would not detect overlap with files in
  level-0 because we were incorrectly using binary search
  on an array of files with overlapping ranges.

  Added "leveldb.sstables" property that can be used to dump
  all of the sstables and ranges that make up the db state.

- Removing post_write_snapshot support.  Email to leveldb mailing
  list brought up no users, just confusion from one person about
  what it meant.

- Fixing static_cast char to unsigned on BIG_ENDIAN platforms.

  Fixes	Issue 35 and Issue 36.

- Comment clarification to address leveldb Issue 37.

- Change license in posix_logger.h to match other files.

- A build problem where uint32 was used instead of uint32_t.

Sync with upstream @24408625
main
Gabor Cselle 13 years ago
parent 26db4d971a
commit 299ccedfec
  1. 5
      build_detect_platform
  2. 6
      db/corruption_test.cc
  3. 15
      db/db_bench.cc
  4. 98
      db/db_impl.cc
  5. 14
      db/db_impl.h
  6. 159
      db/db_test.cc
  7. 12
      db/dbformat.cc
  8. 2
      db/dbformat.h
  9. 14
      db/version_edit.cc
  10. 205
      db/version_set.cc
  11. 40
      db/version_set.h
  12. 57
      db/version_set_test.cc
  13. 22
      doc/index.html
  14. 15
      include/leveldb/db.h
  15. 8
      include/leveldb/env.h
  16. 15
      include/leveldb/options.h
  17. 8
      util/coding.h
  18. 5
      util/posix_logger.h

@ -35,11 +35,6 @@ case `uname -s` in
echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_FREEBSD" >> build_config.mk
echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk
;;
GNU/kFreeBSD)
PLATFORM=OS_FREEBSD
echo "PLATFORM_CFLAGS=-pthread -DOS_FREEBSD" >> build_config.mk
echo "PLATFORM_LDFLAGS=-lpthread -lrt" >> build_config.mk
;;
*)
echo "Unknown platform!"
exit 1

@ -229,8 +229,8 @@ TEST(CorruptionTest, TableFile) {
Build(100);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
dbi->TEST_CompactRange(0, "", "~");
dbi->TEST_CompactRange(1, "", "~");
dbi->TEST_CompactRange(0, NULL, NULL);
dbi->TEST_CompactRange(1, NULL, NULL);
Corrupt(kTableFile, 100, 1);
Check(99, 99);
@ -278,7 +278,7 @@ TEST(CorruptionTest, CorruptedDescriptor) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
dbi->TEST_CompactRange(0, "", "~");
dbi->TEST_CompactRange(0, NULL, NULL);
Corrupt(kDescriptorFile, 0, 1000);
Status s = TryReopen();

@ -796,20 +796,7 @@ class Benchmark {
}
void Compact(ThreadState* thread) {
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
int max_level_with_files = 1;
for (int level = 1; level < config::kNumLevels; level++) {
std::string property;
char name[100];
snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) {
max_level_with_files = level;
}
}
for (int level = 0; level < max_level_with_files; level++) {
dbi->TEST_CompactRange(level, "", "~");
}
db_->CompactRange(NULL, NULL);
}
void PrintStats() {

@ -454,13 +454,8 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
if (s.ok() && meta.file_size > 0) {
const Slice min_user_key = meta.smallest.user_key();
const Slice max_user_key = meta.largest.user_key();
if (base != NULL && !base->OverlapInLevel(0, min_user_key, max_user_key)) {
// Push the new sstable to a higher level if possible to reduce
// expensive manifest file ops.
while (level < config::kMaxMemCompactLevel &&
!base->OverlapInLevel(level + 1, min_user_key, max_user_key)) {
level++;
}
if (base != NULL) {
level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
}
edit->AddFile(level, meta.number, meta.file_size,
meta.smallest, meta.largest);
@ -506,25 +501,55 @@ Status DBImpl::CompactMemTable() {
return s;
}
void DBImpl::TEST_CompactRange(
int level,
const std::string& begin,
const std::string& end) {
void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
int max_level_with_files = 1;
{
MutexLock l(&mutex_);
Version* base = versions_->current();
for (int level = 1; level < config::kNumLevels; level++) {
if (base->OverlapInLevel(level, begin, end)) {
max_level_with_files = level;
}
}
}
TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
for (int level = 0; level < max_level_with_files; level++) {
TEST_CompactRange(level, begin, end);
}
}
void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
assert(level >= 0);
assert(level + 1 < config::kNumLevels);
MutexLock l(&mutex_);
while (manual_compaction_ != NULL) {
bg_cv_.Wait();
}
InternalKey begin_storage, end_storage;
ManualCompaction manual;
manual.level = level;
manual.begin = begin;
manual.end = end;
manual_compaction_ = &manual;
MaybeScheduleCompaction();
while (manual_compaction_ == &manual) {
bg_cv_.Wait();
manual.done = false;
if (begin == NULL) {
manual.begin = NULL;
} else {
begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
manual.begin = &begin_storage;
}
if (end == NULL) {
manual.end = NULL;
} else {
end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
manual.end = &end_storage;
}
MutexLock l(&mutex_);
while (!manual.done) {
while (manual_compaction_ != NULL) {
bg_cv_.Wait();
}
manual_compaction_ = &manual;
MaybeScheduleCompaction();
while (manual_compaction_ == &manual) {
bg_cv_.Wait();
}
}
}
@ -590,12 +615,20 @@ void DBImpl::BackgroundCompaction() {
Compaction* c;
bool is_manual = (manual_compaction_ != NULL);
InternalKey manual_end;
if (is_manual) {
const ManualCompaction* m = manual_compaction_;
c = versions_->CompactRange(
ManualCompaction* m = manual_compaction_;
c = versions_->CompactRange(m->level, m->begin, m->end);
m->done = (c == NULL);
if (c != NULL) {
manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
}
Log(options_.info_log,
"Manual compaction at level-%d from %s .. %s; will stop at %s\n",
m->level,
InternalKey(m->begin, kMaxSequenceNumber, kValueTypeForSeek),
InternalKey(m->end, 0, static_cast<ValueType>(0)));
(m->begin ? m->begin->DebugString().c_str() : "(begin)"),
(m->end ? m->end->DebugString().c_str() : "(end)"),
(m->done ? "(end)" : manual_end.DebugString().c_str()));
} else {
c = versions_->PickCompaction();
}
@ -638,7 +671,13 @@ void DBImpl::BackgroundCompaction() {
}
if (is_manual) {
// Mark it as done
ManualCompaction* m = manual_compaction_;
if (!m->done) {
// We only compacted part of the requested range. Update *m
// to the range that is left to be compacted.
m->tmp_storage = manual_end;
m->begin = &m->tmp_storage;
}
manual_compaction_ = NULL;
}
}
@ -1109,10 +1148,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
versions_->SetLastSequence(last_sequence);
}
if (options.post_write_snapshot != NULL) {
*options.post_write_snapshot =
status.ok() ? snapshots_.New(last_sequence) : NULL;
}
ReleaseLoggingResponsibility(&self);
return status;
}
@ -1225,6 +1260,9 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
}
}
return true;
} else if (in == "sstables") {
*value = versions_->current()->DebugString();
return true;
}
return false;

@ -38,14 +38,12 @@ class DBImpl : public DB {
virtual void ReleaseSnapshot(const Snapshot* snapshot);
virtual bool GetProperty(const Slice& property, std::string* value);
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
virtual void CompactRange(const Slice* begin, const Slice* end);
// Extra methods (for testing) that are not in the public DB interface
// Compact any files in the named level that overlap [begin,end]
void TEST_CompactRange(
int level,
const std::string& begin,
const std::string& end);
// Compact any files in the named level that overlap [*begin,*end]
void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
// Force current memtable contents to be compacted.
Status TEST_CompactMemTable();
@ -145,8 +143,10 @@ class DBImpl : public DB {
// Information for a manual compaction
struct ManualCompaction {
int level;
std::string begin;
std::string end;
bool done;
const InternalKey* begin; // NULL means beginning of key range
const InternalKey* end; // NULL means end of key range
InternalKey tmp_storage; // Used to keep track of compaction progress
};
ManualCompaction* manual_compaction_;

@ -195,6 +195,23 @@ class DBTest {
return result;
}
// Return spread of files per level
std::string FilesPerLevel() {
std::string result;
int last_non_zero_offset = 0;
for (int level = 0; level < config::kNumLevels; level++) {
int f = NumTableFilesAtLevel(level);
char buf[100];
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
result += buf;
if (f > 0) {
last_non_zero_offset = result.size();
}
}
result.resize(last_non_zero_offset);
return result;
}
uint64_t Size(const Slice& start, const Slice& limit) {
Range r(start, limit);
uint64_t size;
@ -203,26 +220,23 @@ class DBTest {
}
void Compact(const Slice& start, const Slice& limit) {
dbfull()->TEST_CompactMemTable();
int max_level_with_files = 1;
for (int level = 1; level < config::kNumLevels; level++) {
if (NumTableFilesAtLevel(level) > 0) {
max_level_with_files = level;
}
}
for (int level = 0; level < max_level_with_files; level++) {
dbfull()->TEST_CompactRange(level, "", "~");
db_->CompactRange(&start, &limit);
}
// Do n memtable compactions, each of which produces an sstable
// covering the range [small,large].
void MakeTables(int n, const std::string& small, const std::string& large) {
for (int i = 0; i < n; i++) {
Put(small, "begin");
Put(large, "end");
dbfull()->TEST_CompactMemTable();
}
}
// Prevent pushing of new sstables into deeper levels by adding
// tables that cover a specified range to all levels.
void FillLevels(const std::string& smallest, const std::string& largest) {
for (int level = 0; level < config::kNumLevels; level++) {
Put(smallest, "begin");
Put(largest, "end");
dbfull()->TEST_CompactMemTable();
}
MakeTables(config::kNumLevels, smallest, largest);
}
void DumpFileCounts(const char* label) {
@ -238,6 +252,12 @@ class DBTest {
}
}
std::string DumpSSTableList() {
std::string property;
db_->GetProperty("leveldb.sstables", &property);
return property;
}
std::string IterStatus(Iterator* iter) {
std::string result;
if (iter->Valid()) {
@ -367,7 +387,7 @@ TEST(DBTest, GetEncountersEmptyLevel) {
}
// Step 2: clear level 1 if necessary.
dbfull()->TEST_CompactRange(1, "a", "z");
dbfull()->TEST_CompactRange(1, NULL, NULL);
ASSERT_EQ(NumTableFilesAtLevel(0), 1);
ASSERT_EQ(NumTableFilesAtLevel(1), 0);
ASSERT_EQ(NumTableFilesAtLevel(2), 1);
@ -693,7 +713,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
// Reopening moves updates to level-0
Reopen(&options);
dbfull()->TEST_CompactRange(0, "", Key(100000));
dbfull()->TEST_CompactRange(0, NULL, NULL);
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GT(NumTableFilesAtLevel(1), 1);
@ -744,7 +764,7 @@ TEST(DBTest, SparseMerge) {
}
Put("C", "vc");
dbfull()->TEST_CompactMemTable();
dbfull()->TEST_CompactRange(0, "A", "Z");
dbfull()->TEST_CompactRange(0, NULL, NULL);
// Make sparse update
Put("A", "va2");
@ -755,9 +775,9 @@ TEST(DBTest, SparseMerge) {
// Compactions should not cause us to create a situation where
// a file overlaps too much data at the next level.
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
dbfull()->TEST_CompactRange(0, "", "z");
dbfull()->TEST_CompactRange(0, NULL, NULL);
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
dbfull()->TEST_CompactRange(1, "", "z");
dbfull()->TEST_CompactRange(1, NULL, NULL);
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
}
@ -808,9 +828,11 @@ TEST(DBTest, ApproximateSizes) {
ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000));
ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000));
dbfull()->TEST_CompactRange(0,
Key(compact_start),
Key(compact_start + 9));
std::string cstart_str = Key(compact_start);
std::string cend_str = Key(compact_start + 9);
Slice cstart = cstart_str;
Slice cend = cend_str;
dbfull()->TEST_CompactRange(0, &cstart, &cend);
}
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
@ -850,7 +872,7 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
dbfull()->TEST_CompactRange(0, Key(0), Key(100));
dbfull()->TEST_CompactRange(0, NULL, NULL);
}
}
@ -921,11 +943,12 @@ TEST(DBTest, HiddenValuesAreRemoved) {
ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000));
db_->ReleaseSnapshot(snapshot);
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]");
dbfull()->TEST_CompactRange(0, "", "x");
Slice x("x");
dbfull()->TEST_CompactRange(0, NULL, &x);
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GE(NumTableFilesAtLevel(1), 1);
dbfull()->TEST_CompactRange(1, "", "x");
dbfull()->TEST_CompactRange(1, NULL, &x);
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
@ -949,11 +972,12 @@ TEST(DBTest, DeletionMarkers1) {
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
dbfull()->TEST_CompactRange(last-2, "", "z");
Slice z("z");
dbfull()->TEST_CompactRange(last-2, NULL, &z);
// DEL eliminated, but v1 remains because we aren't compacting that level
// (DEL can be eliminated because v2 hides v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
dbfull()->TEST_CompactRange(last-1, "", "z");
dbfull()->TEST_CompactRange(last-1, NULL, NULL);
// Merging last-1 w/ last, so we are the base level for "foo", so
// DEL is removed. (as is v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
@ -976,15 +1000,54 @@ TEST(DBTest, DeletionMarkers2) {
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(last-2, "", "z");
dbfull()->TEST_CompactRange(last-2, NULL, NULL);
// DEL kept: "last" file overlaps
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(last-1, "", "z");
dbfull()->TEST_CompactRange(last-1, NULL, NULL);
// Merging last-1 w/ last, so we are the base level for "foo", so
// DEL is removed. (as is v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
}
TEST(DBTest, OverlapInLevel0) {
ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
// Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
ASSERT_OK(Put("100", "v100"));
ASSERT_OK(Put("999", "v999"));
dbfull()->TEST_CompactMemTable();
ASSERT_OK(Delete("100"));
ASSERT_OK(Delete("999"));
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("0,1,1", FilesPerLevel());
// Make files spanning the following ranges in level-0:
// files[0] 200 .. 900
// files[1] 300 .. 500
// Note that files are sorted by smallest key.
ASSERT_OK(Put("300", "v300"));
ASSERT_OK(Put("500", "v500"));
dbfull()->TEST_CompactMemTable();
ASSERT_OK(Put("200", "v200"));
ASSERT_OK(Put("600", "v600"));
ASSERT_OK(Put("900", "v900"));
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("2,1,1", FilesPerLevel());
// Compact away the placeholder files we created initially
dbfull()->TEST_CompactRange(1, NULL, NULL);
dbfull()->TEST_CompactRange(2, NULL, NULL);
ASSERT_EQ("2", FilesPerLevel());
// Do a memtable compaction. Before bug-fix, the compaction would
// not detect the overlap with level-0 files and would incorrectly place
// the deletion in a deeper level.
ASSERT_OK(Delete("600"));
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("3", FilesPerLevel());
ASSERT_EQ("NOT_FOUND", Get("600"));
}
TEST(DBTest, ComparatorCheck) {
class NewComparator : public Comparator {
public:
@ -1008,6 +1071,40 @@ TEST(DBTest, ComparatorCheck) {
<< s.ToString();
}
TEST(DBTest, ManualCompaction) {
ASSERT_EQ(config::kMaxMemCompactLevel, 2)
<< "Need to update this test to match kMaxMemCompactLevel";
MakeTables(3, "p", "q");
ASSERT_EQ("1,1,1", FilesPerLevel());
// Compaction range falls before files
Compact("", "c");
ASSERT_EQ("1,1,1", FilesPerLevel());
// Compaction range falls after files
Compact("r", "z");
ASSERT_EQ("1,1,1", FilesPerLevel());
// Compaction range overlaps files
Compact("p1", "p9");
ASSERT_EQ("0,0,1", FilesPerLevel());
// Populate a different range
MakeTables(3, "c", "e");
ASSERT_EQ("1,1,2", FilesPerLevel());
// Compact just the new range
Compact("b", "f");
ASSERT_EQ("0,0,2", FilesPerLevel());
// Compact all
MakeTables(1, "a", "z");
ASSERT_EQ("0,1,2", FilesPerLevel());
db_->CompactRange(NULL, NULL);
ASSERT_EQ("0,0,1", FilesPerLevel());
}
TEST(DBTest, DBOpen_Options) {
std::string dbname = test::TmpDir() + "/db_options_test";
DestroyDB(dbname, Options());
@ -1187,7 +1284,6 @@ class ModelDB: public DB {
delete reinterpret_cast<const ModelSnapshot*>(snapshot);
}
virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
assert(options.post_write_snapshot == NULL); // Not supported
class Handler : public WriteBatch::Handler {
public:
KVMap* map_;
@ -1211,6 +1307,9 @@ class ModelDB: public DB {
sizes[i] = 0;
}
}
virtual void CompactRange(const Slice* start, const Slice* end) {
}
private:
class ModelIter: public Iterator {
public:

@ -31,6 +31,18 @@ std::string ParsedInternalKey::DebugString() const {
return result;
}
std::string InternalKey::DebugString() const {
std::string result;
ParsedInternalKey parsed;
if (ParseInternalKey(rep_, &parsed)) {
result = parsed.DebugString();
} else {
result = "(bad)";
result.append(EscapeString(rep_));
}
return result;
}
const char* InternalKeyComparator::Name() const {
return "leveldb.InternalKeyComparator";
}

@ -149,6 +149,8 @@ class InternalKey {
}
void Clear() { rep_.clear(); }
std::string DebugString() const;
};
inline int InternalKeyComparator::Compare(

@ -235,9 +235,8 @@ std::string VersionEdit::DebugString() const {
for (size_t i = 0; i < compact_pointers_.size(); i++) {
r.append("\n CompactPointer: ");
AppendNumberTo(&r, compact_pointers_[i].first);
r.append(" '");
AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode());
r.append("'");
r.append(" ");
r.append(compact_pointers_[i].second.DebugString());
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
@ -255,11 +254,10 @@ std::string VersionEdit::DebugString() const {
AppendNumberTo(&r, f.number);
r.append(" ");
AppendNumberTo(&r, f.file_size);
r.append(" '");
AppendEscapedStringTo(&r, f.smallest.Encode());
r.append("' .. '");
AppendEscapedStringTo(&r, f.largest.Encode());
r.append("'");
r.append(" ");
r.append(f.smallest.DebugString());
r.append(" .. ");
r.append(f.largest.DebugString());
}
r.append("\n}\n");
return r;

@ -41,6 +41,14 @@ static uint64_t MaxFileSizeForLevel(int level) {
return kTargetFileSize; // We could vary per level to reduce number of files?
}
static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
int64_t sum = 0;
for (size_t i = 0; i < files.size(); i++) {
sum += files[i]->file_size;
}
return sum;
}
namespace {
std::string IntSetToString(const std::set<uint64_t>& s) {
std::string result = "{";
@ -96,17 +104,55 @@ int FindFile(const InternalKeyComparator& icmp,
return right;
}
static bool AfterFile(const Comparator* ucmp,
const Slice* user_key, const FileMetaData* f) {
// NULL user_key occurs before all keys and is therefore never after *f
return (user_key != NULL &&
ucmp->Compare(*user_key, f->largest.user_key()) > 0);
}
static bool BeforeFile(const Comparator* ucmp,
const Slice* user_key, const FileMetaData* f) {
// NULL user_key occurs after all keys and is therefore never before *f
return (user_key != NULL &&
ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
}
bool SomeFileOverlapsRange(
const InternalKeyComparator& icmp,
bool disjoint_sorted_files,
const std::vector<FileMetaData*>& files,
const Slice& smallest_user_key,
const Slice& largest_user_key) {
// Find the earliest possible internal key for smallest_user_key
InternalKey small(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
const uint32_t index = FindFile(icmp, files, small.Encode());
return ((index < files.size()) &&
icmp.user_comparator()->Compare(
largest_user_key, files[index]->smallest.user_key()) >= 0);
const Slice* smallest_user_key,
const Slice* largest_user_key) {
const Comparator* ucmp = icmp.user_comparator();
if (!disjoint_sorted_files) {
// Need to check against all files
for (int i = 0; i < files.size(); i++) {
const FileMetaData* f = files[i];
if (AfterFile(ucmp, smallest_user_key, f) ||
BeforeFile(ucmp, largest_user_key, f)) {
// No overlap
} else {
return true; // Overlap
}
}
return false;
}
// Binary search over file list
uint32_t index = 0;
if (smallest_user_key != NULL) {
// Find the earliest possible internal key for smallest_user_key
InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
index = FindFile(icmp, files, small.Encode());
}
if (index >= files.size()) {
// beginning of range is after all files, so no overlap.
return false;
}
return !BeforeFile(ucmp, largest_user_key, files[index]);
}
// An internal iterator. For a given version/level pair, yields
@ -358,11 +404,64 @@ void Version::Unref() {
}
bool Version::OverlapInLevel(int level,
const Slice& smallest_user_key,
const Slice& largest_user_key) {
return SomeFileOverlapsRange(vset_->icmp_, files_[level],
smallest_user_key,
largest_user_key);
const Slice* smallest_user_key,
const Slice* largest_user_key) {
return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
smallest_user_key, largest_user_key);
}
int Version::PickLevelForMemTableOutput(
const Slice& smallest_user_key,
const Slice& largest_user_key) {
int level = 0;
if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
// Push to next level if there is no overlap in next level,
// and the #bytes overlapping in the level after that are limited.
InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
std::vector<FileMetaData*> overlaps;
while (level < config::kMaxMemCompactLevel) {
if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
break;
}
GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
const int64_t sum = TotalFileSize(overlaps);
if (sum > kMaxGrandParentOverlapBytes) {
break;
}
level++;
}
}
return level;
}
// Store in "*inputs" all files in "level" that overlap [begin,end]
void Version::GetOverlappingInputs(
int level,
const InternalKey* begin,
const InternalKey* end,
std::vector<FileMetaData*>* inputs) {
inputs->clear();
Slice user_begin, user_end;
if (begin != NULL) {
user_begin = begin->user_key();
}
if (end != NULL) {
user_end = end->user_key();
}
const Comparator* user_cmp = vset_->icmp_.user_comparator();
for (size_t i = 0; i < files_[level].size(); i++) {
FileMetaData* f = files_[level][i];
if (begin != NULL &&
user_cmp->Compare(f->largest.user_key(), user_begin) < 0) {
// "f" is completely before specified range; skip it
} else if (end != NULL &&
user_cmp->Compare(f->smallest.user_key(), user_end) > 0) {
// "f" is completely after specified range; skip it
} else {
inputs->push_back(f);
}
}
}
std::string Version::DebugString() const {
@ -381,11 +480,11 @@ std::string Version::DebugString() const {
AppendNumberTo(&r, files[i]->number);
r.push_back(':');
AppendNumberTo(&r, files[i]->file_size);
r.append("['");
AppendEscapedStringTo(&r, files[i]->smallest.Encode());
r.append("' .. '");
AppendEscapedStringTo(&r, files[i]->largest.Encode());
r.append("']\n");
r.append("[");
r.append(files[i]->smallest.DebugString());
r.append(" .. ");
r.append(files[i]->largest.DebugString());
r.append("]\n");
}
}
return r;
@ -540,8 +639,8 @@ class VersionSet::Builder {
const InternalKey& this_begin = v->files_[level][i]->smallest;
if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
EscapeString(prev_end.Encode()).c_str(),
EscapeString(this_begin.Encode()).c_str());
prev_end.DebugString().c_str(),
this_begin.DebugString().c_str());
abort();
}
}
@ -814,14 +913,6 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
}
}
static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
int64_t sum = 0;
for (size_t i = 0; i < files.size(); i++) {
sum += files[i]->file_size;
}
return sum;
}
void VersionSet::Finalize(Version* v) {
// Precomputed best level for next compaction
int best_level = -1;
@ -967,7 +1058,8 @@ int64_t VersionSet::MaxNextLevelOverlappingBytes() {
for (int level = 1; level < config::kNumLevels - 1; level++) {
for (size_t i = 0; i < current_->files_[level].size(); i++) {
const FileMetaData* f = current_->files_[level][i];
GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps);
current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
&overlaps);
const int64_t sum = TotalFileSize(overlaps);
if (sum > result) {
result = sum;
@ -977,27 +1069,6 @@ int64_t VersionSet::MaxNextLevelOverlappingBytes() {
return result;
}
// Store in "*inputs" all files in "level" that overlap [begin,end]
void VersionSet::GetOverlappingInputs(
int level,
const InternalKey& begin,
const InternalKey& end,
std::vector<FileMetaData*>* inputs) {
inputs->clear();
Slice user_begin = begin.user_key();
Slice user_end = end.user_key();
const Comparator* user_cmp = icmp_.user_comparator();
for (size_t i = 0; i < current_->files_[level].size(); i++) {
FileMetaData* f = current_->files_[level][i];
if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 ||
user_cmp->Compare(f->smallest.user_key(), user_end) > 0) {
// Either completely before or after range; skip it
} else {
inputs->push_back(f);
}
}
}
// Stores the minimal range that covers all entries in inputs in
// *smallest, *largest.
// REQUIRES: inputs is not empty
@ -1113,7 +1184,7 @@ Compaction* VersionSet::PickCompaction() {
// Note that the next call will discard the file we placed in
// c->inputs_[0] earlier and replace it with an overlapping set
// which will include the picked file.
GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]);
current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
assert(!c->inputs_[0].empty());
}
@ -1127,7 +1198,7 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
InternalKey smallest, largest;
GetRange(c->inputs_[0], &smallest, &largest);
GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);
current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]);
// Get entire range covered by compaction
InternalKey all_start, all_limit;
@ -1137,12 +1208,13 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
// changing the number of "level+1" files we pick up.
if (!c->inputs_[1].empty()) {
std::vector<FileMetaData*> expanded0;
GetOverlappingInputs(level, all_start, all_limit, &expanded0);
current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
if (expanded0.size() > c->inputs_[0].size()) {
InternalKey new_start, new_limit;
GetRange(expanded0, &new_start, &new_limit);
std::vector<FileMetaData*> expanded1;
GetOverlappingInputs(level+1, new_start, new_limit, &expanded1);
current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
&expanded1);
if (expanded1.size() == c->inputs_[1].size()) {
Log(options_->info_log,
"Expanding@%d %d+%d to %d+%d\n",
@ -1163,14 +1235,15 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
// Compute the set of grandparent files that overlap this compaction
// (parent == level+1; grandparent == level+2)
if (level + 2 < config::kNumLevels) {
GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_);
current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
&c->grandparents_);
}
if (false) {
Log(options_->info_log, "Compacting %d '%s' .. '%s'",
level,
EscapeString(smallest.Encode()).c_str(),
EscapeString(largest.Encode()).c_str());
smallest.DebugString().c_str(),
largest.DebugString().c_str());
}
// Update the place where we will do the next compaction for this level.
@ -1183,14 +1256,26 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
Compaction* VersionSet::CompactRange(
int level,
const InternalKey& begin,
const InternalKey& end) {
const InternalKey* begin,
const InternalKey* end) {
std::vector<FileMetaData*> inputs;
GetOverlappingInputs(level, begin, end, &inputs);
current_->GetOverlappingInputs(level, begin, end, &inputs);
if (inputs.empty()) {
return NULL;
}
// Avoid compacting too much in one shot in case the range is large.
const uint64_t limit = MaxFileSizeForLevel(level);
uint64_t total = 0;
for (int i = 0; i < inputs.size(); i++) {
uint64_t s = inputs[i]->file_size;
total += s;
if (total >= limit) {
inputs.resize(i + 1);
break;
}
}
Compaction* c = new Compaction(level);
c->input_version_ = current_;
c->input_version_->Ref();

@ -43,12 +43,17 @@ extern int FindFile(const InternalKeyComparator& icmp,
const Slice& key);
// Returns true iff some file in "files" overlaps the user key range
// [smallest,largest].
// [*smallest,*largest].
// smallest==NULL represents a key smaller than all keys in the DB.
// largest==NULL represents a key largest than all keys in the DB.
// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
// in sorted order.
extern bool SomeFileOverlapsRange(
const InternalKeyComparator& icmp,
bool disjoint_sorted_files,
const std::vector<FileMetaData*>& files,
const Slice& smallest_user_key,
const Slice& largest_user_key);
const Slice* smallest_user_key,
const Slice* largest_user_key);
class Version {
public:
@ -77,11 +82,24 @@ class Version {
void Ref();
void Unref();
void GetOverlappingInputs(
int level,
const InternalKey* begin, // NULL means before all keys
const InternalKey* end, // NULL means after all keys
std::vector<FileMetaData*>* inputs);
// Returns true iff some file in the specified level overlaps
// some part of [smallest_user_key,largest_user_key].
// some part of [*smallest_user_key,*largest_user_key].
// smallest_user_key==NULL represents a key smaller than all keys in the DB.
// largest_user_key==NULL represents a key largest than all keys in the DB.
bool OverlapInLevel(int level,
const Slice& smallest_user_key,
const Slice& largest_user_key);
const Slice* smallest_user_key,
const Slice* largest_user_key);
// Return the level at which we should place a new memtable compaction
// result that covers the range [smallest_user_key,largest_user_key].
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
const Slice& largest_user_key);
int NumFiles(int level) const { return files_[level].size(); }
@ -192,8 +210,8 @@ class VersionSet {
// the result.
Compaction* CompactRange(
int level,
const InternalKey& begin,
const InternalKey& end);
const InternalKey* begin,
const InternalKey* end);
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
@ -232,12 +250,6 @@ class VersionSet {
void Finalize(Version* v);
void GetOverlappingInputs(
int level,
const InternalKey& begin,
const InternalKey& end,
std::vector<FileMetaData*>* inputs);
void GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest,
InternalKey* largest);

@ -12,6 +12,9 @@ namespace leveldb {
class FindFileTest {
public:
std::vector<FileMetaData*> files_;
bool disjoint_sorted_files_;
FindFileTest() : disjoint_sorted_files_(true) { }
~FindFileTest() {
for (int i = 0; i < files_.size(); i++) {
@ -37,13 +40,20 @@ class FindFileTest {
bool Overlaps(const char* smallest, const char* largest) {
InternalKeyComparator cmp(BytewiseComparator());
return SomeFileOverlapsRange(cmp, files_, smallest, largest);
Slice s(smallest != NULL ? smallest : "");
Slice l(largest != NULL ? largest : "");
return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
(smallest != NULL ? &s : NULL),
(largest != NULL ? &l : NULL));
}
};
TEST(FindFileTest, Empty) {
ASSERT_EQ(0, Find("foo"));
ASSERT_TRUE(! Overlaps("a", "z"));
ASSERT_TRUE(! Overlaps(NULL, "z"));
ASSERT_TRUE(! Overlaps("a", NULL));
ASSERT_TRUE(! Overlaps(NULL, NULL));
}
TEST(FindFileTest, Single) {
@ -67,6 +77,13 @@ TEST(FindFileTest, Single) {
ASSERT_TRUE(Overlaps("p1", "z"));
ASSERT_TRUE(Overlaps("q", "q"));
ASSERT_TRUE(Overlaps("q", "q1"));
ASSERT_TRUE(! Overlaps(NULL, "j"));
ASSERT_TRUE(! Overlaps("r", NULL));
ASSERT_TRUE(Overlaps(NULL, "p"));
ASSERT_TRUE(Overlaps(NULL, "p1"));
ASSERT_TRUE(Overlaps("q", NULL));
ASSERT_TRUE(Overlaps(NULL, NULL));
}
@ -108,6 +125,26 @@ TEST(FindFileTest, Multiple) {
ASSERT_TRUE(Overlaps("450", "500"));
}
TEST(FindFileTest, MultipleNullBoundaries) {
Add("150", "200");
Add("200", "250");
Add("300", "350");
Add("400", "450");
ASSERT_TRUE(! Overlaps(NULL, "149"));
ASSERT_TRUE(! Overlaps("451", NULL));
ASSERT_TRUE(Overlaps(NULL, NULL));
ASSERT_TRUE(Overlaps(NULL, "150"));
ASSERT_TRUE(Overlaps(NULL, "199"));
ASSERT_TRUE(Overlaps(NULL, "200"));
ASSERT_TRUE(Overlaps(NULL, "201"));
ASSERT_TRUE(Overlaps(NULL, "400"));
ASSERT_TRUE(Overlaps(NULL, "800"));
ASSERT_TRUE(Overlaps("100", NULL));
ASSERT_TRUE(Overlaps("200", NULL));
ASSERT_TRUE(Overlaps("449", NULL));
ASSERT_TRUE(Overlaps("450", NULL));
}
TEST(FindFileTest, OverlapSequenceChecks) {
Add("200", "200", 5000, 3000);
ASSERT_TRUE(! Overlaps("199", "199"));
@ -117,6 +154,24 @@ TEST(FindFileTest, OverlapSequenceChecks) {
ASSERT_TRUE(Overlaps("200", "210"));
}
TEST(FindFileTest, OverlappingFiles) {
Add("150", "600");
Add("400", "500");
disjoint_sorted_files_ = false;
ASSERT_TRUE(! Overlaps("100", "149"));
ASSERT_TRUE(! Overlaps("601", "700"));
ASSERT_TRUE(Overlaps("100", "150"));
ASSERT_TRUE(Overlaps("100", "200"));
ASSERT_TRUE(Overlaps("100", "300"));
ASSERT_TRUE(Overlaps("100", "400"));
ASSERT_TRUE(Overlaps("100", "500"));
ASSERT_TRUE(Overlaps("375", "400"));
ASSERT_TRUE(Overlaps("450", "450"));
ASSERT_TRUE(Overlaps("450", "500"));
ASSERT_TRUE(Overlaps("450", "700"));
ASSERT_TRUE(Overlaps("600", "700"));
}
}
int main(int argc, char** argv) {

@ -193,7 +193,7 @@ that a read should operate on a particular version of the DB state.
If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
implicit snapshot of the current state.
<p>
Snapshots typically are created by the DB::GetSnapshot() method:
Snapshots are created by the DB::GetSnapshot() method:
<p>
<pre>
leveldb::ReadOptions options;
@ -208,26 +208,6 @@ Note that when a snapshot is no longer needed, it should be released
using the DB::ReleaseSnapshot interface. This allows the
implementation to get rid of state that was being maintained just to
support reading as of that snapshot.
<p>
A Write operation can also return a snapshot that
represents the state of the database just after applying a particular
set of updates:
<p>
<pre>
leveldb::Snapshot* snapshot;
leveldb::WriteOptions write_options;
write_options.post_write_snapshot = &amp;snapshot;
leveldb::Status status = db-&gt;Write(write_options, ...);
... perform other mutations to db ...
leveldb::ReadOptions read_options;
read_options.snapshot = snapshot;
leveldb::Iterator* iter = db-&gt;NewIterator(read_options);
... read as of the state just after the Write call returned ...
delete iter;
db-&gt;ReleaseSnapshot(snapshot);
</pre>
<h1>Slice</h1>
<p>
The return value of the <code>it->key()</code> and <code>it->value()</code> calls above

@ -112,6 +112,8 @@ class DB {
// where <N> is an ASCII representation of a level number (e.g. "0").
// "leveldb.stats" - returns a multi-line string that describes statistics
// about the internal operation of the DB.
// "leveldb.sstables" - returns a multi-line string that describes all
// of the sstables that make up the db contents.
virtual bool GetProperty(const Slice& property, std::string* value) = 0;
// For each i in [0,n-1], store in "sizes[i]", the approximate
@ -125,8 +127,17 @@ class DB {
virtual void GetApproximateSizes(const Range* range, int n,
uint64_t* sizes) = 0;
// Possible extensions:
// (1) Add a method to compact a range of keys
// Compact the underlying storage for the key range [*begin,*end].
// In particular, deleted and overwritten versions are discarded,
// and the data is rearranged to reduce the cost of operations
// needed to access the data. This operation should typically only
// be invoked by users who understand the underlying implementation.
//
// begin==NULL is treated as a key before all keys in the database.
// end==NULL is treated as a key after all keys in the database.
// Therefore the following call will compact the entire database:
// db->CompactRange(NULL, NULL);
virtual void CompactRange(const Slice* begin, const Slice* end) = 0;
private:
// No copying allowed

@ -160,6 +160,8 @@ class SequentialFile {
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
// written by this routine. Sets "*result" to the data that was
// read (including if fewer than "n" bytes were successfully read).
// May set "*result" to point at data in "scratch[0..n-1]", so
// "scratch[0..n-1]" must be live when "*result" is used.
// If an error was encountered, returns a non-OK status.
//
// REQUIRES: External synchronization
@ -184,8 +186,10 @@ class RandomAccessFile {
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). If an error was encountered, returns a
// non-OK status.
// successfully read). May set "*result" to point at data in
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
// "*result" is used. If an error was encountered, returns a non-OK
// status.
//
// Safe for concurrent use by multiple threads.
virtual Status Read(uint64_t offset, size_t n, Slice* result,

@ -177,21 +177,8 @@ struct WriteOptions {
// Default: false
bool sync;
// If "post_write_snapshot" is non-NULL, and the write succeeds,
// *post_write_snapshot will be modified to point to a snapshot of
// the DB state immediately after this write. The caller must call
// DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the
// snapshot is no longer needed.
//
// If "post_write_snapshot" is non-NULL, and the write fails,
// *post_write_snapshot will be set to NULL.
//
// Default: NULL
const Snapshot** post_write_snapshot;
WriteOptions()
: sync(false),
post_write_snapshot(NULL) {
: sync(false) {
}
};

@ -62,10 +62,10 @@ inline uint32_t DecodeFixed32(const char* ptr) {
memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
return result;
} else {
return ((static_cast<uint32_t>(ptr[0]))
| (static_cast<uint32_t>(ptr[1]) << 8)
| (static_cast<uint32_t>(ptr[2]) << 16)
| (static_cast<uint32_t>(ptr[3]) << 24));
return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
| (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
| (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
| (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
}
}

@ -1,5 +1,6 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: sanjay@google.com (Sanjay Ghemawat)
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Logger implementation that can be shared by all environments
// where enough posix functionality is available.

Loading…
Cancel
Save