sync with upstream @ 21409451

Check the NEWS file for details of what changed.

git-svn-id: https://leveldb.googlecode.com/svn/trunk@28 62dab493-f737-651d-591e-8d6aee1b9529
main
dgrogan@chromium.org 14 years ago
parent 3c111335a7
commit da79909507
  1. 17
      NEWS
  2. 64
      db/db_bench.cc
  3. 64
      db/db_impl.cc
  4. 107
      db/db_test.cc
  5. 10
      db/dbformat.h
  6. 116
      db/log_reader.cc
  7. 37
      db/log_reader.h
  8. 143
      db/log_test.cc
  9. 3
      db/log_writer.cc
  10. 14
      db/memtable.cc
  11. 18
      db/memtable.h
  12. 12
      db/repair.cc
  13. 22
      db/snapshot.h
  14. 279
      db/version_set.cc
  15. 38
      db/version_set.h
  16. 136
      db/write_batch.cc
  17. 24
      db/write_batch_internal.h
  18. 8
      db/write_batch_test.cc
  19. 26
      doc/impl.html
  20. 16
      doc/index.html
  21. 4
      include/leveldb/comparator.h
  22. 20
      include/leveldb/db.h
  23. 12
      include/leveldb/env.h
  24. 5
      include/leveldb/iterator.h
  25. 5
      include/leveldb/slice.h
  26. 36
      include/leveldb/status.h
  27. 3
      include/leveldb/table.h
  28. 5
      include/leveldb/table_builder.h
  29. 15
      include/leveldb/write_batch.h
  30. 2
      table/block_builder.cc
  31. 14
      table/table_test.cc
  32. 7
      util/env_chromium.cc
  33. 7
      util/env_posix.cc
  34. 36
      util/status.cc

17
NEWS

@ -0,0 +1,17 @@
Release 1.2 2011-05-16
----------------------
Fixes for larger databases (tested up to one billion 100-byte entries,
i.e., ~100GB).
(1) Place hard limit on number of level-0 files. This fixes errors
of the form "too many open files".
(2) Fixed memtable management. Before the fix, a heavy write burst
could cause unbounded memory usage.
A fix for a logging bug where the reader would incorrectly complain
about corruption.
Allow public access to WriteBatch contents so that users can easily
wrap a DB.

@ -24,9 +24,10 @@
// overwrite -- overwrite N values in random key order in async mode // overwrite -- overwrite N values in random key order in async mode
// fillsync -- write N/100 values in random key order in sync mode // fillsync -- write N/100 values in random key order in sync mode
// fill100K -- write N/1000 100K values in random order in async mode // fill100K -- write N/1000 100K values in random order in async mode
// readseq -- read N values sequentially // readseq -- read N times sequentially
// readreverse -- read N values in reverse order // readreverse -- read N times in reverse order
// readrandom -- read N values in random order // readrandom -- read N times in random order
// readhot -- read N times in random order from 1% section of DB
// crc32c -- repeated crc32c of 4K of data // crc32c -- repeated crc32c of 4K of data
// Meta operations: // Meta operations:
// compact -- Compact the entire DB // compact -- Compact the entire DB
@ -54,6 +55,9 @@ static const char* FLAGS_benchmarks =
// Number of key/values to place in database // Number of key/values to place in database
static int FLAGS_num = 1000000; static int FLAGS_num = 1000000;
// Number of read operations to do. If negative, do FLAGS_num reads.
static int FLAGS_reads = -1;
// Size of each value // Size of each value
static int FLAGS_value_size = 100; static int FLAGS_value_size = 100;
@ -72,6 +76,14 @@ static int FLAGS_write_buffer_size = 0;
// Negative means use default settings. // Negative means use default settings.
static int FLAGS_cache_size = -1; static int FLAGS_cache_size = -1;
// Maximum number of files to keep open at the same time (use default if == 0)
static int FLAGS_open_files = 0;
// If true, do not destroy the existing database. If you set this
// flag and also specify a benchmark that wants a fresh database, that
// benchmark will fail.
static bool FLAGS_use_existing_db = false;
namespace leveldb { namespace leveldb {
// Helper for quickly generating random data. // Helper for quickly generating random data.
@ -126,6 +138,7 @@ class Benchmark {
Cache* cache_; Cache* cache_;
DB* db_; DB* db_;
int num_; int num_;
int reads_;
int heap_counter_; int heap_counter_;
double start_; double start_;
double last_op_finish_; double last_op_finish_;
@ -298,6 +311,7 @@ class Benchmark {
: cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
db_(NULL), db_(NULL),
num_(FLAGS_num), num_(FLAGS_num),
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
heap_counter_(0), heap_counter_(0),
bytes_(0), bytes_(0),
rand_(301) { rand_(301) {
@ -308,8 +322,10 @@ class Benchmark {
Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]);
} }
} }
if (!FLAGS_use_existing_db) {
DestroyDB("/tmp/dbbench", Options()); DestroyDB("/tmp/dbbench", Options());
} }
}
~Benchmark() { ~Benchmark() {
delete db_; delete db_;
@ -355,11 +371,13 @@ class Benchmark {
ReadReverse(); ReadReverse();
} else if (name == Slice("readrandom")) { } else if (name == Slice("readrandom")) {
ReadRandom(); ReadRandom();
} else if (name == Slice("readhot")) {
ReadHot();
} else if (name == Slice("readrandomsmall")) { } else if (name == Slice("readrandomsmall")) {
int n = num_; int n = reads_;
num_ /= 1000; reads_ /= 1000;
ReadRandom(); ReadRandom();
num_ = n; reads_ = n;
} else if (name == Slice("compact")) { } else if (name == Slice("compact")) {
Compact(); Compact();
} else if (name == Slice("crc32c")) { } else if (name == Slice("crc32c")) {
@ -449,7 +467,7 @@ class Benchmark {
void Open() { void Open() {
assert(db_ == NULL); assert(db_ == NULL);
Options options; Options options;
options.create_if_missing = true; options.create_if_missing = !FLAGS_use_existing_db;
options.block_cache = cache_; options.block_cache = cache_;
options.write_buffer_size = FLAGS_write_buffer_size; options.write_buffer_size = FLAGS_write_buffer_size;
Status s = DB::Open(options, "/tmp/dbbench", &db_); Status s = DB::Open(options, "/tmp/dbbench", &db_);
@ -462,6 +480,10 @@ class Benchmark {
void Write(const WriteOptions& options, Order order, DBState state, void Write(const WriteOptions& options, Order order, DBState state,
int num_entries, int value_size, int entries_per_batch) { int num_entries, int value_size, int entries_per_batch) {
if (state == FRESH) { if (state == FRESH) {
if (FLAGS_use_existing_db) {
message_ = "skipping (--use_existing_db is true)";
return;
}
delete db_; delete db_;
db_ = NULL; db_ = NULL;
DestroyDB("/tmp/dbbench", Options()); DestroyDB("/tmp/dbbench", Options());
@ -499,7 +521,7 @@ class Benchmark {
void ReadSequential() { void ReadSequential() {
Iterator* iter = db_->NewIterator(ReadOptions()); Iterator* iter = db_->NewIterator(ReadOptions());
int i = 0; int i = 0;
for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
bytes_ += iter->key().size() + iter->value().size(); bytes_ += iter->key().size() + iter->value().size();
FinishedSingleOp(); FinishedSingleOp();
++i; ++i;
@ -510,7 +532,7 @@ class Benchmark {
void ReadReverse() { void ReadReverse() {
Iterator* iter = db_->NewIterator(ReadOptions()); Iterator* iter = db_->NewIterator(ReadOptions());
int i = 0; int i = 0;
for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
bytes_ += iter->key().size() + iter->value().size(); bytes_ += iter->key().size() + iter->value().size();
FinishedSingleOp(); FinishedSingleOp();
++i; ++i;
@ -521,7 +543,7 @@ class Benchmark {
void ReadRandom() { void ReadRandom() {
ReadOptions options; ReadOptions options;
std::string value; std::string value;
for (int i = 0; i < num_; i++) { for (int i = 0; i < reads_; i++) {
char key[100]; char key[100];
const int k = rand_.Next() % FLAGS_num; const int k = rand_.Next() % FLAGS_num;
snprintf(key, sizeof(key), "%016d", k); snprintf(key, sizeof(key), "%016d", k);
@ -530,6 +552,19 @@ class Benchmark {
} }
} }
void ReadHot() {
ReadOptions options;
std::string value;
const int range = (FLAGS_num + 99) / 100;
for (int i = 0; i < reads_; i++) {
char key[100];
const int k = rand_.Next() % range;
snprintf(key, sizeof(key), "%016d", k);
db_->Get(options, key, &value);
FinishedSingleOp();
}
}
void Compact() { void Compact() {
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable(); dbi->TEST_CompactMemTable();
@ -582,6 +617,8 @@ class Benchmark {
int main(int argc, char** argv) { int main(int argc, char** argv) {
FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
FLAGS_open_files = leveldb::Options().max_open_files;
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
double d; double d;
int n; int n;
@ -593,14 +630,21 @@ int main(int argc, char** argv) {
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) { (n == 0 || n == 1)) {
FLAGS_histogram = n; FLAGS_histogram = n;
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_existing_db = n;
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
FLAGS_num = n; FLAGS_num = n;
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
FLAGS_reads = n;
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
FLAGS_value_size = n; FLAGS_value_size = n;
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
FLAGS_write_buffer_size = n; FLAGS_write_buffer_size = n;
} else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
FLAGS_cache_size = n; FLAGS_cache_size = n;
} else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
FLAGS_open_files = n;
} else { } else {
fprintf(stderr, "Invalid flag '%s'\n", argv[i]); fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
exit(1); exit(1);

@ -126,6 +126,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
log_(NULL), log_(NULL),
bg_compaction_scheduled_(false), bg_compaction_scheduled_(false),
compacting_(false) { compacting_(false) {
mem_->Ref();
has_imm_.Release_Store(NULL); has_imm_.Release_Store(NULL);
// Reserve ten files or so for other uses and give the rest to TableCache. // Reserve ten files or so for other uses and give the rest to TableCache.
@ -152,8 +153,8 @@ DBImpl::~DBImpl() {
} }
delete versions_; delete versions_;
delete mem_; if (mem_ != NULL) mem_->Unref();
delete imm_; if (imm_ != NULL) imm_->Unref();
delete log_; delete log_;
delete logfile_; delete logfile_;
delete table_cache_; delete table_cache_;
@ -344,7 +345,8 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
// paranoid_checks==false so that corruptions cause entire commits // paranoid_checks==false so that corruptions cause entire commits
// to be skipped instead of propagating bad information (like overly // to be skipped instead of propagating bad information (like overly
// large sequence numbers). // large sequence numbers).
log::Reader reader(file, &reporter, true/*checksum*/); log::Reader reader(file, &reporter, true/*checksum*/,
0/*initial_offset*/);
Log(env_, options_.info_log, "Recovering log #%llu", Log(env_, options_.info_log, "Recovering log #%llu",
(unsigned long long) log_number); (unsigned long long) log_number);
@ -364,6 +366,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
if (mem == NULL) { if (mem == NULL) {
mem = new MemTable(internal_comparator_); mem = new MemTable(internal_comparator_);
mem->Ref();
} }
status = WriteBatchInternal::InsertInto(&batch, mem); status = WriteBatchInternal::InsertInto(&batch, mem);
MaybeIgnoreError(&status); MaybeIgnoreError(&status);
@ -384,7 +387,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
// file-systems cause the DB::Open() to fail. // file-systems cause the DB::Open() to fail.
break; break;
} }
delete mem; mem->Unref();
mem = NULL; mem = NULL;
} }
} }
@ -395,7 +398,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
// file-systems cause the DB::Open() to fail. // file-systems cause the DB::Open() to fail.
} }
delete mem; if (mem != NULL) mem->Unref();
delete file; delete file;
return status; return status;
} }
@ -443,11 +446,12 @@ Status DBImpl::CompactMemTable() {
// Replace immutable memtable with the generated Table // Replace immutable memtable with the generated Table
if (s.ok()) { if (s.ok()) {
edit.SetPrevLogNumber(0); edit.SetPrevLogNumber(0);
s = versions_->LogAndApply(&edit, imm_); s = versions_->LogAndApply(&edit);
} }
if (s.ok()) { if (s.ok()) {
// Commit to the new state // Commit to the new state
imm_->Unref();
imm_ = NULL; imm_ = NULL;
has_imm_.Release_Store(NULL); has_imm_.Release_Store(NULL);
DeleteObsoleteFiles(); DeleteObsoleteFiles();
@ -556,7 +560,7 @@ void DBImpl::BackgroundCompaction() {
c->edit()->DeleteFile(c->level(), f->number); c->edit()->DeleteFile(c->level(), f->number);
c->edit()->AddFile(c->level() + 1, f->number, f->file_size, c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
f->smallest, f->largest); f->smallest, f->largest);
status = versions_->LogAndApply(c->edit(), NULL); status = versions_->LogAndApply(c->edit());
Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n", Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n",
static_cast<unsigned long long>(f->number), static_cast<unsigned long long>(f->number),
c->level() + 1, c->level() + 1,
@ -697,7 +701,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
} }
compact->outputs.clear(); compact->outputs.clear();
Status s = versions_->LogAndApply(compact->compaction->edit(), NULL); Status s = versions_->LogAndApply(compact->compaction->edit());
if (s.ok()) { if (s.ok()) {
compact->compaction->ReleaseInputs(); compact->compaction->ReleaseInputs();
DeleteObsoleteFiles(); DeleteObsoleteFiles();
@ -754,9 +758,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
} }
Slice key = input->key(); Slice key = input->key();
InternalKey tmp_internal_key; if (compact->compaction->ShouldStopBefore(key) &&
tmp_internal_key.DecodeFrom(key);
if (compact->compaction->ShouldStopBefore(tmp_internal_key) &&
compact->builder != NULL) { compact->builder != NULL) {
status = FinishCompactionOutputFile(compact, input); status = FinishCompactionOutputFile(compact, input);
if (!status.ok()) { if (!status.ok()) {
@ -867,6 +869,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
} }
compacting_ = false; compacting_ = false;
compacting_cv_.SignalAll(); compacting_cv_.SignalAll();
VersionSet::LevelSummaryStorage tmp;
Log(env_, options_.info_log,
"compacted to: %s", versions_->LevelSummary(&tmp));
return status; return status;
} }
@ -925,10 +930,11 @@ Status DBImpl::Get(const ReadOptions& options,
Iterator* DBImpl::NewIterator(const ReadOptions& options) { Iterator* DBImpl::NewIterator(const ReadOptions& options) {
SequenceNumber latest_snapshot; SequenceNumber latest_snapshot;
Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
SequenceNumber sequence = return NewDBIterator(
(options.snapshot ? options.snapshot->number_ : latest_snapshot); &dbname_, env_, user_comparator(), internal_iter,
return NewDBIterator(&dbname_, env_, (options.snapshot != NULL
user_comparator(), internal_iter, sequence); ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
: latest_snapshot));
} }
void DBImpl::Unref(void* arg1, void* arg2) { void DBImpl::Unref(void* arg1, void* arg2) {
@ -945,7 +951,7 @@ const Snapshot* DBImpl::GetSnapshot() {
void DBImpl::ReleaseSnapshot(const Snapshot* s) { void DBImpl::ReleaseSnapshot(const Snapshot* s) {
MutexLock l(&mutex_); MutexLock l(&mutex_);
snapshots_.Delete(s); snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
} }
// Convenience methods // Convenience methods
@ -985,12 +991,26 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
Status DBImpl::MakeRoomForWrite(bool force) { Status DBImpl::MakeRoomForWrite(bool force) {
mutex_.AssertHeld(); mutex_.AssertHeld();
bool allow_delay = !force;
Status s; Status s;
while (true) { while (true) {
if (!bg_error_.ok()) { if (!bg_error_.ok()) {
// Yield previous error // Yield previous error
s = bg_error_; s = bg_error_;
break; break;
} else if (
allow_delay &&
versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
// We are getting close to hitting a hard limit on the number of
// L0 files. Rather than delaying a single write by several
// seconds when we hit the hard limit, start delaying each
// individual write by 1ms to reduce latency variance. Also,
// this delay hands over some CPU to the compaction thread in
// case it is sharing the same core as the writer.
mutex_.Unlock();
env_->SleepForMicroseconds(1000);
allow_delay = false; // Do not delay a single write more than once
mutex_.Lock();
} else if (!force && } else if (!force &&
(mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
// There is room in current memtable // There is room in current memtable
@ -999,6 +1019,9 @@ Status DBImpl::MakeRoomForWrite(bool force) {
// We have filled up the current memtable, but the previous // We have filled up the current memtable, but the previous
// one is still being compacted, so we wait. // one is still being compacted, so we wait.
compacting_cv_.Wait(); compacting_cv_.Wait();
} else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
// There are too many level-0 files.
compacting_cv_.Wait();
} else { } else {
// Attempt to switch to a new memtable and trigger compaction of old // Attempt to switch to a new memtable and trigger compaction of old
assert(versions_->PrevLogNumber() == 0); assert(versions_->PrevLogNumber() == 0);
@ -1011,7 +1034,7 @@ Status DBImpl::MakeRoomForWrite(bool force) {
VersionEdit edit; VersionEdit edit;
edit.SetPrevLogNumber(versions_->LogNumber()); edit.SetPrevLogNumber(versions_->LogNumber());
edit.SetLogNumber(new_log_number); edit.SetLogNumber(new_log_number);
s = versions_->LogAndApply(&edit, NULL); s = versions_->LogAndApply(&edit);
if (!s.ok()) { if (!s.ok()) {
delete lfile; delete lfile;
env_->DeleteFile(LogFileName(dbname_, new_log_number)); env_->DeleteFile(LogFileName(dbname_, new_log_number));
@ -1024,6 +1047,7 @@ Status DBImpl::MakeRoomForWrite(bool force) {
imm_ = mem_; imm_ = mem_;
has_imm_.Release_Store(imm_); has_imm_.Release_Store(imm_);
mem_ = new MemTable(internal_comparator_); mem_ = new MemTable(internal_comparator_);
mem_->Ref();
force = false; // Do not force another compaction if have room force = false; // Do not force another compaction if have room
MaybeScheduleCompaction(); MaybeScheduleCompaction();
} }
@ -1141,10 +1165,11 @@ Status DB::Open(const Options& options, const std::string& dbname,
edit.SetLogNumber(new_log_number); edit.SetLogNumber(new_log_number);
impl->logfile_ = lfile; impl->logfile_ = lfile;
impl->log_ = new log::Writer(lfile); impl->log_ = new log::Writer(lfile);
s = impl->versions_->LogAndApply(&edit, NULL); s = impl->versions_->LogAndApply(&edit);
} }
if (s.ok()) { if (s.ok()) {
impl->DeleteObsoleteFiles(); impl->DeleteObsoleteFiles();
impl->MaybeScheduleCompaction();
} }
} }
impl->mutex_.Unlock(); impl->mutex_.Unlock();
@ -1156,6 +1181,9 @@ Status DB::Open(const Options& options, const std::string& dbname,
return s; return s;
} }
Snapshot::~Snapshot() {
}
Status DestroyDB(const std::string& dbname, const Options& options) { Status DestroyDB(const std::string& dbname, const Options& options) {
Env* env = options.env; Env* env = options.env;
std::vector<std::string> filenames; std::vector<std::string> filenames;

@ -3,7 +3,6 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/db.h" #include "leveldb/db.h"
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/filename.h" #include "db/filename.h"
#include "db/version_set.h" #include "db/version_set.h"
@ -802,8 +801,17 @@ TEST(DBTest, DBOpen_Options) {
db = NULL; db = NULL;
} }
namespace {
typedef std::map<std::string, std::string> KVMap;
}
class ModelDB: public DB { class ModelDB: public DB {
public: public:
class ModelSnapshot : public Snapshot {
public:
KVMap map_;
};
explicit ModelDB(const Options& options): options_(options) { } explicit ModelDB(const Options& options): options_(options) { }
~ModelDB() { } ~ModelDB() { }
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
@ -824,35 +832,34 @@ class ModelDB: public DB {
return new ModelIter(saved, true); return new ModelIter(saved, true);
} else { } else {
const KVMap* snapshot_state = const KVMap* snapshot_state =
reinterpret_cast<const KVMap*>(options.snapshot->number_); &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
return new ModelIter(snapshot_state, false); return new ModelIter(snapshot_state, false);
} }
} }
virtual const Snapshot* GetSnapshot() { virtual const Snapshot* GetSnapshot() {
KVMap* saved = new KVMap; ModelSnapshot* snapshot = new ModelSnapshot;
*saved = map_; snapshot->map_ = map_;
return snapshots_.New( return snapshot;
reinterpret_cast<SequenceNumber>(saved));
} }
virtual void ReleaseSnapshot(const Snapshot* snapshot) { virtual void ReleaseSnapshot(const Snapshot* snapshot) {
const KVMap* saved = reinterpret_cast<const KVMap*>(snapshot->number_); delete reinterpret_cast<const ModelSnapshot*>(snapshot);
delete saved;
snapshots_.Delete(snapshot);
} }
virtual Status Write(const WriteOptions& options, WriteBatch* batch) { virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
assert(options.post_write_snapshot == NULL); // Not supported assert(options.post_write_snapshot == NULL); // Not supported
for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) { class Handler : public WriteBatch::Handler {
switch (it.op()) { public:
case kTypeValue: KVMap* map_;
map_[it.key().ToString()] = it.value().ToString(); virtual void Put(const Slice& key, const Slice& value) {
break; (*map_)[key.ToString()] = value.ToString();
case kTypeDeletion:
map_.erase(it.key().ToString());
break;
} }
virtual void Delete(const Slice& key) {
map_->erase(key.ToString());
} }
return Status::OK(); };
Handler handler;
handler.map_ = &map_;
return batch->Iterate(&handler);
} }
virtual bool GetProperty(const Slice& property, std::string* value) { virtual bool GetProperty(const Slice& property, std::string* value) {
@ -864,7 +871,6 @@ class ModelDB: public DB {
} }
} }
private: private:
typedef std::map<std::string, std::string> KVMap;
class ModelIter: public Iterator { class ModelIter: public Iterator {
public: public:
ModelIter(const KVMap* map, bool owned) ModelIter(const KVMap* map, bool owned)
@ -897,7 +903,6 @@ class ModelDB: public DB {
}; };
const Options options_; const Options options_;
KVMap map_; KVMap map_;
SnapshotList snapshots_;
}; };
static std::string RandomKey(Random* rnd) { static std::string RandomKey(Random* rnd) {
@ -1023,8 +1028,70 @@ TEST(DBTest, Randomized) {
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
} }
std::string MakeKey(unsigned int num) {
char buf[30];
snprintf(buf, sizeof(buf), "%016u", num);
return std::string(buf);
}
void BM_LogAndApply(int iters, int num_base_files) {
std::string dbname = test::TmpDir() + "/leveldb_test_benchmark";
DestroyDB(dbname, Options());
DB* db = NULL;
Options opts;
opts.create_if_missing = true;
Status s = DB::Open(opts, dbname, &db);
ASSERT_OK(s);
ASSERT_TRUE(db != NULL);
delete db;
db = NULL;
Env* env = Env::Default();
InternalKeyComparator cmp(BytewiseComparator());
Options options;
VersionSet vset(dbname, &options, NULL, &cmp);
ASSERT_OK(vset.Recover());
VersionEdit vbase;
uint64_t fnum = 1;
for (int i = 0; i < num_base_files; i++) {
InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
}
ASSERT_OK(vset.LogAndApply(&vbase));
uint64_t start_micros = env->NowMicros();
for (int i = 0; i < iters; i++) {
VersionEdit vedit;
vedit.DeleteFile(2, fnum);
InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
vset.LogAndApply(&vedit);
}
uint64_t stop_micros = env->NowMicros();
unsigned int us = stop_micros - start_micros;
char buf[16];
snprintf(buf, sizeof(buf), "%d", num_base_files);
fprintf(stderr,
"BM_LogAndApply/%-6s %8d iters : %9u us (%7.0f us / iter)\n",
buf, iters, us, ((float)us) / iters);
}
} }
int main(int argc, char** argv) { int main(int argc, char** argv) {
if (argc > 1 && std::string(argv[1]) == "--benchmark") {
leveldb::BM_LogAndApply(1000, 1);
leveldb::BM_LogAndApply(1000, 100);
leveldb::BM_LogAndApply(1000, 10000);
leveldb::BM_LogAndApply(100, 100000);
return 0;
}
return leveldb::test::RunAllTests(); return leveldb::test::RunAllTests();
} }

@ -19,6 +19,16 @@ namespace leveldb {
// parameters set via options. // parameters set via options.
namespace config { namespace config {
static const int kNumLevels = 7; static const int kNumLevels = 7;
// Level-0 compaction is started when we hit this many files.
static const int kL0_CompactionTrigger = 4;
// Soft limit on number of level-0 files. We slow down writes at this point.
static const int kL0_SlowdownWritesTrigger = 8;
// Maximum number of level-0 files. We stop writes at this point.
static const int kL0_StopWritesTrigger = 12;
} }
class InternalKey; class InternalKey;

@ -4,7 +4,6 @@
#include "db/log_reader.h" #include "db/log_reader.h"
#include <stdint.h>
#include "leveldb/env.h" #include "leveldb/env.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/crc32c.h" #include "util/crc32c.h"
@ -15,46 +14,104 @@ namespace log {
Reader::Reporter::~Reporter() { Reader::Reporter::~Reporter() {
} }
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum) Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
uint64_t initial_offset)
: file_(file), : file_(file),
reporter_(reporter), reporter_(reporter),
checksum_(checksum), checksum_(checksum),
backing_store_(new char[kBlockSize]), backing_store_(new char[kBlockSize]),
buffer_(), buffer_(),
eof_(false) { eof_(false),
last_record_offset_(0),
end_of_buffer_offset_(0),
initial_offset_(initial_offset) {
} }
Reader::~Reader() { Reader::~Reader() {
delete[] backing_store_; delete[] backing_store_;
} }
bool Reader::SkipToInitialBlock() {
size_t offset_in_block = initial_offset_ % kBlockSize;
uint64_t block_start_location = initial_offset_ - offset_in_block;
// Don't search a block if we'd be in the trailer
if (offset_in_block > kBlockSize - 6) {
offset_in_block = 0;
block_start_location += kBlockSize;
}
end_of_buffer_offset_ = block_start_location;
// Skip to start of first block that can contain the initial record
if (block_start_location > 0) {
Status skip_status = file_->Skip(block_start_location);
if (!skip_status.ok()) {
ReportDrop(block_start_location, skip_status);
return false;
}
}
return true;
}
bool Reader::ReadRecord(Slice* record, std::string* scratch) { bool Reader::ReadRecord(Slice* record, std::string* scratch) {
if (last_record_offset_ < initial_offset_) {
if (!SkipToInitialBlock()) {
return false;
}
}
scratch->clear(); scratch->clear();
record->clear(); record->clear();
bool in_fragmented_record = false; bool in_fragmented_record = false;
// Record offset of the logical record that we're reading
// 0 is a dummy value to make compilers happy
uint64_t prospective_record_offset = 0;
Slice fragment; Slice fragment;
while (true) { while (true) {
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
switch (ReadPhysicalRecord(&fragment)) { switch (ReadPhysicalRecord(&fragment)) {
case kFullType: case kFullType:
if (in_fragmented_record) { if (in_fragmented_record) {
ReportDrop(scratch->size(), "partial record without end"); // Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(1)");
} }
}
prospective_record_offset = physical_record_offset;
scratch->clear(); scratch->clear();
*record = fragment; *record = fragment;
last_record_offset_ = prospective_record_offset;
return true; return true;
case kFirstType: case kFirstType:
if (in_fragmented_record) { if (in_fragmented_record) {
ReportDrop(scratch->size(), "partial record without end"); // Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(2)");
}
} }
prospective_record_offset = physical_record_offset;
scratch->assign(fragment.data(), fragment.size()); scratch->assign(fragment.data(), fragment.size());
in_fragmented_record = true; in_fragmented_record = true;
break; break;
case kMiddleType: case kMiddleType:
if (!in_fragmented_record) { if (!in_fragmented_record) {
ReportDrop(fragment.size(), "missing start of fragmented record"); ReportCorruption(fragment.size(),
"missing start of fragmented record(1)");
} else { } else {
scratch->append(fragment.data(), fragment.size()); scratch->append(fragment.data(), fragment.size());
} }
@ -62,31 +119,33 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
case kLastType: case kLastType:
if (!in_fragmented_record) { if (!in_fragmented_record) {
ReportDrop(fragment.size(), "missing start of fragmented record"); ReportCorruption(fragment.size(),
"missing start of fragmented record(2)");
} else { } else {
scratch->append(fragment.data(), fragment.size()); scratch->append(fragment.data(), fragment.size());
*record = Slice(*scratch); *record = Slice(*scratch);
last_record_offset_ = prospective_record_offset;
return true; return true;
} }
break; break;
case kEof: case kEof:
if (in_fragmented_record) { if (in_fragmented_record) {
ReportDrop(scratch->size(), "partial record without end"); ReportCorruption(scratch->size(), "partial record without end(3)");
scratch->clear(); scratch->clear();
} }
return false; return false;
case kBadRecord: case kBadRecord:
if (in_fragmented_record) { if (in_fragmented_record) {
ReportDrop(scratch->size(), "error in middle of record"); ReportCorruption(scratch->size(), "error in middle of record");
in_fragmented_record = false; in_fragmented_record = false;
scratch->clear(); scratch->clear();
} }
break; break;
default: default:
ReportDrop( ReportCorruption(
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)), (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
"unknown record type"); "unknown record type");
in_fragmented_record = false; in_fragmented_record = false;
@ -97,9 +156,18 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
return false; return false;
} }
void Reader::ReportDrop(size_t bytes, const char* reason) { uint64_t Reader::LastRecordOffset() {
if (reporter_ != NULL) { return last_record_offset_;
reporter_->Corruption(bytes, Status::Corruption(reason)); }
void Reader::ReportCorruption(size_t bytes, const char* reason) {
ReportDrop(bytes, Status::Corruption(reason));
}
void Reader::ReportDrop(size_t bytes, const Status& reason) {
if (reporter_ != NULL &&
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
reporter_->Corruption(bytes, reason);
} }
} }
@ -110,11 +178,10 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
// Last read was a full read, so this is a trailer to skip // Last read was a full read, so this is a trailer to skip
buffer_.clear(); buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_); Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) { if (!status.ok()) {
if (reporter_ != NULL) {
reporter_->Corruption(kBlockSize, status);
}
buffer_.clear(); buffer_.clear();
ReportDrop(kBlockSize, status);
eof_ = true; eof_ = true;
return kEof; return kEof;
} else if (buffer_.size() < kBlockSize) { } else if (buffer_.size() < kBlockSize) {
@ -125,8 +192,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
// End of file // End of file
return kEof; return kEof;
} else { } else {
ReportDrop(buffer_.size(), "truncated record at end of file"); size_t drop_size = buffer_.size();
buffer_.clear(); buffer_.clear();
ReportCorruption(drop_size, "truncated record at end of file");
return kEof; return kEof;
} }
} }
@ -138,8 +206,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
const unsigned int type = header[6]; const unsigned int type = header[6];
const uint32_t length = a | (b << 8); const uint32_t length = a | (b << 8);
if (kHeaderSize + length > buffer_.size()) { if (kHeaderSize + length > buffer_.size()) {
ReportDrop(buffer_.size(), "bad record length"); size_t drop_size = buffer_.size();
buffer_.clear(); buffer_.clear();
ReportCorruption(drop_size, "bad record length");
return kBadRecord; return kBadRecord;
} }
@ -160,13 +229,22 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
// been corrupted and if we trust it, we could find some // been corrupted and if we trust it, we could find some
// fragment of a real log record that just happens to look // fragment of a real log record that just happens to look
// like a valid log record. // like a valid log record.
ReportDrop(buffer_.size(), "checksum mismatch"); size_t drop_size = buffer_.size();
buffer_.clear(); buffer_.clear();
ReportCorruption(drop_size, "checksum mismatch");
return kBadRecord; return kBadRecord;
} }
} }
buffer_.remove_prefix(kHeaderSize + length); buffer_.remove_prefix(kHeaderSize + length);
// Skip physical record that started before initial_offset_
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
initial_offset_) {
result->clear();
return kBadRecord;
}
*result = Slice(header + kHeaderSize, length); *result = Slice(header + kHeaderSize, length);
return type; return type;
} }

@ -5,6 +5,8 @@
#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ #ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
#define STORAGE_LEVELDB_DB_LOG_READER_H_ #define STORAGE_LEVELDB_DB_LOG_READER_H_
#include <stdint.h>
#include "db/log_format.h" #include "db/log_format.h"
#include "leveldb/slice.h" #include "leveldb/slice.h"
#include "leveldb/status.h" #include "leveldb/status.h"
@ -35,7 +37,11 @@ class Reader {
// live while this Reader is in use. // live while this Reader is in use.
// //
// If "checksum" is true, verify checksums if available. // If "checksum" is true, verify checksums if available.
Reader(SequentialFile* file, Reporter* reporter, bool checksum); //
// The Reader will start reading at the first record located at physical
// position >= initial_offset within the file.
Reader(SequentialFile* file, Reporter* reporter, bool checksum,
uint64_t initial_offset);
~Reader(); ~Reader();
@ -46,6 +52,11 @@ class Reader {
// reader or the next mutation to *scratch. // reader or the next mutation to *scratch.
bool ReadRecord(Slice* record, std::string* scratch); bool ReadRecord(Slice* record, std::string* scratch);
// Returns the physical offset of the last record returned by ReadRecord.
//
// Undefined before the first call to ReadRecord.
uint64_t LastRecordOffset();
private: private:
SequentialFile* const file_; SequentialFile* const file_;
Reporter* const reporter_; Reporter* const reporter_;
@ -54,15 +65,37 @@ class Reader {
Slice buffer_; Slice buffer_;
bool eof_; // Last Read() indicated EOF by returning < kBlockSize bool eof_; // Last Read() indicated EOF by returning < kBlockSize
// Offset of the last record returned by ReadRecord.
uint64_t last_record_offset_;
// Offset of the first location past the end of buffer_.
uint64_t end_of_buffer_offset_;
// Offset at which to start looking for the first record to return
uint64_t const initial_offset_;
// Extend record types with the following special values // Extend record types with the following special values
enum { enum {
kEof = kMaxRecordType + 1, kEof = kMaxRecordType + 1,
// Returned whenever we find an invalid physical record.
// Currently there are three situations in which this happens:
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
// * The record is a 0-length record (No drop is reported)
// * The record is below constructor's initial_offset (No drop is reported)
kBadRecord = kMaxRecordType + 2 kBadRecord = kMaxRecordType + 2
}; };
// Skips all blocks that are completely before "initial_offset_".
//
// Returns true on success. Handles reporting.
bool SkipToInitialBlock();
// Return type, or one of the preceding special values // Return type, or one of the preceding special values
unsigned int ReadPhysicalRecord(Slice* result); unsigned int ReadPhysicalRecord(Slice* result);
void ReportDrop(size_t bytes, const char* reason);
// Reports dropped bytes to the reporter.
// buffer_ must be updated to remove the dropped bytes prior to invocation.
void ReportCorruption(size_t bytes, const char* reason);
void ReportDrop(size_t bytes, const Status& reason);
// No copying allowed // No copying allowed
Reader(const Reader&); Reader(const Reader&);

@ -60,7 +60,6 @@ class LogTest {
virtual Status Read(size_t n, Slice* result, char* scratch) { virtual Status Read(size_t n, Slice* result, char* scratch) {
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
ASSERT_EQ(kBlockSize, n);
if (force_error_) { if (force_error_) {
force_error_ = false; force_error_ = false;
@ -76,6 +75,17 @@ class LogTest {
contents_.remove_prefix(n); contents_.remove_prefix(n);
return Status::OK(); return Status::OK();
} }
virtual Status Skip(size_t n) {
if (n > contents_.size()) {
contents_.clear();
return Status::NotFound("in-memory file skipepd past end");
}
contents_.remove_prefix(n);
return Status::OK();
}
}; };
class ReportCollector : public Reader::Reporter { class ReportCollector : public Reader::Reporter {
@ -97,10 +107,15 @@ class LogTest {
Writer writer_; Writer writer_;
Reader reader_; Reader reader_;
// Record metadata for testing initial offset functionality
static size_t initial_offset_record_sizes_[];
static uint64_t initial_offset_last_record_offsets_[];
public: public:
LogTest() : reading_(false), LogTest() : reading_(false),
writer_(&dest_), writer_(&dest_),
reader_(&source_, &report_, true/*checksum*/) { reader_(&source_, &report_, true/*checksum*/,
0/*initial_offset*/) {
} }
void Write(const std::string& msg) { void Write(const std::string& msg) {
@ -153,6 +168,10 @@ class LogTest {
return report_.dropped_bytes_; return report_.dropped_bytes_;
} }
std::string ReportMessage() const {
return report_.message_;
}
// Returns OK iff recorded error message contains "msg" // Returns OK iff recorded error message contains "msg"
std::string MatchError(const std::string& msg) const { std::string MatchError(const std::string& msg) const {
if (report_.message_.find(msg) == std::string::npos) { if (report_.message_.find(msg) == std::string::npos) {
@ -161,8 +180,61 @@ class LogTest {
return "OK"; return "OK";
} }
} }
void WriteInitialOffsetLog() {
for (int i = 0; i < 4; i++) {
std::string record(initial_offset_record_sizes_[i],
static_cast<char>('a' + i));
Write(record);
}
}
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
WriteInitialOffsetLog();
reading_ = true;
source_.contents_ = Slice(dest_.contents_);
Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
WrittenBytes() + offset_past_end);
Slice record;
std::string scratch;
ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
delete offset_reader;
}
void CheckInitialOffsetRecord(uint64_t initial_offset,
int expected_record_offset) {
WriteInitialOffsetLog();
reading_ = true;
source_.contents_ = Slice(dest_.contents_);
Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
initial_offset);
Slice record;
std::string scratch;
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
record.size());
ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
offset_reader->LastRecordOffset());
ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
delete offset_reader;
}
}; };
size_t LogTest::initial_offset_record_sizes_[] =
{10000, // Two sizable records in first block
10000,
2 * log::kBlockSize - 1000, // Span three blocks
1};
uint64_t LogTest::initial_offset_last_record_offsets_[] =
{0,
kHeaderSize + 10000,
2 * (kHeaderSize + 10000),
2 * (kHeaderSize + 10000) +
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
TEST(LogTest, Empty) { TEST(LogTest, Empty) {
ASSERT_EQ("EOF", Read()); ASSERT_EQ("EOF", Read());
} }
@ -213,6 +285,19 @@ TEST(LogTest, MarginalTrailer) {
ASSERT_EQ("EOF", Read()); ASSERT_EQ("EOF", Read());
} }
TEST(LogTest, MarginalTrailer2) {
// Make a trailer that is exactly the same length as an empty record.
const int n = kBlockSize - 2*kHeaderSize;
Write(BigString("foo", n));
ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(0, DroppedBytes());
ASSERT_EQ("", ReportMessage());
}
TEST(LogTest, ShortTrailer) { TEST(LogTest, ShortTrailer) {
const int n = kBlockSize - 2*kHeaderSize + 4; const int n = kBlockSize - 2*kHeaderSize + 4;
Write(BigString("foo", n)); Write(BigString("foo", n));
@ -353,6 +438,60 @@ TEST(LogTest, ErrorJoinsRecords) {
ASSERT_GE(dropped, 2*kBlockSize); ASSERT_GE(dropped, 2*kBlockSize);
} }
TEST(LogTest, ReadStart) {
CheckInitialOffsetRecord(0, 0);
}
TEST(LogTest, ReadSecondOneOff) {
CheckInitialOffsetRecord(1, 1);
}
TEST(LogTest, ReadSecondTenThousand) {
CheckInitialOffsetRecord(10000, 1);
}
TEST(LogTest, ReadSecondStart) {
CheckInitialOffsetRecord(10007, 1);
}
TEST(LogTest, ReadThirdOneOff) {
CheckInitialOffsetRecord(10008, 2);
}
TEST(LogTest, ReadThirdStart) {
CheckInitialOffsetRecord(20014, 2);
}
TEST(LogTest, ReadFourthOneOff) {
CheckInitialOffsetRecord(20015, 3);
}
TEST(LogTest, ReadFourthFirstBlockTrailer) {
CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
}
TEST(LogTest, ReadFourthMiddleBlock) {
CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
}
TEST(LogTest, ReadFourthLastBlock) {
CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
}
TEST(LogTest, ReadFourthStart) {
CheckInitialOffsetRecord(
2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
3);
}
TEST(LogTest, ReadEnd) {
CheckOffsetPastEndReturnsNoRecords(0);
}
TEST(LogTest, ReadPastEnd) {
CheckOffsetPastEndReturnsNoRecords(5);
}
} }
} }

@ -32,6 +32,7 @@ Status Writer::AddRecord(const Slice& slice) {
// is empty, we still want to iterate once to emit a single // is empty, we still want to iterate once to emit a single
// zero-length record // zero-length record
Status s; Status s;
bool begin = true;
do { do {
const int leftover = kBlockSize - block_offset_; const int leftover = kBlockSize - block_offset_;
assert(leftover >= 0); assert(leftover >= 0);
@ -52,7 +53,6 @@ Status Writer::AddRecord(const Slice& slice) {
const size_t fragment_length = (left < avail) ? left : avail; const size_t fragment_length = (left < avail) ? left : avail;
RecordType type; RecordType type;
const bool begin = (ptr == slice.data());
const bool end = (left == fragment_length); const bool end = (left == fragment_length);
if (begin && end) { if (begin && end) {
type = kFullType; type = kFullType;
@ -67,6 +67,7 @@ Status Writer::AddRecord(const Slice& slice) {
s = EmitPhysicalRecord(type, ptr, fragment_length); s = EmitPhysicalRecord(type, ptr, fragment_length);
ptr += fragment_length; ptr += fragment_length;
left -= fragment_length; left -= fragment_length;
begin = false;
} while (s.ok() && left > 0); } while (s.ok() && left > 0);
return s; return s;
} }

@ -20,10 +20,12 @@ static Slice GetLengthPrefixedSlice(const char* data) {
MemTable::MemTable(const InternalKeyComparator& cmp) MemTable::MemTable(const InternalKeyComparator& cmp)
: comparator_(cmp), : comparator_(cmp),
refs_(0),
table_(comparator_, &arena_) { table_(comparator_, &arena_) {
} }
MemTable::~MemTable() { MemTable::~MemTable() {
assert(refs_ == 0);
} }
size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
@ -48,10 +50,15 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) {
class MemTableIterator: public Iterator { class MemTableIterator: public Iterator {
public: public:
explicit MemTableIterator(MemTable::Table* table) { explicit MemTableIterator(MemTable* mem, MemTable::Table* table) {
mem_ = mem;
iter_ = new MemTable::Table::Iterator(table); iter_ = new MemTable::Table::Iterator(table);
mem->Ref();
}
virtual ~MemTableIterator() {
delete iter_;
mem_->Unref();
} }
virtual ~MemTableIterator() { delete iter_; }
virtual bool Valid() const { return iter_->Valid(); } virtual bool Valid() const { return iter_->Valid(); }
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
@ -68,6 +75,7 @@ class MemTableIterator: public Iterator {
virtual Status status() const { return Status::OK(); } virtual Status status() const { return Status::OK(); }
private: private:
MemTable* mem_;
MemTable::Table::Iterator* iter_; MemTable::Table::Iterator* iter_;
std::string tmp_; // For passing to EncodeKey std::string tmp_; // For passing to EncodeKey
@ -77,7 +85,7 @@ class MemTableIterator: public Iterator {
}; };
Iterator* MemTable::NewIterator() { Iterator* MemTable::NewIterator() {
return new MemTableIterator(&table_); return new MemTableIterator(this, &table_);
} }
void MemTable::Add(SequenceNumber s, ValueType type, void MemTable::Add(SequenceNumber s, ValueType type,

@ -19,8 +19,21 @@ class MemTableIterator;
class MemTable { class MemTable {
public: public:
// MemTables are reference counted. The initial reference count
// is zero and the caller must call Ref() at least once.
explicit MemTable(const InternalKeyComparator& comparator); explicit MemTable(const InternalKeyComparator& comparator);
~MemTable();
// Increase reference count.
void Ref() { ++refs_; }
// Drop reference count. Delete if no more references exist.
void Unref() {
--refs_;
assert(refs_ >= 0);
if (refs_ <= 0) {
delete this;
}
}
// Returns an estimate of the number of bytes of data in use by this // Returns an estimate of the number of bytes of data in use by this
// data structure. // data structure.
@ -45,6 +58,8 @@ class MemTable {
const Slice& value); const Slice& value);
private: private:
~MemTable(); // Private since only Unref() should be used to delete it
struct KeyComparator { struct KeyComparator {
const InternalKeyComparator comparator; const InternalKeyComparator comparator;
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
@ -56,6 +71,7 @@ class MemTable {
typedef SkipList<const char*, KeyComparator> Table; typedef SkipList<const char*, KeyComparator> Table;
KeyComparator comparator_; KeyComparator comparator_;
int refs_;
Arena arena_; Arena arena_;
Table table_; Table table_;

@ -183,13 +183,15 @@ class Repairer {
// corruptions cause entire commits to be skipped instead of // corruptions cause entire commits to be skipped instead of
// propagating bad information (like overly large sequence // propagating bad information (like overly large sequence
// numbers). // numbers).
log::Reader reader(lfile, &reporter, false/*do not checksum*/); log::Reader reader(lfile, &reporter, false/*do not checksum*/,
0/*initial_offset*/);
// Read all the records and add to a memtable // Read all the records and add to a memtable
std::string scratch; std::string scratch;
Slice record; Slice record;
WriteBatch batch; WriteBatch batch;
MemTable mem(icmp_); MemTable* mem = new MemTable(icmp_);
mem->Ref();
int counter = 0; int counter = 0;
while (reader.ReadRecord(&record, &scratch)) { while (reader.ReadRecord(&record, &scratch)) {
if (record.size() < 12) { if (record.size() < 12) {
@ -198,7 +200,7 @@ class Repairer {
continue; continue;
} }
WriteBatchInternal::SetContents(&batch, record); WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, &mem); status = WriteBatchInternal::InsertInto(&batch, mem);
if (status.ok()) { if (status.ok()) {
counter += WriteBatchInternal::Count(&batch); counter += WriteBatchInternal::Count(&batch);
} else { } else {
@ -215,10 +217,12 @@ class Repairer {
VersionEdit skipped; VersionEdit skipped;
FileMetaData meta; FileMetaData meta;
meta.number = next_file_number_++; meta.number = next_file_number_++;
Iterator* iter = mem.NewIterator(); Iterator* iter = mem->NewIterator();
status = BuildTable(dbname_, env_, options_, table_cache_, iter, status = BuildTable(dbname_, env_, options_, table_cache_, iter,
&meta, &skipped); &meta, &skipped);
delete iter; delete iter;
mem->Unref();
mem = NULL;
if (status.ok()) { if (status.ok()) {
if (meta.file_size > 0) { if (meta.file_size > 0) {
table_numbers_.push_back(meta.number); table_numbers_.push_back(meta.number);

@ -12,17 +12,17 @@ namespace leveldb {
class SnapshotList; class SnapshotList;
// Snapshots are kept in a doubly-linked list in the DB. // Snapshots are kept in a doubly-linked list in the DB.
// Each Snapshot corresponds to a particular sequence number. // Each SnapshotImpl corresponds to a particular sequence number.
class Snapshot { class SnapshotImpl : public Snapshot {
public: public:
SequenceNumber number_; // const after creation SequenceNumber number_; // const after creation
private: private:
friend class SnapshotList; friend class SnapshotList;
// Snapshot is kept in a doubly-linked circular list // SnapshotImpl is kept in a doubly-linked circular list
Snapshot* prev_; SnapshotImpl* prev_;
Snapshot* next_; SnapshotImpl* next_;
SnapshotList* list_; // just for sanity checks SnapshotList* list_; // just for sanity checks
}; };
@ -35,11 +35,11 @@ class SnapshotList {
} }
bool empty() const { return list_.next_ == &list_; } bool empty() const { return list_.next_ == &list_; }
Snapshot* oldest() const { assert(!empty()); return list_.next_; } SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
Snapshot* newest() const { assert(!empty()); return list_.prev_; } SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
const Snapshot* New(SequenceNumber seq) { const SnapshotImpl* New(SequenceNumber seq) {
Snapshot* s = new Snapshot; SnapshotImpl* s = new SnapshotImpl;
s->number_ = seq; s->number_ = seq;
s->list_ = this; s->list_ = this;
s->next_ = &list_; s->next_ = &list_;
@ -49,7 +49,7 @@ class SnapshotList {
return s; return s;
} }
void Delete(const Snapshot* s) { void Delete(const SnapshotImpl* s) {
assert(s->list_ == this); assert(s->list_ == this);
s->prev_->next_ = s->next_; s->prev_->next_ = s->next_;
s->next_->prev_ = s->prev_; s->next_->prev_ = s->prev_;
@ -58,7 +58,7 @@ class SnapshotList {
private: private:
// Dummy head of doubly-linked list of snapshots // Dummy head of doubly-linked list of snapshots
Snapshot list_; SnapshotImpl list_;
}; };
} }

@ -57,17 +57,22 @@ std::string IntSetToString(const std::set<uint64_t>& s) {
Version::~Version() { Version::~Version() {
assert(refs_ == 0); assert(refs_ == 0);
// Remove from linked list
prev_->next_ = next_;
next_->prev_ = prev_;
// Drop references to files
for (int level = 0; level < config::kNumLevels; level++) { for (int level = 0; level < config::kNumLevels; level++) {
for (size_t i = 0; i < files_[level].size(); i++) { for (size_t i = 0; i < files_[level].size(); i++) {
FileMetaData* f = files_[level][i]; FileMetaData* f = files_[level][i];
assert(f->refs >= 0); assert(f->refs > 0);
f->refs--; f->refs--;
if (f->refs <= 0) { if (f->refs <= 0) {
delete f; delete f;
} }
} }
} }
delete cleanup_mem_;
} }
// An internal iterator. For a given version/level pair, yields // An internal iterator. For a given version/level pair, yields
@ -77,9 +82,9 @@ Version::~Version() {
// encoded using EncodeFixed64. // encoded using EncodeFixed64.
class Version::LevelFileNumIterator : public Iterator { class Version::LevelFileNumIterator : public Iterator {
public: public:
LevelFileNumIterator(const Version* version, LevelFileNumIterator(const InternalKeyComparator& icmp,
const std::vector<FileMetaData*>* flist) const std::vector<FileMetaData*>* flist)
: icmp_(version->vset_->icmp_.user_comparator()), : icmp_(icmp),
flist_(flist), flist_(flist),
index_(flist->size()) { // Marks as invalid index_(flist->size()) { // Marks as invalid
} }
@ -157,7 +162,7 @@ static Iterator* GetFileIterator(void* arg,
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
int level) const { int level) const {
return NewTwoLevelIterator( return NewTwoLevelIterator(
new LevelFileNumIterator(this, &files_[level]), new LevelFileNumIterator(vset_->icmp_, &files_[level]),
&GetFileIterator, vset_->table_cache_, options); &GetFileIterator, vset_->table_cache_, options);
} }
@ -185,11 +190,11 @@ void Version::Ref() {
} }
void Version::Unref() { void Version::Unref() {
assert(this != &vset_->dummy_versions_);
assert(refs_ >= 1); assert(refs_ >= 1);
--refs_; --refs_;
if (refs_ == 0) { if (refs_ == 0) {
vset_->MaybeDeleteOldVersions(); delete this;
// TODO: try to delete obsolete files
} }
} }
@ -222,37 +227,58 @@ std::string Version::DebugString() const {
// Versions that contain full copies of the intermediate state. // Versions that contain full copies of the intermediate state.
class VersionSet::Builder { class VersionSet::Builder {
private: private:
typedef std::map<uint64_t, FileMetaData*> FileMap; // Helper to sort by v->files_[file_number].smallest
struct BySmallestKey {
const InternalKeyComparator* internal_comparator;
bool operator()(FileMetaData* f1, FileMetaData* f2) const {
int r = internal_comparator->Compare(f1->smallest, f2->smallest);
if (r != 0) {
return (r < 0);
} else {
// Break ties by file number
return (f1->number < f2->number);
}
}
};
typedef std::set<FileMetaData*, BySmallestKey> FileSet;
struct LevelState {
std::set<uint64_t> deleted_files;
FileSet* added_files;
};
VersionSet* vset_; VersionSet* vset_;
FileMap files_[config::kNumLevels]; Version* base_;
LevelState levels_[config::kNumLevels];
public: public:
// Initialize a builder with the files from *base and other info from *vset // Initialize a builder with the files from *base and other info from *vset
Builder(VersionSet* vset, Version* base) Builder(VersionSet* vset, Version* base)
: vset_(vset) { : vset_(vset),
base_(base) {
base_->Ref();
BySmallestKey cmp;
cmp.internal_comparator = &vset_->icmp_;
for (int level = 0; level < config::kNumLevels; level++) { for (int level = 0; level < config::kNumLevels; level++) {
const std::vector<FileMetaData*>& files = base->files_[level]; levels_[level].added_files = new FileSet(cmp);
for (size_t i = 0; i < files.size(); i++) {
FileMetaData* f = files[i];
f->refs++;
files_[level].insert(std::make_pair(f->number, f));
}
} }
} }
~Builder() { ~Builder() {
for (int level = 0; level < config::kNumLevels; level++) { for (int level = 0; level < config::kNumLevels; level++) {
const FileMap& fmap = files_[level]; std::vector<FileMetaData*> to_unref(levels_[level].added_files->begin(),
for (FileMap::const_iterator iter = fmap.begin(); levels_[level].added_files->end());
iter != fmap.end(); delete levels_[level].added_files;
++iter) { for (int i = 0; i < to_unref.size(); i++) {
FileMetaData* f = iter->second; FileMetaData* f = to_unref[i];
f->refs--; f->refs--;
if (f->refs <= 0) { if (f->refs <= 0) {
delete f; delete f;
} }
} }
} }
base_->Unref();
} }
// Apply all of the edits in *edit to the current state. // Apply all of the edits in *edit to the current state.
@ -271,16 +297,7 @@ class VersionSet::Builder {
++iter) { ++iter) {
const int level = iter->first; const int level = iter->first;
const uint64_t number = iter->second; const uint64_t number = iter->second;
FileMap::iterator fiter = files_[level].find(number); levels_[level].deleted_files.insert(number);
assert(fiter != files_[level].end()); // Sanity check for debug mode
if (fiter != files_[level].end()) {
FileMetaData* f = fiter->second;
f->refs--;
if (f->refs <= 0) {
delete f;
}
files_[level].erase(fiter);
}
} }
// Add new files // Add new files
@ -288,24 +305,68 @@ class VersionSet::Builder {
const int level = edit->new_files_[i].first; const int level = edit->new_files_[i].first;
FileMetaData* f = new FileMetaData(edit->new_files_[i].second); FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
f->refs = 1; f->refs = 1;
assert(files_[level].count(f->number) == 0); levels_[level].deleted_files.erase(f->number);
files_[level].insert(std::make_pair(f->number, f)); levels_[level].added_files->insert(f);
} }
} }
// Save the current state in *v. // Save the current state in *v.
void SaveTo(Version* v) { void SaveTo(Version* v) {
BySmallestKey cmp;
cmp.internal_comparator = &vset_->icmp_;
for (int level = 0; level < config::kNumLevels; level++) { for (int level = 0; level < config::kNumLevels; level++) {
const FileMap& fmap = files_[level]; // Merge the set of added files with the set of pre-existing files.
for (FileMap::const_iterator iter = fmap.begin(); // Drop any deleted files. Store the result in *v.
iter != fmap.end(); const std::vector<FileMetaData*>& base_files = base_->files_[level];
++iter) { std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
FileMetaData* f = iter->second; std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
const FileSet* added = levels_[level].added_files;
v->files_[level].reserve(base_files.size() + added->size());
for (FileSet::const_iterator added_iter = added->begin();
added_iter != added->end();
++added_iter) {
// Add all smaller files listed in base_
for (std::vector<FileMetaData*>::const_iterator bpos
= std::upper_bound(base_iter, base_end, *added_iter, cmp);
base_iter != bpos;
++base_iter) {
MaybeAddFile(v, level, *base_iter);
}
MaybeAddFile(v, level, *added_iter);
}
// Add remaining base files
for (; base_iter != base_end; ++base_iter) {
MaybeAddFile(v, level, *base_iter);
}
#ifndef NDEBUG
// Make sure there is no overlap in levels > 0
if (level > 0) {
for (int i = 1; i < v->files_[level].size(); i++) {
const InternalKey& prev_end = v->files_[level][i-1]->largest;
const InternalKey& this_begin = v->files_[level][i]->smallest;
if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
EscapeString(prev_end.Encode()).c_str(),
EscapeString(this_begin.Encode()).c_str());
abort();
}
}
}
#endif
}
}
void MaybeAddFile(Version* v, int level, FileMetaData* f) {
if (levels_[level].deleted_files.count(f->number) > 0) {
// File is deleted: do nothing
} else {
f->refs++; f->refs++;
v->files_[level].push_back(f); v->files_[level].push_back(f);
} }
} }
}
}; };
VersionSet::VersionSet(const std::string& dbname, VersionSet::VersionSet(const std::string& dbname,
@ -324,22 +385,36 @@ VersionSet::VersionSet(const std::string& dbname,
prev_log_number_(0), prev_log_number_(0),
descriptor_file_(NULL), descriptor_file_(NULL),
descriptor_log_(NULL), descriptor_log_(NULL),
current_(new Version(this)), dummy_versions_(this),
oldest_(current_) { current_(NULL) {
AppendVersion(new Version(this));
} }
VersionSet::~VersionSet() { VersionSet::~VersionSet() {
for (Version* v = oldest_; v != NULL; ) { current_->Unref();
Version* next = v->next_; assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty
assert(v->refs_ == 0);
delete v;
v = next;
}
delete descriptor_log_; delete descriptor_log_;
delete descriptor_file_; delete descriptor_file_;
} }
Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) { void VersionSet::AppendVersion(Version* v) {
// Make "v" current
assert(v->refs_ == 0);
assert(v != current_);
if (current_ != NULL) {
current_->Unref();
}
current_ = v;
v->Ref();
// Append to linked list
v->prev_ = dummy_versions_.prev_;
v->next_ = &dummy_versions_;
v->prev_->next_ = v;
v->next_->prev_ = v;
}
Status VersionSet::LogAndApply(VersionEdit* edit) {
if (edit->has_log_number_) { if (edit->has_log_number_) {
assert(edit->log_number_ >= log_number_); assert(edit->log_number_ >= log_number_);
assert(edit->log_number_ < next_file_number_); assert(edit->log_number_ < next_file_number_);
@ -360,13 +435,12 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
builder.Apply(edit); builder.Apply(edit);
builder.SaveTo(v); builder.SaveTo(v);
} }
Finalize(v);
std::string new_manifest_file;
Status s = Finalize(v);
// Initialize new descriptor log file if necessary by creating // Initialize new descriptor log file if necessary by creating
// a temporary file that contains a snapshot of the current version. // a temporary file that contains a snapshot of the current version.
if (s.ok()) { std::string new_manifest_file;
Status s;
if (descriptor_log_ == NULL) { if (descriptor_log_ == NULL) {
assert(descriptor_file_ == NULL); assert(descriptor_file_ == NULL);
new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
@ -377,7 +451,6 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
s = WriteSnapshot(descriptor_log_); s = WriteSnapshot(descriptor_log_);
} }
} }
}
// Write new record to MANIFEST log // Write new record to MANIFEST log
if (s.ok()) { if (s.ok()) {
@ -397,12 +470,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
// Install the new version // Install the new version
if (s.ok()) { if (s.ok()) {
assert(current_->next_ == NULL); AppendVersion(v);
assert(current_->cleanup_mem_ == NULL);
current_->cleanup_mem_ = cleanup_mem;
v->next_ = NULL;
current_->next_ = v;
current_ = v;
log_number_ = edit->log_number_; log_number_ = edit->log_number_;
prev_log_number_ = edit->prev_log_number_; prev_log_number_ = edit->prev_log_number_;
} else { } else {
@ -458,7 +526,7 @@ Status VersionSet::Recover() {
{ {
LogReporter reporter; LogReporter reporter;
reporter.status = &s; reporter.status = &s;
log::Reader reader(file, &reporter, true/*checksum*/); log::Reader reader(file, &reporter, true/*checksum*/, 0/*initial_offset*/);
Slice record; Slice record;
std::string scratch; std::string scratch;
while (reader.ReadRecord(&record, &scratch) && s.ok()) { while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@ -518,21 +586,15 @@ Status VersionSet::Recover() {
if (s.ok()) { if (s.ok()) {
Version* v = new Version(this); Version* v = new Version(this);
builder.SaveTo(v); builder.SaveTo(v);
s = Finalize(v);
if (!s.ok()) {
delete v;
} else {
// Install recovered version // Install recovered version
v->next_ = NULL; Finalize(v);
current_->next_ = v; AppendVersion(v);
current_ = v;
manifest_file_number_ = next_file; manifest_file_number_ = next_file;
next_file_number_ = next_file + 1; next_file_number_ = next_file + 1;
last_sequence_ = last_sequence; last_sequence_ = last_sequence;
log_number_ = log_number; log_number_ = log_number;
prev_log_number_ = prev_log_number; prev_log_number_ = prev_log_number;
} }
}
return s; return s;
} }
@ -545,15 +607,12 @@ static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
return sum; return sum;
} }
Status VersionSet::Finalize(Version* v) { void VersionSet::Finalize(Version* v) {
// Precomputed best level for next compaction // Precomputed best level for next compaction
int best_level = -1; int best_level = -1;
double best_score = -1; double best_score = -1;
Status s; for (int level = 0; level < config::kNumLevels-1; level++) {
for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) {
s = SortLevel(v, level);
double score; double score;
if (level == 0) { if (level == 0) {
// We treat level-0 specially by bounding the number of files // We treat level-0 specially by bounding the number of files
@ -567,7 +626,8 @@ Status VersionSet::Finalize(Version* v) {
// file size is small (perhaps because of a small write-buffer // file size is small (perhaps because of a small write-buffer
// setting, or very high compression ratios, or lots of // setting, or very high compression ratios, or lots of
// overwrites/deletions). // overwrites/deletions).
score = v->files_[level].size() / 4.0; score = v->files_[level].size() /
static_cast<double>(config::kL0_CompactionTrigger);
} else { } else {
// Compute the ratio of current size to size limit. // Compute the ratio of current size to size limit.
const uint64_t level_bytes = TotalFileSize(v->files_[level]); const uint64_t level_bytes = TotalFileSize(v->files_[level]);
@ -582,7 +642,6 @@ Status VersionSet::Finalize(Version* v) {
v->compaction_level_ = best_level; v->compaction_level_ = best_level;
v->compaction_score_ = best_score; v->compaction_score_ = best_score;
return s;
} }
Status VersionSet::WriteSnapshot(log::Writer* log) { Status VersionSet::WriteSnapshot(log::Writer* log) {
@ -615,44 +674,27 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
return log->AddRecord(record); return log->AddRecord(record);
} }
// Helper to sort by tables_[file_number].smallest
struct VersionSet::BySmallestKey {
const InternalKeyComparator* internal_comparator;
bool operator()(FileMetaData* f1, FileMetaData* f2) const {
return internal_comparator->Compare(f1->smallest, f2->smallest) < 0;
}
};
Status VersionSet::SortLevel(Version* v, uint64_t level) {
Status result;
BySmallestKey cmp;
cmp.internal_comparator = &icmp_;
std::sort(v->files_[level].begin(), v->files_[level].end(), cmp);
if (result.ok() && level > 0) {
// There should be no overlap
for (size_t i = 1; i < v->files_[level].size(); i++) {
const InternalKey& prev_end = v->files_[level][i-1]->largest;
const InternalKey& this_begin = v->files_[level][i]->smallest;
if (icmp_.Compare(prev_end, this_begin) >= 0) {
result = Status::Corruption(
"overlapping ranges in same level",
(EscapeString(prev_end.Encode()) + " vs. " +
EscapeString(this_begin.Encode())));
break;
}
}
}
return result;
}
int VersionSet::NumLevelFiles(int level) const { int VersionSet::NumLevelFiles(int level) const {
assert(level >= 0); assert(level >= 0);
assert(level < config::kNumLevels); assert(level < config::kNumLevels);
return current_->files_[level].size(); return current_->files_[level].size();
} }
const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
// Update code if kNumLevels changes
assert(config::kNumLevels == 7);
snprintf(scratch->buffer, sizeof(scratch->buffer),
"files[ %d %d %d %d %d %d %d ]",
int(current_->files_[0].size()),
int(current_->files_[1].size()),
int(current_->files_[2].size()),
int(current_->files_[3].size()),
int(current_->files_[4].size()),
int(current_->files_[5].size()),
int(current_->files_[6].size()));
return scratch->buffer;
}
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
uint64_t result = 0; uint64_t result = 0;
for (int level = 0; level < config::kNumLevels; level++) { for (int level = 0; level < config::kNumLevels; level++) {
@ -685,19 +727,10 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
return result; return result;
} }
void VersionSet::MaybeDeleteOldVersions() {
// Note: it is important to delete versions in order since a newer
// version with zero refs may be holding a pointer to a memtable
// that is used by somebody who has a ref on an older version.
while (oldest_ != current_ && oldest_->refs_ == 0) {
Version* next = oldest_->next_;
delete oldest_;
oldest_ = next;
}
}
void VersionSet::AddLiveFiles(std::set<uint64_t>* live) { void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
for (Version* v = oldest_; v != NULL; v = v->next_) { for (Version* v = dummy_versions_.next_;
v != &dummy_versions_;
v = v->next_) {
for (int level = 0; level < config::kNumLevels; level++) { for (int level = 0; level < config::kNumLevels; level++) {
const std::vector<FileMetaData*>& files = v->files_[level]; const std::vector<FileMetaData*>& files = v->files_[level];
for (size_t i = 0; i < files.size(); i++) { for (size_t i = 0; i < files.size(); i++) {
@ -809,8 +842,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
} else { } else {
// Create concatenating iterator for the files from this level // Create concatenating iterator for the files from this level
list[num++] = NewTwoLevelIterator( list[num++] = NewTwoLevelIterator(
new Version::LevelFileNumIterator( new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
c->input_version_, &c->inputs_[which]),
&GetFileIterator, table_cache_, options); &GetFileIterator, table_cache_, options);
} }
} }
@ -996,11 +1028,12 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
return true; return true;
} }
bool Compaction::ShouldStopBefore(const InternalKey& key) { bool Compaction::ShouldStopBefore(const Slice& internal_key) {
// Scan to find earliest grandparent file that contains key. // Scan to find earliest grandparent file that contains key.
const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
while (grandparent_index_ < grandparents_.size() && while (grandparent_index_ < grandparents_.size() &&
icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) { icmp->Compare(internal_key,
grandparents_[grandparent_index_]->largest.Encode()) > 0) {
if (seen_key_) { if (seen_key_) {
overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
} }

@ -59,8 +59,8 @@ class Version {
VersionSet* vset_; // VersionSet to which this Version belongs VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list
int refs_; // Number of live refs to this version int refs_; // Number of live refs to this version
MemTable* cleanup_mem_; // NULL, or table to delete when version dropped
// List of files per level // List of files per level
std::vector<FileMetaData*> files_[config::kNumLevels]; std::vector<FileMetaData*> files_[config::kNumLevels];
@ -72,8 +72,7 @@ class Version {
int compaction_level_; int compaction_level_;
explicit Version(VersionSet* vset) explicit Version(VersionSet* vset)
: vset_(vset), next_(NULL), refs_(0), : vset_(vset), next_(this), prev_(this), refs_(0),
cleanup_mem_(NULL),
compaction_score_(-1), compaction_score_(-1),
compaction_level_(-1) { compaction_level_(-1) {
} }
@ -95,10 +94,8 @@ class VersionSet {
// Apply *edit to the current version to form a new descriptor that // Apply *edit to the current version to form a new descriptor that
// is both saved to persistent state and installed as the new // is both saved to persistent state and installed as the new
// current version. Iff Apply() returns OK, arrange to delete // current version.
// cleanup_mem (if cleanup_mem != NULL) when it is no longer needed Status LogAndApply(VersionEdit* edit);
// by older versions.
Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem);
// Recover the last saved descriptor from persistent storage. // Recover the last saved descriptor from persistent storage.
Status Recover(); Status Recover();
@ -171,19 +168,20 @@ class VersionSet {
// "key" as of version "v". // "key" as of version "v".
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
// Return a human-readable short (single-line) summary of the number
// of files per level. Uses *scratch as backing store.
struct LevelSummaryStorage {
char buffer[100];
};
const char* LevelSummary(LevelSummaryStorage* scratch) const;
private: private:
class Builder; class Builder;
friend class Compaction; friend class Compaction;
friend class Version; friend class Version;
Status Finalize(Version* v); void Finalize(Version* v);
// Delete any old versions that are no longer needed.
void MaybeDeleteOldVersions();
struct BySmallestKey;
Status SortLevel(Version* v, uint64_t level);
void GetOverlappingInputs( void GetOverlappingInputs(
int level, int level,
@ -202,6 +200,8 @@ class VersionSet {
void SetupOtherInputs(Compaction* c); void SetupOtherInputs(Compaction* c);
void AppendVersion(Version* v);
Env* const env_; Env* const env_;
const std::string dbname_; const std::string dbname_;
const Options* const options_; const Options* const options_;
@ -216,10 +216,8 @@ class VersionSet {
// Opened lazily // Opened lazily
WritableFile* descriptor_file_; WritableFile* descriptor_file_;
log::Writer* descriptor_log_; log::Writer* descriptor_log_;
Version dummy_versions_; // Head of circular doubly-linked list of versions.
// Versions are kept in a singly linked list that is never empty Version* current_; // == dummy_versions_.prev_
Version* current_; // Pointer to the last (newest) list entry
Version* oldest_; // Pointer to the first (oldest) list entry
// Per-level key at which the next compaction at that level should start. // Per-level key at which the next compaction at that level should start.
// Either an empty string, or a valid InternalKey. // Either an empty string, or a valid InternalKey.
@ -265,8 +263,8 @@ class Compaction {
bool IsBaseLevelForKey(const Slice& user_key); bool IsBaseLevelForKey(const Slice& user_key);
// Returns true iff we should stop building the current output // Returns true iff we should stop building the current output
// before processing "key". // before processing "internal_key".
bool ShouldStopBefore(const InternalKey& key); bool ShouldStopBefore(const Slice& internal_key);
// Release the input version for the compaction, once the compaction // Release the input version for the compaction, once the compaction
// is successful. // is successful.

@ -29,11 +29,53 @@ WriteBatch::WriteBatch() {
WriteBatch::~WriteBatch() { } WriteBatch::~WriteBatch() { }
WriteBatch::Handler::~Handler() { }
void WriteBatch::Clear() { void WriteBatch::Clear() {
rep_.clear(); rep_.clear();
rep_.resize(12); rep_.resize(12);
} }
Status WriteBatch::Iterate(Handler* handler) const {
Slice input(rep_);
if (input.size() < 12) {
return Status::Corruption("malformed WriteBatch (too small)");
}
input.remove_prefix(12);
Slice key, value;
int found = 0;
while (!input.empty()) {
found++;
char tag = input[0];
input.remove_prefix(1);
switch (tag) {
case kTypeValue:
if (GetLengthPrefixedSlice(&input, &key) &&
GetLengthPrefixedSlice(&input, &value)) {
handler->Put(key, value);
} else {
return Status::Corruption("bad WriteBatch Put");
}
break;
case kTypeDeletion:
if (GetLengthPrefixedSlice(&input, &key)) {
handler->Delete(key);
} else {
return Status::Corruption("bad WriteBatch Delete");
}
break;
default:
return Status::Corruption("unknown WriteBatch tag");
}
}
if (found != WriteBatchInternal::Count(this)) {
return Status::Corruption("WriteBatch has wrong count");
} else {
return Status::OK();
}
}
int WriteBatchInternal::Count(const WriteBatch* b) { int WriteBatchInternal::Count(const WriteBatch* b) {
return DecodeFixed32(b->rep_.data() + 8); return DecodeFixed32(b->rep_.data() + 8);
} }
@ -63,28 +105,29 @@ void WriteBatch::Delete(const Slice& key) {
PutLengthPrefixedSlice(&rep_, key); PutLengthPrefixedSlice(&rep_, key);
} }
Status WriteBatchInternal::InsertInto(const WriteBatch* b, namespace {
MemTable* memtable) { class MemTableInserter : public WriteBatch::Handler {
const int count = WriteBatchInternal::Count(b); public:
int found = 0; SequenceNumber sequence_;
Iterator it(*b); MemTable* mem_;
for (; !it.Done(); it.Next()) {
switch (it.op()) { virtual void Put(const Slice& key, const Slice& value) {
case kTypeDeletion: mem_->Add(sequence_, kTypeValue, key, value);
memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice()); sequence_++;
break;
case kTypeValue:
memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value());
break;
}
found++;
} }
if (!it.status().ok()) { virtual void Delete(const Slice& key) {
return it.status(); mem_->Add(sequence_, kTypeDeletion, key, Slice());
} else if (found != count) { sequence_++;
return Status::Corruption("wrong count in WriteBatch");
} }
return Status::OK(); };
}
Status WriteBatchInternal::InsertInto(const WriteBatch* b,
MemTable* memtable) {
MemTableInserter inserter;
inserter.sequence_ = WriteBatchInternal::Sequence(b);
inserter.mem_ = memtable;
return b->Iterate(&inserter);
} }
void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
@ -92,57 +135,4 @@ void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
b->rep_.assign(contents.data(), contents.size()); b->rep_.assign(contents.data(), contents.size());
} }
WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch)
: input_(WriteBatchInternal::Contents(&batch)),
done_(false) {
if (input_.size() < 12) {
done_ = true;
} else {
seq_ = WriteBatchInternal::Sequence(&batch),
input_.remove_prefix(12);
GetNextEntry();
}
}
void WriteBatchInternal::Iterator::Next() {
assert(!done_);
seq_++;
GetNextEntry();
}
void WriteBatchInternal::Iterator::GetNextEntry() {
if (input_.empty()) {
done_ = true;
return;
}
char tag = input_[0];
input_.remove_prefix(1);
switch (tag) {
case kTypeValue:
if (GetLengthPrefixedSlice(&input_, &key_) &&
GetLengthPrefixedSlice(&input_, &value_)) {
op_ = static_cast<ValueType>(tag);
} else {
status_ = Status::Corruption("bad WriteBatch Put");
done_ = true;
input_.clear();
}
break;
case kTypeDeletion:
if (GetLengthPrefixedSlice(&input_, &key_)) {
op_ = kTypeDeletion;
} else {
status_ = Status::Corruption("bad WriteBatch Delete");
done_ = true;
input_.clear();
}
break;
default:
status_ = Status::Corruption("unknown WriteBatch tag");
done_ = true;
input_.clear();
break;
}
}
} }

@ -37,30 +37,6 @@ class WriteBatchInternal {
static void SetContents(WriteBatch* batch, const Slice& contents); static void SetContents(WriteBatch* batch, const Slice& contents);
static Status InsertInto(const WriteBatch* batch, MemTable* memtable); static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
// Iterate over the contents of a write batch.
class Iterator {
public:
explicit Iterator(const WriteBatch& batch);
bool Done() const { return done_; }
void Next();
ValueType op() const { return op_; }
const Slice& key() const { return key_; }
const Slice& value() const { return value_; }
SequenceNumber sequence_number() const { return seq_; }
Status status() const { return status_; }
private:
void GetNextEntry();
Slice input_;
bool done_;
ValueType op_;
Slice key_;
Slice value_;
SequenceNumber seq_;
Status status_;
};
}; };
} }

@ -14,10 +14,11 @@ namespace leveldb {
static std::string PrintContents(WriteBatch* b) { static std::string PrintContents(WriteBatch* b) {
InternalKeyComparator cmp(BytewiseComparator()); InternalKeyComparator cmp(BytewiseComparator());
MemTable mem(cmp); MemTable* mem = new MemTable(cmp);
mem->Ref();
std::string state; std::string state;
Status s = WriteBatchInternal::InsertInto(b, &mem); Status s = WriteBatchInternal::InsertInto(b, mem);
Iterator* iter = mem.NewIterator(); Iterator* iter = mem->NewIterator();
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey ikey; ParsedInternalKey ikey;
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
@ -42,6 +43,7 @@ static std::string PrintContents(WriteBatch* b) {
if (!s.ok()) { if (!s.ok()) {
state.append("ParseError()"); state.append("ParseError()");
} }
mem->Unref();
return state; return state;
} }

@ -17,14 +17,14 @@ However the organization of the files that make up the representation
is somewhat different and is explained below. is somewhat different and is explained below.
<p> <p>
Each database is represented by a set of file stored in a directory. Each database is represented by a set of files stored in a directory.
There are several different types of files as documented below: There are several different types of files as documented below:
<p> <p>
<h2>Log files</h2> <h2>Log files</h2>
<p> <p>
A log file (*.log) stores a sequence of recent updates. Each update A log file (*.log) stores a sequence of recent updates. Each update
is appended to the current log file. When the log file reaches a is appended to the current log file. When the log file reaches a
pre-determined size (approximately 1MB by default), it is converted pre-determined size (approximately 4MB by default), it is converted
to a sorted table (see below) and a new log file is created for future to a sorted table (see below) and a new log file is created for future
updates. updates.
<p> <p>
@ -83,19 +83,15 @@ Other files used for miscellaneous purposes may also be present
<h1>Level 0</h1> <h1>Level 0</h1>
When the log file grows above a certain size (1MB by default): When the log file grows above a certain size (1MB by default):
<ul> <ul>
<li>Write the contents of the current memtable to an sstable <li>Create a brand new memtable and log file and direct future updates here
<li>Replace the current memtable by a brand new empty memtable <li>In the background:
<li>Switch to a new log file <ul>
<li>Write the contents of the previous memtable to an sstable
<li>Discard the memtable
<li>Delete the old log file and the old memtable <li>Delete the old log file and the old memtable
<li>Add the new sstable to the young (level-0) level.
</ul>
</ul> </ul>
Experimental measurements show that generating an sstable from a 1MB
log file takes ~12ms, which seems like an acceptable latency hiccup to
add infrequently to a log write.
<p>
The new sstable is added to a special level-0 level. level-0 contains
a set of files (up to 4 by default). However unlike other levels,
these files do not cover disjoint ranges, but may overlap each other.
<h1>Compactions</h1> <h1>Compactions</h1>
@ -162,8 +158,8 @@ read.
<p> <p>
Solution 1: To reduce this problem, we might want to increase the log Solution 1: To reduce this problem, we might want to increase the log
switching threshold when the number of level-0 files is large. Though switching threshold when the number of level-0 files is large. Though
the downside is that the larger this threshold, the larger the delay the downside is that the larger this threshold, the more memory we will
that we will add to write latency when a write triggers a log switch. need to hold the corresponding memtable.
<p> <p>
Solution 2: We might want to decrease write rate artificially when the Solution 2: We might want to decrease write rate artificially when the

@ -141,10 +141,18 @@ the batch.
<p> <p>
<h1>Concurrency</h1> <h1>Concurrency</h1>
<p> <p>
A database may only be opened by one process at a time. The <code>leveldb</code> A database may only be opened by one process at a time.
implementation acquires a lock from the operating system to prevent The <code>leveldb</code> implementation acquires a lock from the
misuse. Within a single process, the same <code>leveldb::DB</code> object may operating system to prevent misuse. Within a single process, the
be safely used by multiple concurrent threads. same <code>leveldb::DB</code> object may be safely shared by multiple
concurrent threads. I.e., different threads may write into or fetch
iterators or call <code>Get</code> on the same database without any
external synchronization (the leveldb implementation will
automatically do the required synchronization). However other objects
(like Iterator and WriteBatch) may require external synchronization.
If two threads share such an object, they must protect access to it
using their own locking protocol. More details are available in
the public header files.
<p> <p>
<h1>Iteration</h1> <h1>Iteration</h1>
<p> <p>

@ -12,7 +12,9 @@ namespace leveldb {
class Slice; class Slice;
// A Comparator object provides a total order across slices that are // A Comparator object provides a total order across slices that are
// used as keys in an sstable or a database. // used as keys in an sstable or a database. A Comparator implementation
// must be thread-safe since leveldb may invoke its methods concurrently
// from multiple threads.
class Comparator { class Comparator {
public: public:
virtual ~Comparator(); virtual ~Comparator();

@ -13,26 +13,32 @@
namespace leveldb { namespace leveldb {
static const int kMajorVersion = 1; static const int kMajorVersion = 1;
static const int kMinorVersion = 1; static const int kMinorVersion = 2;
struct Options; struct Options;
struct ReadOptions; struct ReadOptions;
struct WriteOptions; struct WriteOptions;
class Snapshot;
class WriteBatch; class WriteBatch;
// Some internal types. Clients should ignore. // Abstract handle to particular state of a DB.
class WriteBatchInternal; // A Snapshot is an immutable object and can therefore be safely
// accessed from multiple threads without any external synchronization.
class Snapshot {
protected:
virtual ~Snapshot();
};
// A range of keys
struct Range { struct Range {
Slice start; Slice start; // Included in the range
Slice limit; Slice limit; // Not included in the range
Range(const Slice& s, const Slice& l) : start(s), limit(l) { } Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
}; };
// A DB is a persistent ordered map from keys to values. // A DB is a persistent ordered map from keys to values.
// A DB is safe for concurrent access from multiple threads without
// any external synchronization.
class DB { class DB {
public: public:
// Open the database with the specified "name". // Open the database with the specified "name".

@ -6,6 +6,9 @@
// operating system functionality like the filesystem etc. Callers // operating system functionality like the filesystem etc. Callers
// may wish to provide a custom Env object when opening a database to // may wish to provide a custom Env object when opening a database to
// get fine gain control; e.g., to rate limit file system operations. // get fine gain control; e.g., to rate limit file system operations.
//
// All Env implementations are safe for concurrent access from
// multiple threads without any external synchronization.
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ #ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
#define STORAGE_LEVELDB_INCLUDE_ENV_H_ #define STORAGE_LEVELDB_INCLUDE_ENV_H_
@ -160,6 +163,15 @@ class SequentialFile {
// //
// REQUIRES: External synchronization // REQUIRES: External synchronization
virtual Status Read(size_t n, Slice* result, char* scratch) = 0; virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
// Skip "n" bytes from the file. This is guaranteed to be no
// slower that reading the same data, but may be faster.
//
// If end of file is reached, skipping will stop at the end of the
// file, and Skip will return OK.
//
// REQUIRES: External synchronization
virtual Status Skip(uint64_t n) = 0;
}; };
// A file abstraction for randomly reading the contents of a file. // A file abstraction for randomly reading the contents of a file.

@ -6,6 +6,11 @@
// The following class defines the interface. Multiple implementations // The following class defines the interface. Multiple implementations
// are provided by this library. In particular, iterators are provided // are provided by this library. In particular, iterators are provided
// to access the contents of a Table or a DB. // to access the contents of a Table or a DB.
//
// Multiple threads can invoke const methods on an Iterator without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same Iterator must use
// external synchronization.
#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ #ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ #define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_

@ -6,6 +6,11 @@
// storage and a size. The user of a Slice must ensure that the slice // storage and a size. The user of a Slice must ensure that the slice
// is not used after the corresponding external storage has been // is not used after the corresponding external storage has been
// deallocated. // deallocated.
//
// Multiple threads can invoke const methods on a Slice without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same Slice must use
// external synchronization.
#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ #ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_
#define STORAGE_LEVELDB_INCLUDE_SLICE_H_ #define STORAGE_LEVELDB_INCLUDE_SLICE_H_

@ -4,12 +4,16 @@
// //
// A Status encapsulates the result of an operation. It may indicate success, // A Status encapsulates the result of an operation. It may indicate success,
// or it may indicate an error with an associated error message. // or it may indicate an error with an associated error message.
//
// Multiple threads can invoke const methods on a Status without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same Status must use
// external synchronization.
#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ #ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_
#define STORAGE_LEVELDB_INCLUDE_STATUS_H_ #define STORAGE_LEVELDB_INCLUDE_STATUS_H_
#include <string> #include <string>
#include <utility>
#include "leveldb/slice.h" #include "leveldb/slice.h"
namespace leveldb { namespace leveldb {
@ -18,7 +22,7 @@ class Status {
public: public:
// Create a success status. // Create a success status.
Status() : state_(NULL) { } Status() : state_(NULL) { }
~Status() { delete state_; } ~Status() { delete[] state_; }
// Copy the specified status. // Copy the specified status.
Status(const Status& s); Status(const Status& s);
@ -29,7 +33,7 @@ class Status {
// Return error status of an appropriate type. // Return error status of an appropriate type.
static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kNotFound, msg, Slice()); return Status(kNotFound, msg, msg2);
} }
static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kCorruption, msg, msg2); return Status(kCorruption, msg, msg2);
@ -55,6 +59,13 @@ class Status {
std::string ToString() const; std::string ToString() const;
private: private:
// OK status has a NULL state_. Otherwise, state_ is a new[] array
// of the following form:
// state_[0..3] == length of message
// state_[4] == code
// state_[5..] == message
const char* state_;
enum Code { enum Code {
kOk = 0, kOk = 0,
kNotFound = 1, kNotFound = 1,
@ -63,21 +74,24 @@ class Status {
kInvalidArgument = 4, kInvalidArgument = 4,
kIOError = 5, kIOError = 5,
}; };
Code code() const { return (state_ == NULL) ? kOk : state_->first; }
Status(Code code, const Slice& msg, const Slice& msg2); Code code() const {
return (state_ == NULL) ? kOk : static_cast<Code>(state_[4]);
}
typedef std::pair<Code, std::string> State; Status(Code code, const Slice& msg, const Slice& msg2);
State* state_; static const char* CopyState(const char* s);
}; };
inline Status::Status(const Status& s) { inline Status::Status(const Status& s) {
state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
} }
inline void Status::operator=(const Status& s) { inline void Status::operator=(const Status& s) {
if (this != &s) { // The following condition catches both aliasing (when this == &s),
delete state_; // and the common case where both s and *this are ok.
state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); if (state_ != s.state_) {
delete[] state_;
state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
} }
} }

@ -17,7 +17,8 @@ class RandomAccessFile;
struct ReadOptions; struct ReadOptions;
// A Table is a sorted map from strings to strings. Tables are // A Table is a sorted map from strings to strings. Tables are
// immutable and persistent. // immutable and persistent. A Table may be safely accessed from
// multiple threads without external synchronization.
class Table { class Table {
public: public:
// Attempt to open the table that is stored in bytes [0..file_size) // Attempt to open the table that is stored in bytes [0..file_size)

@ -4,6 +4,11 @@
// //
// TableBuilder provides the interface used to build a Table // TableBuilder provides the interface used to build a Table
// (an immutable and sorted map from keys to values). // (an immutable and sorted map from keys to values).
//
// Multiple threads can invoke const methods on a TableBuilder without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same TableBuilder must use
// external synchronization.
#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ #ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ #define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_

@ -12,11 +12,17 @@
// batch.Delete("key"); // batch.Delete("key");
// batch.Put("key", "v2"); // batch.Put("key", "v2");
// batch.Put("key", "v3"); // batch.Put("key", "v3");
//
// Multiple threads can invoke const methods on a WriteBatch without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same WriteBatch must use
// external synchronization.
#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ #ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ #define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
#include <string> #include <string>
#include "leveldb/status.h"
namespace leveldb { namespace leveldb {
@ -36,6 +42,15 @@ class WriteBatch {
// Clear all updates buffered in this batch. // Clear all updates buffered in this batch.
void Clear(); void Clear();
// Support for iterating over the contents of a batch.
class Handler {
public:
virtual ~Handler();
virtual void Put(const Slice& key, const Slice& value) = 0;
virtual void Delete(const Slice& key) = 0;
};
Status Iterate(Handler* handler) const;
private: private:
friend class WriteBatchInternal; friend class WriteBatchInternal;

@ -80,7 +80,7 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) {
if (counter_ < options_->block_restart_interval) { if (counter_ < options_->block_restart_interval) {
// See how much sharing to do with previous string // See how much sharing to do with previous string
const size_t min_length = std::min(last_key_piece.size(), key.size()); const size_t min_length = std::min(last_key_piece.size(), key.size());
while ((shared < min_length) && (last_key_[shared] == key[shared])) { while ((shared < min_length) && (last_key_piece[shared] == key[shared])) {
shared++; shared++;
} }
} else { } else {

@ -319,13 +319,15 @@ class MemTableConstructor: public Constructor {
: Constructor(cmp), : Constructor(cmp),
internal_comparator_(cmp) { internal_comparator_(cmp) {
memtable_ = new MemTable(internal_comparator_); memtable_ = new MemTable(internal_comparator_);
memtable_->Ref();
} }
~MemTableConstructor() { ~MemTableConstructor() {
delete memtable_; memtable_->Unref();
} }
virtual Status FinishImpl(const Options& options, const KVMap& data) { virtual Status FinishImpl(const Options& options, const KVMap& data) {
delete memtable_; memtable_->Unref();
memtable_ = new MemTable(internal_comparator_); memtable_ = new MemTable(internal_comparator_);
memtable_->Ref();
int seq = 1; int seq = 1;
for (KVMap::const_iterator it = data.begin(); for (KVMap::const_iterator it = data.begin();
it != data.end(); it != data.end();
@ -736,16 +738,17 @@ class MemTableTest { };
TEST(MemTableTest, Simple) { TEST(MemTableTest, Simple) {
InternalKeyComparator cmp(BytewiseComparator()); InternalKeyComparator cmp(BytewiseComparator());
MemTable memtable(cmp); MemTable* memtable = new MemTable(cmp);
memtable->Ref();
WriteBatch batch; WriteBatch batch;
WriteBatchInternal::SetSequence(&batch, 100); WriteBatchInternal::SetSequence(&batch, 100);
batch.Put(std::string("k1"), std::string("v1")); batch.Put(std::string("k1"), std::string("v1"));
batch.Put(std::string("k2"), std::string("v2")); batch.Put(std::string("k2"), std::string("v2"));
batch.Put(std::string("k3"), std::string("v3")); batch.Put(std::string("k3"), std::string("v3"));
batch.Put(std::string("largekey"), std::string("vlarge")); batch.Put(std::string("largekey"), std::string("vlarge"));
ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok()); ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable).ok());
Iterator* iter = memtable.NewIterator(); Iterator* iter = memtable->NewIterator();
iter->SeekToFirst(); iter->SeekToFirst();
while (iter->Valid()) { while (iter->Valid()) {
fprintf(stderr, "key: '%s' -> '%s'\n", fprintf(stderr, "key: '%s' -> '%s'\n",
@ -755,6 +758,7 @@ TEST(MemTableTest, Simple) {
} }
delete iter; delete iter;
memtable->Unref();
} }
static bool Between(uint64_t val, uint64_t low, uint64_t high) { static bool Between(uint64_t val, uint64_t low, uint64_t high) {

@ -141,6 +141,13 @@ class ChromiumSequentialFile: public SequentialFile {
} }
return s; return s;
} }
virtual Status Skip(uint64_t n) {
if (fseek(file_, n, SEEK_CUR)) {
return Status::IOError(filename_, strerror(errno));
}
return Status::OK();
}
}; };
class ChromiumRandomAccessFile: public RandomAccessFile { class ChromiumRandomAccessFile: public RandomAccessFile {

@ -52,6 +52,13 @@ class PosixSequentialFile: public SequentialFile {
} }
return s; return s;
} }
virtual Status Skip(uint64_t n) {
if (fseek(file_, n, SEEK_CUR)) {
return Status::IOError(filename_, strerror(errno));
}
return Status::OK();
}
}; };
class PosixRandomAccessFile: public RandomAccessFile { class PosixRandomAccessFile: public RandomAccessFile {

@ -8,13 +8,29 @@
namespace leveldb { namespace leveldb {
const char* Status::CopyState(const char* state) {
uint32_t size;
memcpy(&size, state, sizeof(size));
char* result = new char[size + 5];
memcpy(result, state, size + 5);
return result;
}
Status::Status(Code code, const Slice& msg, const Slice& msg2) { Status::Status(Code code, const Slice& msg, const Slice& msg2) {
assert(code != kOk); assert(code != kOk);
state_ = new State(make_pair(code, std::string(msg.data(), msg.size()))); const uint32_t len1 = msg.size();
if (!msg2.empty()) { const uint32_t len2 = msg2.size();
state_->second.append(": "); const uint32_t size = len1 + (len2 ? (2 + len2) : 0);
state_->second.append(msg2.data(), msg2.size()); char* result = new char[size + 5];
memcpy(result, &size, sizeof(size));
result[4] = static_cast<char>(code);
memcpy(result + 5, msg.data(), len1);
if (len2) {
result[5 + len1] = ':';
result[6 + len1] = ' ';
memcpy(result + 7 + len1, msg2.data(), len2);
} }
state_ = result;
} }
std::string Status::ToString() const { std::string Status::ToString() const {
@ -23,12 +39,12 @@ std::string Status::ToString() const {
} else { } else {
char tmp[30]; char tmp[30];
const char* type; const char* type;
switch (state_->first) { switch (code()) {
case kOk: case kOk:
type = "OK"; type = "OK";
break; break;
case kNotFound: case kNotFound:
type = "NotFound"; type = "NotFound: ";
break; break;
case kCorruption: case kCorruption:
type = "Corruption: "; type = "Corruption: ";
@ -44,14 +60,14 @@ std::string Status::ToString() const {
break; break;
default: default:
snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
static_cast<int>(state_->first)); static_cast<int>(code()));
type = tmp; type = tmp;
break; break;
} }
std::string result(type); std::string result(type);
if (!state_->second.empty()) { uint32_t length;
result.append(state_->second); memcpy(&length, state_, sizeof(length));
} result.append(state_ + 5, length);
return result; return result;
} }
} }

Loading…
Cancel
Save