Bugfix for issue 33; reduce lock contention in Get(), parallel benchmarks.

- Fix for issue 33 (non-null-terminated result from
  leveldb_property_value())

- Support for running multiple instances of a benchmark in parallel.

- Reduce lock contention on Get():
  (1) Do not hold the lock while searching memtables.
  (2) Shard block and table caches 16-ways.

  Benchmark for evaluating this change:
  $ db_bench --benchmarks=fillseq1,readrandom --threads=$n
  (fillseq1 is a small hack to make sure fillseq runs once regardless
  of number of threads specified on the command line).



git-svn-id: https://leveldb.googlecode.com/svn/trunk@49 62dab493-f737-651d-591e-8d6aee1b9529
main
gabor@google.com 14 years ago
parent ab323f7e1e
commit e3584f9c28
  1. 3
      db/c.cc
  2. 530
      db/db_bench.cc
  3. 36
      db/db_impl.cc
  4. 149
      util/cache.cc
  5. 39
      util/cache_test.cc
  6. 11
      util/histogram.cc
  7. 1
      util/histogram.h

@ -196,7 +196,8 @@ char* leveldb_property_value(
const char* propname) { const char* propname) {
std::string tmp; std::string tmp;
if (db->rep->GetProperty(Slice(propname), &tmp)) { if (db->rep->GetProperty(Slice(propname), &tmp)) {
return CopyString(tmp); // We use strdup() since we expect human readable output.
return strdup(tmp.c_str());
} else { } else {
return NULL; return NULL;
} }

@ -14,6 +14,7 @@
#include "port/port.h" #include "port/port.h"
#include "util/crc32c.h" #include "util/crc32c.h"
#include "util/histogram.h" #include "util/histogram.h"
#include "util/mutexlock.h"
#include "util/random.h" #include "util/random.h"
#include "util/testutil.h" #include "util/testutil.h"
@ -60,6 +61,9 @@ static int FLAGS_num = 1000000;
// Number of read operations to do. If negative, do FLAGS_num reads. // Number of read operations to do. If negative, do FLAGS_num reads.
static int FLAGS_reads = -1; static int FLAGS_reads = -1;
// Number of concurrent threads to run.
static int FLAGS_threads = 1;
// Size of each value // Size of each value
static int FLAGS_value_size = 100; static int FLAGS_value_size = 100;
@ -91,8 +95,9 @@ static const char* FLAGS_db = "/tmp/dbbench";
namespace leveldb { namespace leveldb {
// Helper for quickly generating random data.
namespace { namespace {
// Helper for quickly generating random data.
class RandomGenerator { class RandomGenerator {
private: private:
std::string data_; std::string data_;
@ -136,6 +141,152 @@ static Slice TrimSpace(Slice s) {
return Slice(s.data() + start, limit - start); return Slice(s.data() + start, limit - start);
} }
static void AppendWithSpace(std::string* str, Slice msg) {
if (msg.empty()) return;
if (!str->empty()) {
str->push_back(' ');
}
str->append(msg.data(), msg.size());
}
class Stats {
private:
double start_;
double finish_;
double seconds_;
int done_;
int next_report_;
int64_t bytes_;
double last_op_finish_;
Histogram hist_;
std::string message_;
public:
Stats() { Start(); }
void Start() {
next_report_ = 100;
last_op_finish_ = start_;
hist_.Clear();
done_ = 0;
bytes_ = 0;
seconds_ = 0;
start_ = Env::Default()->NowMicros();
finish_ = start_;
message_.clear();
}
void Merge(const Stats& other) {
hist_.Merge(other.hist_);
done_ += other.done_;
bytes_ += other.bytes_;
seconds_ += other.seconds_;
if (other.start_ < start_) start_ = other.start_;
if (other.finish_ > finish_) finish_ = other.finish_;
// Just keep the messages from one thread
if (message_.empty()) message_ = other.message_;
}
void Stop() {
finish_ = Env::Default()->NowMicros();
seconds_ = (finish_ - start_) * 1e-6;
}
void AddMessage(Slice msg) {
AppendWithSpace(&message_, msg);
}
void FinishedSingleOp() {
if (FLAGS_histogram) {
double now = Env::Default()->NowMicros();
double micros = now - last_op_finish_;
hist_.Add(micros);
if (micros > 20000) {
fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
fflush(stderr);
}
last_op_finish_ = now;
}
done_++;
if (done_ >= next_report_) {
if (next_report_ < 1000) next_report_ += 100;
else if (next_report_ < 5000) next_report_ += 500;
else if (next_report_ < 10000) next_report_ += 1000;
else if (next_report_ < 50000) next_report_ += 5000;
else if (next_report_ < 100000) next_report_ += 10000;
else if (next_report_ < 500000) next_report_ += 50000;
else next_report_ += 100000;
fprintf(stderr, "... finished %d ops%30s\r", done_, "");
fflush(stderr);
}
}
void AddBytes(int64_t n) {
bytes_ += n;
}
void Report(const Slice& name) {
// Pretend at least one op was done in case we are running a benchmark
// that does not call FinishedSingleOp().
if (done_ < 1) done_ = 1;
std::string extra;
if (bytes_ > 0) {
// Rate is computed on actual elapsed time, not the sum of per-thread
// elapsed times.
double elapsed = (finish_ - start_) * 1e-6;
char rate[100];
snprintf(rate, sizeof(rate), "%6.1f MB/s",
(bytes_ / 1048576.0) / elapsed);
extra = rate;
}
AppendWithSpace(&extra, message_);
fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
name.ToString().c_str(),
seconds_ * 1e6 / done_,
(extra.empty() ? "" : " "),
extra.c_str());
if (FLAGS_histogram) {
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
}
fflush(stdout);
}
};
// State shared by all concurrent executions of the same benchmark.
struct SharedState {
port::Mutex mu;
port::CondVar cv;
int total;
// Each thread goes through the following states:
// (1) initializing
// (2) waiting for others to be initialized
// (3) running
// (4) done
int num_initialized;
int num_done;
bool start;
SharedState() : cv(&mu) { }
};
// Per-thread state for concurrent executions of the same benchmark.
struct ThreadState {
int tid; // 0..n-1 when running in n threads
Random rand; // Has different seeds for different threads
Stats stats;
ThreadState(int index)
: tid(index),
rand(1000 + index) {
}
};
} }
class Benchmark { class Benchmark {
@ -143,20 +294,11 @@ class Benchmark {
Cache* cache_; Cache* cache_;
DB* db_; DB* db_;
int num_; int num_;
int value_size_;
int entries_per_batch_;
WriteOptions write_options_;
int reads_; int reads_;
int heap_counter_; int heap_counter_;
double start_;
double last_op_finish_;
int64_t bytes_;
std::string message_;
std::string post_message_;
Histogram hist_;
RandomGenerator gen_;
Random rand_;
// State kept for progress messages
int done_;
int next_report_; // When to report next
void PrintHeader() { void PrintHeader() {
const int kKeySize = 16; const int kKeySize = 16;
@ -232,94 +374,15 @@ class Benchmark {
#endif #endif
} }
void Start() {
start_ = Env::Default()->NowMicros() * 1e-6;
bytes_ = 0;
message_.clear();
last_op_finish_ = start_;
hist_.Clear();
done_ = 0;
next_report_ = 100;
}
void FinishedSingleOp() {
if (FLAGS_histogram) {
double now = Env::Default()->NowMicros() * 1e-6;
double micros = (now - last_op_finish_) * 1e6;
hist_.Add(micros);
if (micros > 20000) {
fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
fflush(stderr);
}
last_op_finish_ = now;
}
done_++;
if (done_ >= next_report_) {
if (next_report_ < 1000) next_report_ += 100;
else if (next_report_ < 5000) next_report_ += 500;
else if (next_report_ < 10000) next_report_ += 1000;
else if (next_report_ < 50000) next_report_ += 5000;
else if (next_report_ < 100000) next_report_ += 10000;
else if (next_report_ < 500000) next_report_ += 50000;
else next_report_ += 100000;
fprintf(stderr, "... finished %d ops%30s\r", done_, "");
fflush(stderr);
}
}
void Stop(const Slice& name) {
double finish = Env::Default()->NowMicros() * 1e-6;
// Pretend at least one op was done in case we are running a benchmark
// that does nto call FinishedSingleOp().
if (done_ < 1) done_ = 1;
if (bytes_ > 0) {
char rate[100];
snprintf(rate, sizeof(rate), "%6.1f MB/s",
(bytes_ / 1048576.0) / (finish - start_));
if (!message_.empty()) {
message_ = std::string(rate) + " " + message_;
} else {
message_ = rate;
}
}
fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
name.ToString().c_str(),
(finish - start_) * 1e6 / done_,
(message_.empty() ? "" : " "),
message_.c_str());
if (FLAGS_histogram) {
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
}
fflush(stdout);
if (!post_message_.empty()) {
fprintf(stdout, "\n%s\n", post_message_.c_str());
post_message_.clear();
}
}
public: public:
enum Order {
SEQUENTIAL,
RANDOM
};
enum DBState {
FRESH,
EXISTING
};
Benchmark() Benchmark()
: cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
db_(NULL), db_(NULL),
num_(FLAGS_num), num_(FLAGS_num),
value_size_(FLAGS_value_size),
entries_per_batch_(1),
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
heap_counter_(0), heap_counter_(0) {
bytes_(0),
rand_(301) {
std::vector<std::string> files; std::vector<std::string> files;
Env::Default()->GetChildren(FLAGS_db, &files); Env::Default()->GetChildren(FLAGS_db, &files);
for (int i = 0; i < files.size(); i++) { for (int i = 0; i < files.size(); i++) {
@ -353,98 +416,203 @@ class Benchmark {
benchmarks = sep + 1; benchmarks = sep + 1;
} }
Start(); // Reset parameters that may be overriddden bwlow
num_ = FLAGS_num;
reads_ = num_;
value_size_ = FLAGS_value_size;
entries_per_batch_ = 1;
write_options_ = WriteOptions();
void (Benchmark::*method)(ThreadState*) = NULL;
bool fresh_db = false;
WriteOptions write_options;
bool known = true;
if (name == Slice("fillseq")) { if (name == Slice("fillseq")) {
Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); fresh_db = true;
method = &Benchmark::WriteSeq;
} else if (name == Slice("fillbatch")) { } else if (name == Slice("fillbatch")) {
Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000); fresh_db = true;
entries_per_batch_ = 1000;
method = &Benchmark::WriteSeq;
} else if (name == Slice("fillrandom")) { } else if (name == Slice("fillrandom")) {
Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size, 1); fresh_db = true;
method = &Benchmark::WriteRandom;
} else if (name == Slice("overwrite")) { } else if (name == Slice("overwrite")) {
Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size, 1); fresh_db = false;
method = &Benchmark::WriteRandom;
} else if (name == Slice("fillsync")) { } else if (name == Slice("fillsync")) {
write_options.sync = true; fresh_db = true;
Write(write_options, RANDOM, FRESH, num_ / 1000, FLAGS_value_size, 1); num_ /= 1000;
write_options_.sync = true;
method = &Benchmark::WriteRandom;
} else if (name == Slice("fill100K")) { } else if (name == Slice("fill100K")) {
Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1); fresh_db = true;
num_ /= 1000;
value_size_ = 100 * 1000;
method = &Benchmark::WriteRandom;
} else if (name == Slice("readseq")) { } else if (name == Slice("readseq")) {
ReadSequential(); method = &Benchmark::ReadSequential;
} else if (name == Slice("readreverse")) { } else if (name == Slice("readreverse")) {
ReadReverse(); method = &Benchmark::ReadReverse;
} else if (name == Slice("readrandom")) { } else if (name == Slice("readrandom")) {
ReadRandom(); method = &Benchmark::ReadRandom;
} else if (name == Slice("readhot")) { } else if (name == Slice("readhot")) {
ReadHot(); method = &Benchmark::ReadHot;
} else if (name == Slice("readrandomsmall")) { } else if (name == Slice("readrandomsmall")) {
int n = reads_;
reads_ /= 1000; reads_ /= 1000;
ReadRandom(); method = &Benchmark::ReadRandom;
reads_ = n;
} else if (name == Slice("compact")) { } else if (name == Slice("compact")) {
Compact(); method = &Benchmark::Compact;
} else if (name == Slice("crc32c")) { } else if (name == Slice("crc32c")) {
Crc32c(4096, "(4K per op)"); method = &Benchmark::Crc32c;
} else if (name == Slice("acquireload")) { } else if (name == Slice("acquireload")) {
AcquireLoad(); method = &Benchmark::AcquireLoad;
} else if (name == Slice("snappycomp")) { } else if (name == Slice("snappycomp")) {
SnappyCompress(); method = &Benchmark::SnappyCompress;
} else if (name == Slice("snappyuncomp")) { } else if (name == Slice("snappyuncomp")) {
SnappyUncompress(); method = &Benchmark::SnappyUncompress;
} else if (name == Slice("heapprofile")) { } else if (name == Slice("heapprofile")) {
HeapProfile(); HeapProfile();
} else if (name == Slice("stats")) { } else if (name == Slice("stats")) {
PrintStats(); PrintStats();
} else { } else {
known = false;
if (name != Slice()) { // No error message for empty name if (name != Slice()) { // No error message for empty name
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
} }
} }
if (known) {
Stop(name); if (fresh_db) {
if (FLAGS_use_existing_db) {
fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
name.ToString().c_str());
method = NULL;
} else {
delete db_;
db_ = NULL;
DestroyDB(FLAGS_db, Options());
Open();
}
}
if (method != NULL) {
RunBenchmark(name, method);
} }
} }
} }
private: private:
void Crc32c(int size, const char* label) { struct ThreadArg {
Benchmark* bm;
SharedState* shared;
ThreadState* thread;
void (Benchmark::*method)(ThreadState*);
};
static void ThreadBody(void* v) {
ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
SharedState* shared = arg->shared;
ThreadState* thread = arg->thread;
{
MutexLock l(&shared->mu);
shared->num_initialized++;
if (shared->num_initialized >= shared->total) {
shared->cv.SignalAll();
}
while (!shared->start) {
shared->cv.Wait();
}
}
thread->stats.Start();
(arg->bm->*(arg->method))(thread);
thread->stats.Stop();
{
MutexLock l(&shared->mu);
shared->num_done++;
if (shared->num_done >= shared->total) {
shared->cv.SignalAll();
}
}
}
void RunBenchmark(Slice name, void (Benchmark::*method)(ThreadState*)) {
const int n = FLAGS_threads;
SharedState shared;
shared.total = n;
shared.num_initialized = 0;
shared.num_done = 0;
shared.start = false;
ThreadArg* arg = new ThreadArg[n];
for (int i = 0; i < n; i++) {
arg[i].bm = this;
arg[i].method = method;
arg[i].shared = &shared;
arg[i].thread = new ThreadState(i);
Env::Default()->StartThread(ThreadBody, &arg[i]);
}
shared.mu.Lock();
while (shared.num_initialized < n) {
shared.cv.Wait();
}
shared.start = true;
shared.cv.SignalAll();
while (shared.num_done < n) {
shared.cv.Wait();
}
shared.mu.Unlock();
for (int i = 1; i < n; i++) {
arg[0].thread->stats.Merge(arg[i].thread->stats);
}
arg[0].thread->stats.Report(name);
for (int i = 0; i < n; i++) {
delete arg[i].thread;
}
delete[] arg;
}
void Crc32c(ThreadState* thread) {
// Checksum about 500MB of data total // Checksum about 500MB of data total
const int size = 4096;
const char* label = "(4K per op)";
std::string data(size, 'x'); std::string data(size, 'x');
int64_t bytes = 0; int64_t bytes = 0;
uint32_t crc = 0; uint32_t crc = 0;
while (bytes < 500 * 1048576) { while (bytes < 500 * 1048576) {
crc = crc32c::Value(data.data(), size); crc = crc32c::Value(data.data(), size);
FinishedSingleOp(); thread->stats.FinishedSingleOp();
bytes += size; bytes += size;
} }
// Print so result is not dead // Print so result is not dead
fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc)); fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
bytes_ = bytes; thread->stats.AddBytes(bytes);
message_ = label; thread->stats.AddMessage(label);
} }
void AcquireLoad() { void AcquireLoad(ThreadState* thread) {
int dummy; int dummy;
port::AtomicPointer ap(&dummy); port::AtomicPointer ap(&dummy);
int count = 0; int count = 0;
void *ptr = NULL; void *ptr = NULL;
message_ = "(each op is 1000 loads)"; thread->stats.AddMessage("(each op is 1000 loads)");
while (count < 100000) { while (count < 100000) {
for (int i = 0; i < 1000; i++) { for (int i = 0; i < 1000; i++) {
ptr = ap.Acquire_Load(); ptr = ap.Acquire_Load();
} }
count++; count++;
FinishedSingleOp(); thread->stats.FinishedSingleOp();
} }
if (ptr == NULL) exit(1); // Disable unused variable warning. if (ptr == NULL) exit(1); // Disable unused variable warning.
} }
void SnappyCompress() { void SnappyCompress(ThreadState* thread) {
Slice input = gen_.Generate(Options().block_size); RandomGenerator gen;
Slice input = gen.Generate(Options().block_size);
int64_t bytes = 0; int64_t bytes = 0;
int64_t produced = 0; int64_t produced = 0;
bool ok = true; bool ok = true;
@ -453,22 +621,23 @@ class Benchmark {
ok = port::Snappy_Compress(input.data(), input.size(), &compressed); ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
produced += compressed.size(); produced += compressed.size();
bytes += input.size(); bytes += input.size();
FinishedSingleOp(); thread->stats.FinishedSingleOp();
} }
if (!ok) { if (!ok) {
message_ = "(snappy failure)"; thread->stats.AddMessage("(snappy failure)");
} else { } else {
char buf[100]; char buf[100];
snprintf(buf, sizeof(buf), "(output: %.1f%%)", snprintf(buf, sizeof(buf), "(output: %.1f%%)",
(produced * 100.0) / bytes); (produced * 100.0) / bytes);
message_ = buf; thread->stats.AddMessage(buf);
bytes_ = bytes; thread->stats.AddBytes(bytes);
} }
} }
void SnappyUncompress() { void SnappyUncompress(ThreadState* thread) {
Slice input = gen_.Generate(Options().block_size); RandomGenerator gen;
Slice input = gen.Generate(Options().block_size);
std::string compressed; std::string compressed;
bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed); bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
int64_t bytes = 0; int64_t bytes = 0;
@ -477,14 +646,14 @@ class Benchmark {
ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), ok = port::Snappy_Uncompress(compressed.data(), compressed.size(),
uncompressed); uncompressed);
bytes += input.size(); bytes += input.size();
FinishedSingleOp(); thread->stats.FinishedSingleOp();
} }
delete[] uncompressed; delete[] uncompressed;
if (!ok) { if (!ok) {
message_ = "(snappy failure)"; thread->stats.AddMessage("(snappy failure)");
} else { } else {
bytes_ = bytes; thread->stats.AddBytes(bytes);
} }
} }
@ -501,95 +670,97 @@ class Benchmark {
} }
} }
void Write(const WriteOptions& options, Order order, DBState state, void WriteSeq(ThreadState* thread) {
int num_entries, int value_size, int entries_per_batch) { DoWrite(thread, true);
if (state == FRESH) { }
if (FLAGS_use_existing_db) {
message_ = "skipping (--use_existing_db is true)";
return;
}
delete db_;
db_ = NULL;
DestroyDB(FLAGS_db, Options());
Open();
Start(); // Do not count time taken to destroy/open
}
if (num_entries != num_) { void WriteRandom(ThreadState* thread) {
DoWrite(thread, false);
}
void DoWrite(ThreadState* thread, bool seq) {
if (num_ != FLAGS_num) {
char msg[100]; char msg[100];
snprintf(msg, sizeof(msg), "(%d ops)", num_entries); snprintf(msg, sizeof(msg), "(%d ops)", num_);
message_ = msg; thread->stats.AddMessage(msg);
} }
RandomGenerator gen;
WriteBatch batch; WriteBatch batch;
Status s; Status s;
std::string val; std::string val;
for (int i = 0; i < num_entries; i += entries_per_batch) { int64_t bytes = 0;
for (int i = 0; i < num_; i += entries_per_batch_) {
batch.Clear(); batch.Clear();
for (int j = 0; j < entries_per_batch; j++) { for (int j = 0; j < entries_per_batch_; j++) {
const int k = (order == SEQUENTIAL) ? i+j : (rand_.Next() % FLAGS_num); const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
char key[100]; char key[100];
snprintf(key, sizeof(key), "%016d", k); snprintf(key, sizeof(key), "%016d", k);
batch.Put(key, gen_.Generate(value_size)); batch.Put(key, gen.Generate(value_size_));
bytes_ += value_size + strlen(key); bytes += value_size_ + strlen(key);
FinishedSingleOp(); thread->stats.FinishedSingleOp();
} }
s = db_->Write(options, &batch); s = db_->Write(write_options_, &batch);
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str()); fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1); exit(1);
} }
} }
thread->stats.AddBytes(bytes);
} }
void ReadSequential() { void ReadSequential(ThreadState* thread) {
Iterator* iter = db_->NewIterator(ReadOptions()); Iterator* iter = db_->NewIterator(ReadOptions());
int i = 0; int i = 0;
int64_t bytes = 0;
for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
bytes_ += iter->key().size() + iter->value().size(); bytes += iter->key().size() + iter->value().size();
FinishedSingleOp(); thread->stats.FinishedSingleOp();
++i; ++i;
} }
delete iter; delete iter;
thread->stats.AddBytes(bytes);
} }
void ReadReverse() { void ReadReverse(ThreadState* thread) {
Iterator* iter = db_->NewIterator(ReadOptions()); Iterator* iter = db_->NewIterator(ReadOptions());
int i = 0; int i = 0;
int64_t bytes = 0;
for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
bytes_ += iter->key().size() + iter->value().size(); bytes += iter->key().size() + iter->value().size();
FinishedSingleOp(); thread->stats.FinishedSingleOp();
++i; ++i;
} }
delete iter; delete iter;
thread->stats.AddBytes(bytes);
} }
void ReadRandom() { void ReadRandom(ThreadState* thread) {
ReadOptions options; ReadOptions options;
std::string value; std::string value;
for (int i = 0; i < reads_; i++) { for (int i = 0; i < reads_; i++) {
char key[100]; char key[100];
const int k = rand_.Next() % FLAGS_num; const int k = thread->rand.Next() % FLAGS_num;
snprintf(key, sizeof(key), "%016d", k); snprintf(key, sizeof(key), "%016d", k);
db_->Get(options, key, &value); db_->Get(options, key, &value);
FinishedSingleOp(); thread->stats.FinishedSingleOp();
} }
} }
void ReadHot() { void ReadHot(ThreadState* thread) {
ReadOptions options; ReadOptions options;
std::string value; std::string value;
const int range = (FLAGS_num + 99) / 100; const int range = (FLAGS_num + 99) / 100;
for (int i = 0; i < reads_; i++) { for (int i = 0; i < reads_; i++) {
char key[100]; char key[100];
const int k = rand_.Next() % range; const int k = thread->rand.Next() % range;
snprintf(key, sizeof(key), "%016d", k); snprintf(key, sizeof(key), "%016d", k);
db_->Get(options, key, &value); db_->Get(options, key, &value);
FinishedSingleOp(); thread->stats.FinishedSingleOp();
} }
} }
void Compact() { void Compact(ThreadState* thread) {
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable(); dbi->TEST_CompactMemTable();
int max_level_with_files = 1; int max_level_with_files = 1;
@ -609,10 +780,9 @@ class Benchmark {
void PrintStats() { void PrintStats() {
std::string stats; std::string stats;
if (!db_->GetProperty("leveldb.stats", &stats)) { if (!db_->GetProperty("leveldb.stats", &stats)) {
message_ = "(failed)"; stats = "(failed)";
} else {
post_message_ = stats;
} }
fprintf(stdout, "\n%s\n", stats.c_str());
} }
static void WriteToFile(void* arg, const char* buf, int n) { static void WriteToFile(void* arg, const char* buf, int n) {
@ -625,13 +795,13 @@ class Benchmark {
WritableFile* file; WritableFile* file;
Status s = Env::Default()->NewWritableFile(fname, &file); Status s = Env::Default()->NewWritableFile(fname, &file);
if (!s.ok()) { if (!s.ok()) {
message_ = s.ToString(); fprintf(stderr, "%s\n", s.ToString().c_str());
return; return;
} }
bool ok = port::GetHeapProfile(WriteToFile, file); bool ok = port::GetHeapProfile(WriteToFile, file);
delete file; delete file;
if (!ok) { if (!ok) {
message_ = "not supported"; fprintf(stderr, "heap profiling not supported\n");
Env::Default()->DeleteFile(fname); Env::Default()->DeleteFile(fname);
} }
} }
@ -661,6 +831,8 @@ int main(int argc, char** argv) {
FLAGS_num = n; FLAGS_num = n;
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
FLAGS_reads = n; FLAGS_reads = n;
} else if (sscanf(argv[i], "--threads=%d%c", &n, &junk) == 1) {
FLAGS_threads = n;
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
FLAGS_value_size = n; FLAGS_value_size = n;
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {

@ -989,27 +989,37 @@ Status DBImpl::Get(const ReadOptions& options,
snapshot = versions_->LastSequence(); snapshot = versions_->LastSequence();
} }
// First look in the memtable, then in the immutable memtable (if any). MemTable* mem = mem_;
LookupKey lkey(key, snapshot); MemTable* imm = imm_;
if (mem_->Get(lkey, value, &s)) {
return s;
}
if (imm_ != NULL && imm_->Get(lkey, value, &s)) {
return s;
}
// Not in memtable(s); try live files in level order
Version* current = versions_->current(); Version* current = versions_->current();
mem->Ref();
if (imm != NULL) imm->Ref();
current->Ref(); current->Ref();
bool have_stat_update = false;
Version::GetStats stats; Version::GetStats stats;
{ // Unlock while reading from files
// Unlock while reading from files and memtables
{
mutex_.Unlock(); mutex_.Unlock();
s = current->Get(options, lkey, value, &stats); // First look in the memtable, then in the immutable memtable (if any).
LookupKey lkey(key, snapshot);
if (mem_->Get(lkey, value, &s)) {
// Done
} else if (imm_ != NULL && imm_->Get(lkey, value, &s)) {
// Done
} else {
s = current->Get(options, lkey, value, &stats);
have_stat_update = true;
}
mutex_.Lock(); mutex_.Lock();
} }
if (current->UpdateStats(stats)) {
if (have_stat_update && current->UpdateStats(stats)) {
MaybeScheduleCompaction(); MaybeScheduleCompaction();
} }
mem->Unref();
if (imm != NULL) imm->Unref();
current->Unref(); current->Unref();
return s; return s;
} }

@ -30,7 +30,8 @@ struct LRUHandle {
LRUHandle* prev; LRUHandle* prev;
size_t charge; // TODO(opt): Only allow uint32_t? size_t charge; // TODO(opt): Only allow uint32_t?
size_t key_length; size_t key_length;
size_t refs; // TODO(opt): Pack with "key_length"? uint32_t refs;
uint32_t hash; // Hash of key(); used for fast sharding and comparisons
char key_data[1]; // Beginning of key char key_data[1]; // Beginning of key
Slice key() const { Slice key() const {
@ -54,12 +55,12 @@ class HandleTable {
HandleTable() : length_(0), elems_(0), list_(NULL) { Resize(); } HandleTable() : length_(0), elems_(0), list_(NULL) { Resize(); }
~HandleTable() { delete[] list_; } ~HandleTable() { delete[] list_; }
LRUHandle* Lookup(LRUHandle* h) { LRUHandle* Lookup(const Slice& key, uint32_t hash) {
return *FindPointer(h); return *FindPointer(key, hash);
} }
LRUHandle* Insert(LRUHandle* h) { LRUHandle* Insert(LRUHandle* h) {
LRUHandle** ptr = FindPointer(h); LRUHandle** ptr = FindPointer(h->key(), h->hash);
LRUHandle* old = *ptr; LRUHandle* old = *ptr;
h->next_hash = (old == NULL ? NULL : old->next_hash); h->next_hash = (old == NULL ? NULL : old->next_hash);
*ptr = h; *ptr = h;
@ -74,8 +75,8 @@ class HandleTable {
return old; return old;
} }
LRUHandle* Remove(LRUHandle* h) { LRUHandle* Remove(const Slice& key, uint32_t hash) {
LRUHandle** ptr = FindPointer(h); LRUHandle** ptr = FindPointer(key, hash);
LRUHandle* result = *ptr; LRUHandle* result = *ptr;
if (result != NULL) { if (result != NULL) {
*ptr = result->next_hash; *ptr = result->next_hash;
@ -92,13 +93,12 @@ class HandleTable {
LRUHandle** list_; LRUHandle** list_;
// Return a pointer to slot that points to a cache entry that // Return a pointer to slot that points to a cache entry that
// matches *h. If there is no such cache entry, return a pointer to // matches key/hash. If there is no such cache entry, return a
// the trailing slot in the corresponding linked list. // pointer to the trailing slot in the corresponding linked list.
LRUHandle** FindPointer(LRUHandle* h) { LRUHandle** FindPointer(const Slice& key, uint32_t hash) {
Slice key = h->key();
uint32_t hash = Hash(key.data(), key.size(), 0);
LRUHandle** ptr = &list_[hash & (length_ - 1)]; LRUHandle** ptr = &list_[hash & (length_ - 1)];
while (*ptr != NULL && key != (*ptr)->key()) { while (*ptr != NULL &&
((*ptr)->hash != hash || key != (*ptr)->key())) {
ptr = &(*ptr)->next_hash; ptr = &(*ptr)->next_hash;
} }
return ptr; return ptr;
@ -117,7 +117,7 @@ class HandleTable {
while (h != NULL) { while (h != NULL) {
LRUHandle* next = h->next_hash; LRUHandle* next = h->next_hash;
Slice key = h->key(); Slice key = h->key();
uint32_t hash = Hash(key.data(), key.size(), 0); uint32_t hash = h->hash;
LRUHandle** ptr = &new_list[hash & (new_length - 1)]; LRUHandle** ptr = &new_list[hash & (new_length - 1)];
h->next_hash = *ptr; h->next_hash = *ptr;
*ptr = h; *ptr = h;
@ -132,26 +132,30 @@ class HandleTable {
} }
}; };
class LRUCache : public Cache { // A single shard of sharded cache.
class LRUCache {
public: public:
explicit LRUCache(size_t capacity); LRUCache();
virtual ~LRUCache(); ~LRUCache();
virtual Handle* Insert(const Slice& key, void* value, size_t charge, // Separate from constructor so caller can easily make an array of LRUCache
void (*deleter)(const Slice& key, void* value)); void SetCapacity(size_t capacity) { capacity_ = capacity; }
virtual Handle* Lookup(const Slice& key);
virtual void Release(Handle* handle); // Like Cache methods, but with an extra "hash" parameter.
virtual void* Value(Handle* handle); Cache::Handle* Insert(const Slice& key, uint32_t hash,
virtual void Erase(const Slice& key); void* value, size_t charge,
virtual uint64_t NewId(); void (*deleter)(const Slice& key, void* value));
Cache::Handle* Lookup(const Slice& key, uint32_t hash);
void Release(Cache::Handle* handle);
void Erase(const Slice& key, uint32_t hash);
private: private:
void LRU_Remove(LRUHandle* e); void LRU_Remove(LRUHandle* e);
void LRU_Append(LRUHandle* e); void LRU_Append(LRUHandle* e);
void Unref(LRUHandle* e); void Unref(LRUHandle* e);
// Constructor parameters // Initialized before use.
const size_t capacity_; size_t capacity_;
// mutex_ protects the following state. // mutex_ protects the following state.
port::Mutex mutex_; port::Mutex mutex_;
@ -165,9 +169,8 @@ class LRUCache : public Cache {
HandleTable table_; HandleTable table_;
}; };
LRUCache::LRUCache(size_t capacity) LRUCache::LRUCache()
: capacity_(capacity), : usage_(0),
usage_(0),
last_id_(0) { last_id_(0) {
// Make empty circular linked list // Make empty circular linked list
lru_.next = &lru_; lru_.next = &lru_;
@ -206,32 +209,25 @@ void LRUCache::LRU_Append(LRUHandle* e) {
e->next->prev = e; e->next->prev = e;
} }
Cache::Handle* LRUCache::Lookup(const Slice& key) { Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
MutexLock l(&mutex_); MutexLock l(&mutex_);
LRUHandle* e = table_.Lookup(key, hash);
LRUHandle dummy;
dummy.next = &dummy;
dummy.value = const_cast<Slice*>(&key);
LRUHandle* e = table_.Lookup(&dummy);
if (e != NULL) { if (e != NULL) {
e->refs++; e->refs++;
LRU_Remove(e); LRU_Remove(e);
LRU_Append(e); LRU_Append(e);
} }
return reinterpret_cast<Handle*>(e); return reinterpret_cast<Cache::Handle*>(e);
} }
void* LRUCache::Value(Handle* handle) { void LRUCache::Release(Cache::Handle* handle) {
return reinterpret_cast<LRUHandle*>(handle)->value;
}
void LRUCache::Release(Handle* handle) {
MutexLock l(&mutex_); MutexLock l(&mutex_);
Unref(reinterpret_cast<LRUHandle*>(handle)); Unref(reinterpret_cast<LRUHandle*>(handle));
} }
Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge, Cache::Handle* LRUCache::Insert(
void (*deleter)(const Slice& key, void* value)) { const Slice& key, uint32_t hash, void* value, size_t charge,
void (*deleter)(const Slice& key, void* value)) {
MutexLock l(&mutex_); MutexLock l(&mutex_);
LRUHandle* e = reinterpret_cast<LRUHandle*>( LRUHandle* e = reinterpret_cast<LRUHandle*>(
@ -240,6 +236,7 @@ Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge,
e->deleter = deleter; e->deleter = deleter;
e->charge = charge; e->charge = charge;
e->key_length = key.size(); e->key_length = key.size();
e->hash = hash;
e->refs = 2; // One from LRUCache, one for the returned handle e->refs = 2; // One from LRUCache, one for the returned handle
memcpy(e->key_data, key.data(), key.size()); memcpy(e->key_data, key.data(), key.size());
LRU_Append(e); LRU_Append(e);
@ -254,35 +251,77 @@ Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge,
while (usage_ > capacity_ && lru_.next != &lru_) { while (usage_ > capacity_ && lru_.next != &lru_) {
LRUHandle* old = lru_.next; LRUHandle* old = lru_.next;
LRU_Remove(old); LRU_Remove(old);
table_.Remove(old); table_.Remove(old->key(), old->hash);
Unref(old); Unref(old);
} }
return reinterpret_cast<Handle*>(e); return reinterpret_cast<Cache::Handle*>(e);
} }
void LRUCache::Erase(const Slice& key) { void LRUCache::Erase(const Slice& key, uint32_t hash) {
MutexLock l(&mutex_); MutexLock l(&mutex_);
LRUHandle* e = table_.Remove(key, hash);
LRUHandle dummy;
dummy.next = &dummy;
dummy.value = const_cast<Slice*>(&key);
LRUHandle* e = table_.Remove(&dummy);
if (e != NULL) { if (e != NULL) {
LRU_Remove(e); LRU_Remove(e);
Unref(e); Unref(e);
} }
} }
uint64_t LRUCache::NewId() { static const int kNumShardBits = 4;
MutexLock l(&mutex_); static const int kNumShards = 1 << kNumShardBits;
return ++(last_id_);
} class ShardedLRUCache : public Cache {
private:
LRUCache shard_[kNumShards];
port::Mutex id_mutex_;
uint64_t last_id_;
static inline uint32_t HashSlice(const Slice& s) {
return Hash(s.data(), s.size(), 0);
}
static uint32_t Shard(uint32_t hash) {
return hash >> (32 - kNumShardBits);
}
public:
explicit ShardedLRUCache(size_t capacity) {
const size_t per_shard = (capacity + (kNumShards - 1)) / kNumShards;
for (int s = 0; s < kNumShards; s++) {
shard_[s].SetCapacity(per_shard);
}
}
virtual ~ShardedLRUCache() { }
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
void (*deleter)(const Slice& key, void* value)) {
const uint32_t hash = HashSlice(key);
return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter);
}
virtual Handle* Lookup(const Slice& key) {
const uint32_t hash = HashSlice(key);
return shard_[Shard(hash)].Lookup(key, hash);
}
virtual void Release(Handle* handle) {
LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
shard_[Shard(h->hash)].Release(handle);
}
virtual void Erase(const Slice& key) {
const uint32_t hash = HashSlice(key);
shard_[Shard(hash)].Erase(key, hash);
}
virtual void* Value(Handle* handle) {
return reinterpret_cast<LRUHandle*>(handle)->value;
}
virtual uint64_t NewId() {
MutexLock l(&id_mutex_);
return ++(last_id_);
}
};
} // end anonymous namespace } // end anonymous namespace
Cache* NewLRUCache(size_t capacity) { Cache* NewLRUCache(size_t capacity) {
return new LRUCache(capacity); return new ShardedLRUCache(capacity);
} }
} }

@ -32,7 +32,7 @@ class CacheTest {
current_->deleted_values_.push_back(DecodeValue(v)); current_->deleted_values_.push_back(DecodeValue(v));
} }
static const int kCacheSize = 100; static const int kCacheSize = 1000;
std::vector<int> deleted_keys_; std::vector<int> deleted_keys_;
std::vector<int> deleted_values_; std::vector<int> deleted_values_;
Cache* cache_; Cache* cache_;
@ -137,23 +137,40 @@ TEST(CacheTest, EvictionPolicy) {
Insert(200, 201); Insert(200, 201);
// Frequently used entry must be kept around // Frequently used entry must be kept around
for (int i = 0; i < kCacheSize; i++) { for (int i = 0; i < kCacheSize + 100; i++) {
Insert(1000+i, 2000+i); Insert(1000+i, 2000+i);
ASSERT_EQ(2000+i, Lookup(1000+i)); ASSERT_EQ(2000+i, Lookup(1000+i));
ASSERT_EQ(101, Lookup(100)); ASSERT_EQ(101, Lookup(100));
} }
ASSERT_EQ(101, Lookup(100)); ASSERT_EQ(101, Lookup(100));
ASSERT_EQ(2, deleted_keys_.size()); ASSERT_EQ(-1, Lookup(200));
ASSERT_EQ(200, deleted_keys_[0]);
ASSERT_EQ(201, deleted_values_[0]);
} }
TEST(CacheTest, HeavyEntry) { TEST(CacheTest, HeavyEntries) {
Insert(100, 101); // Add a bunch of light and heavy entries and then count the combined
Insert(200, 201, kCacheSize); // size of items still in the cache, which must be approximately the
ASSERT_EQ(1, deleted_keys_.size()); // same as the total capacity.
ASSERT_EQ(100, deleted_keys_[0]); const int kLight = 1;
ASSERT_EQ(101, deleted_values_[0]); const int kHeavy = 10;
int added = 0;
int index = 0;
while (added < 2*kCacheSize) {
const int weight = (index & 1) ? kLight : kHeavy;
Insert(index, 1000+index, weight);
added += weight;
index++;
}
int cached_weight = 0;
for (int i = 0; i < index; i++) {
const int weight = (i & 1 ? kLight : kHeavy);
int r = Lookup(i);
if (r >= 0) {
cached_weight += weight;
ASSERT_EQ(1000+i, r);
}
}
ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10);
} }
TEST(CacheTest, NewId) { TEST(CacheTest, NewId) {

@ -55,6 +55,17 @@ void Histogram::Add(double value) {
sum_squares_ += (value * value); sum_squares_ += (value * value);
} }
void Histogram::Merge(const Histogram& other) {
if (other.min_ < min_) min_ = other.min_;
if (other.max_ > max_) max_ = other.max_;
num_ += other.num_;
sum_ += other.sum_;
sum_squares_ += other.sum_squares_;
for (int b = 0; b < kNumBuckets; b++) {
buckets_[b] += other.buckets_[b];
}
}
double Histogram::Median() const { double Histogram::Median() const {
return Percentile(50.0); return Percentile(50.0);
} }

@ -16,6 +16,7 @@ class Histogram {
void Clear(); void Clear();
void Add(double value); void Add(double value);
void Merge(const Histogram& other);
std::string ToString() const; std::string ToString() const;

Loading…
Cancel
Save