reverting disastrous MOE commit, returning to r21

git-svn-id: https://leveldb.googlecode.com/svn/trunk@23 62dab493-f737-651d-591e-8d6aee1b9529
main
dgrogan@chromium.org 14 years ago
parent b743906eea
commit 69c6d38342
  1. 0
      AUTHORS
  2. 0
      LICENSE
  3. 5
      Makefile
  4. 2
      README
  5. 4
      TODO
  6. 9
      db/builder.cc
  7. 6
      db/builder.h
  8. 26
      db/corruption_test.cc
  9. 22
      db/db_bench.cc
  10. 191
      db/db_impl.cc
  11. 23
      db/db_impl.h
  12. 101
      db/db_iter.cc
  13. 0
      db/db_iter.h
  14. 185
      db/db_test.cc
  15. 65
      db/dbformat.cc
  16. 53
      db/dbformat.h
  17. 15
      db/dbformat_test.cc
  18. 19
      db/filename.cc
  19. 16
      db/filename.h
  20. 156
      db/filename_test.cc
  21. 0
      db/log_format.h
  22. 0
      db/log_reader.cc
  23. 0
      db/log_reader.h
  24. 0
      db/log_test.cc
  25. 4
      db/log_writer.cc
  26. 0
      db/log_writer.h
  27. 0
      db/memtable.cc
  28. 0
      db/memtable.h
  29. 40
      db/repair.cc
  30. 0
      db/skiplist.h
  31. 0
      db/skiplist_test.cc
  32. 0
      db/snapshot.h
  33. 0
      db/table_cache.cc
  34. 0
      db/table_cache.h
  35. 43
      db/version_edit.cc
  36. 18
      db/version_edit.h
  37. 6
      db/version_edit_test.cc
  38. 129
      db/version_set.cc
  39. 28
      db/version_set.h
  40. 16
      db/write_batch.cc
  41. 4
      db/write_batch_internal.h
  42. 23
      db/write_batch_test.cc
  43. 0
      doc/doc.css
  44. 13
      doc/impl.html
  45. 11
      doc/index.html
  46. 0
      doc/log_format.txt
  47. 0
      doc/table_format.txt
  48. 0
      include/leveldb/cache.h
  49. 0
      include/leveldb/comparator.h
  50. 0
      include/leveldb/db.h
  51. 0
      include/leveldb/env.h
  52. 0
      include/leveldb/iterator.h
  53. 12
      include/leveldb/options.h
  54. 0
      include/leveldb/slice.h
  55. 0
      include/leveldb/status.h
  56. 0
      include/leveldb/table.h
  57. 0
      include/leveldb/table_builder.h
  58. 0
      include/leveldb/write_batch.h
  59. 12
      leveldb.gyp
  60. 122
      leveldb/db/filename_test.cc
  61. 0
      port/README
  62. 0
      port/port.h
  63. 0
      port/port_android.cc
  64. 8
      port/port_android.h
  65. 0
      port/port_chromium.cc
  66. 7
      port/port_chromium.h
  67. 5
      port/port_example.h
  68. 0
      port/port_posix.cc
  69. 5
      port/port_posix.h
  70. 298
      port/sha1_portable.cc
  71. 25
      port/sha1_portable.h
  72. 39
      port/sha1_test.cc
  73. 0
      port/win/stdint.h
  74. 4
      table/block.cc
  75. 0
      table/block.h
  76. 2
      table/block_builder.cc
  77. 0
      table/block_builder.h
  78. 4
      table/format.cc
  79. 0
      table/format.h
  80. 0
      table/iterator.cc
  81. 0
      table/iterator_wrapper.h
  82. 0
      table/merger.cc
  83. 0
      table/merger.h
  84. 0
      table/table.cc
  85. 0
      table/table_builder.cc
  86. 0
      table/table_test.cc
  87. 0
      table/two_level_iterator.cc
  88. 0
      table/two_level_iterator.h
  89. 2
      util/arena.cc
  90. 0
      util/arena.h
  91. 0
      util/arena_test.cc
  92. 0
      util/cache.cc
  93. 0
      util/cache_test.cc
  94. 2
      util/coding.cc
  95. 0
      util/coding.h
  96. 0
      util/coding_test.cc
  97. 2
      util/comparator.cc
  98. 0
      util/crc32c.cc
  99. 0
      util/crc32c.h
  100. 0
      util/crc32c_test.cc
  101. Some files were not shown because too many files have changed in this diff Show More

@ -27,6 +27,7 @@ LIBOBJECTS = \
./db/version_set.o \ ./db/version_set.o \
./db/write_batch.o \ ./db/write_batch.o \
./port/port_posix.o \ ./port/port_posix.o \
./port/sha1_portable.o \
./table/block.o \ ./table/block.o \
./table/block_builder.o \ ./table/block_builder.o \
./table/format.o \ ./table/format.o \
@ -62,6 +63,7 @@ TESTS = \
env_test \ env_test \
filename_test \ filename_test \
log_test \ log_test \
sha1_test \
skiplist_test \ skiplist_test \
table_test \ table_test \
version_edit_test \ version_edit_test \
@ -113,6 +115,9 @@ log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
sha1_test: port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@

@ -5,7 +5,7 @@ The code under this directory implements a system for maintaining a
persistent key/value store. persistent key/value store.
See doc/index.html for more explanation. See doc/index.html for more explanation.
See doc/impl.html for a brief overview of the implementation. See doc/db_layout.txt for a brief overview of the implementation.
The public interface is in include/*.h. Callers should not include or The public interface is in include/*.h. Callers should not include or
rely on the details of any other header files in this package. Those rely on the details of any other header files in this package. Those

@ -8,7 +8,7 @@ db
object stores, etc. can be done in the background anyway, so object stores, etc. can be done in the background anyway, so
probably not that important. probably not that important.
api changes: api changes?
- Make it wrappable - Efficient large value reading and writing
Faster Get implementation Faster Get implementation

@ -38,6 +38,15 @@ Status BuildTable(const std::string& dbname,
for (; iter->Valid(); iter->Next()) { for (; iter->Valid(); iter->Next()) {
Slice key = iter->key(); Slice key = iter->key();
meta->largest.DecodeFrom(key); meta->largest.DecodeFrom(key);
if (ExtractValueType(key) == kTypeLargeValueRef) {
if (iter->value().size() != LargeValueRef::ByteSize()) {
s = Status::Corruption("invalid indirect reference hash value (L0)");
break;
}
edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()),
meta->number,
iter->key());
}
builder->Add(key, iter->value()); builder->Add(key, iter->value());
} }

@ -20,9 +20,9 @@ class VersionEdit;
// Build a Table file from the contents of *iter. The generated file // Build a Table file from the contents of *iter. The generated file
// will be named according to meta->number. On success, the rest of // will be named according to meta->number. On success, the rest of
// *meta will be filled with metadata about the generated table, and // *meta will be filled with metadata about the generated table, and
// the file information will be added to *edit. If no data is present // large value refs and the added file information will be added to
// in *iter, meta->file_size will be set to zero, and no Table file // *edit. If no data is present in *iter, meta->file_size will be set
// will be produced. // to zero, and no Table file will be produced.
extern Status BuildTable(const std::string& dbname, extern Status BuildTable(const std::string& dbname,
Env* env, Env* env,
const Options& options, const Options& options,

@ -121,10 +121,11 @@ class CorruptionTest {
std::vector<std::string> filenames; std::vector<std::string> filenames;
ASSERT_OK(env_.GetChildren(dbname_, &filenames)); ASSERT_OK(env_.GetChildren(dbname_, &filenames));
uint64_t number; uint64_t number;
LargeValueRef large_ref;
FileType type; FileType type;
std::vector<std::string> candidates; std::vector<std::string> candidates;
for (int i = 0; i < filenames.size(); i++) { for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) && if (ParseFileName(filenames[i], &number, &large_ref, &type) &&
type == filetype) { type == filetype) {
candidates.push_back(dbname_ + "/" + filenames[i]); candidates.push_back(dbname_ + "/" + filenames[i]);
} }
@ -275,6 +276,29 @@ TEST(CorruptionTest, SequenceNumberRecovery) {
ASSERT_EQ("v6", v); ASSERT_EQ("v6", v);
} }
TEST(CorruptionTest, LargeValueRecovery) {
Options options;
options.large_value_threshold = 10000;
Reopen(&options);
Random rnd(301);
std::string big;
ASSERT_OK(db_->Put(WriteOptions(),
"foo", test::RandomString(&rnd, 100000, &big)));
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ(big, v);
RepairDB();
Reopen();
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ(big, v);
Reopen();
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ(big, v);
}
TEST(CorruptionTest, CorruptedDescriptor) { TEST(CorruptionTest, CorruptedDescriptor) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);

@ -28,6 +28,7 @@
// readreverse -- read N values in reverse order // readreverse -- read N values in reverse order
// readrandom -- read N values in random order // readrandom -- read N values in random order
// crc32c -- repeated crc32c of 4K of data // crc32c -- repeated crc32c of 4K of data
// sha1 -- repeated SHA1 computation over 4K of data
// Meta operations: // Meta operations:
// compact -- Compact the entire DB // compact -- Compact the entire DB
// stats -- Print DB stats // stats -- Print DB stats
@ -47,6 +48,7 @@ static const char* FLAGS_benchmarks =
"readreverse," "readreverse,"
"fill100K," "fill100K,"
"crc32c," "crc32c,"
"sha1,"
"snappycomp," "snappycomp,"
"snappyuncomp," "snappyuncomp,"
; ;
@ -364,6 +366,8 @@ class Benchmark {
Compact(); Compact();
} else if (name == Slice("crc32c")) { } else if (name == Slice("crc32c")) {
Crc32c(4096, "(4K per op)"); Crc32c(4096, "(4K per op)");
} else if (name == Slice("sha1")) {
SHA1(4096, "(4K per op)");
} else if (name == Slice("snappycomp")) { } else if (name == Slice("snappycomp")) {
SnappyCompress(); SnappyCompress();
} else if (name == Slice("snappyuncomp")) { } else if (name == Slice("snappyuncomp")) {
@ -402,6 +406,24 @@ class Benchmark {
message_ = label; message_ = label;
} }
void SHA1(int size, const char* label) {
// SHA1 about 100MB of data total
std::string data(size, 'x');
int64_t bytes = 0;
char sha1[20];
while (bytes < 100 * 1048576) {
port::SHA1_Hash(data.data(), size, sha1);
FinishedSingleOp();
bytes += size;
}
// Print so result is not dead
fprintf(stderr, "... sha1=%02x...\r", static_cast<unsigned int>(sha1[0]));
bytes_ = bytes;
message_ = label;
}
void SnappyCompress() { void SnappyCompress() {
Slice input = gen_.Generate(Options().block_size); Slice input = gen_.Generate(Options().block_size);
int64_t bytes = 0; int64_t bytes = 0;

@ -81,8 +81,8 @@ class NullWritableFile : public WritableFile {
// Fix user-supplied options to be reasonable // Fix user-supplied options to be reasonable
template <class T,class V> template <class T,class V>
static void ClipToRange(T* ptr, V minvalue, V maxvalue) { static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue; if (*ptr > maxvalue) *ptr = maxvalue;
if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue; if (*ptr < minvalue) *ptr = minvalue;
} }
Options SanitizeOptions(const std::string& dbname, Options SanitizeOptions(const std::string& dbname,
const InternalKeyComparator* icmp, const InternalKeyComparator* icmp,
@ -91,6 +91,7 @@ Options SanitizeOptions(const std::string& dbname,
result.comparator = icmp; result.comparator = icmp;
ClipToRange(&result.max_open_files, 20, 50000); ClipToRange(&result.max_open_files, 20, 50000);
ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); ClipToRange(&result.write_buffer_size, 64<<10, 1<<30);
ClipToRange(&result.large_value_threshold, 16<<10, 1<<30);
ClipToRange(&result.block_size, 1<<10, 4<<20); ClipToRange(&result.block_size, 1<<10, 4<<20);
if (result.info_log == NULL) { if (result.info_log == NULL) {
// Open a log file in the same directory as the db // Open a log file in the same directory as the db
@ -212,12 +213,15 @@ void DBImpl::DeleteObsoleteFiles() {
std::set<uint64_t> live = pending_outputs_; std::set<uint64_t> live = pending_outputs_;
versions_->AddLiveFiles(&live); versions_->AddLiveFiles(&live);
versions_->CleanupLargeValueRefs(live);
std::vector<std::string> filenames; std::vector<std::string> filenames;
env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
uint64_t number; uint64_t number;
LargeValueRef large_ref;
FileType type; FileType type;
for (size_t i = 0; i < filenames.size(); i++) { for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) { if (ParseFileName(filenames[i], &number, &large_ref, &type)) {
bool keep = true; bool keep = true;
switch (type) { switch (type) {
case kLogFile: case kLogFile:
@ -237,6 +241,9 @@ void DBImpl::DeleteObsoleteFiles() {
// be recorded in pending_outputs_, which is inserted into "live" // be recorded in pending_outputs_, which is inserted into "live"
keep = (live.find(number) != live.end()); keep = (live.find(number) != live.end());
break; break;
case kLargeValueFile:
keep = versions_->LargeValueIsLive(large_ref);
break;
case kCurrentFile: case kCurrentFile:
case kDBLockFile: case kDBLockFile:
case kInfoLogFile: case kInfoLogFile:
@ -592,7 +599,7 @@ void DBImpl::CleanupCompaction(CompactionState* compact) {
assert(compact->outfile == NULL); assert(compact->outfile == NULL);
} }
delete compact->outfile; delete compact->outfile;
for (size_t i = 0; i < compact->outputs.size(); i++) { for (int i = 0; i < compact->outputs.size(); i++) {
const CompactionState::Output& out = compact->outputs[i]; const CompactionState::Output& out = compact->outputs[i];
pending_outputs_.erase(out.number); pending_outputs_.erase(out.number);
} }
@ -688,7 +695,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
// Add compaction outputs // Add compaction outputs
compact->compaction->AddInputDeletions(compact->compaction->edit()); compact->compaction->AddInputDeletions(compact->compaction->edit());
const int level = compact->compaction->level(); const int level = compact->compaction->level();
for (size_t i = 0; i < compact->outputs.size(); i++) { for (int i = 0; i < compact->outputs.size(); i++) {
const CompactionState::Output& out = compact->outputs[i]; const CompactionState::Output& out = compact->outputs[i];
compact->compaction->edit()->AddFile( compact->compaction->edit()->AddFile(
level + 1, level + 1,
@ -703,7 +710,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
DeleteObsoleteFiles(); DeleteObsoleteFiles();
} else { } else {
// Discard any files we may have created during this failed compaction // Discard any files we may have created during this failed compaction
for (size_t i = 0; i < compact->outputs.size(); i++) { for (int i = 0; i < compact->outputs.size(); i++) {
env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number)); env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number));
} }
} }
@ -804,7 +811,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
" Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
"%d smallest_snapshot: %d", "%d smallest_snapshot: %d",
ikey.user_key.ToString().c_str(), ikey.user_key.ToString().c_str(),
(int)ikey.sequence, ikey.type, kTypeValue, drop, (int)ikey.sequence, ikey.type, kTypeLargeValueRef, drop,
compact->compaction->IsBaseLevelForKey(ikey.user_key), compact->compaction->IsBaseLevelForKey(ikey.user_key),
(int)last_sequence_for_key, (int)compact->smallest_snapshot); (int)last_sequence_for_key, (int)compact->smallest_snapshot);
#endif #endif
@ -821,7 +828,26 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
compact->current_output()->smallest.DecodeFrom(key); compact->current_output()->smallest.DecodeFrom(key);
} }
compact->current_output()->largest.DecodeFrom(key); compact->current_output()->largest.DecodeFrom(key);
if (ikey.type == kTypeLargeValueRef) {
if (input->value().size() != LargeValueRef::ByteSize()) {
if (options_.paranoid_checks) {
status = Status::Corruption("invalid large value ref");
break;
} else {
Log(env_, options_.info_log,
"compaction found invalid large value ref");
}
} else {
compact->compaction->edit()->AddLargeValueRef(
LargeValueRef::FromRef(input->value()),
compact->current_output()->number,
input->key());
compact->builder->Add(key, input->value()); compact->builder->Add(key, input->value());
}
} else {
compact->builder->Add(key, input->value());
}
// Close output file if it is big enough // Close output file if it is big enough
if (compact->builder->FileSize() >= if (compact->builder->FileSize() >=
@ -855,7 +881,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
stats.bytes_read += compact->compaction->input(which, i)->file_size; stats.bytes_read += compact->compaction->input(which, i)->file_size;
} }
} }
for (size_t i = 0; i < compact->outputs.size(); i++) { for (int i = 0; i < compact->outputs.size(); i++) {
stats.bytes_written += compact->outputs[i].file_size; stats.bytes_written += compact->outputs[i].file_size;
} }
@ -959,27 +985,40 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
Status status; Status status;
WriteBatch* final = NULL;
{
MutexLock l(&mutex_); MutexLock l(&mutex_);
status = MakeRoomForWrite(false); // May temporarily release lock and wait status = MakeRoomForWrite(false); // May temporarily release lock and wait
uint64_t last_sequence = versions_->LastSequence(); uint64_t last_sequence = versions_->LastSequence();
if (status.ok()) { if (status.ok()) {
WriteBatchInternal::SetSequence(updates, last_sequence + 1); status = HandleLargeValues(last_sequence + 1, updates, &final);
last_sequence += WriteBatchInternal::Count(updates); }
if (status.ok()) {
WriteBatchInternal::SetSequence(final, last_sequence + 1);
last_sequence += WriteBatchInternal::Count(final);
versions_->SetLastSequence(last_sequence); versions_->SetLastSequence(last_sequence);
// Add to log and apply to memtable // Add to log and apply to memtable
status = log_->AddRecord(WriteBatchInternal::Contents(updates)); status = log_->AddRecord(WriteBatchInternal::Contents(final));
if (status.ok() && options.sync) { if (status.ok() && options.sync) {
status = logfile_->Sync(); status = logfile_->Sync();
} }
if (status.ok()) { if (status.ok()) {
status = WriteBatchInternal::InsertInto(updates, mem_); status = WriteBatchInternal::InsertInto(final, mem_);
} }
} }
if (options.post_write_snapshot != NULL) { if (options.post_write_snapshot != NULL) {
*options.post_write_snapshot = *options.post_write_snapshot =
status.ok() ? snapshots_.New(last_sequence) : NULL; status.ok() ? snapshots_.New(last_sequence) : NULL;
} }
}
if (final != updates) {
delete final;
}
return status; return status;
} }
@ -1031,6 +1070,124 @@ Status DBImpl::MakeRoomForWrite(bool force) {
return s; return s;
} }
bool DBImpl::HasLargeValues(const WriteBatch& batch) const {
if (WriteBatchInternal::ByteSize(&batch) >= options_.large_value_threshold) {
for (WriteBatchInternal::Iterator it(batch); !it.Done(); it.Next()) {
if (it.op() == kTypeValue &&
it.value().size() >= options_.large_value_threshold) {
return true;
}
}
}
return false;
}
// Given "raw_value", determines the appropriate compression format to use
// and stores the data that should be written to the large value file in
// "*file_bytes", and sets "*ref" to the appropriate large value reference.
// May use "*scratch" as backing store for "*file_bytes".
void DBImpl::MaybeCompressLargeValue(
const Slice& raw_value,
Slice* file_bytes,
std::string* scratch,
LargeValueRef* ref) {
switch (options_.compression) {
case kSnappyCompression: {
if (port::Snappy_Compress(raw_value.data(), raw_value.size(), scratch) &&
(scratch->size() < (raw_value.size() / 8) * 7)) {
*file_bytes = *scratch;
*ref = LargeValueRef::Make(raw_value, kSnappyCompression);
return;
}
// Less than 12.5% compression: just leave as uncompressed data
break;
}
case kNoCompression:
// Use default code outside of switch
break;
}
// Store as uncompressed data
*file_bytes = raw_value;
*ref = LargeValueRef::Make(raw_value, kNoCompression);
}
Status DBImpl::HandleLargeValues(SequenceNumber assigned_seq,
WriteBatch* updates,
WriteBatch** final) {
if (!HasLargeValues(*updates)) {
// Fast path: no large values found
*final = updates;
} else {
// Copy *updates to a new WriteBatch, replacing the references to
*final = new WriteBatch;
SequenceNumber seq = assigned_seq;
for (WriteBatchInternal::Iterator it(*updates); !it.Done(); it.Next()) {
switch (it.op()) {
case kTypeValue:
if (it.value().size() < options_.large_value_threshold) {
(*final)->Put(it.key(), it.value());
} else {
std::string scratch;
Slice file_bytes;
LargeValueRef large_ref;
MaybeCompressLargeValue(
it.value(), &file_bytes, &scratch, &large_ref);
InternalKey ikey(it.key(), seq, kTypeLargeValueRef);
if (versions_->RegisterLargeValueRef(
large_ref, versions_->LogNumber(), ikey)) {
// TODO(opt): avoid holding the lock here (but be careful about
// another thread doing a Write and switching logs or
// having us get a different "assigned_seq" value).
uint64_t tmp_number = versions_->NewFileNumber();
pending_outputs_.insert(tmp_number);
std::string tmp = TempFileName(dbname_, tmp_number);
WritableFile* file;
Status s = env_->NewWritableFile(tmp, &file);
if (!s.ok()) {
return s; // Caller will delete *final
}
file->Append(file_bytes);
s = file->Close();
delete file;
if (s.ok()) {
const std::string fname =
LargeValueFileName(dbname_, large_ref);
s = env_->RenameFile(tmp, fname);
} else {
Log(env_, options_.info_log, "Write large value: %s",
s.ToString().c_str());
}
pending_outputs_.erase(tmp_number);
if (!s.ok()) {
env_->DeleteFile(tmp); // Cleanup; intentionally ignoring error
return s; // Caller will delete *final
}
}
// Put an indirect reference in the write batch in place
// of large value
WriteBatchInternal::PutLargeValueRef(*final, it.key(), large_ref);
}
break;
case kTypeLargeValueRef:
return Status::Corruption("Corrupted write batch");
break;
case kTypeDeletion:
(*final)->Delete(it.key());
break;
}
seq = seq + 1;
}
}
return Status::OK();
}
bool DBImpl::GetProperty(const Slice& property, std::string* value) { bool DBImpl::GetProperty(const Slice& property, std::string* value) {
value->clear(); value->clear();
@ -1048,8 +1205,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
return false; return false;
} else { } else {
char buf[100]; char buf[100];
snprintf(buf, sizeof(buf), "%d", snprintf(buf, sizeof(buf), "%d", versions_->NumLevelFiles(level));
versions_->NumLevelFiles(static_cast<int>(level)));
*value = buf; *value = buf;
return true; return true;
} }
@ -1169,9 +1325,10 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
Status result = env->LockFile(LockFileName(dbname), &lock); Status result = env->LockFile(LockFileName(dbname), &lock);
if (result.ok()) { if (result.ok()) {
uint64_t number; uint64_t number;
LargeValueRef large_ref;
FileType type; FileType type;
for (size_t i = 0; i < filenames.size(); i++) { for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) { if (ParseFileName(filenames[i], &number, &large_ref, &type)) {
Status del = env->DeleteFile(dbname + "/" + filenames[i]); Status del = env->DeleteFile(dbname + "/" + filenames[i]);
if (result.ok() && !del.ok()) { if (result.ok() && !del.ok()) {
result = del; result = del;

@ -92,6 +92,29 @@ class DBImpl : public DB {
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); Status WriteLevel0Table(MemTable* mem, VersionEdit* edit);
Status MakeRoomForWrite(bool force /* compact even if there is room? */); Status MakeRoomForWrite(bool force /* compact even if there is room? */);
bool HasLargeValues(const WriteBatch& batch) const;
// Process data in "*updates" and return a status. "assigned_seq"
// is the sequence number assigned to the first mod in "*updates".
// If no large values are encountered, "*final" is set to "updates".
// If large values were encountered, registers the references of the
// large values with the VersionSet, writes the large values to
// files (if appropriate), and allocates a new WriteBatch with the
// large values replaced with indirect references and stores a
// pointer to the new WriteBatch in *final. If *final != updates on
// return, then the client should delete *final when no longer
// needed. Returns OK on success, and an appropriate error
// otherwise.
Status HandleLargeValues(SequenceNumber assigned_seq,
WriteBatch* updates,
WriteBatch** final);
// Helper routine for HandleLargeValues
void MaybeCompressLargeValue(
const Slice& raw_value,
Slice* file_bytes,
std::string* scratch,
LargeValueRef* ref);
struct CompactionState; struct CompactionState;

@ -53,11 +53,13 @@ class DBIter: public Iterator {
user_comparator_(cmp), user_comparator_(cmp),
iter_(iter), iter_(iter),
sequence_(s), sequence_(s),
large_(NULL),
direction_(kForward), direction_(kForward),
valid_(false) { valid_(false) {
} }
virtual ~DBIter() { virtual ~DBIter() {
delete iter_; delete iter_;
delete large_;
} }
virtual bool Valid() const { return valid_; } virtual bool Valid() const { return valid_; }
virtual Slice key() const { virtual Slice key() const {
@ -66,10 +68,20 @@ class DBIter: public Iterator {
} }
virtual Slice value() const { virtual Slice value() const {
assert(valid_); assert(valid_);
return (direction_ == kForward) ? iter_->value() : saved_value_; Slice raw_value = (direction_ == kForward) ? iter_->value() : saved_value_;
if (large_ == NULL) {
return raw_value;
} else {
MutexLock l(&large_->mutex);
if (!large_->produced) {
ReadIndirectValue(raw_value);
}
return large_->value;
}
} }
virtual Status status() const { virtual Status status() const {
if (status_.ok()) { if (status_.ok()) {
if (large_ != NULL && !large_->status.ok()) return large_->status;
return iter_->status(); return iter_->status();
} else { } else {
return status_; return status_;
@ -83,14 +95,29 @@ class DBIter: public Iterator {
virtual void SeekToLast(); virtual void SeekToLast();
private: private:
struct Large {
port::Mutex mutex;
std::string value;
bool produced;
Status status;
};
void FindNextUserEntry(bool skipping, std::string* skip); void FindNextUserEntry(bool skipping, std::string* skip);
void FindPrevUserEntry(); void FindPrevUserEntry();
bool ParseKey(ParsedInternalKey* key); bool ParseKey(ParsedInternalKey* key);
void ReadIndirectValue(Slice ref) const;
inline void SaveKey(const Slice& k, std::string* dst) { inline void SaveKey(const Slice& k, std::string* dst) {
dst->assign(k.data(), k.size()); dst->assign(k.data(), k.size());
} }
inline void ForgetLargeValue() {
if (large_ != NULL) {
delete large_;
large_ = NULL;
}
}
inline void ClearSavedValue() { inline void ClearSavedValue() {
if (saved_value_.capacity() > 1048576) { if (saved_value_.capacity() > 1048576) {
std::string empty; std::string empty;
@ -109,6 +136,7 @@ class DBIter: public Iterator {
Status status_; Status status_;
std::string saved_key_; // == current key when direction_==kReverse std::string saved_key_; // == current key when direction_==kReverse
std::string saved_value_; // == current raw value when direction_==kReverse std::string saved_value_; // == current raw value when direction_==kReverse
Large* large_; // Non-NULL if value is an indirect reference
Direction direction_; Direction direction_;
bool valid_; bool valid_;
@ -128,6 +156,7 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
void DBIter::Next() { void DBIter::Next() {
assert(valid_); assert(valid_);
ForgetLargeValue();
if (direction_ == kReverse) { // Switch directions? if (direction_ == kReverse) { // Switch directions?
direction_ = kForward; direction_ = kForward;
@ -156,6 +185,7 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
// Loop until we hit an acceptable entry to yield // Loop until we hit an acceptable entry to yield
assert(iter_->Valid()); assert(iter_->Valid());
assert(direction_ == kForward); assert(direction_ == kForward);
assert(large_ == NULL);
do { do {
ParsedInternalKey ikey; ParsedInternalKey ikey;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) { if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
@ -167,12 +197,17 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
skipping = true; skipping = true;
break; break;
case kTypeValue: case kTypeValue:
case kTypeLargeValueRef:
if (skipping && if (skipping &&
user_comparator_->Compare(ikey.user_key, *skip) <= 0) { user_comparator_->Compare(ikey.user_key, *skip) <= 0) {
// Entry hidden // Entry hidden
} else { } else {
valid_ = true; valid_ = true;
saved_key_.clear(); saved_key_.clear();
if (ikey.type == kTypeLargeValueRef) {
large_ = new Large;
large_->produced = false;
}
return; return;
} }
break; break;
@ -186,6 +221,7 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
void DBIter::Prev() { void DBIter::Prev() {
assert(valid_); assert(valid_);
ForgetLargeValue();
if (direction_ == kForward) { // Switch directions? if (direction_ == kForward) { // Switch directions?
// iter_ is pointing at the current entry. Scan backwards until // iter_ is pointing at the current entry. Scan backwards until
@ -213,6 +249,7 @@ void DBIter::Prev() {
void DBIter::FindPrevUserEntry() { void DBIter::FindPrevUserEntry() {
assert(direction_ == kReverse); assert(direction_ == kReverse);
assert(large_ == NULL);
ValueType value_type = kTypeDeletion; ValueType value_type = kTypeDeletion;
if (iter_->Valid()) { if (iter_->Valid()) {
@ -249,11 +286,16 @@ void DBIter::FindPrevUserEntry() {
direction_ = kForward; direction_ = kForward;
} else { } else {
valid_ = true; valid_ = true;
if (value_type == kTypeLargeValueRef) {
large_ = new Large;
large_->produced = false;
}
} }
} }
void DBIter::Seek(const Slice& target) { void DBIter::Seek(const Slice& target) {
direction_ = kForward; direction_ = kForward;
ForgetLargeValue();
ClearSavedValue(); ClearSavedValue();
saved_key_.clear(); saved_key_.clear();
AppendInternalKey( AppendInternalKey(
@ -268,6 +310,7 @@ void DBIter::Seek(const Slice& target) {
void DBIter::SeekToFirst() { void DBIter::SeekToFirst() {
direction_ = kForward; direction_ = kForward;
ForgetLargeValue();
ClearSavedValue(); ClearSavedValue();
iter_->SeekToFirst(); iter_->SeekToFirst();
if (iter_->Valid()) { if (iter_->Valid()) {
@ -279,11 +322,67 @@ void DBIter::SeekToFirst() {
void DBIter::SeekToLast() { void DBIter::SeekToLast() {
direction_ = kReverse; direction_ = kReverse;
ForgetLargeValue();
ClearSavedValue(); ClearSavedValue();
iter_->SeekToLast(); iter_->SeekToLast();
FindPrevUserEntry(); FindPrevUserEntry();
} }
void DBIter::ReadIndirectValue(Slice ref) const {
assert(!large_->produced);
large_->produced = true;
LargeValueRef large_ref;
if (ref.size() != LargeValueRef::ByteSize()) {
large_->status = Status::Corruption("malformed large value reference");
return;
}
memcpy(large_ref.data, ref.data(), LargeValueRef::ByteSize());
std::string fname = LargeValueFileName(*dbname_, large_ref);
RandomAccessFile* file;
Status s = env_->NewRandomAccessFile(fname, &file);
uint64_t file_size = 0;
if (s.ok()) {
s = env_->GetFileSize(fname, &file_size);
}
if (s.ok()) {
uint64_t value_size = large_ref.ValueSize();
large_->value.resize(value_size);
Slice result;
s = file->Read(0, file_size, &result,
const_cast<char*>(large_->value.data()));
if (s.ok()) {
if (result.size() == file_size) {
switch (large_ref.compression_type()) {
case kNoCompression: {
if (result.data() != large_->value.data()) {
large_->value.assign(result.data(), result.size());
}
break;
}
case kSnappyCompression: {
std::string uncompressed;
if (port::Snappy_Uncompress(result.data(), result.size(),
&uncompressed) &&
uncompressed.size() == large_ref.ValueSize()) {
swap(uncompressed, large_->value);
} else {
s = Status::Corruption(
"Unable to read entire compressed large value file");
}
}
}
} else {
s = Status::Corruption("Unable to read entire large value file");
}
}
delete file; // Ignore errors on closing
}
if (!s.ok()) {
large_->value.clear();
large_->status = s;
}
}
} // anonymous namespace } // anonymous namespace
Iterator* NewDBIterator( Iterator* NewDBIterator(

@ -119,6 +119,9 @@ class DBTest {
case kTypeValue: case kTypeValue:
result += iter->value().ToString(); result += iter->value().ToString();
break; break;
case kTypeLargeValueRef:
result += "LARGEVALUE(" + EscapeString(iter->value()) + ")";
break;
case kTypeDeletion: case kTypeDeletion:
result += "DEL"; result += "DEL";
break; break;
@ -150,6 +153,26 @@ class DBTest {
return size; return size;
} }
std::set<LargeValueRef> LargeValueFiles() const {
// Return the set of large value files that exist in the database
std::vector<std::string> filenames;
env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
uint64_t number;
LargeValueRef large_ref;
FileType type;
std::set<LargeValueRef> live;
for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &large_ref, &type) &&
type == kLargeValueFile) {
fprintf(stderr, " live: %s\n",
LargeValueRefToFilenameString(large_ref).c_str());
live.insert(large_ref);
}
}
fprintf(stderr, "Found %d live large value files\n", (int)live.size());
return live;
}
void Compact(const Slice& start, const Slice& limit) { void Compact(const Slice& start, const Slice& limit) {
dbfull()->TEST_CompactMemTable(); dbfull()->TEST_CompactMemTable();
int max_level_with_files = 1; int max_level_with_files = 1;
@ -448,6 +471,7 @@ TEST(DBTest, MinorCompactionsHappen) {
TEST(DBTest, RecoverWithLargeLog) { TEST(DBTest, RecoverWithLargeLog) {
{ {
Options options; Options options;
options.large_value_threshold = 1048576;
Reopen(&options); Reopen(&options);
ASSERT_OK(Put("big1", std::string(200000, '1'))); ASSERT_OK(Put("big1", std::string(200000, '1')));
ASSERT_OK(Put("big2", std::string(200000, '2'))); ASSERT_OK(Put("big2", std::string(200000, '2')));
@ -460,6 +484,7 @@ TEST(DBTest, RecoverWithLargeLog) {
// we flush table files in the middle of a large log file. // we flush table files in the middle of a large log file.
Options options; Options options;
options.write_buffer_size = 100000; options.write_buffer_size = 100000;
options.large_value_threshold = 1048576;
Reopen(&options); Reopen(&options);
ASSERT_EQ(NumTableFilesAtLevel(0), 3); ASSERT_EQ(NumTableFilesAtLevel(0), 3);
ASSERT_EQ(std::string(200000, '1'), Get("big1")); ASSERT_EQ(std::string(200000, '1'), Get("big1"));
@ -472,6 +497,7 @@ TEST(DBTest, RecoverWithLargeLog) {
TEST(DBTest, CompactionsGenerateMultipleFiles) { TEST(DBTest, CompactionsGenerateMultipleFiles) {
Options options; Options options;
options.write_buffer_size = 100000000; // Large write buffer options.write_buffer_size = 100000000; // Large write buffer
options.large_value_threshold = 1048576;
Reopen(&options); Reopen(&options);
Random rnd(301); Random rnd(301);
@ -544,7 +570,11 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) {
} }
TEST(DBTest, ApproximateSizes) { TEST(DBTest, ApproximateSizes) {
for (int test = 0; test < 2; test++) {
// test==0: default large_value_threshold
// test==1: 1 MB large_value_threshold
Options options; Options options;
options.large_value_threshold = (test == 0) ? 65536 : 1048576;
options.write_buffer_size = 100000000; // Large write buffer options.write_buffer_size = 100000000; // Large write buffer
options.compression = kNoCompression; options.compression = kNoCompression;
DestroyAndReopen(); DestroyAndReopen();
@ -560,9 +590,15 @@ TEST(DBTest, ApproximateSizes) {
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000)));
} }
if (test == 1) {
// 0 because GetApproximateSizes() does not account for memtable space // 0 because GetApproximateSizes() does not account for memtable space for
// non-large values
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
} else {
ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000));
ASSERT_TRUE(Between(Size(Key(20), Key(30)),
100000*10, 100000*10 + 10000));
}
// Check sizes across recovery by reopening a few times // Check sizes across recovery by reopening a few times
for (int run = 0; run < 3; run++) { for (int run = 0; run < 3; run++) {
@ -588,9 +624,11 @@ TEST(DBTest, ApproximateSizes) {
ASSERT_GT(NumTableFilesAtLevel(1), 0); ASSERT_GT(NumTableFilesAtLevel(1), 0);
} }
} }
}
TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
Options options; Options options;
options.large_value_threshold = 65536;
options.compression = kNoCompression; options.compression = kNoCompression;
Reopen(); Reopen();
@ -763,6 +801,146 @@ TEST(DBTest, ComparatorCheck) {
<< s.ToString(); << s.ToString();
} }
static bool LargeValuesOK(DBTest* db,
const std::set<LargeValueRef>& expected) {
std::set<LargeValueRef> actual = db->LargeValueFiles();
if (actual.size() != expected.size()) {
fprintf(stderr, "Sets differ in size: %d vs %d\n",
(int)actual.size(), (int)expected.size());
return false;
}
for (std::set<LargeValueRef>::const_iterator it = expected.begin();
it != expected.end();
++it) {
if (actual.count(*it) != 1) {
fprintf(stderr, " key '%s' not found in actual set\n",
LargeValueRefToFilenameString(*it).c_str());
return false;
}
}
return true;
}
TEST(DBTest, LargeValues1) {
Options options;
options.large_value_threshold = 10000;
Reopen(&options);
Random rnd(301);
std::string big1;
test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible
std::set<LargeValueRef> expected;
ASSERT_OK(Put("big1", big1));
expected.insert(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Delete("big1"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
// No handling of deletion markers on memtable compactions, so big1 remains
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
expected.erase(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
}
static bool SnappyCompressionSupported() {
std::string out;
Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
return port::Snappy_Compress(in.data(), in.size(), &out);
}
TEST(DBTest, LargeValues2) {
Options options;
options.large_value_threshold = 10000;
Reopen(&options);
Random rnd(301);
std::string big1, big2;
test::CompressibleString(&rnd, 1.0, 20000, &big1); // Not compressible
test::CompressibleString(&rnd, 0.6, 40000, &big2); // Compressible
std::set<LargeValueRef> expected;
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("big1", big1));
expected.insert(LargeValueRef::Make(big1, kNoCompression));
ASSERT_EQ(big1, Get("big1"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("big2", big2));
ASSERT_EQ(big2, Get("big2"));
if (SnappyCompressionSupported()) {
expected.insert(LargeValueRef::Make(big2, kSnappyCompression));
} else {
expected.insert(LargeValueRef::Make(big2, kNoCompression));
}
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("big2", big2));
ASSERT_OK(Put("big2_b", big2));
ASSERT_EQ(big1, Get("big1"));
ASSERT_EQ(big2, Get("big2"));
ASSERT_EQ(big2, Get("big2_b"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Delete("big1"));
ASSERT_EQ("NOT_FOUND", Get("big1"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
expected.erase(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(1, "", "z");
ASSERT_OK(Delete("big2"));
ASSERT_EQ("NOT_FOUND", Get("big2"));
ASSERT_EQ(big2, Get("big2_b"));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
ASSERT_TRUE(LargeValuesOK(this, expected));
// Make sure the large value refs survive a reload and compactions after
// the reload.
Reopen();
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
dbfull()->TEST_CompactRange(0, "", "z");
ASSERT_TRUE(LargeValuesOK(this, expected));
}
TEST(DBTest, LargeValues3) {
// Make sure we don't compress values if
Options options;
options.large_value_threshold = 10000;
options.compression = kNoCompression;
Reopen(&options);
Random rnd(301);
std::string big1 = std::string(100000, 'x'); // Very compressible
std::set<LargeValueRef> expected;
ASSERT_OK(Put("big1", big1));
ASSERT_EQ(big1, Get("big1"));
expected.insert(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
}
TEST(DBTest, DBOpen_Options) { TEST(DBTest, DBOpen_Options) {
std::string dbname = test::TmpDir() + "/db_options_test"; std::string dbname = test::TmpDir() + "/db_options_test";
DestroyDB(dbname, Options()); DestroyDB(dbname, Options());
@ -847,6 +1025,9 @@ class ModelDB: public DB {
case kTypeValue: case kTypeValue:
map_[it.key().ToString()] = it.value().ToString(); map_[it.key().ToString()] = it.value().ToString();
break; break;
case kTypeLargeValueRef:
assert(false); // Should not occur
break;
case kTypeDeletion: case kTypeDeletion:
map_.erase(it.key().ToString()); map_.erase(it.key().ToString());
break; break;

@ -84,4 +84,69 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
} }
} }
LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) {
LargeValueRef result;
port::SHA1_Hash(value.data(), value.size(), &result.data[0]);
EncodeFixed64(&result.data[20], value.size());
result.data[28] = static_cast<unsigned char>(ctype);
return result;
}
std::string LargeValueRefToFilenameString(const LargeValueRef& h) {
assert(sizeof(h.data) == LargeValueRef::ByteSize());
assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf
static const char tohex[] = "0123456789abcdef";
char buf[20*2];
for (int i = 0; i < 20; i++) {
buf[2*i] = tohex[(h.data[i] >> 4) & 0xf];
buf[2*i+1] = tohex[h.data[i] & 0xf];
}
std::string result = std::string(buf, sizeof(buf));
result += "-";
result += NumberToString(h.ValueSize());
result += "-";
result += NumberToString(static_cast<uint64_t>(h.compression_type()));
return result;
}
static uint32_t hexvalue(char c) {
if (c >= '0' && c <= '9') {
return c - '0';
} else if (c >= 'A' && c <= 'F') {
return 10 + c - 'A';
} else {
assert(c >= 'a' && c <= 'f');
return 10 + c - 'a';
}
}
bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) {
Slice in = s;
if (in.size() < 40) {
return false;
}
for (int i = 0; i < 20; i++) {
if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) {
return false;
}
unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]);
h->data[i] = c;
}
in.remove_prefix(40);
uint64_t value_size, ctype;
if (ConsumeChar(&in, '-') &&
ConsumeDecimalNumber(&in, &value_size) &&
ConsumeChar(&in, '-') &&
ConsumeDecimalNumber(&in, &ctype) &&
in.empty() &&
(ctype <= kSnappyCompression)) {
EncodeFixed64(&h->data[20], value_size);
h->data[28] = static_cast<unsigned char>(ctype);
return true;
} else {
return false;
}
}
} }

@ -29,6 +29,7 @@ class InternalKey;
enum ValueType { enum ValueType {
kTypeDeletion = 0x0, kTypeDeletion = 0x0,
kTypeValue = 0x1, kTypeValue = 0x1,
kTypeLargeValueRef = 0x2,
}; };
// kValueTypeForSeek defines the ValueType that should be passed when // kValueTypeForSeek defines the ValueType that should be passed when
// constructing a ParsedInternalKey object for seeking to a particular // constructing a ParsedInternalKey object for seeking to a particular
@ -36,7 +37,7 @@ enum ValueType {
// and the value type is embedded as the low 8 bits in the sequence // and the value type is embedded as the low 8 bits in the sequence
// number in internal keys, we need to use the highest-numbered // number in internal keys, we need to use the highest-numbered
// ValueType, not the lowest). // ValueType, not the lowest).
static const ValueType kValueTypeForSeek = kTypeValue; static const ValueType kValueTypeForSeek = kTypeLargeValueRef;
typedef uint64_t SequenceNumber; typedef uint64_t SequenceNumber;
@ -138,6 +139,54 @@ inline int InternalKeyComparator::Compare(
return Compare(a.Encode(), b.Encode()); return Compare(a.Encode(), b.Encode());
} }
// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte
// uncompressed size, and a 1 byte CompressionType code. An
// encoded form of it is embedded in the filenames of large value
// files stored in the database, and the raw binary form is stored as
// the iter->value() result for values of type kTypeLargeValueRef in
// the table and log files that make up the database.
struct LargeValueRef {
char data[29];
// Initialize a large value ref for the given data
static LargeValueRef Make(const Slice& data,
CompressionType compression_type);
// Initialize a large value ref from a serialized, 29-byte reference value
static LargeValueRef FromRef(const Slice& ref) {
LargeValueRef result;
assert(ref.size() == sizeof(result.data));
memcpy(result.data, ref.data(), sizeof(result.data));
return result;
}
// Return the number of bytes in a LargeValueRef (not the
// number of bytes in the value referenced).
static size_t ByteSize() { return sizeof(LargeValueRef().data); }
// Return the number of bytes in the value referenced by "*this".
uint64_t ValueSize() const { return DecodeFixed64(&data[20]); }
CompressionType compression_type() const {
return static_cast<CompressionType>(data[28]);
}
bool operator==(const LargeValueRef& b) const {
return memcmp(data, b.data, sizeof(data)) == 0;
}
bool operator<(const LargeValueRef& b) const {
return memcmp(data, b.data, sizeof(data)) < 0;
}
};
// Convert the large value ref to a human-readable string suitable
// for embedding in a large value filename.
extern std::string LargeValueRefToFilenameString(const LargeValueRef& h);
// Parse the large value filename string in "input" and store it in
// "*h". If successful, returns true. Otherwise returns false.
extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref);
inline bool ParseInternalKey(const Slice& internal_key, inline bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result) { ParsedInternalKey* result) {
const size_t n = internal_key.size(); const size_t n = internal_key.size();
@ -147,7 +196,7 @@ inline bool ParseInternalKey(const Slice& internal_key,
result->sequence = num >> 8; result->sequence = num >> 8;
result->type = static_cast<ValueType>(c); result->type = static_cast<ValueType>(c);
result->user_key = Slice(internal_key.data(), n - 8); result->user_key = Slice(internal_key.data(), n - 8);
return (c <= static_cast<unsigned char>(kTypeValue)); return (c <= static_cast<unsigned char>(kTypeLargeValueRef));
} }
} }

@ -76,6 +76,9 @@ TEST(FormatTest, InternalKeyShortSeparator) {
ASSERT_EQ(IKey("foo", 100, kTypeValue), ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue), Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeDeletion))); IKey("foo", 100, kTypeDeletion)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeLargeValueRef)));
// When user keys are misordered // When user keys are misordered
ASSERT_EQ(IKey("foo", 100, kTypeValue), ASSERT_EQ(IKey("foo", 100, kTypeValue),
@ -105,6 +108,18 @@ TEST(FormatTest, InternalKeyShortestSuccessor) {
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
} }
TEST(FormatTest, SHA1) {
// Check that we are computing the same value as sha1.
// Note that the last two numbers are the length of the input and the
// compression type.
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0", // SHA1, uncompr
LargeValueRefToFilenameString(
LargeValueRef::Make("hello", kNoCompression)));
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr
LargeValueRefToFilenameString(
LargeValueRef::Make("hello", kSnappyCompression)));
}
} }
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -30,6 +30,14 @@ std::string TableFileName(const std::string& name, uint64_t number) {
return MakeFileName(name, number, "sst"); return MakeFileName(name, number, "sst");
} }
std::string LargeValueFileName(const std::string& name,
const LargeValueRef& large_ref) {
std::string result = name + "/";
result += LargeValueRefToFilenameString(large_ref);
result += ".val";
return result;
}
std::string DescriptorFileName(const std::string& dbname, uint64_t number) { std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
assert(number > 0); assert(number > 0);
char buf[100]; char buf[100];
@ -67,9 +75,11 @@ std::string OldInfoLogFileName(const std::string& dbname) {
// dbname/LOG // dbname/LOG
// dbname/LOG.old // dbname/LOG.old
// dbname/MANIFEST-[0-9]+ // dbname/MANIFEST-[0-9]+
// dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val
// dbname/[0-9]+.(log|sst) // dbname/[0-9]+.(log|sst)
bool ParseFileName(const std::string& fname, bool ParseFileName(const std::string& fname,
uint64_t* number, uint64_t* number,
LargeValueRef* large_ref,
FileType* type) { FileType* type) {
Slice rest(fname); Slice rest(fname);
if (rest == "CURRENT") { if (rest == "CURRENT") {
@ -81,6 +91,15 @@ bool ParseFileName(const std::string& fname,
} else if (rest == "LOG" || rest == "LOG.old") { } else if (rest == "LOG" || rest == "LOG.old") {
*number = 0; *number = 0;
*type = kInfoLogFile; *type = kInfoLogFile;
} else if (rest.size() >= 4 &&
Slice(rest.data() + rest.size() - 4, 4) == ".val") {
LargeValueRef h;
if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4),
&h)) {
return false;
}
*large_ref = h;
*type = kLargeValueFile;
} else if (rest.starts_with("MANIFEST-")) { } else if (rest.starts_with("MANIFEST-")) {
rest.remove_prefix(strlen("MANIFEST-")); rest.remove_prefix(strlen("MANIFEST-"));
uint64_t num; uint64_t num;

@ -16,11 +16,13 @@
namespace leveldb { namespace leveldb {
class Env; class Env;
struct LargeValueRef;
enum FileType { enum FileType {
kLogFile, kLogFile,
kDBLockFile, kDBLockFile,
kTableFile, kTableFile,
kLargeValueFile,
kDescriptorFile, kDescriptorFile,
kCurrentFile, kCurrentFile,
kTempFile, kTempFile,
@ -37,6 +39,12 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number);
// "dbname". // "dbname".
extern std::string TableFileName(const std::string& dbname, uint64_t number); extern std::string TableFileName(const std::string& dbname, uint64_t number);
// Return the name of the large value file with the specified large
// value reference in the db named by "dbname". The result will be
// prefixed with "dbname".
extern std::string LargeValueFileName(const std::string& dbname,
const LargeValueRef& large_ref);
// Return the name of the descriptor file for the db named by // Return the name of the descriptor file for the db named by
// "dbname" and the specified incarnation number. The result will be // "dbname" and the specified incarnation number. The result will be
// prefixed with "dbname". // prefixed with "dbname".
@ -63,10 +71,14 @@ extern std::string InfoLogFileName(const std::string& dbname);
extern std::string OldInfoLogFileName(const std::string& dbname); extern std::string OldInfoLogFileName(const std::string& dbname);
// If filename is a leveldb file, store the type of the file in *type. // If filename is a leveldb file, store the type of the file in *type.
// The number encoded in the filename is stored in *number. If the // If *type is kLargeValueFile, then the large value reference data
// filename was successfully parsed, returns true. Else return false. // from the filename is stored in "*large_ref. For all other types of
// files, the number encoded in the filename is stored in *number. If
// the filename was successfully parsed, returns true. Else return
// false.
extern bool ParseFileName(const std::string& filename, extern bool ParseFileName(const std::string& filename,
uint64_t* number, uint64_t* number,
LargeValueRef* large_ref,
FileType* type); FileType* type);
// Make the CURRENT file point to the descriptor file with the // Make the CURRENT file point to the descriptor file with the

@ -0,0 +1,156 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/filename.h"
#include "db/dbformat.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace leveldb {
class FileNameTest { };
TEST(FileNameTest, Parse) {
Slice db;
FileType type;
uint64_t number;
LargeValueRef large_ref;
// Successful parses
static struct {
const char* fname;
uint64_t number;
const char* large_ref;
FileType type;
} cases[] = {
{ "100.log", 100, "", kLogFile },
{ "0.log", 0, "", kLogFile },
{ "0.sst", 0, "", kTableFile },
{ "CURRENT", 0, "", kCurrentFile },
{ "LOCK", 0, "", kDBLockFile },
{ "MANIFEST-2", 2, "", kDescriptorFile },
{ "MANIFEST-7", 7, "", kDescriptorFile },
{ "LOG", 0, "", kInfoLogFile },
{ "LOG.old", 0, "", kInfoLogFile },
{ "18446744073709551615.log", 18446744073709551615ull, "",
kLogFile },
{ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0,
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile },
{ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0,
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0",
kLargeValueFile },
};
for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
std::string f = cases[i].fname;
ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f;
ASSERT_EQ(cases[i].type, type) << f;
if (type == kLargeValueFile) {
ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref))
<< f;
} else {
ASSERT_EQ(cases[i].number, number) << f;
}
}
// Errors
static const char* errors[] = {
"",
"foo",
"foo-dx-100.log",
".log",
"",
"manifest",
"CURREN",
"CURRENTX",
"MANIFES",
"MANIFEST",
"MANIFEST-",
"XMANIFEST-3",
"MANIFEST-3x",
"LOC",
"LOCKx",
"LO",
"LOGx",
"18446744073709551616.log",
"184467440737095516150.log",
"100",
"100.",
"100.lop",
"100.val",
".val",
"123456789012345678901234567890123456789-12340.val",
"1234567890123456789012345678901234567-123-0.val",
"12345678901234567890123456789012345678902-100-1-.val",
// Overflow on value size
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val",
// '03.val' is a bad compression type
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" };
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
std::string f = errors[i];
ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f;
};
}
TEST(FileNameTest, Construction) {
uint64_t number;
FileType type;
LargeValueRef large_ref;
std::string fname;
fname = CurrentFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(0, number);
ASSERT_EQ(kCurrentFile, type);
fname = LockFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(0, number);
ASSERT_EQ(kDBLockFile, type);
fname = LogFileName("foo", 192);
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(192, number);
ASSERT_EQ(kLogFile, type);
fname = TableFileName("bar", 200);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(200, number);
ASSERT_EQ(kTableFile, type);
fname = DescriptorFileName("bar", 100);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(100, number);
ASSERT_EQ(kDescriptorFile, type);
fname = TempFileName("tmp", 999);
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(999, number);
ASSERT_EQ(kTempFile, type);
for (int i = 0; i <= kSnappyCompression; i++) {
CompressionType ctype = static_cast<CompressionType>(i);
std::string value = "abcdef";
LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype);
fname = LargeValueFileName("tmp", real_large_ref);
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_TRUE(real_large_ref == large_ref);
ASSERT_EQ(kLargeValueFile, type);
ASSERT_EQ(large_ref.compression_type(), ctype);
}
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

@ -46,9 +46,9 @@ Status Writer::AddRecord(const Slice& slice) {
} }
// Invariant: we never leave < kHeaderSize bytes in a block. // Invariant: we never leave < kHeaderSize bytes in a block.
assert(kBlockSize - block_offset_ - kHeaderSize >= 0); const int avail = kBlockSize - block_offset_ - kHeaderSize;
assert(avail >= 0);
const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
const size_t fragment_length = (left < avail) ? left : avail; const size_t fragment_length = (left < avail) ? left : avail;
RecordType type; RecordType type;

@ -6,7 +6,8 @@
// (1) Any log files are first converted to tables // (1) Any log files are first converted to tables
// (2) We scan every table to compute // (2) We scan every table to compute
// (a) smallest/largest for the table // (a) smallest/largest for the table
// (b) largest sequence number in the table // (b) large value refs from the table
// (c) largest sequence number in the table
// (3) We generate descriptor contents: // (3) We generate descriptor contents:
// - log number is set to zero // - log number is set to zero
// - next-file-number is set to 1 + largest file number we found // - next-file-number is set to 1 + largest file number we found
@ -21,8 +22,9 @@
// (c) For each table: if it overlaps earlier table, place in level-0, // (c) For each table: if it overlaps earlier table, place in level-0,
// else place in level-M. // else place in level-M.
// Possible optimization 2: // Possible optimization 2:
// Store per-table metadata (smallest, largest, largest-seq#, ...) // Store per-table metadata (smallest, largest, largest-seq#,
// in the table's meta section to speed up ScanTable. // large-value-refs, ...) in the table's meta section to speed up
// ScanTable.
#include "db/builder.h" #include "db/builder.h"
#include "db/db_impl.h" #include "db/db_impl.h"
@ -71,7 +73,7 @@ class Repairer {
} }
if (status.ok()) { if (status.ok()) {
unsigned long long bytes = 0; unsigned long long bytes = 0;
for (size_t i = 0; i < tables_.size(); i++) { for (int i = 0; i < tables_.size(); i++) {
bytes += tables_[i].meta.file_size; bytes += tables_[i].meta.file_size;
} }
Log(env_, options_.info_log, Log(env_, options_.info_log,
@ -117,10 +119,13 @@ class Repairer {
} }
uint64_t number; uint64_t number;
LargeValueRef large_ref;
FileType type; FileType type;
for (size_t i = 0; i < filenames.size(); i++) { for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) { if (ParseFileName(filenames[i], &number, &large_ref, &type)) {
if (type == kDescriptorFile) { if (type == kLargeValueFile) {
// Will be picked up when we process a Table that points to it
} else if (type == kDescriptorFile) {
manifests_.push_back(filenames[i]); manifests_.push_back(filenames[i]);
} else { } else {
if (number + 1 > next_file_number_) { if (number + 1 > next_file_number_) {
@ -140,7 +145,7 @@ class Repairer {
} }
void ConvertLogFilesToTables() { void ConvertLogFilesToTables() {
for (size_t i = 0; i < logs_.size(); i++) { for (int i = 0; i < logs_.size(); i++) {
std::string logname = LogFileName(dbname_, logs_[i]); std::string logname = LogFileName(dbname_, logs_[i]);
Status status = ConvertLogToTable(logs_[i]); Status status = ConvertLogToTable(logs_[i]);
if (!status.ok()) { if (!status.ok()) {
@ -234,7 +239,7 @@ class Repairer {
void ExtractMetaData() { void ExtractMetaData() {
std::vector<TableInfo> kept; std::vector<TableInfo> kept;
for (size_t i = 0; i < table_numbers_.size(); i++) { for (int i = 0; i < table_numbers_.size(); i++) {
TableInfo t; TableInfo t;
t.meta.number = table_numbers_[i]; t.meta.number = table_numbers_[i];
Status status = ScanTable(&t); Status status = ScanTable(&t);
@ -278,6 +283,17 @@ class Repairer {
if (parsed.sequence > t->max_sequence) { if (parsed.sequence > t->max_sequence) {
t->max_sequence = parsed.sequence; t->max_sequence = parsed.sequence;
} }
if (ExtractValueType(key) == kTypeLargeValueRef) {
if (iter->value().size() != LargeValueRef::ByteSize()) {
Log(env_, options_.info_log, "Table #%llu: bad large value ref",
(unsigned long long) t->meta.number);
} else {
edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()),
t->meta.number,
key);
}
}
} }
if (!iter->status().ok()) { if (!iter->status().ok()) {
status = iter->status(); status = iter->status();
@ -300,7 +316,7 @@ class Repairer {
} }
SequenceNumber max_sequence = 0; SequenceNumber max_sequence = 0;
for (size_t i = 0; i < tables_.size(); i++) { for (int i = 0; i < tables_.size(); i++) {
if (max_sequence < tables_[i].max_sequence) { if (max_sequence < tables_[i].max_sequence) {
max_sequence = tables_[i].max_sequence; max_sequence = tables_[i].max_sequence;
} }
@ -311,7 +327,7 @@ class Repairer {
edit_.SetNextFile(next_file_number_); edit_.SetNextFile(next_file_number_);
edit_.SetLastSequence(max_sequence); edit_.SetLastSequence(max_sequence);
for (size_t i = 0; i < tables_.size(); i++) { for (int i = 0; i < tables_.size(); i++) {
// TODO(opt): separate out into multiple levels // TODO(opt): separate out into multiple levels
const TableInfo& t = tables_[i]; const TableInfo& t = tables_[i];
edit_.AddFile(0, t.meta.number, t.meta.file_size, edit_.AddFile(0, t.meta.number, t.meta.file_size,
@ -335,7 +351,7 @@ class Repairer {
env_->DeleteFile(tmp); env_->DeleteFile(tmp);
} else { } else {
// Discard older manifests // Discard older manifests
for (size_t i = 0; i < manifests_.size(); i++) { for (int i = 0; i < manifests_.size(); i++) {
ArchiveFile(dbname_ + "/" + manifests_[i]); ArchiveFile(dbname_ + "/" + manifests_[i]);
} }

@ -19,7 +19,7 @@ enum Tag {
kCompactPointer = 5, kCompactPointer = 5,
kDeletedFile = 6, kDeletedFile = 6,
kNewFile = 7, kNewFile = 7,
// 8 was used for large value refs kLargeValueRef = 8,
kPrevLogNumber = 9, kPrevLogNumber = 9,
}; };
@ -36,6 +36,7 @@ void VersionEdit::Clear() {
has_last_sequence_ = false; has_last_sequence_ = false;
deleted_files_.clear(); deleted_files_.clear();
new_files_.clear(); new_files_.clear();
large_refs_added_.clear();
} }
void VersionEdit::EncodeTo(std::string* dst) const { void VersionEdit::EncodeTo(std::string* dst) const {
@ -60,7 +61,7 @@ void VersionEdit::EncodeTo(std::string* dst) const {
PutVarint64(dst, last_sequence_); PutVarint64(dst, last_sequence_);
} }
for (size_t i = 0; i < compact_pointers_.size(); i++) { for (int i = 0; i < compact_pointers_.size(); i++) {
PutVarint32(dst, kCompactPointer); PutVarint32(dst, kCompactPointer);
PutVarint32(dst, compact_pointers_[i].first); // level PutVarint32(dst, compact_pointers_[i].first); // level
PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
@ -74,7 +75,7 @@ void VersionEdit::EncodeTo(std::string* dst) const {
PutVarint64(dst, iter->second); // file number PutVarint64(dst, iter->second); // file number
} }
for (size_t i = 0; i < new_files_.size(); i++) { for (int i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second; const FileMetaData& f = new_files_[i].second;
PutVarint32(dst, kNewFile); PutVarint32(dst, kNewFile);
PutVarint32(dst, new_files_[i].first); // level PutVarint32(dst, new_files_[i].first); // level
@ -83,6 +84,15 @@ void VersionEdit::EncodeTo(std::string* dst) const {
PutLengthPrefixedSlice(dst, f.smallest.Encode()); PutLengthPrefixedSlice(dst, f.smallest.Encode());
PutLengthPrefixedSlice(dst, f.largest.Encode()); PutLengthPrefixedSlice(dst, f.largest.Encode());
} }
for (int i = 0; i < large_refs_added_.size(); i++) {
const VersionEdit::Large& l = large_refs_added_[i];
PutVarint32(dst, kLargeValueRef);
PutLengthPrefixedSlice(dst,
Slice(l.large_ref.data, LargeValueRef::ByteSize()));
PutVarint64(dst, l.fnum);
PutLengthPrefixedSlice(dst, l.internal_key.Encode());
}
} }
static bool GetInternalKey(Slice* input, InternalKey* dst) { static bool GetInternalKey(Slice* input, InternalKey* dst) {
@ -117,6 +127,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
uint64_t number; uint64_t number;
FileMetaData f; FileMetaData f;
Slice str; Slice str;
Large large;
InternalKey key; InternalKey key;
while (msg == NULL && GetVarint32(&input, &tag)) { while (msg == NULL && GetVarint32(&input, &tag)) {
@ -192,6 +203,18 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
} }
break; break;
case kLargeValueRef:
if (GetLengthPrefixedSlice(&input, &str) &&
(str.size() == LargeValueRef::ByteSize()) &&
GetVarint64(&input, &large.fnum) &&
GetInternalKey(&input, &large.internal_key)) {
large.large_ref = LargeValueRef::FromRef(str);
large_refs_added_.push_back(large);
} else {
msg = "large ref";
}
break;
default: default:
msg = "unknown tag"; msg = "unknown tag";
break; break;
@ -232,7 +255,7 @@ std::string VersionEdit::DebugString() const {
r.append("\n LastSeq: "); r.append("\n LastSeq: ");
AppendNumberTo(&r, last_sequence_); AppendNumberTo(&r, last_sequence_);
} }
for (size_t i = 0; i < compact_pointers_.size(); i++) { for (int i = 0; i < compact_pointers_.size(); i++) {
r.append("\n CompactPointer: "); r.append("\n CompactPointer: ");
AppendNumberTo(&r, compact_pointers_[i].first); AppendNumberTo(&r, compact_pointers_[i].first);
r.append(" '"); r.append(" '");
@ -247,7 +270,7 @@ std::string VersionEdit::DebugString() const {
r.append(" "); r.append(" ");
AppendNumberTo(&r, iter->second); AppendNumberTo(&r, iter->second);
} }
for (size_t i = 0; i < new_files_.size(); i++) { for (int i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second; const FileMetaData& f = new_files_[i].second;
r.append("\n AddFile: "); r.append("\n AddFile: ");
AppendNumberTo(&r, new_files_[i].first); AppendNumberTo(&r, new_files_[i].first);
@ -261,6 +284,16 @@ std::string VersionEdit::DebugString() const {
AppendEscapedStringTo(&r, f.largest.Encode()); AppendEscapedStringTo(&r, f.largest.Encode());
r.append("'"); r.append("'");
} }
for (int i = 0; i < large_refs_added_.size(); i++) {
const VersionEdit::Large& l = large_refs_added_[i];
r.append("\n LargeRef: ");
AppendNumberTo(&r, l.fnum);
r.append(" ");
r.append(LargeValueRefToFilenameString(l.large_ref));
r.append(" '");
AppendEscapedStringTo(&r, l.internal_key.Encode());
r.append("'");
}
r.append("\n}\n"); r.append("\n}\n");
return r; return r;
} }

@ -75,6 +75,18 @@ class VersionEdit {
deleted_files_.insert(std::make_pair(level, file)); deleted_files_.insert(std::make_pair(level, file));
} }
// Record that a large value with the specified large_ref was
// written to the output file numbered "fnum"
void AddLargeValueRef(const LargeValueRef& large_ref,
uint64_t fnum,
const Slice& internal_key) {
large_refs_added_.resize(large_refs_added_.size() + 1);
Large* large = &(large_refs_added_.back());
large->large_ref = large_ref;
large->fnum = fnum;
large->internal_key.DecodeFrom(internal_key);
}
void EncodeTo(std::string* dst) const; void EncodeTo(std::string* dst) const;
Status DecodeFrom(const Slice& src); Status DecodeFrom(const Slice& src);
@ -99,6 +111,12 @@ class VersionEdit {
std::vector< std::pair<int, InternalKey> > compact_pointers_; std::vector< std::pair<int, InternalKey> > compact_pointers_;
DeletedFileSet deleted_files_; DeletedFileSet deleted_files_;
std::vector< std::pair<int, FileMetaData> > new_files_; std::vector< std::pair<int, FileMetaData> > new_files_;
struct Large {
LargeValueRef large_ref;
uint64_t fnum;
InternalKey internal_key;
};
std::vector<Large> large_refs_added_;
}; };
} }

@ -26,9 +26,13 @@ TEST(VersionEditTest, EncodeDecode) {
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
TestEncodeDecode(edit); TestEncodeDecode(edit);
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
InternalKey("foo", kBig + 500 + i, kTypeValue), InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef),
InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
edit.DeleteFile(4, kBig + 700 + i); edit.DeleteFile(4, kBig + 700 + i);
edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression),
kBig + 800 + i, "foobar");
edit.AddLargeValueRef(LargeValueRef::Make("big2", kSnappyCompression),
kBig + 801 + i, "baz");
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
} }

@ -58,7 +58,7 @@ std::string IntSetToString(const std::set<uint64_t>& s) {
Version::~Version() { Version::~Version() {
assert(refs_ == 0); assert(refs_ == 0);
for (int level = 0; level < config::kNumLevels; level++) { for (int level = 0; level < config::kNumLevels; level++) {
for (size_t i = 0; i < files_[level].size(); i++) { for (int i = 0; i < files_[level].size(); i++) {
FileMetaData* f = files_[level][i]; FileMetaData* f = files_[level][i];
assert(f->refs >= 0); assert(f->refs >= 0);
f->refs--; f->refs--;
@ -134,7 +134,7 @@ class Version::LevelFileNumIterator : public Iterator {
private: private:
const InternalKeyComparator icmp_; const InternalKeyComparator icmp_;
const std::vector<FileMetaData*>* const flist_; const std::vector<FileMetaData*>* const flist_;
uint32_t index_; int index_;
// Backing store for value(). Holds the file number and size. // Backing store for value(). Holds the file number and size.
mutable char value_buf_[16]; mutable char value_buf_[16];
@ -164,7 +164,7 @@ Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
void Version::AddIterators(const ReadOptions& options, void Version::AddIterators(const ReadOptions& options,
std::vector<Iterator*>* iters) { std::vector<Iterator*>* iters) {
// Merge all level zero files together since they may overlap // Merge all level zero files together since they may overlap
for (size_t i = 0; i < files_[0].size(); i++) { for (int i = 0; i < files_[0].size(); i++) {
iters->push_back( iters->push_back(
vset_->table_cache_->NewIterator( vset_->table_cache_->NewIterator(
options, files_[0][i]->number, files_[0][i]->file_size)); options, files_[0][i]->number, files_[0][i]->file_size));
@ -201,7 +201,7 @@ std::string Version::DebugString() const {
AppendNumberTo(&r, level); AppendNumberTo(&r, level);
r.push_back(':'); r.push_back(':');
const std::vector<FileMetaData*>& files = files_[level]; const std::vector<FileMetaData*>& files = files_[level];
for (size_t i = 0; i < files.size(); i++) { for (int i = 0; i < files.size(); i++) {
r.push_back(' '); r.push_back(' ');
AppendNumberTo(&r, files[i]->number); AppendNumberTo(&r, files[i]->number);
r.push_back(':'); r.push_back(':');
@ -232,7 +232,7 @@ class VersionSet::Builder {
: vset_(vset) { : vset_(vset) {
for (int level = 0; level < config::kNumLevels; level++) { for (int level = 0; level < config::kNumLevels; level++) {
const std::vector<FileMetaData*>& files = base->files_[level]; const std::vector<FileMetaData*>& files = base->files_[level];
for (size_t i = 0; i < files.size(); i++) { for (int i = 0; i < files.size(); i++) {
FileMetaData* f = files[i]; FileMetaData* f = files[i];
f->refs++; f->refs++;
files_[level].insert(std::make_pair(f->number, f)); files_[level].insert(std::make_pair(f->number, f));
@ -258,7 +258,7 @@ class VersionSet::Builder {
// Apply all of the edits in *edit to the current state. // Apply all of the edits in *edit to the current state.
void Apply(VersionEdit* edit) { void Apply(VersionEdit* edit) {
// Update compaction pointers // Update compaction pointers
for (size_t i = 0; i < edit->compact_pointers_.size(); i++) { for (int i = 0; i < edit->compact_pointers_.size(); i++) {
const int level = edit->compact_pointers_[i].first; const int level = edit->compact_pointers_[i].first;
vset_->compact_pointer_[level] = vset_->compact_pointer_[level] =
edit->compact_pointers_[i].second.Encode().ToString(); edit->compact_pointers_[i].second.Encode().ToString();
@ -284,13 +284,19 @@ class VersionSet::Builder {
} }
// Add new files // Add new files
for (size_t i = 0; i < edit->new_files_.size(); i++) { for (int i = 0; i < edit->new_files_.size(); i++) {
const int level = edit->new_files_[i].first; const int level = edit->new_files_[i].first;
FileMetaData* f = new FileMetaData(edit->new_files_[i].second); FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
f->refs = 1; f->refs = 1;
assert(files_[level].count(f->number) == 0); assert(files_[level].count(f->number) == 0);
files_[level].insert(std::make_pair(f->number, f)); files_[level].insert(std::make_pair(f->number, f));
} }
// Add large value refs
for (int i = 0; i < edit->large_refs_added_.size(); i++) {
const VersionEdit::Large& l = edit->large_refs_added_[i];
vset_->RegisterLargeValueRef(l.large_ref, l.fnum, l.internal_key);
}
} }
// Save the current state in *v. // Save the current state in *v.
@ -539,7 +545,7 @@ Status VersionSet::Recover() {
static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) { static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
int64_t sum = 0; int64_t sum = 0;
for (size_t i = 0; i < files.size(); i++) { for (int i = 0; i < files.size(); i++) {
sum += files[i]->file_size; sum += files[i]->file_size;
} }
return sum; return sum;
@ -604,12 +610,25 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
// Save files // Save files
for (int level = 0; level < config::kNumLevels; level++) { for (int level = 0; level < config::kNumLevels; level++) {
const std::vector<FileMetaData*>& files = current_->files_[level]; const std::vector<FileMetaData*>& files = current_->files_[level];
for (size_t i = 0; i < files.size(); i++) { for (int i = 0; i < files.size(); i++) {
const FileMetaData* f = files[i]; const FileMetaData* f = files[i];
edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest);
} }
} }
// Save large value refs
for (LargeValueMap::const_iterator it = large_value_refs_.begin();
it != large_value_refs_.end();
++it) {
const LargeValueRef& ref = it->first;
const LargeReferencesSet& pointers = it->second;
for (LargeReferencesSet::const_iterator j = pointers.begin();
j != pointers.end();
++j) {
edit.AddLargeValueRef(ref, j->first, j->second);
}
}
std::string record; std::string record;
edit.EncodeTo(&record); edit.EncodeTo(&record);
return log->AddRecord(record); return log->AddRecord(record);
@ -632,7 +651,7 @@ Status VersionSet::SortLevel(Version* v, uint64_t level) {
if (result.ok() && level > 0) { if (result.ok() && level > 0) {
// There should be no overlap // There should be no overlap
for (size_t i = 1; i < v->files_[level].size(); i++) { for (int i = 1; i < v->files_[level].size(); i++) {
const InternalKey& prev_end = v->files_[level][i-1]->largest; const InternalKey& prev_end = v->files_[level][i-1]->largest;
const InternalKey& this_begin = v->files_[level][i]->smallest; const InternalKey& this_begin = v->files_[level][i]->smallest;
if (icmp_.Compare(prev_end, this_begin) >= 0) { if (icmp_.Compare(prev_end, this_begin) >= 0) {
@ -657,7 +676,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
uint64_t result = 0; uint64_t result = 0;
for (int level = 0; level < config::kNumLevels; level++) { for (int level = 0; level < config::kNumLevels; level++) {
const std::vector<FileMetaData*>& files = v->files_[level]; const std::vector<FileMetaData*>& files = v->files_[level];
for (size_t i = 0; i < files.size(); i++) { for (int i = 0; i < files.size(); i++) {
if (icmp_.Compare(files[i]->largest, ikey) <= 0) { if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
// Entire file is before "ikey", so just add the file size // Entire file is before "ikey", so just add the file size
result += files[i]->file_size; result += files[i]->file_size;
@ -682,9 +701,83 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
} }
} }
} }
// Add in large value files which are references from internal keys
// stored in the table files
//
// TODO(opt): this is O(# large values in db). If this becomes too slow,
// we could store an auxiliary data structure indexed by internal key
for (LargeValueMap::const_iterator it = large_value_refs_.begin();
it != large_value_refs_.end();
++it) {
const LargeValueRef& lref = it->first;
for (LargeReferencesSet::const_iterator it2 = it->second.begin();
it2 != it->second.end();
++it2) {
if (icmp_.Compare(it2->second, ikey.Encode()) <= 0) {
// Internal key for large value is before our key of interest
result += lref.ValueSize();
}
}
}
return result; return result;
} }
bool VersionSet::RegisterLargeValueRef(const LargeValueRef& large_ref,
uint64_t fnum,
const InternalKey& internal_key) {
LargeReferencesSet* refs = &large_value_refs_[large_ref];
bool is_first = refs->empty();
refs->insert(make_pair(fnum, internal_key.Encode().ToString()));
return is_first;
}
void VersionSet::CleanupLargeValueRefs(const std::set<uint64_t>& live_tables) {
for (LargeValueMap::iterator it = large_value_refs_.begin();
it != large_value_refs_.end();
) {
LargeReferencesSet* refs = &it->second;
for (LargeReferencesSet::iterator ref_it = refs->begin();
ref_it != refs->end();
) {
if (ref_it->first != log_number_ && // Not in log file
ref_it->first != prev_log_number_ && // Not in prev log
live_tables.count(ref_it->first) == 0) { // Not in a live table
// No longer live: erase
LargeReferencesSet::iterator to_erase = ref_it;
++ref_it;
refs->erase(to_erase);
} else {
// Still live: leave this reference alone
++ref_it;
}
}
if (refs->empty()) {
// No longer any live references to this large value: remove from
// large_value_refs
Log(env_, options_->info_log, "large value is dead: '%s'",
LargeValueRefToFilenameString(it->first).c_str());
LargeValueMap::iterator to_erase = it;
++it;
large_value_refs_.erase(to_erase);
} else {
++it;
}
}
}
bool VersionSet::LargeValueIsLive(const LargeValueRef& large_ref) {
LargeValueMap::iterator it = large_value_refs_.find(large_ref);
if (it == large_value_refs_.end()) {
return false;
} else {
assert(!it->second.empty());
return true;
}
}
void VersionSet::MaybeDeleteOldVersions() { void VersionSet::MaybeDeleteOldVersions() {
// Note: it is important to delete versions in order since a newer // Note: it is important to delete versions in order since a newer
// version with zero refs may be holding a pointer to a memtable // version with zero refs may be holding a pointer to a memtable
@ -700,7 +793,7 @@ void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
for (Version* v = oldest_; v != NULL; v = v->next_) { for (Version* v = oldest_; v != NULL; v = v->next_) {
for (int level = 0; level < config::kNumLevels; level++) { for (int level = 0; level < config::kNumLevels; level++) {
const std::vector<FileMetaData*>& files = v->files_[level]; const std::vector<FileMetaData*>& files = v->files_[level];
for (size_t i = 0; i < files.size(); i++) { for (int i = 0; i < files.size(); i++) {
live->insert(files[i]->number); live->insert(files[i]->number);
} }
} }
@ -717,7 +810,7 @@ int64_t VersionSet::MaxNextLevelOverlappingBytes() {
int64_t result = 0; int64_t result = 0;
std::vector<FileMetaData*> overlaps; std::vector<FileMetaData*> overlaps;
for (int level = 0; level < config::kNumLevels - 1; level++) { for (int level = 0; level < config::kNumLevels - 1; level++) {
for (size_t i = 0; i < current_->files_[level].size(); i++) { for (int i = 0; i < current_->files_[level].size(); i++) {
const FileMetaData* f = current_->files_[level][i]; const FileMetaData* f = current_->files_[level][i];
GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps);
const int64_t sum = TotalFileSize(overlaps); const int64_t sum = TotalFileSize(overlaps);
@ -739,7 +832,7 @@ void VersionSet::GetOverlappingInputs(
Slice user_begin = begin.user_key(); Slice user_begin = begin.user_key();
Slice user_end = end.user_key(); Slice user_end = end.user_key();
const Comparator* user_cmp = icmp_.user_comparator(); const Comparator* user_cmp = icmp_.user_comparator();
for (size_t i = 0; i < current_->files_[level].size(); i++) { for (int i = 0; i < current_->files_[level].size(); i++) {
FileMetaData* f = current_->files_[level][i]; FileMetaData* f = current_->files_[level][i];
if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 || if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 ||
user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { user_cmp->Compare(f->smallest.user_key(), user_end) > 0) {
@ -759,7 +852,7 @@ void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
assert(!inputs.empty()); assert(!inputs.empty());
smallest->Clear(); smallest->Clear();
largest->Clear(); largest->Clear();
for (size_t i = 0; i < inputs.size(); i++) { for (int i = 0; i < inputs.size(); i++) {
FileMetaData* f = inputs[i]; FileMetaData* f = inputs[i];
if (i == 0) { if (i == 0) {
*smallest = f->smallest; *smallest = f->smallest;
@ -802,7 +895,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
if (!c->inputs_[which].empty()) { if (!c->inputs_[which].empty()) {
if (c->level() + which == 0) { if (c->level() + which == 0) {
const std::vector<FileMetaData*>& files = c->inputs_[which]; const std::vector<FileMetaData*>& files = c->inputs_[which];
for (size_t i = 0; i < files.size(); i++) { for (int i = 0; i < files.size(); i++) {
list[num++] = table_cache_->NewIterator( list[num++] = table_cache_->NewIterator(
options, files[i]->number, files[i]->file_size); options, files[i]->number, files[i]->file_size);
} }
@ -834,7 +927,7 @@ Compaction* VersionSet::PickCompaction() {
c->input_version_->Ref(); c->input_version_->Ref();
// Pick the first file that comes after compact_pointer_[level] // Pick the first file that comes after compact_pointer_[level]
for (size_t i = 0; i < current_->files_[level].size(); i++) { for (int i = 0; i < current_->files_[level].size(); i++) {
FileMetaData* f = current_->files_[level][i]; FileMetaData* f = current_->files_[level][i];
if (compact_pointer_[level].empty() || if (compact_pointer_[level].empty() ||
icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
@ -969,7 +1062,7 @@ bool Compaction::IsTrivialMove() const {
void Compaction::AddInputDeletions(VersionEdit* edit) { void Compaction::AddInputDeletions(VersionEdit* edit) {
for (int which = 0; which < 2; which++) { for (int which = 0; which < 2; which++) {
for (size_t i = 0; i < inputs_[which].size(); i++) { for (int i = 0; i < inputs_[which].size(); i++) {
edit->DeleteFile(level_ + which, inputs_[which][i]->number); edit->DeleteFile(level_ + which, inputs_[which][i]->number);
} }
} }

@ -171,6 +171,22 @@ class VersionSet {
// "key" as of version "v". // "key" as of version "v".
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
// Register a reference to a large value with the specified
// large_ref from the specified file number. Returns "true" if this
// is the first recorded reference to the "large_ref" value in the
// database, and false otherwise.
bool RegisterLargeValueRef(const LargeValueRef& large_ref,
uint64_t filenum,
const InternalKey& internal_key);
// Cleanup the large value reference state by eliminating any
// references from files that are not includes in either "live_tables"
// or the current log.
void CleanupLargeValueRefs(const std::set<uint64_t>& live_tables);
// Returns true if a large value with the given reference is live.
bool LargeValueIsLive(const LargeValueRef& large_ref);
private: private:
class Builder; class Builder;
@ -221,6 +237,14 @@ class VersionSet {
Version* current_; // Pointer to the last (newest) list entry Version* current_; // Pointer to the last (newest) list entry
Version* oldest_; // Pointer to the first (oldest) list entry Version* oldest_; // Pointer to the first (oldest) list entry
// Map from large value reference to the set of <file numbers,internal_key>
// values containing references to the value. We keep the
// internal key as a std::string rather than as an InternalKey because
// we want to be able to easily use a set.
typedef std::set<std::pair<uint64_t, std::string> > LargeReferencesSet;
typedef std::map<LargeValueRef, LargeReferencesSet> LargeValueMap;
LargeValueMap large_value_refs_;
// Per-level key at which the next compaction at that level should start. // Per-level key at which the next compaction at that level should start.
// Either an empty string, or a valid InternalKey. // Either an empty string, or a valid InternalKey.
std::string compact_pointer_[config::kNumLevels]; std::string compact_pointer_[config::kNumLevels];
@ -289,7 +313,7 @@ class Compaction {
// State used to check for number of of overlapping grandparent files // State used to check for number of of overlapping grandparent files
// (parent == level_ + 1, grandparent == level_ + 2) // (parent == level_ + 1, grandparent == level_ + 2)
std::vector<FileMetaData*> grandparents_; std::vector<FileMetaData*> grandparents_;
size_t grandparent_index_; // Index in grandparent_starts_ int grandparent_index_; // Index in grandparent_starts_
bool seen_key_; // Some output key has been seen bool seen_key_; // Some output key has been seen
int64_t overlapped_bytes_; // Bytes of overlap between current output int64_t overlapped_bytes_; // Bytes of overlap between current output
// and grandparent files // and grandparent files
@ -300,7 +324,7 @@ class Compaction {
// is that we are positioned at one of the file ranges for each // is that we are positioned at one of the file ranges for each
// higher level than the ones involved in this compaction (i.e. for // higher level than the ones involved in this compaction (i.e. for
// all L >= level_ + 2). // all L >= level_ + 2).
size_t level_ptrs_[config::kNumLevels]; int level_ptrs_[config::kNumLevels];
}; };
} }

@ -8,6 +8,7 @@
// data: record[count] // data: record[count]
// record := // record :=
// kTypeValue varstring varstring | // kTypeValue varstring varstring |
// kTypeLargeValueRef varstring varstring |
// kTypeDeletion varstring // kTypeDeletion varstring
// varstring := // varstring :=
// len: varint32 // len: varint32
@ -57,6 +58,16 @@ void WriteBatch::Put(const Slice& key, const Slice& value) {
PutLengthPrefixedSlice(&rep_, value); PutLengthPrefixedSlice(&rep_, value);
} }
void WriteBatchInternal::PutLargeValueRef(WriteBatch* b,
const Slice& key,
const LargeValueRef& large_ref) {
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
b->rep_.push_back(static_cast<char>(kTypeLargeValueRef));
PutLengthPrefixedSlice(&b->rep_, key);
PutLengthPrefixedSlice(&b->rep_,
Slice(large_ref.data, sizeof(large_ref.data)));
}
void WriteBatch::Delete(const Slice& key) { void WriteBatch::Delete(const Slice& key) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeDeletion)); rep_.push_back(static_cast<char>(kTypeDeletion));
@ -76,6 +87,10 @@ Status WriteBatchInternal::InsertInto(const WriteBatch* b,
case kTypeValue: case kTypeValue:
memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value());
break; break;
case kTypeLargeValueRef:
memtable->Add(it.sequence_number(), kTypeLargeValueRef,
it.key(), it.value());
break;
} }
found++; found++;
} }
@ -119,6 +134,7 @@ void WriteBatchInternal::Iterator::GetNextEntry() {
input_.remove_prefix(1); input_.remove_prefix(1);
switch (tag) { switch (tag) {
case kTypeValue: case kTypeValue:
case kTypeLargeValueRef:
if (GetLengthPrefixedSlice(&input_, &key_) && if (GetLengthPrefixedSlice(&input_, &key_) &&
GetLengthPrefixedSlice(&input_, &value_)) { GetLengthPrefixedSlice(&input_, &value_)) {
op_ = static_cast<ValueType>(tag); op_ = static_cast<ValueType>(tag);

@ -13,6 +13,10 @@ namespace leveldb {
// WriteBatch that we don't want in the public WriteBatch interface. // WriteBatch that we don't want in the public WriteBatch interface.
class WriteBatchInternal { class WriteBatchInternal {
public: public:
static void PutLargeValueRef(WriteBatch* batch,
const Slice& key,
const LargeValueRef& large_ref);
// Return the number of entries in the batch. // Return the number of entries in the batch.
static int Count(const WriteBatch* batch); static int Count(const WriteBatch* batch);

@ -29,6 +29,13 @@ static std::string PrintContents(WriteBatch* b) {
state.append(iter->value().ToString()); state.append(iter->value().ToString());
state.append(")"); state.append(")");
break; break;
case kTypeLargeValueRef:
state.append("PutRef(");
state.append(ikey.user_key.ToString());
state.append(", ");
state.append(iter->value().ToString());
state.append(")");
break;
case kTypeDeletion: case kTypeDeletion:
state.append("Delete("); state.append("Delete(");
state.append(ikey.user_key.ToString()); state.append(ikey.user_key.ToString());
@ -67,6 +74,22 @@ TEST(WriteBatchTest, Multiple) {
PrintContents(&batch)); PrintContents(&batch));
} }
TEST(WriteBatchTest, PutIndirect) {
WriteBatch batch;
batch.Put(Slice("baz"), Slice("boo"));
LargeValueRef h;
for (int i = 0; i < LargeValueRef::ByteSize(); i++) {
h.data[i] = (i < 20) ? 'a' : 'b';
}
WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h);
WriteBatchInternal::SetSequence(&batch, 100);
ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
ASSERT_EQ(2, WriteBatchInternal::Count(&batch));
ASSERT_EQ("Put(baz, boo)@100"
"PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101",
PrintContents(&batch));
}
TEST(WriteBatchTest, Corruption) { TEST(WriteBatchTest, Corruption) {
WriteBatch batch; WriteBatch batch;
batch.Put(Slice("foo"), Slice("bar")); batch.Put(Slice("foo"), Slice("bar"));

@ -57,6 +57,15 @@ These merges have the effect of gradually migrating new updates from
the young level to the largest level using only bulk reads and writes the young level to the largest level using only bulk reads and writes
(i.e., minimizing expensive seeks). (i.e., minimizing expensive seeks).
<h2>Large value files</h2>
<p>
Each large value (greater than 64KB by default) is placed in a large
value file (*.val) of its own. An entry is maintained in the log
and/or sorted tables that maps from the corresponding key to the
name of this large value file. The name of the large value file
is derived from a SHA1 hash of the value and its length so that
identical values share the same file.
<p>
<h2>Manifest</h2> <h2>Manifest</h2>
<p> <p>
A MANIFEST file lists the set of sorted tables that make up each A MANIFEST file lists the set of sorted tables that make up each
@ -211,7 +220,9 @@ So maybe even the sharding is not necessary on modern filesystems?
compaction and at the end of recovery. It finds the names of all compaction and at the end of recovery. It finds the names of all
files in the database. It deletes all log files that are not the files in the database. It deletes all log files that are not the
current log file. It deletes all table files that are not referenced current log file. It deletes all table files that are not referenced
from some level and are not the output of an active compaction. from some level and are not the output of an active compaction. It
deletes all large value files that are not referenced from any live
table or log file.
</body> </body>
</html> </html>

@ -412,6 +412,17 @@ We might want to prefix <code>filename</code> keys with one letter (say '/') and
over just the metadata do not force us to fetch and cache bulky file over just the metadata do not force us to fetch and cache bulky file
contents. contents.
<p> <p>
<h2>Large Values</h2>
<p>
<code>leveldb</code> has special treatment of large values (by default, a value
of length greater than or equal to 64K is considered large, though a
field in Options can be used to adjust this threshold). Each such
large value is placed in a separate operating system file, and the
normal database blocks just contain pointers to such files.
<p>
Furthermore, if the same large value occurs multiple times in a single
database, it will be stored just once.
<p>
<h1>Checksums</h1> <h1>Checksums</h1>
<p> <p>
<code>leveldb</code> associates checksums with all data it stores in the file system. <code>leveldb</code> associates checksums with all data it stores in the file system.

@ -86,6 +86,16 @@ struct Options {
// Default: 1000 // Default: 1000
int max_open_files; int max_open_files;
// Handle values larger than "large_value_threshold" bytes
// specially, by writing them into their own files (to avoid
// compaction overhead) and doing content-based elimination of
// duplicate values to save space.
//
// We recommend against changing this value.
//
// Default: 64K
size_t large_value_threshold;
// Control over blocks (user data is stored in a set of blocks, and // Control over blocks (user data is stored in a set of blocks, and
// a block is the unit of reading from disk). // a block is the unit of reading from disk).
@ -100,7 +110,7 @@ struct Options {
// compression is enabled. This parameter can be changed dynamically. // compression is enabled. This parameter can be changed dynamically.
// //
// Default: 4K // Default: 4K
size_t block_size; int block_size;
// Number of keys between restart points for delta encoding of keys. // Number of keys between restart points for delta encoding of keys.
// This parameter can be changed dynamically. Most clients should // This parameter can be changed dynamically. Most clients should

@ -96,6 +96,8 @@
'port/port_example.h', 'port/port_example.h',
'port/port_posix.cc', 'port/port_posix.cc',
'port/port_posix.h', 'port/port_posix.h',
'port/sha1_portable.cc',
'port/sha1_portable.h',
'table/block.cc', 'table/block.cc',
'table/block.h', 'table/block.h',
'table/block_builder.cc', 'table/block_builder.cc',
@ -265,6 +267,16 @@
'db/log_test.cc', 'db/log_test.cc',
], ],
}, },
{
'target_name': 'leveldb_sha1_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'port/sha1_test.cc',
],
},
{ {
'target_name': 'leveldb_skiplist_test', 'target_name': 'leveldb_skiplist_test',
'type': 'executable', 'type': 'executable',

@ -1,122 +0,0 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/filename.h"
#include "db/dbformat.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace leveldb {
class FileNameTest { };
TEST(FileNameTest, Parse) {
Slice db;
FileType type;
uint64_t number;
// Successful parses
static struct {
const char* fname;
uint64_t number;
FileType type;
} cases[] = {
{ "100.log", 100, kLogFile },
{ "0.log", 0, kLogFile },
{ "0.sst", 0, kTableFile },
{ "CURRENT", 0, kCurrentFile },
{ "LOCK", 0, kDBLockFile },
{ "MANIFEST-2", 2, kDescriptorFile },
{ "MANIFEST-7", 7, kDescriptorFile },
{ "LOG", 0, kInfoLogFile },
{ "LOG.old", 0, kInfoLogFile },
{ "18446744073709551615.log", 18446744073709551615ull, kLogFile },
};
for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
std::string f = cases[i].fname;
ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
ASSERT_EQ(cases[i].type, type) << f;
ASSERT_EQ(cases[i].number, number) << f;
}
// Errors
static const char* errors[] = {
"",
"foo",
"foo-dx-100.log",
".log",
"",
"manifest",
"CURREN",
"CURRENTX",
"MANIFES",
"MANIFEST",
"MANIFEST-",
"XMANIFEST-3",
"MANIFEST-3x",
"LOC",
"LOCKx",
"LO",
"LOGx",
"18446744073709551616.log",
"184467440737095516150.log",
"100",
"100.",
"100.lop"
};
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
std::string f = errors[i];
ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
};
}
TEST(FileNameTest, Construction) {
uint64_t number;
FileType type;
std::string fname;
fname = CurrentFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(0, number);
ASSERT_EQ(kCurrentFile, type);
fname = LockFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(0, number);
ASSERT_EQ(kDBLockFile, type);
fname = LogFileName("foo", 192);
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(192, number);
ASSERT_EQ(kLogFile, type);
fname = TableFileName("bar", 200);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(200, number);
ASSERT_EQ(kTableFile, type);
fname = DescriptorFileName("bar", 100);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(100, number);
ASSERT_EQ(kDescriptorFile, type);
fname = TempFileName("tmp", 999);
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(999, number);
ASSERT_EQ(kTempFile, type);
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

@ -10,6 +10,7 @@
#include <endian.h> #include <endian.h>
#include <pthread.h> #include <pthread.h>
#include <stdint.h> #include <stdint.h>
#include <sha1.h>
#include <cstdatomic> #include <cstdatomic>
#include <string> #include <string>
#include <cctype> #include <cctype>
@ -133,6 +134,13 @@ inline bool Snappy_Uncompress(
return false; return false;
} }
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) {
SHA1_CTX sha1_ctx;
SHA1Init(&sha1_ctx);
SHA1Update(&sha1_ctx, (const u_char*)data, len);
SHA1Final((u_char*)hash_array, &sha1_ctx);
}
inline uint64_t ThreadIdentifier() { inline uint64_t ThreadIdentifier() {
pthread_t tid = pthread_self(); pthread_t tid = pthread_self();
uint64_t r = 0; uint64_t r = 0;

@ -13,6 +13,7 @@
#include "base/atomicops.h" #include "base/atomicops.h"
#include "base/basictypes.h" #include "base/basictypes.h"
#include "base/logging.h" #include "base/logging.h"
#include "base/sha1.h"
#include "base/synchronization/condition_variable.h" #include "base/synchronization/condition_variable.h"
#include "base/synchronization/lock.h" #include "base/synchronization/lock.h"
@ -82,6 +83,12 @@ class AtomicPointer {
} }
}; };
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) {
return ::base::SHA1HashBytes(reinterpret_cast<const unsigned char*>(data),
len,
reinterpret_cast<unsigned char*>(hash_array));
}
bool Snappy_Compress(const char* input, size_t input_length, bool Snappy_Compress(const char* input, size_t input_length,
std::string* output); std::string* output);
bool Snappy_Uncompress(const char* input_data, size_t input_length, bool Snappy_Uncompress(const char* input_data, size_t input_length,

@ -89,6 +89,11 @@ class AtomicPointer {
void NoBarrier_Store(void* v); void NoBarrier_Store(void* v);
}; };
// ------------------ Checksumming -------------------
// Store a 160-bit hash of "data[0..len-1]" in "hash_array[0]..hash_array[19]"
extern void SHA1_Hash(const char* data, size_t len, char* hash_array);
// ------------------ Compression ------------------- // ------------------ Compression -------------------
// Store the snappy compression of "input[0,input_length-1]" in *output. // Store the snappy compression of "input[0,input_length-1]" in *output.

@ -13,6 +13,7 @@
#include <string> #include <string>
#include <cstdatomic> #include <cstdatomic>
#include <cstring> #include <cstring>
#include "port/sha1_portable.h"
namespace leveldb { namespace leveldb {
namespace port { namespace port {
@ -72,6 +73,10 @@ class AtomicPointer {
} }
}; };
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) {
SHA1_Hash_Portable(data, len, hash_array);
}
// TODO(gabor): Implement actual compress // TODO(gabor): Implement actual compress
inline bool Snappy_Compress(const char* input, size_t input_length, inline bool Snappy_Compress(const char* input, size_t input_length,
std::string* output) { std::string* output) {

@ -0,0 +1,298 @@
// Portions copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// This module provides a slow but portable implementation of
// the SHA1 hash function.
//
// It is adapted from free code written by Paul E. Jones
// <paulej@packetizer.com>. See http://www.packetizer.com/security/sha1/
//
// The license for the original code is:
/*
Copyright (C) 1998, 2009
Paul E. Jones <paulej@packetizer.com>
Freeware Public License (FPL)
This software is licensed as "freeware." Permission to distribute
this software in source and binary forms, including incorporation
into other products, is hereby granted without a fee. THIS SOFTWARE
IS PROVIDED 'AS IS' AND WITHOUT ANY EXPRESSED OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS FOR A PARTICULAR PURPOSE. THE AUTHOR SHALL NOT BE HELD
LIABLE FOR ANY DAMAGES RESULTING FROM THE USE OF THIS SOFTWARE, EITHER
DIRECTLY OR INDIRECTLY, INCLUDING, BUT NOT LIMITED TO, LOSS OF DATA
OR DATA BEING RENDERED INACCURATE.
*/
#include "port/sha1_portable.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
namespace leveldb {
namespace port {
/*
* Description:
* This class implements the Secure Hashing Standard as defined
* in FIPS PUB 180-1 published April 17, 1995.
*/
/*
* This structure will hold context information for the hashing
* operation
*/
typedef struct SHA1Context {
unsigned Message_Digest[5]; /* Message Digest (output) */
unsigned Length_Low; /* Message length in bits */
unsigned Length_High; /* Message length in bits */
unsigned char Message_Block[64]; /* 512-bit message blocks */
int Message_Block_Index; /* Index into message block array */
bool Computed; /* Is the digest computed? */
bool Corrupted; /* Is the message digest corruped? */
} SHA1Context;
/*
* Portability Issues:
* SHA-1 is defined in terms of 32-bit "words". This code was
* written with the expectation that the processor has at least
* a 32-bit machine word size. If the machine word size is larger,
* the code should still function properly. One caveat to that
* is that the input functions taking characters and character
* arrays assume that only 8 bits of information are stored in each
* character.
*/
/*
* Define the circular shift macro
*/
#define SHA1CircularShift(bits,word) \
((((word) << (bits)) & 0xFFFFFFFF) | \
((word) >> (32-(bits))))
/* Function prototypes */
static void SHA1ProcessMessageBlock(SHA1Context *);
static void SHA1PadMessage(SHA1Context *);
// Initialize the SHA1Context in preparation for computing a new
// message digest.
static void SHA1Reset(SHA1Context* context) {
context->Length_Low = 0;
context->Length_High = 0;
context->Message_Block_Index = 0;
context->Message_Digest[0] = 0x67452301;
context->Message_Digest[1] = 0xEFCDAB89;
context->Message_Digest[2] = 0x98BADCFE;
context->Message_Digest[3] = 0x10325476;
context->Message_Digest[4] = 0xC3D2E1F0;
context->Computed = false;
context->Corrupted = false;
}
// This function will return the 160-bit message digest into the
// Message_Digest array within the SHA1Context provided
static bool SHA1Result(SHA1Context *context) {
if (context->Corrupted) {
return false;
}
if (!context->Computed) {
SHA1PadMessage(context);
context->Computed = true;
}
return true;
}
// This function accepts an array of bytes as the next portion of
// the message.
static void SHA1Input(SHA1Context *context,
const unsigned char *message_array,
unsigned length) {
if (!length) return;
if (context->Computed || context->Corrupted) {
context->Corrupted = true;
return;
}
while(length-- && !context->Corrupted) {
context->Message_Block[context->Message_Block_Index++] =
(*message_array & 0xFF);
context->Length_Low += 8;
/* Force it to 32 bits */
context->Length_Low &= 0xFFFFFFFF;
if (context->Length_Low == 0) {
context->Length_High++;
/* Force it to 32 bits */
context->Length_High &= 0xFFFFFFFF;
if (context->Length_High == 0)
{
/* Message is too long */
context->Corrupted = true;
}
}
if (context->Message_Block_Index == 64)
{
SHA1ProcessMessageBlock(context);
}
message_array++;
}
}
// This function will process the next 512 bits of the message stored
// in the Message_Block array.
static void SHA1ProcessMessageBlock(SHA1Context *context) {
const unsigned K[] = // Constants defined in SHA-1
{
0x5A827999,
0x6ED9EBA1,
0x8F1BBCDC,
0xCA62C1D6
};
int t; // Loop counter
unsigned temp; // Temporary word value
unsigned W[80]; // Word sequence
unsigned A, B, C, D, E; // Word buffers
// Initialize the first 16 words in the array W
for(t = 0; t < 16; t++) {
W[t] = ((unsigned) context->Message_Block[t * 4]) << 24;
W[t] |= ((unsigned) context->Message_Block[t * 4 + 1]) << 16;
W[t] |= ((unsigned) context->Message_Block[t * 4 + 2]) << 8;
W[t] |= ((unsigned) context->Message_Block[t * 4 + 3]);
}
for(t = 16; t < 80; t++) {
W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]);
}
A = context->Message_Digest[0];
B = context->Message_Digest[1];
C = context->Message_Digest[2];
D = context->Message_Digest[3];
E = context->Message_Digest[4];
for(t = 0; t < 20; t++) {
temp = SHA1CircularShift(5,A) +
((B & C) | ((~B) & D)) + E + W[t] + K[0];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = SHA1CircularShift(30,B);
B = A;
A = temp;
}
for(t = 20; t < 40; t++) {
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = SHA1CircularShift(30,B);
B = A;
A = temp;
}
for(t = 40; t < 60; t++) {
temp = SHA1CircularShift(5,A) +
((B & C) | (B & D) | (C & D)) + E + W[t] + K[2];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = SHA1CircularShift(30,B);
B = A;
A = temp;
}
for(t = 60; t < 80; t++) {
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = SHA1CircularShift(30,B);
B = A;
A = temp;
}
context->Message_Digest[0] = (context->Message_Digest[0] + A) & 0xFFFFFFFF;
context->Message_Digest[1] = (context->Message_Digest[1] + B) & 0xFFFFFFFF;
context->Message_Digest[2] = (context->Message_Digest[2] + C) & 0xFFFFFFFF;
context->Message_Digest[3] = (context->Message_Digest[3] + D) & 0xFFFFFFFF;
context->Message_Digest[4] = (context->Message_Digest[4] + E) & 0xFFFFFFFF;
context->Message_Block_Index = 0;
}
// According to the standard, the message must be padded to an even
// 512 bits. The first padding bit must be a '1'. The last 64 bits
// represent the length of the original message. All bits in between
// should be 0. This function will pad the message according to those
// rules by filling the Message_Block array accordingly. It will also
// call SHA1ProcessMessageBlock() appropriately. When it returns, it
// can be assumed that the message digest has been computed.
static void SHA1PadMessage(SHA1Context *context) {
// Check to see if the current message block is too small to hold
// the initial padding bits and length. If so, we will pad the
// block, process it, and then continue padding into a second block.
if (context->Message_Block_Index > 55) {
context->Message_Block[context->Message_Block_Index++] = 0x80;
while(context->Message_Block_Index < 64) {
context->Message_Block[context->Message_Block_Index++] = 0;
}
SHA1ProcessMessageBlock(context);
while(context->Message_Block_Index < 56) {
context->Message_Block[context->Message_Block_Index++] = 0;
}
} else {
context->Message_Block[context->Message_Block_Index++] = 0x80;
while(context->Message_Block_Index < 56) {
context->Message_Block[context->Message_Block_Index++] = 0;
}
}
// Store the message length as the last 8 octets
context->Message_Block[56] = (context->Length_High >> 24) & 0xFF;
context->Message_Block[57] = (context->Length_High >> 16) & 0xFF;
context->Message_Block[58] = (context->Length_High >> 8) & 0xFF;
context->Message_Block[59] = (context->Length_High) & 0xFF;
context->Message_Block[60] = (context->Length_Low >> 24) & 0xFF;
context->Message_Block[61] = (context->Length_Low >> 16) & 0xFF;
context->Message_Block[62] = (context->Length_Low >> 8) & 0xFF;
context->Message_Block[63] = (context->Length_Low) & 0xFF;
SHA1ProcessMessageBlock(context);
}
void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array) {
SHA1Context context;
SHA1Reset(&context);
SHA1Input(&context, reinterpret_cast<const unsigned char*>(data), len);
bool ok = SHA1Result(&context);
if (!ok) {
fprintf(stderr, "Unexpected error in SHA1_Hash_Portable code\n");
exit(1);
}
for (int i = 0; i < 5; i++) {
uint32_t value = context.Message_Digest[i];
hash_array[i*4 + 0] = (value >> 24) & 0xff;
hash_array[i*4 + 1] = (value >> 16) & 0xff;
hash_array[i*4 + 2] = (value >> 8) & 0xff;
hash_array[i*4 + 3] = value & 0xff;
}
}
}
}

@ -0,0 +1,25 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_
#define STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_
#include <stddef.h>
namespace leveldb {
namespace port {
// Compute the SHA1 hash value of "data[0..len-1]" and store it in
// "hash_array[0..19]". hash_array must have 20 bytes of space available.
//
// This function is portable but may not be as fast as a version
// optimized for your platform. It is provided as a default method
// that can be used when porting leveldb to a new platform if no
// better SHA1 hash implementation is available.
void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array);
}
}
#endif // STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_

@ -0,0 +1,39 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "port/port.h"
#include "util/testharness.h"
namespace leveldb {
namespace port {
class SHA1 { };
static std::string TestSHA1(const char* data, size_t len) {
char hash_val[20];
SHA1_Hash(data, len, hash_val);
char buf[41];
for (int i = 0; i < 20; i++) {
snprintf(buf + i * 2, 41 - i * 2,
"%02x",
static_cast<unsigned int>(static_cast<unsigned char>(
hash_val[i])));
}
return std::string(buf, 40);
}
TEST(SHA1, Simple) {
ASSERT_EQ("da39a3ee5e6b4b0d3255bfef95601890afd80709", TestSHA1("", 0));
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d", TestSHA1("hello", 5));
std::string x(10000, 'x');
ASSERT_EQ("f8c5cde791c5056cf515881e701c8a9ecb439a75",
TestSHA1(x.data(), x.size()));
}
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

@ -62,9 +62,7 @@ static inline const char* DecodeEntry(const char* p, const char* limit,
if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL;
} }
if (static_cast<uint32>(limit - p) < (*non_shared + *value_length)) { if (limit - p < (*non_shared + *value_length)) return NULL;
return NULL;
}
return p; return p;
} }

@ -62,7 +62,7 @@ size_t BlockBuilder::CurrentSizeEstimate() const {
Slice BlockBuilder::Finish() { Slice BlockBuilder::Finish() {
// Append restart array // Append restart array
for (size_t i = 0; i < restarts_.size(); i++) { for (int i = 0; i < restarts_.size(); i++) {
PutFixed32(&buffer_, restarts_[i]); PutFixed32(&buffer_, restarts_[i]);
} }
PutFixed32(&buffer_, restarts_.size()); PutFixed32(&buffer_, restarts_.size());

@ -36,7 +36,7 @@ void Footer::EncodeTo(std::string* dst) const {
metaindex_handle_.EncodeTo(dst); metaindex_handle_.EncodeTo(dst);
index_handle_.EncodeTo(dst); index_handle_.EncodeTo(dst);
dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding
PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber & 0xffffffffu)); PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber));
PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber >> 32)); PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber >> 32));
assert(dst->size() == original_size + kEncodedLength); assert(dst->size() == original_size + kEncodedLength);
} }
@ -71,7 +71,7 @@ Status ReadBlock(RandomAccessFile* file,
// Read the block contents as well as the type/crc footer. // Read the block contents as well as the type/crc footer.
// See table_builder.cc for the code that built this structure. // See table_builder.cc for the code that built this structure.
size_t n = static_cast<size_t>(handle.size()); size_t n = handle.size();
char* buf = new char[n + kBlockTrailerSize]; char* buf = new char[n + kBlockTrailerSize];
Slice contents; Slice contents;
Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);

@ -16,7 +16,7 @@ Arena::Arena() {
} }
Arena::~Arena() { Arena::~Arena() {
for (size_t i = 0; i < blocks_.size(); i++) { for (int i = 0; i < blocks_.size(); i++) {
delete[] blocks_[i]; delete[] blocks_[i];
} }
} }

@ -85,7 +85,7 @@ char* EncodeVarint64(char* dst, uint64_t v) {
*(ptr++) = (v & (B-1)) | B; *(ptr++) = (v & (B-1)) | B;
v >>= 7; v >>= 7;
} }
*(ptr++) = static_cast<unsigned char>(v); *(ptr++) = v;
return reinterpret_cast<char*>(ptr); return reinterpret_cast<char*>(ptr);
} }

@ -51,7 +51,7 @@ class BytewiseComparatorImpl : public Comparator {
virtual void FindShortSuccessor(std::string* key) const { virtual void FindShortSuccessor(std::string* key) const {
// Find first character that can be incremented // Find first character that can be incremented
size_t n = key->size(); size_t n = key->size();
for (size_t i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
const uint8_t byte = (*key)[i]; const uint8_t byte = (*key)[i];
if (byte != static_cast<uint8_t>(0xff)) { if (byte != static_cast<uint8_t>(0xff)) {
(*key)[i] = byte + 1; (*key)[i] = byte + 1;

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save