git-svn-id: https://leveldb.googlecode.com/svn/trunk@2 62dab493-f737-651d-591e-8d6aee1b9529main
parent
54f1fd7eef
commit
f67e15e50f
@ -0,0 +1,8 @@ |
||||
# Names should be added to this file like so: |
||||
# Name or Organization <email address> |
||||
|
||||
Google Inc. |
||||
|
||||
# Initial version authors: |
||||
Jeffrey Dean <jeff@google.com> |
||||
Sanjay Ghemawat <sanjay@google.com> |
@ -0,0 +1,64 @@ |
||||
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
# INSTRUCTIONS
|
||||
# After you've downloaded and installed the Android NDK from:
|
||||
# http://developer.android.com/sdk/ndk/index.html
|
||||
# 1. In the same directory as this file, Android.mk, type:
|
||||
# $ ln -s leveldb ../jni
|
||||
# (The Android NDK will only build native projects in
|
||||
# subdirectories named "jni".)
|
||||
# 2. $ cd ..
|
||||
# 3. Execute ndk-build:
|
||||
# $ $(ANDROID_NDK_DIR)/ndk-build
|
||||
|
||||
LOCAL_PATH := $(call my-dir)
|
||||
|
||||
include $(CLEAR_VARS) |
||||
LOCAL_MODULE := leveldb
|
||||
# Build flags:
|
||||
# - LEVELDB_PLATFORM_ANDROID to use the correct port header: port_android.h
|
||||
LOCAL_CFLAGS := -DLEVELDB_PLATFORM_ANDROID -std=gnu++0x
|
||||
LOCAL_C_INCLUDES := $(LOCAL_PATH)/../../
|
||||
LOCAL_CPP_EXTENSION := .cc
|
||||
|
||||
LOCAL_SRC_FILES := ./db/builder.cc \
|
||||
./db/db_bench.cc \ |
||||
./db/db_impl.cc \ |
||||
./db/db_iter.cc \ |
||||
./db/filename.cc \ |
||||
./db/dbformat.cc \ |
||||
./db/log_reader.cc \ |
||||
./db/log_writer.cc \ |
||||
./db/memtable.cc \ |
||||
./db/repair.cc \ |
||||
./db/table_cache.cc \ |
||||
./db/version_edit.cc \ |
||||
./db/version_set.cc \ |
||||
./db/write_batch.cc \ |
||||
./port/port_android.cc \ |
||||
./table/block.cc \ |
||||
./table/block_builder.cc \ |
||||
./table/format.cc \ |
||||
./table/iterator.cc \ |
||||
./table/merger.cc \ |
||||
./table/table.cc \ |
||||
./table/table_builder.cc \ |
||||
./table/two_level_iterator.cc \ |
||||
./util/arena.cc \ |
||||
./util/cache.cc \ |
||||
./util/coding.cc \ |
||||
./util/comparator.cc \ |
||||
./util/crc32c.cc \ |
||||
./util/env.cc \ |
||||
./util/env_posix.cc \ |
||||
./util/hash.cc \ |
||||
./util/histogram.cc \ |
||||
./util/logging.cc \ |
||||
./util/options.cc \ |
||||
./util/status.cc \ |
||||
./util/testharness.cc \ |
||||
./util/testutil.cc |
||||
|
||||
include $(BUILD_SHARED_LIBRARY) |
@ -0,0 +1,6 @@ |
||||
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
APP_ABI := armeabi-v7a
|
||||
APP_STL := gnustl_static
|
@ -0,0 +1,27 @@ |
||||
Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
||||
|
||||
Redistribution and use in source and binary forms, with or without |
||||
modification, are permitted provided that the following conditions are |
||||
met: |
||||
|
||||
* Redistributions of source code must retain the above copyright |
||||
notice, this list of conditions and the following disclaimer. |
||||
* Redistributions in binary form must reproduce the above |
||||
copyright notice, this list of conditions and the following disclaimer |
||||
in the documentation and/or other materials provided with the |
||||
distribution. |
||||
* Neither the name of Google Inc. nor the names of its |
||||
contributors may be used to endorse or promote products derived from |
||||
this software without specific prior written permission. |
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
@ -0,0 +1,134 @@ |
||||
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
CC = g++
|
||||
|
||||
# Uncomment one of the following to switch between debug and opt mode
|
||||
#OPT = -O2 -DNDEBUG
|
||||
OPT = -g2
|
||||
|
||||
CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -std=c++0x $(OPT)
|
||||
|
||||
LDFLAGS=-lpthread
|
||||
|
||||
LIBOBJECTS = \
|
||||
./db/builder.o \
|
||||
./db/db_impl.o \
|
||||
./db/db_iter.o \
|
||||
./db/filename.o \
|
||||
./db/format.o \
|
||||
./db/log_reader.o \
|
||||
./db/log_writer.o \
|
||||
./db/memtable.o \
|
||||
./db/repair.o \
|
||||
./db/table_cache.o \
|
||||
./db/version_edit.o \
|
||||
./db/version_set.o \
|
||||
./db/write_batch.o \
|
||||
./port/port_posix.o \
|
||||
./port/sha1_portable.o \
|
||||
./table/block.o \
|
||||
./table/block_builder.o \
|
||||
./table/format.o \
|
||||
./table/iterator.o \
|
||||
./table/merger.o \
|
||||
./table/table.o \
|
||||
./table/table_builder.o \
|
||||
./table/two_level_iterator.o \
|
||||
./util/arena.o \
|
||||
./util/cache.o \
|
||||
./util/coding.o \
|
||||
./util/comparator.o \
|
||||
./util/crc32c.o \
|
||||
./util/env.o \
|
||||
./util/env_posix.o \
|
||||
./util/hash.o \
|
||||
./util/histogram.o \
|
||||
./util/logging.o \
|
||||
./util/options.o \
|
||||
./util/status.o
|
||||
|
||||
TESTUTIL = ./util/testutil.o
|
||||
TESTHARNESS = ./util/testharness.o $(TESTUTIL)
|
||||
|
||||
TESTS = \
|
||||
arena_test \
|
||||
cache_test \
|
||||
coding_test \
|
||||
corruption_test \
|
||||
crc32c_test \
|
||||
db_test \
|
||||
dbformat_test \
|
||||
env_test \
|
||||
filename_test \
|
||||
log_test \
|
||||
sha1_test \
|
||||
skiplist_test \
|
||||
table_test \
|
||||
version_edit_test \
|
||||
write_batch_test
|
||||
|
||||
PROGRAMS = db_bench $(TESTS)
|
||||
|
||||
all: $(PROGRAMS) |
||||
|
||||
check: $(TESTS) |
||||
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
|
||||
|
||||
clean: |
||||
rm -f $(PROGRAMS) */*.o
|
||||
|
||||
db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) |
||||
$(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@
|
||||
|
||||
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
sha1_test: port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) |
||||
$(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
|
||||
|
||||
.cc.o: |
||||
$(CC) $(CFLAGS) $< -o $@
|
||||
|
||||
# TODO(gabor): dependencies for .o files
|
||||
# TODO(gabor): Build library
|
@ -0,0 +1,51 @@ |
||||
leveldb: A key-value store |
||||
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) |
||||
|
||||
The code under this directory implements a system for maintaining a |
||||
persistent key/value store. |
||||
|
||||
See doc/index.html for more explanation. |
||||
See doc/db_layout.txt for a brief overview of the implementation. |
||||
|
||||
The public interface is in include/*.h. Callers should not include or |
||||
rely on the details of any other header files in this package. Those |
||||
internal APIs may be changed without warning. |
||||
|
||||
Guide to header files: |
||||
|
||||
include/db.h |
||||
Main interface to the DB: Start here |
||||
|
||||
include/options.h |
||||
Control over the behavior of an entire database, and also |
||||
control over the behavior of individual reads and writes. |
||||
|
||||
include/comparator.h |
||||
Abstraction for user-specified comparison function. If you want |
||||
just bytewise comparison of keys, you can use the default comparator, |
||||
but clients can write their own comparator implementations if they |
||||
want custom ordering (e.g. to handle different character |
||||
encodings, etc.) |
||||
|
||||
include/iterator.h |
||||
Interface for iterating over data. You can get an iterator |
||||
from a DB object. |
||||
|
||||
include/write_batch.h |
||||
Interface for atomically applying multiple updates to a database. |
||||
|
||||
include/slice.h |
||||
A simple module for maintaining a pointer and a length into some |
||||
other byte array. |
||||
|
||||
include/status.h |
||||
Status is returned from many of the public interfaces and is used |
||||
to report success and various kinds of errors. |
||||
|
||||
include/env.h |
||||
Abstraction of the OS environment. A posix implementation of |
||||
this interface is in util/env_posix.cc |
||||
|
||||
include/table.h |
||||
include/table_builder.h |
||||
Lower-level modules that most clients probably won't use directly |
@ -0,0 +1,23 @@ |
||||
Before adding to chrome |
||||
----------------------- |
||||
- multi-threaded test/benchmark |
||||
- Allow missing crc32c in Table format? |
||||
|
||||
Maybe afterwards |
||||
---------------- |
||||
|
||||
ss |
||||
- Stats |
||||
- Speed up backwards scan (avoid three passes over data) |
||||
|
||||
db |
||||
- Maybe implement DB::BulkDeleteForRange(start_key, end_key) |
||||
that would blow away files whose ranges are entirely contained |
||||
within [start_key..end_key]? For Chrome, deletion of obsolete |
||||
object stores, etc. can be done in the background anyway, so |
||||
probably not that important. |
||||
|
||||
api changes? |
||||
- Efficient large value reading and writing |
||||
|
||||
Faster Get implementation |
@ -0,0 +1,97 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/builder.h" |
||||
|
||||
#include "db/filename.h" |
||||
#include "db/dbformat.h" |
||||
#include "db/table_cache.h" |
||||
#include "db/version_edit.h" |
||||
#include "include/db.h" |
||||
#include "include/env.h" |
||||
#include "include/iterator.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
Status BuildTable(const std::string& dbname, |
||||
Env* env, |
||||
const Options& options, |
||||
TableCache* table_cache, |
||||
Iterator* iter, |
||||
FileMetaData* meta, |
||||
VersionEdit* edit) { |
||||
Status s; |
||||
meta->file_size = 0; |
||||
iter->SeekToFirst(); |
||||
|
||||
std::string fname = TableFileName(dbname, meta->number); |
||||
if (iter->Valid()) { |
||||
WritableFile* file; |
||||
s = env->NewWritableFile(fname, &file); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
TableBuilder* builder = new TableBuilder(options, file); |
||||
meta->smallest.DecodeFrom(iter->key()); |
||||
for (; iter->Valid(); iter->Next()) { |
||||
Slice key = iter->key(); |
||||
meta->largest.DecodeFrom(key); |
||||
if (ExtractValueType(key) == kTypeLargeValueRef) { |
||||
if (iter->value().size() != LargeValueRef::ByteSize()) { |
||||
s = Status::Corruption("invalid indirect reference hash value (L0)"); |
||||
break; |
||||
} |
||||
edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()), |
||||
meta->number, |
||||
iter->key()); |
||||
} |
||||
builder->Add(key, iter->value()); |
||||
} |
||||
|
||||
// Finish and check for builder errors
|
||||
if (s.ok()) { |
||||
s = builder->Finish(); |
||||
if (s.ok()) { |
||||
meta->file_size = builder->FileSize(); |
||||
assert(meta->file_size > 0); |
||||
} |
||||
} else { |
||||
builder->Abandon(); |
||||
} |
||||
delete builder; |
||||
|
||||
// Finish and check for file errors
|
||||
if (s.ok()) { |
||||
s = file->Sync(); |
||||
} |
||||
if (s.ok()) { |
||||
s = file->Close(); |
||||
} |
||||
delete file; |
||||
file = NULL; |
||||
|
||||
if (s.ok()) { |
||||
// Verify that the table is usable
|
||||
Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number); |
||||
s = it->status(); |
||||
delete it; |
||||
} |
||||
} |
||||
|
||||
// Check for input iterator errors
|
||||
if (!iter->status().ok()) { |
||||
s = iter->status(); |
||||
} |
||||
|
||||
if (s.ok() && meta->file_size > 0) { |
||||
edit->AddFile(0, meta->number, meta->file_size, |
||||
meta->smallest, meta->largest); |
||||
} else { |
||||
env->DeleteFile(fname); |
||||
} |
||||
return s; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,36 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_BUILDER_H_ |
||||
#define STORAGE_LEVELDB_DB_BUILDER_H_ |
||||
|
||||
#include "include/status.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
struct Options; |
||||
struct FileMetaData; |
||||
|
||||
class Env; |
||||
class Iterator; |
||||
class TableCache; |
||||
class VersionEdit; |
||||
|
||||
// Build a Table file from the contents of *iter. The generated file
|
||||
// will be named according to meta->number. On success, the rest of
|
||||
// *meta will be filled with metadata about the generated table, and
|
||||
// large value refs and the added file information will be added to
|
||||
// *edit. If no data is present in *iter, meta->file_size will be set
|
||||
// to zero, and no Table file will be produced.
|
||||
extern Status BuildTable(const std::string& dbname, |
||||
Env* env, |
||||
const Options& options, |
||||
TableCache* table_cache, |
||||
Iterator* iter, |
||||
FileMetaData* meta, |
||||
VersionEdit* edit); |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_BUILDER_H_
|
@ -0,0 +1,366 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "include/db.h" |
||||
|
||||
#include <errno.h> |
||||
#include <fcntl.h> |
||||
#include <sys/stat.h> |
||||
#include <sys/types.h> |
||||
#include "include/env.h" |
||||
#include "include/table.h" |
||||
#include "include/write_batch.h" |
||||
#include "db/db_impl.h" |
||||
#include "db/filename.h" |
||||
#include "db/version_set.h" |
||||
#include "util/logging.h" |
||||
#include "util/testharness.h" |
||||
#include "util/testutil.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
static const int kValueSize = 1000; |
||||
|
||||
class CorruptionTest { |
||||
public: |
||||
test::ErrorEnv env_; |
||||
Random rnd_; |
||||
std::string dbname_; |
||||
Options options_; |
||||
DB* db_; |
||||
|
||||
CorruptionTest() : rnd_(test::RandomSeed()) { |
||||
options_.env = &env_; |
||||
dbname_ = test::TmpDir() + "/db_test"; |
||||
DestroyDB(dbname_, options_); |
||||
|
||||
db_ = NULL; |
||||
options_.create_if_missing = true; |
||||
Reopen(); |
||||
options_.create_if_missing = false; |
||||
} |
||||
|
||||
~CorruptionTest() { |
||||
delete db_; |
||||
DestroyDB(dbname_, Options()); |
||||
} |
||||
|
||||
Status TryReopen(Options* options = NULL) { |
||||
delete db_; |
||||
db_ = NULL; |
||||
Options opt = (options ? *options : options_); |
||||
opt.env = &env_; |
||||
return DB::Open(opt, dbname_, &db_); |
||||
} |
||||
|
||||
void Reopen(Options* options = NULL) { |
||||
ASSERT_OK(TryReopen(options)); |
||||
} |
||||
|
||||
void RepairDB() { |
||||
delete db_; |
||||
db_ = NULL; |
||||
ASSERT_OK(::leveldb::RepairDB(dbname_, options_)); |
||||
} |
||||
|
||||
void Build(int n) { |
||||
std::string key_space, value_space; |
||||
WriteBatch batch; |
||||
for (int i = 0; i < n; i++) { |
||||
//if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
|
||||
Slice key = Key(i, &key_space); |
||||
batch.Clear(); |
||||
batch.Put(key, Value(i, &value_space)); |
||||
ASSERT_OK(db_->Write(WriteOptions(), &batch)); |
||||
} |
||||
} |
||||
|
||||
void Check(int min_expected, int max_expected) { |
||||
int next_expected = 0; |
||||
int missed = 0; |
||||
int bad_keys = 0; |
||||
int bad_values = 0; |
||||
int correct = 0; |
||||
std::string value_space; |
||||
Iterator* iter = db_->NewIterator(ReadOptions()); |
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { |
||||
uint64_t key; |
||||
Slice in(iter->key()); |
||||
if (!ConsumeDecimalNumber(&in, &key) || |
||||
!in.empty() || |
||||
key < next_expected) { |
||||
bad_keys++; |
||||
continue; |
||||
} |
||||
missed += (key - next_expected); |
||||
next_expected = key + 1; |
||||
if (iter->value() != Value(key, &value_space)) { |
||||
bad_values++; |
||||
} else { |
||||
correct++; |
||||
} |
||||
} |
||||
delete iter; |
||||
|
||||
fprintf(stderr, |
||||
"expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", |
||||
min_expected, max_expected, correct, bad_keys, bad_values, missed); |
||||
ASSERT_LE(min_expected, correct); |
||||
ASSERT_GE(max_expected, correct); |
||||
} |
||||
|
||||
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { |
||||
// Pick file to corrupt
|
||||
std::vector<std::string> filenames; |
||||
ASSERT_OK(env_.GetChildren(dbname_, &filenames)); |
||||
uint64_t number; |
||||
LargeValueRef large_ref; |
||||
FileType type; |
||||
std::vector<std::string> candidates; |
||||
for (int i = 0; i < filenames.size(); i++) { |
||||
if (ParseFileName(filenames[i], &number, &large_ref, &type) && |
||||
type == filetype) { |
||||
candidates.push_back(dbname_ + "/" + filenames[i]); |
||||
} |
||||
} |
||||
ASSERT_TRUE(!candidates.empty()) << filetype; |
||||
std::string fname = candidates[rnd_.Uniform(candidates.size())]; |
||||
|
||||
struct stat sbuf; |
||||
if (stat(fname.c_str(), &sbuf) != 0) {
|
||||
const char* msg = strerror(errno);
|
||||
ASSERT_TRUE(false) << fname << ": " << msg;
|
||||
}
|
||||
|
||||
if (offset < 0) {
|
||||
// Relative to end of file; make it absolute
|
||||
if (-offset > sbuf.st_size) {
|
||||
offset = 0;
|
||||
} else {
|
||||
offset = sbuf.st_size + offset;
|
||||
} |
||||
} |
||||
if (offset > sbuf.st_size) { |
||||
offset = sbuf.st_size; |
||||
} |
||||
if (offset + bytes_to_corrupt > sbuf.st_size) { |
||||
bytes_to_corrupt = sbuf.st_size - offset; |
||||
} |
||||
|
||||
// Do it
|
||||
std::string contents; |
||||
Status s = ReadFileToString(Env::Default(), fname, &contents); |
||||
ASSERT_TRUE(s.ok()) << s.ToString(); |
||||
for (int i = 0; i < bytes_to_corrupt; i++) { |
||||
contents[i + offset] ^= 0x80; |
||||
} |
||||
s = WriteStringToFile(Env::Default(), contents, fname); |
||||
ASSERT_TRUE(s.ok()) << s.ToString(); |
||||
} |
||||
|
||||
uint64_t Property(const std::string& name) { |
||||
uint64_t result; |
||||
if (!db_->GetProperty(name, &result)) { |
||||
result = ~static_cast<uint64_t>(0); |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
// Return the ith key
|
||||
Slice Key(int i, std::string* storage) { |
||||
char buf[100]; |
||||
snprintf(buf, sizeof(buf), "%016d", i); |
||||
storage->assign(buf, strlen(buf)); |
||||
return Slice(*storage); |
||||
} |
||||
|
||||
// Return the value to associate with the specified key
|
||||
Slice Value(int k, std::string* storage) { |
||||
Random r(k); |
||||
return test::RandomString(&r, kValueSize, storage); |
||||
} |
||||
}; |
||||
|
||||
TEST(CorruptionTest, Recovery) { |
||||
Build(10); |
||||
Check(10, 10); |
||||
Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
|
||||
Corrupt(kLogFile, 2*kValueSize, 1); // Somewhere in second log record?
|
||||
Reopen(); |
||||
Check(8, 8); |
||||
} |
||||
|
||||
TEST(CorruptionTest, RecoverWriteError) { |
||||
env_.writable_file_error_ = true; |
||||
Status s = TryReopen(); |
||||
ASSERT_TRUE(!s.ok()); |
||||
} |
||||
|
||||
TEST(CorruptionTest, NewFileErrorDuringWrite) { |
||||
// Do enough writing to force minor compaction
|
||||
env_.writable_file_error_ = true; |
||||
const int num = 3 + (Options().write_buffer_size / kValueSize); |
||||
std::string value_storage; |
||||
Status s; |
||||
for (int i = 0; s.ok() && i < num; i++) { |
||||
WriteBatch batch; |
||||
batch.Put("a", Value(100, &value_storage)); |
||||
s = db_->Write(WriteOptions(), &batch); |
||||
} |
||||
ASSERT_TRUE(!s.ok()); |
||||
ASSERT_GE(env_.num_writable_file_errors_, 1); |
||||
env_.writable_file_error_ = false; |
||||
Reopen(); |
||||
} |
||||
|
||||
TEST(CorruptionTest, TableFile) { |
||||
Build(100); |
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); |
||||
dbi->TEST_CompactMemTable(); |
||||
dbi->TEST_CompactRange(0, "", "~"); |
||||
dbi->TEST_CompactRange(1, "", "~"); |
||||
|
||||
Corrupt(kTableFile, 100, 1); |
||||
Check(99, 99); |
||||
} |
||||
|
||||
TEST(CorruptionTest, TableFileIndexData) { |
||||
Build(10000); // Enough to build multiple Tables
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); |
||||
dbi->TEST_CompactMemTable(); |
||||
dbi->TEST_CompactRange(0, "", "~"); |
||||
dbi->TEST_CompactRange(1, "", "~"); |
||||
|
||||
Corrupt(kTableFile, -1000, 500); |
||||
Reopen(); |
||||
Check(5000, 9999); |
||||
} |
||||
|
||||
TEST(CorruptionTest, MissingDescriptor) { |
||||
Build(1000); |
||||
RepairDB(); |
||||
Reopen(); |
||||
Check(1000, 1000); |
||||
} |
||||
|
||||
TEST(CorruptionTest, SequenceNumberRecovery) { |
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); |
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); |
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); |
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); |
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); |
||||
RepairDB(); |
||||
Reopen(); |
||||
std::string v; |
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); |
||||
ASSERT_EQ("v5", v); |
||||
// Write something. If sequence number was not recovered properly,
|
||||
// it will be hidden by an earlier write.
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); |
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); |
||||
ASSERT_EQ("v6", v); |
||||
Reopen(); |
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); |
||||
ASSERT_EQ("v6", v); |
||||
} |
||||
|
||||
TEST(CorruptionTest, LargeValueRecovery) { |
||||
Options options; |
||||
options.large_value_threshold = 10000; |
||||
Reopen(&options); |
||||
|
||||
Random rnd(301); |
||||
std::string big; |
||||
ASSERT_OK(db_->Put(WriteOptions(), |
||||
"foo", test::RandomString(&rnd, 100000, &big))); |
||||
std::string v; |
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); |
||||
ASSERT_EQ(big, v); |
||||
|
||||
RepairDB(); |
||||
Reopen(); |
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); |
||||
ASSERT_EQ(big, v); |
||||
|
||||
Reopen(); |
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); |
||||
ASSERT_EQ(big, v); |
||||
} |
||||
|
||||
TEST(CorruptionTest, CorruptedDescriptor) { |
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); |
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); |
||||
dbi->TEST_CompactMemTable(); |
||||
dbi->TEST_CompactRange(0, "", "~"); |
||||
|
||||
Corrupt(kDescriptorFile, 0, 1000); |
||||
Status s = TryReopen(); |
||||
ASSERT_TRUE(!s.ok()); |
||||
|
||||
RepairDB(); |
||||
Reopen(); |
||||
std::string v; |
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); |
||||
ASSERT_EQ("hello", v); |
||||
} |
||||
|
||||
TEST(CorruptionTest, CompactionInputError) { |
||||
Build(10); |
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); |
||||
dbi->TEST_CompactMemTable(); |
||||
ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); |
||||
|
||||
Corrupt(kTableFile, 100, 1); |
||||
Check(9, 9); |
||||
|
||||
// Force compactions by writing lots of values
|
||||
Build(10000); |
||||
Check(10000, 10000); |
||||
dbi->TEST_CompactRange(0, "", "~"); |
||||
ASSERT_EQ(0, Property("leveldb.num-files-at-level0")); |
||||
} |
||||
|
||||
TEST(CorruptionTest, CompactionInputErrorParanoid) { |
||||
Options options; |
||||
options.paranoid_checks = true; |
||||
Reopen(&options); |
||||
|
||||
Build(10); |
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); |
||||
dbi->TEST_CompactMemTable(); |
||||
ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); |
||||
|
||||
Corrupt(kTableFile, 100, 1); |
||||
Check(9, 9); |
||||
|
||||
// Write must eventually fail because of corrupted table
|
||||
Status s; |
||||
std::string tmp1, tmp2; |
||||
for (int i = 0; i < 10000 && s.ok(); i++) { |
||||
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); |
||||
} |
||||
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; |
||||
} |
||||
|
||||
TEST(CorruptionTest, UnrelatedKeys) { |
||||
Build(10); |
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); |
||||
dbi->TEST_CompactMemTable(); |
||||
Corrupt(kTableFile, 100, 1); |
||||
|
||||
std::string tmp1, tmp2; |
||||
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); |
||||
std::string v; |
||||
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); |
||||
ASSERT_EQ(Value(1000, &tmp2).ToString(), v); |
||||
dbi->TEST_CompactMemTable(); |
||||
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); |
||||
ASSERT_EQ(Value(1000, &tmp2).ToString(), v); |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,376 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <sys/types.h> |
||||
#include <stdio.h> |
||||
#include <stdlib.h> |
||||
#include "db/db_impl.h" |
||||
#include "db/version_set.h" |
||||
#include "include/cache.h" |
||||
#include "include/db.h" |
||||
#include "include/env.h" |
||||
#include "include/write_batch.h" |
||||
#include "util/histogram.h" |
||||
#include "util/random.h" |
||||
#include "util/testutil.h" |
||||
|
||||
// Comma-separated list of operations to run in the specified order
|
||||
// Actual benchmarks:
|
||||
// writeseq -- write N values in sequential key order
|
||||
// writerandom -- write N values in random key order
|
||||
// writebig -- write N/1000 100K valuesin random order
|
||||
// readseq -- read N values sequentially
|
||||
// readrandom -- read N values in random order
|
||||
// Meta operations:
|
||||
// compact -- Compact the entire DB
|
||||
// heapprofile -- Dump a heap profile (if supported by this port)
|
||||
// sync -- switch to synchronous writes (not the default)
|
||||
// nosync -- switch to asynchronous writes (the default)
|
||||
// tenth -- divide N by 10 (i.e., following benchmarks are smaller)
|
||||
// normal -- reset N back to its normal value (1000000)
|
||||
static const char* FLAGS_benchmarks = |
||||
"writeseq," |
||||
"writeseq," |
||||
"writerandom," |
||||
"sync,tenth,tenth,writerandom,nosync,normal," |
||||
"readseq," |
||||
"readrandom," |
||||
"compact," |
||||
"readseq," |
||||
"readrandom," |
||||
"writebig"; |
||||
|
||||
// Number of key/values to place in database
|
||||
static int FLAGS_num = 1000000; |
||||
|
||||
// Size of each value
|
||||
static int FLAGS_value_size = 100; |
||||
|
||||
// Arrange to generate values that shrink to this fraction of
|
||||
// their original size after compression
|
||||
static double FLAGS_compression_ratio = 0.25; |
||||
|
||||
// Print histogram of operation timings
|
||||
static bool FLAGS_histogram = false; |
||||
|
||||
// Number of bytes to buffer in memtable before compacting
|
||||
static int FLAGS_write_buffer_size = 1 << 20; |
||||
|
||||
namespace leveldb { |
||||
|
||||
// Helper for quickly generating random data.
|
||||
namespace { |
||||
class RandomGenerator { |
||||
private: |
||||
std::string data_; |
||||
int pos_; |
||||
|
||||
public: |
||||
RandomGenerator() { |
||||
// We use a limited amount of data over and over again and ensure
|
||||
// that it is larger than the compression window (32KB), and also
|
||||
// large enough to serve all typical value sizes we want to write.
|
||||
Random rnd(301); |
||||
std::string piece; |
||||
while (data_.size() < 1048576) { |
||||
// Add a short fragment that is as compressible as specified
|
||||
// by FLAGS_compression_ratio.
|
||||
test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); |
||||
data_.append(piece); |
||||
} |
||||
pos_ = 0; |
||||
} |
||||
|
||||
Slice Generate(int len) { |
||||
if (pos_ + len > data_.size()) { |
||||
pos_ = 0; |
||||
assert(len < data_.size()); |
||||
} |
||||
pos_ += len; |
||||
return Slice(data_.data() + pos_ - len, len); |
||||
} |
||||
}; |
||||
} |
||||
|
||||
class Benchmark { |
||||
private: |
||||
Cache* cache_; |
||||
DB* db_; |
||||
int num_; |
||||
bool sync_; |
||||
int heap_counter_; |
||||
double start_; |
||||
double last_op_finish_; |
||||
int64_t bytes_; |
||||
std::string message_; |
||||
Histogram hist_; |
||||
RandomGenerator gen_; |
||||
Random rand_; |
||||
|
||||
// State kept for progress messages
|
||||
int done_; |
||||
int next_report_; // When to report next
|
||||
|
||||
void Start() { |
||||
start_ = Env::Default()->NowMicros() * 1e-6; |
||||
bytes_ = 0; |
||||
message_.clear(); |
||||
last_op_finish_ = start_; |
||||
hist_.Clear(); |
||||
done_ = 0; |
||||
next_report_ = 100; |
||||
} |
||||
|
||||
void FinishedSingleOp() { |
||||
if (FLAGS_histogram) { |
||||
double now = Env::Default()->NowMicros() * 1e-6; |
||||
double micros = (now - last_op_finish_) * 1e6; |
||||
hist_.Add(micros); |
||||
if (micros > 20000) { |
||||
fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); |
||||
fflush(stderr); |
||||
} |
||||
last_op_finish_ = now; |
||||
} |
||||
|
||||
done_++; |
||||
if (done_ >= next_report_) { |
||||
if (next_report_ < 1000) { |
||||
next_report_ += 100; |
||||
} else if (next_report_ < 10000) { |
||||
next_report_ += 1000; |
||||
} else if (next_report_ < 100000) { |
||||
next_report_ += 10000; |
||||
} else { |
||||
next_report_ += 100000; |
||||
} |
||||
fprintf(stderr, "... finished %d ops%30s\r", done_, ""); |
||||
fflush(stderr); |
||||
} |
||||
} |
||||
|
||||
void Stop(const Slice& name) { |
||||
double finish = Env::Default()->NowMicros() * 1e-6; |
||||
|
||||
// Pretend at least one op was done in case we are running a benchmark
|
||||
// that does nto call FinishedSingleOp().
|
||||
if (done_ < 1) done_ = 1; |
||||
|
||||
if (bytes_ > 0) { |
||||
char rate[100]; |
||||
snprintf(rate, sizeof(rate), "%5.1f MB/s", |
||||
(bytes_ / 1048576.0) / (finish - start_)); |
||||
if (!message_.empty()) { |
||||
message_.push_back(' '); |
||||
} |
||||
message_.append(rate); |
||||
} |
||||
|
||||
fprintf(stdout, "%-12s : %10.3f micros/op;%s%s\n", |
||||
name.ToString().c_str(), |
||||
(finish - start_) * 1e6 / done_, |
||||
(message_.empty() ? "" : " "), |
||||
message_.c_str()); |
||||
if (FLAGS_histogram) { |
||||
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); |
||||
} |
||||
fflush(stdout); |
||||
} |
||||
|
||||
public: |
||||
enum Order { SEQUENTIAL, RANDOM }; |
||||
|
||||
Benchmark() : cache_(NewLRUCache(200<<20)), |
||||
db_(NULL), |
||||
num_(FLAGS_num), |
||||
sync_(false), |
||||
heap_counter_(0), |
||||
bytes_(0), |
||||
rand_(301) { |
||||
std::vector<std::string> files; |
||||
Env::Default()->GetChildren("/tmp/dbbench", &files); |
||||
for (int i = 0; i < files.size(); i++) { |
||||
if (Slice(files[i]).starts_with("heap-")) { |
||||
Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); |
||||
} |
||||
} |
||||
DestroyDB("/tmp/dbbench", Options()); |
||||
} |
||||
|
||||
~Benchmark() { |
||||
delete db_; |
||||
delete cache_; |
||||
} |
||||
|
||||
void Run() { |
||||
Options options; |
||||
options.create_if_missing = true; |
||||
options.max_open_files = 10000; |
||||
options.block_cache = cache_; |
||||
options.write_buffer_size = FLAGS_write_buffer_size; |
||||
|
||||
Start(); |
||||
Status s = DB::Open(options, "/tmp/dbbench", &db_); |
||||
Stop("open"); |
||||
if (!s.ok()) { |
||||
fprintf(stderr, "open error: %s\n", s.ToString().c_str()); |
||||
exit(1); |
||||
} |
||||
|
||||
const char* benchmarks = FLAGS_benchmarks; |
||||
while (benchmarks != NULL) { |
||||
const char* sep = strchr(benchmarks, ','); |
||||
Slice name; |
||||
if (sep == NULL) { |
||||
name = benchmarks; |
||||
benchmarks = NULL; |
||||
} else { |
||||
name = Slice(benchmarks, sep - benchmarks); |
||||
benchmarks = sep + 1; |
||||
} |
||||
|
||||
Start(); |
||||
if (name == Slice("writeseq")) { |
||||
Write(SEQUENTIAL, num_, FLAGS_value_size); |
||||
} else if (name == Slice("writerandom")) { |
||||
Write(RANDOM, num_, FLAGS_value_size); |
||||
} else if (name == Slice("writebig")) { |
||||
Write(RANDOM, num_ / 1000, 100 * 1000); |
||||
} else if (name == Slice("readseq")) { |
||||
Read(SEQUENTIAL); |
||||
} else if (name == Slice("readrandom")) { |
||||
Read(RANDOM); |
||||
} else if (name == Slice("compact")) { |
||||
Compact(); |
||||
} else if (name == Slice("heapprofile")) { |
||||
HeapProfile(); |
||||
} else if (name == Slice("sync")) { |
||||
sync_ = true; |
||||
} else if (name == Slice("nosync")) { |
||||
sync_ = false; |
||||
} else if (name == Slice("tenth")) { |
||||
num_ = num_ / 10; |
||||
} else if (name == Slice("normal")) { |
||||
num_ = FLAGS_num; |
||||
} else { |
||||
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); |
||||
} |
||||
Stop(name); |
||||
} |
||||
} |
||||
|
||||
void Write(Order order, int num_entries, int value_size) { |
||||
WriteBatch batch; |
||||
Status s; |
||||
std::string val; |
||||
WriteOptions options; |
||||
options.sync = sync_; |
||||
for (int i = 0; i < num_entries; i++) { |
||||
const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); |
||||
char key[100]; |
||||
snprintf(key, sizeof(key), "%012d", k); |
||||
batch.Clear(); |
||||
batch.Put(key, gen_.Generate(value_size)); |
||||
s = db_->Write(options, &batch); |
||||
bytes_ += value_size + strlen(key); |
||||
if (!s.ok()) { |
||||
fprintf(stderr, "put error: %s\n", s.ToString().c_str()); |
||||
exit(1); |
||||
} |
||||
FinishedSingleOp(); |
||||
} |
||||
} |
||||
|
||||
void Read(Order order) { |
||||
ReadOptions options; |
||||
if (order == SEQUENTIAL) { |
||||
Iterator* iter = db_->NewIterator(options); |
||||
int i = 0; |
||||
for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { |
||||
bytes_ += iter->key().size() + iter->value().size(); |
||||
FinishedSingleOp(); |
||||
++i; |
||||
} |
||||
delete iter; |
||||
} else { |
||||
std::string value; |
||||
for (int i = 0; i < num_; i++) { |
||||
char key[100]; |
||||
const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); |
||||
snprintf(key, sizeof(key), "%012d", k); |
||||
db_->Get(options, key, &value); |
||||
FinishedSingleOp(); |
||||
} |
||||
} |
||||
} |
||||
|
||||
void Compact() { |
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); |
||||
dbi->TEST_CompactMemTable(); |
||||
int max_level_with_files = 1; |
||||
for (int level = 1; level < config::kNumLevels; level++) { |
||||
uint64_t v; |
||||
char name[100]; |
||||
snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); |
||||
if (db_->GetProperty(name, &v) && v > 0) { |
||||
max_level_with_files = level; |
||||
} |
||||
} |
||||
for (int level = 0; level < max_level_with_files; level++) { |
||||
dbi->TEST_CompactRange(level, "", "~"); |
||||
} |
||||
} |
||||
|
||||
static void WriteToFile(void* arg, const char* buf, int n) { |
||||
reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n)); |
||||
} |
||||
|
||||
void HeapProfile() { |
||||
char fname[100]; |
||||
snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_); |
||||
WritableFile* file; |
||||
Status s = Env::Default()->NewWritableFile(fname, &file); |
||||
if (!s.ok()) { |
||||
message_ = s.ToString(); |
||||
return; |
||||
} |
||||
bool ok = port::GetHeapProfile(WriteToFile, file); |
||||
delete file; |
||||
if (!ok) { |
||||
message_ = "not supported"; |
||||
Env::Default()->DeleteFile(fname); |
||||
} |
||||
} |
||||
}; |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
for (int i = 1; i < argc; i++) { |
||||
double d; |
||||
int n; |
||||
char junk; |
||||
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { |
||||
FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); |
||||
} else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { |
||||
FLAGS_compression_ratio = d; |
||||
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && |
||||
(n == 0 || n == 1)) { |
||||
FLAGS_histogram = n; |
||||
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { |
||||
FLAGS_num = n; |
||||
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { |
||||
FLAGS_value_size = n; |
||||
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { |
||||
FLAGS_write_buffer_size = n; |
||||
} else { |
||||
fprintf(stderr, "Invalid flag '%s'\n", argv[i]); |
||||
exit(1); |
||||
} |
||||
} |
||||
|
||||
leveldb::Benchmark benchmark; |
||||
benchmark.Run(); |
||||
return 0; |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,192 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_ |
||||
#define STORAGE_LEVELDB_DB_DB_IMPL_H_ |
||||
|
||||
#include <set> |
||||
#include "db/dbformat.h" |
||||
#include "db/log_writer.h" |
||||
#include "db/snapshot.h" |
||||
#include "include/db.h" |
||||
#include "include/env.h" |
||||
#include "port/port.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class MemTable; |
||||
class TableCache; |
||||
class Version; |
||||
class VersionEdit; |
||||
class VersionSet; |
||||
|
||||
class DBImpl : public DB { |
||||
public: |
||||
DBImpl(const Options& options, const std::string& dbname); |
||||
virtual ~DBImpl(); |
||||
|
||||
// Implementations of the DB interface
|
||||
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); |
||||
virtual Status Delete(const WriteOptions&, const Slice& key); |
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates); |
||||
virtual Status Get(const ReadOptions& options, |
||||
const Slice& key, |
||||
std::string* value); |
||||
virtual Iterator* NewIterator(const ReadOptions&); |
||||
virtual const Snapshot* GetSnapshot(); |
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot); |
||||
virtual bool GetProperty(const Slice& property, uint64_t* value); |
||||
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); |
||||
|
||||
// Extra methods (for testing) that are not in the public DB interface
|
||||
|
||||
// Compact any files in the named level that overlap [begin,end]
|
||||
void TEST_CompactRange( |
||||
int level, |
||||
const std::string& begin, |
||||
const std::string& end); |
||||
|
||||
// Force current memtable contents to be compacted.
|
||||
Status TEST_CompactMemTable(); |
||||
|
||||
// Return an internal iterator over the current state of the database.
|
||||
// The keys of this iterator are internal keys (see format.h).
|
||||
// The returned iterator should be deleted when no longer needed.
|
||||
Iterator* TEST_NewInternalIterator(); |
||||
|
||||
private: |
||||
friend class DB; |
||||
|
||||
Iterator* NewInternalIterator(const ReadOptions&, |
||||
SequenceNumber* latest_snapshot); |
||||
|
||||
Status NewDB(); |
||||
|
||||
// Recover the descriptor from persistent storage. May do a significant
|
||||
// amount of work to recover recently logged updates. Any changes to
|
||||
// be made to the descriptor are added to *edit.
|
||||
Status Recover(VersionEdit* edit); |
||||
|
||||
// Apply the specified updates and save the resulting descriptor to
|
||||
// persistent storage. If cleanup_mem is non-NULL, arrange to
|
||||
// delete it when all existing snapshots have gone away iff Install()
|
||||
// returns OK.
|
||||
Status Install(VersionEdit* edit, |
||||
uint64_t new_log_number, |
||||
MemTable* cleanup_mem); |
||||
|
||||
void MaybeIgnoreError(Status* s) const; |
||||
|
||||
// Delete any unneeded files and stale in-memory entries.
|
||||
void DeleteObsoleteFiles(); |
||||
|
||||
// Called when an iterator over a particular version of the
|
||||
// descriptor goes away.
|
||||
static void Unref(void* arg1, void* arg2); |
||||
|
||||
// Compact the in-memory write buffer to disk. Switches to a new
|
||||
// log-file/memtable and writes a new descriptor iff successful.
|
||||
Status CompactMemTable(); |
||||
|
||||
Status RecoverLogFile(uint64_t log_number, |
||||
VersionEdit* edit, |
||||
SequenceNumber* max_sequence); |
||||
|
||||
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); |
||||
|
||||
bool HasLargeValues(const WriteBatch& batch) const; |
||||
|
||||
// Process data in "*updates" and return a status. "assigned_seq"
|
||||
// is the sequence number assigned to the first mod in "*updates".
|
||||
// If no large values are encountered, "*final" is set to "updates".
|
||||
// If large values were encountered, registers the references of the
|
||||
// large values with the VersionSet, writes the large values to
|
||||
// files (if appropriate), and allocates a new WriteBatch with the
|
||||
// large values replaced with indirect references and stores a
|
||||
// pointer to the new WriteBatch in *final. If *final != updates on
|
||||
// return, then the client should delete *final when no longer
|
||||
// needed. Returns OK on success, and an appropriate error
|
||||
// otherwise.
|
||||
Status HandleLargeValues(SequenceNumber assigned_seq, |
||||
WriteBatch* updates, |
||||
WriteBatch** final); |
||||
|
||||
// Helper routine for HandleLargeValues
|
||||
void MaybeCompressLargeValue( |
||||
const Slice& raw_value, |
||||
Slice* file_bytes, |
||||
std::string* scratch, |
||||
LargeValueRef* ref); |
||||
|
||||
struct CompactionState; |
||||
|
||||
void MaybeScheduleCompaction(); |
||||
static void BGWork(void* db); |
||||
void BackgroundCall(); |
||||
void BackgroundCompaction(); |
||||
void CleanupCompaction(CompactionState* compact); |
||||
Status DoCompactionWork(CompactionState* compact); |
||||
|
||||
Status OpenCompactionOutputFile(CompactionState* compact); |
||||
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); |
||||
Status InstallCompactionResults(CompactionState* compact); |
||||
|
||||
// Constant after construction
|
||||
Env* const env_; |
||||
const InternalKeyComparator internal_comparator_; |
||||
const Options options_; // options_.comparator == &internal_comparator_
|
||||
bool owns_info_log_; |
||||
const std::string dbname_; |
||||
|
||||
// table_cache_ provides its own synchronization
|
||||
TableCache* table_cache_; |
||||
|
||||
// Lock over the persistent DB state. Non-NULL iff successfully acquired.
|
||||
FileLock* db_lock_; |
||||
|
||||
// State below is protected by mutex_
|
||||
port::Mutex mutex_; |
||||
port::AtomicPointer shutting_down_; |
||||
port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_
|
||||
port::CondVar compacting_cv_; // Signalled when !compacting_
|
||||
SequenceNumber last_sequence_; |
||||
MemTable* mem_; |
||||
WritableFile* logfile_; |
||||
log::Writer* log_; |
||||
uint64_t log_number_; |
||||
SnapshotList snapshots_; |
||||
|
||||
// Set of table files to protect from deletion because they are
|
||||
// part of ongoing compactions.
|
||||
std::set<uint64_t> pending_outputs_; |
||||
|
||||
// Has a background compaction been scheduled or is running?
|
||||
bool bg_compaction_scheduled_; |
||||
|
||||
// Is there a compaction running?
|
||||
bool compacting_; |
||||
|
||||
VersionSet* versions_; |
||||
|
||||
// Have we encountered a background error in paranoid mode?
|
||||
Status bg_error_; |
||||
|
||||
// No copying allowed
|
||||
DBImpl(const DBImpl&); |
||||
void operator=(const DBImpl&); |
||||
|
||||
const Comparator* user_comparator() const { |
||||
return internal_comparator_.user_comparator(); |
||||
} |
||||
}; |
||||
|
||||
// Sanitize db options. The caller should delete result.info_log if
|
||||
// it is not equal to src.info_log.
|
||||
extern Options SanitizeOptions(const std::string& db, |
||||
const InternalKeyComparator* icmp, |
||||
const Options& src); |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_
|
@ -0,0 +1,412 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/db_iter.h" |
||||
|
||||
#include "db/filename.h" |
||||
#include "db/dbformat.h" |
||||
#include "include/env.h" |
||||
#include "include/iterator.h" |
||||
#include "port/port.h" |
||||
#include "util/logging.h" |
||||
#include "util/mutexlock.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
#if 0 |
||||
static void DumpInternalIter(Iterator* iter) { |
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { |
||||
ParsedInternalKey k; |
||||
if (!ParseInternalKey(iter->key(), &k)) { |
||||
fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); |
||||
} else { |
||||
fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); |
||||
} |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
namespace { |
||||
|
||||
// Memtables and sstables that make the DB representation contain
|
||||
// (userkey,seq,type) => uservalue entries. DBIter
|
||||
// combines multiple entries for the same userkey found in the DB
|
||||
// representation into a single entry while accounting for sequence
|
||||
// numbers, deletion markers, overwrites, etc.
|
||||
class DBIter: public Iterator { |
||||
public: |
||||
DBIter(const std::string* dbname, Env* env, |
||||
const Comparator* cmp, Iterator* iter, SequenceNumber s) |
||||
: dbname_(dbname), |
||||
env_(env), |
||||
user_comparator_(cmp), |
||||
iter_(iter), |
||||
sequence_(s), |
||||
large_(NULL), |
||||
valid_(false) { |
||||
} |
||||
virtual ~DBIter() { |
||||
delete iter_; |
||||
delete large_; |
||||
} |
||||
virtual bool Valid() const { return valid_; } |
||||
virtual Slice key() const { |
||||
assert(valid_); |
||||
return key_; |
||||
} |
||||
virtual Slice value() const { |
||||
assert(valid_); |
||||
if (large_ == NULL) { |
||||
return value_; |
||||
} else { |
||||
MutexLock l(&large_->mutex); |
||||
if (!large_->produced) { |
||||
ReadIndirectValue(); |
||||
} |
||||
return large_->value; |
||||
} |
||||
} |
||||
|
||||
virtual void Next() { |
||||
assert(valid_); |
||||
// iter_ is already positioned past DBIter::key()
|
||||
FindNextUserEntry(); |
||||
} |
||||
|
||||
virtual void Prev() { |
||||
assert(valid_); |
||||
bool ignored; |
||||
ScanUntilBeforeCurrentKey(&ignored); |
||||
FindPrevUserEntry(); |
||||
} |
||||
|
||||
virtual void Seek(const Slice& target) { |
||||
ParsedInternalKey ikey(target, sequence_, kValueTypeForSeek); |
||||
std::string tmp; |
||||
AppendInternalKey(&tmp, ikey); |
||||
iter_->Seek(tmp); |
||||
FindNextUserEntry(); |
||||
} |
||||
virtual void SeekToFirst() { |
||||
iter_->SeekToFirst(); |
||||
FindNextUserEntry(); |
||||
} |
||||
|
||||
virtual void SeekToLast(); |
||||
|
||||
virtual Status status() const { |
||||
if (status_.ok()) { |
||||
if (large_ != NULL && !large_->status.ok()) return large_->status; |
||||
return iter_->status(); |
||||
} else { |
||||
return status_; |
||||
} |
||||
} |
||||
|
||||
private: |
||||
void FindNextUserEntry(); |
||||
void FindPrevUserEntry(); |
||||
void SaveKey(const Slice& k) { key_.assign(k.data(), k.size()); } |
||||
void SaveValue(const Slice& v) { |
||||
if (value_.capacity() > v.size() + 1048576) { |
||||
std::string empty; |
||||
swap(empty, value_); |
||||
} |
||||
value_.assign(v.data(), v.size()); |
||||
} |
||||
bool ParseKey(ParsedInternalKey* key); |
||||
void SkipPast(const Slice& k); |
||||
void ScanUntilBeforeCurrentKey(bool* found_live); |
||||
|
||||
void ReadIndirectValue() const; |
||||
|
||||
struct Large { |
||||
port::Mutex mutex; |
||||
std::string value; |
||||
bool produced; |
||||
Status status; |
||||
}; |
||||
|
||||
const std::string* const dbname_; |
||||
Env* const env_; |
||||
|
||||
const Comparator* const user_comparator_; |
||||
|
||||
// iter_ is positioned just past current entry for DBIter if valid_
|
||||
Iterator* const iter_; |
||||
|
||||
SequenceNumber const sequence_; |
||||
Status status_; |
||||
std::string key_; // Always a user key
|
||||
std::string value_; |
||||
Large* large_; // Non-NULL if value is an indirect reference
|
||||
bool valid_; |
||||
|
||||
// No copying allowed
|
||||
DBIter(const DBIter&); |
||||
void operator=(const DBIter&); |
||||
}; |
||||
|
||||
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { |
||||
if (!ParseInternalKey(iter_->key(), ikey)) { |
||||
status_ = Status::Corruption("corrupted internal key in DBIter"); |
||||
return false; |
||||
} else { |
||||
return true; |
||||
} |
||||
} |
||||
|
||||
void DBIter::FindNextUserEntry() { |
||||
if (large_ != NULL) { |
||||
if (status_.ok() && !large_->status.ok()) { |
||||
status_ = large_->status; |
||||
} |
||||
delete large_; |
||||
large_ = NULL; |
||||
} |
||||
while (iter_->Valid()) { |
||||
ParsedInternalKey ikey; |
||||
if (!ParseKey(&ikey)) { |
||||
// Skip past corrupted entry
|
||||
iter_->Next(); |
||||
continue; |
||||
} |
||||
if (ikey.sequence > sequence_) { |
||||
// Ignore entries newer than the snapshot
|
||||
iter_->Next(); |
||||
continue; |
||||
} |
||||
|
||||
switch (ikey.type) { |
||||
case kTypeDeletion: |
||||
SaveKey(ikey.user_key); // Make local copy for use by SkipPast()
|
||||
iter_->Next(); |
||||
SkipPast(key_); |
||||
// Do not return deleted entries. Instead keep looping.
|
||||
break; |
||||
|
||||
case kTypeValue: |
||||
SaveKey(ikey.user_key); |
||||
SaveValue(iter_->value()); |
||||
iter_->Next(); |
||||
SkipPast(key_); |
||||
// Yield the value we just found.
|
||||
valid_ = true; |
||||
return; |
||||
|
||||
case kTypeLargeValueRef: |
||||
SaveKey(ikey.user_key); |
||||
// Save the large value ref as value_, and read it lazily on a call
|
||||
// to value()
|
||||
SaveValue(iter_->value()); |
||||
large_ = new Large; |
||||
large_->produced = false; |
||||
iter_->Next(); |
||||
SkipPast(key_); |
||||
// Yield the value we just found.
|
||||
valid_ = true; |
||||
return; |
||||
} |
||||
} |
||||
valid_ = false; |
||||
key_.clear(); |
||||
value_.clear(); |
||||
assert(large_ == NULL); |
||||
} |
||||
|
||||
void DBIter::SkipPast(const Slice& k) { |
||||
while (iter_->Valid()) { |
||||
ParsedInternalKey ikey; |
||||
// Note that if we cannot parse an internal key, we keep looping
|
||||
// so that if we have a run like the following:
|
||||
// <x,100,v> => value100
|
||||
// <corrupted entry for user key x>
|
||||
// <x,50,v> => value50
|
||||
// we will skip over the corrupted entry as well as value50.
|
||||
if (ParseKey(&ikey) && user_comparator_->Compare(ikey.user_key, k) != 0) { |
||||
break; |
||||
} |
||||
iter_->Next(); |
||||
} |
||||
} |
||||
|
||||
void DBIter::SeekToLast() { |
||||
// Position iter_ at the last uncorrupted user key and then
|
||||
// let FindPrevUserEntry() do the heavy lifting to find
|
||||
// a user key that is live.
|
||||
iter_->SeekToLast(); |
||||
ParsedInternalKey current; |
||||
while (iter_->Valid() && !ParseKey(¤t)) { |
||||
iter_->Prev(); |
||||
} |
||||
if (iter_->Valid()) { |
||||
SaveKey(current.user_key); |
||||
} |
||||
FindPrevUserEntry(); |
||||
} |
||||
|
||||
// Let X be the user key at which iter_ is currently positioned.
|
||||
// Adjust DBIter to point at the last entry with a key <= X that
|
||||
// has a live value.
|
||||
void DBIter::FindPrevUserEntry() { |
||||
// Consider the following example:
|
||||
//
|
||||
// A@540
|
||||
// A@400
|
||||
//
|
||||
// B@300
|
||||
// B@200
|
||||
// B@100 <- iter_
|
||||
//
|
||||
// C@301
|
||||
// C@201
|
||||
//
|
||||
// The comments marked "(first iteration)" below relate what happens
|
||||
// for the preceding example in the first iteration of the while loop
|
||||
// below. There may be more than one iteration either if there are
|
||||
// no live values for B, or if there is a corruption.
|
||||
while (iter_->Valid()) { |
||||
std::string saved = key_; |
||||
bool found_live; |
||||
ScanUntilBeforeCurrentKey(&found_live); |
||||
// (first iteration) iter_ at A@400
|
||||
if (found_live) { |
||||
// Step forward into range of entries with user key >= saved
|
||||
if (!iter_->Valid()) { |
||||
iter_->SeekToFirst(); |
||||
} else { |
||||
iter_->Next(); |
||||
} |
||||
// (first iteration) iter_ at B@300
|
||||
|
||||
FindNextUserEntry(); // Sets key_ to the key of the next value it found
|
||||
if (valid_ && user_comparator_->Compare(key_, saved) == 0) { |
||||
// (first iteration) iter_ at C@301
|
||||
return; |
||||
} |
||||
|
||||
// FindNextUserEntry() could not find any entries under the
|
||||
// user key "saved". This is probably a corruption since
|
||||
// ScanUntilBefore(saved) found a live value. So we skip
|
||||
// backwards to an earlier key and ignore the corrupted
|
||||
// entries for "saved".
|
||||
//
|
||||
// (first iteration) iter_ at C@301 and saved == "B"
|
||||
key_ = saved; |
||||
bool ignored; |
||||
ScanUntilBeforeCurrentKey(&ignored); |
||||
// (first iteration) iter_ at A@400
|
||||
} |
||||
} |
||||
valid_ = false; |
||||
key_.clear(); |
||||
value_.clear(); |
||||
} |
||||
|
||||
void DBIter::ScanUntilBeforeCurrentKey(bool* found_live) { |
||||
*found_live = false; |
||||
if (!iter_->Valid()) { |
||||
iter_->SeekToLast(); |
||||
} |
||||
|
||||
while (iter_->Valid()) { |
||||
ParsedInternalKey current; |
||||
if (!ParseKey(¤t)) { |
||||
iter_->Prev(); |
||||
continue; |
||||
} |
||||
|
||||
if (current.sequence > sequence_) { |
||||
// Ignore entries that are serialized after this read
|
||||
iter_->Prev(); |
||||
continue; |
||||
} |
||||
|
||||
const int cmp = user_comparator_->Compare(current.user_key, key_); |
||||
if (cmp < 0) { |
||||
SaveKey(current.user_key); |
||||
return; |
||||
} else if (cmp == 0) { |
||||
switch (current.type) { |
||||
case kTypeDeletion: |
||||
*found_live = false; |
||||
break; |
||||
|
||||
case kTypeValue: |
||||
case kTypeLargeValueRef: |
||||
*found_live = true; |
||||
break; |
||||
} |
||||
} else { // cmp > 0
|
||||
*found_live = false; |
||||
} |
||||
|
||||
iter_->Prev(); |
||||
} |
||||
} |
||||
|
||||
void DBIter::ReadIndirectValue() const { |
||||
assert(!large_->produced); |
||||
large_->produced = true; |
||||
LargeValueRef large_ref; |
||||
if (value_.size() != LargeValueRef::ByteSize()) { |
||||
large_->status = Status::Corruption("malformed large value reference"); |
||||
return; |
||||
} |
||||
memcpy(large_ref.data, value_.data(), LargeValueRef::ByteSize()); |
||||
std::string fname = LargeValueFileName(*dbname_, large_ref); |
||||
RandomAccessFile* file; |
||||
Status s = env_->NewRandomAccessFile(fname, &file); |
||||
if (s.ok()) { |
||||
uint64_t file_size = file->Size(); |
||||
uint64_t value_size = large_ref.ValueSize(); |
||||
large_->value.resize(value_size); |
||||
Slice result; |
||||
s = file->Read(0, file_size, &result, |
||||
const_cast<char*>(large_->value.data())); |
||||
if (s.ok()) { |
||||
if (result.size() == file_size) { |
||||
switch (large_ref.compression_type()) { |
||||
case kNoCompression: { |
||||
if (result.data() != large_->value.data()) { |
||||
large_->value.assign(result.data(), result.size()); |
||||
} |
||||
break; |
||||
} |
||||
case kLightweightCompression: { |
||||
std::string uncompressed; |
||||
if (port::Lightweight_Uncompress(result.data(), result.size(), |
||||
&uncompressed) && |
||||
uncompressed.size() == large_ref.ValueSize()) { |
||||
swap(uncompressed, large_->value); |
||||
} else { |
||||
s = Status::Corruption( |
||||
"Unable to read entire compressed large value file"); |
||||
} |
||||
} |
||||
} |
||||
} else { |
||||
s = Status::Corruption("Unable to read entire large value file"); |
||||
} |
||||
} |
||||
delete file; // Ignore errors on closing
|
||||
} |
||||
if (!s.ok()) { |
||||
large_->value.clear(); |
||||
large_->status = s; |
||||
} |
||||
} |
||||
|
||||
} // anonymous namespace
|
||||
|
||||
Iterator* NewDBIterator( |
||||
const std::string* dbname, |
||||
Env* env, |
||||
const Comparator* user_key_comparator, |
||||
Iterator* internal_iter, |
||||
const SequenceNumber& sequence) { |
||||
return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); |
||||
} |
||||
|
||||
} |
@ -0,0 +1,26 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_ |
||||
#define STORAGE_LEVELDB_DB_DB_ITER_H_ |
||||
|
||||
#include <stdint.h> |
||||
#include "include/db.h" |
||||
#include "db/dbformat.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
// Return a new iterator that converts internal keys (yielded by
|
||||
// "*internal_iter") that were live at the specified "sequence" number
|
||||
// into appropriate user keys.
|
||||
extern Iterator* NewDBIterator( |
||||
const std::string* dbname, |
||||
Env* env, |
||||
const Comparator* user_key_comparator, |
||||
Iterator* internal_iter, |
||||
const SequenceNumber& sequence); |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_DB_ITER_H_
|
@ -0,0 +1,963 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "include/db.h" |
||||
|
||||
#include "db/db_impl.h" |
||||
#include "db/filename.h" |
||||
#include "db/version_set.h" |
||||
#include "db/write_batch_internal.h" |
||||
#include "include/env.h" |
||||
#include "include/table.h" |
||||
#include "util/logging.h" |
||||
#include "util/testharness.h" |
||||
#include "util/testutil.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
static std::string RandomString(Random* rnd, int len) { |
||||
std::string r; |
||||
test::RandomString(rnd, len, &r); |
||||
return r; |
||||
} |
||||
|
||||
class DBTest { |
||||
public: |
||||
std::string dbname_; |
||||
Env* env_; |
||||
DB* db_; |
||||
|
||||
Options last_options_; |
||||
|
||||
DBTest() : env_(Env::Default()) { |
||||
dbname_ = test::TmpDir() + "/db_test"; |
||||
DestroyDB(dbname_, Options()); |
||||
db_ = NULL; |
||||
Reopen(); |
||||
} |
||||
|
||||
~DBTest() { |
||||
delete db_; |
||||
DestroyDB(dbname_, Options()); |
||||
} |
||||
|
||||
DBImpl* dbfull() { |
||||
return reinterpret_cast<DBImpl*>(db_); |
||||
} |
||||
|
||||
void Reopen(Options* options = NULL) { |
||||
ASSERT_OK(TryReopen(options)); |
||||
} |
||||
|
||||
void DestroyAndReopen(Options* options = NULL) { |
||||
delete db_; |
||||
db_ = NULL; |
||||
DestroyDB(dbname_, Options()); |
||||
ASSERT_OK(TryReopen(options)); |
||||
} |
||||
|
||||
Status TryReopen(Options* options) { |
||||
delete db_; |
||||
db_ = NULL; |
||||
Options opts; |
||||
if (options != NULL) { |
||||
opts = *options; |
||||
} else { |
||||
opts.create_if_missing = true; |
||||
} |
||||
last_options_ = opts; |
||||
|
||||
return DB::Open(opts, dbname_, &db_); |
||||
} |
||||
|
||||
Status Put(const std::string& k, const std::string& v) { |
||||
WriteBatch batch; |
||||
batch.Put(k, v); |
||||
return db_->Write(WriteOptions(), &batch); |
||||
} |
||||
|
||||
Status Delete(const std::string& k) { |
||||
WriteBatch batch; |
||||
batch.Delete(k); |
||||
return db_->Write(WriteOptions(), &batch); |
||||
} |
||||
|
||||
std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { |
||||
ReadOptions options; |
||||
options.snapshot = snapshot; |
||||
std::string result; |
||||
Status s = db_->Get(options, k, &result); |
||||
if (s.IsNotFound()) { |
||||
result = "NOT_FOUND"; |
||||
} else if (!s.ok()) { |
||||
result = s.ToString(); |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
std::string AllEntriesFor(const Slice& user_key) { |
||||
Iterator* iter = dbfull()->TEST_NewInternalIterator(); |
||||
InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); |
||||
iter->Seek(target.Encode()); |
||||
std::string result; |
||||
if (!iter->status().ok()) { |
||||
result = iter->status().ToString(); |
||||
} else { |
||||
result = "[ "; |
||||
bool first = true; |
||||
while (iter->Valid()) { |
||||
ParsedInternalKey ikey; |
||||
if (!ParseInternalKey(iter->key(), &ikey)) { |
||||
result += "CORRUPTED"; |
||||
} else { |
||||
if (last_options_.comparator->Compare( |
||||
ikey.user_key, user_key) != 0) { |
||||
break; |
||||
} |
||||
if (!first) { |
||||
result += ", "; |
||||
} |
||||
first = false; |
||||
switch (ikey.type) { |
||||
case kTypeValue: |
||||
result += iter->value().ToString(); |
||||
break; |
||||
case kTypeLargeValueRef: |
||||
result += "LARGEVALUE(" + EscapeString(iter->value()) + ")"; |
||||
break; |
||||
case kTypeDeletion: |
||||
result += "DEL"; |
||||
break; |
||||
} |
||||
} |
||||
iter->Next(); |
||||
} |
||||
if (!first) { |
||||
result += " "; |
||||
} |
||||
result += "]"; |
||||
} |
||||
delete iter; |
||||
return result; |
||||
} |
||||
|
||||
int NumTableFilesAtLevel(int level) { |
||||
uint64_t val; |
||||
ASSERT_TRUE( |
||||
db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level), |
||||
&val)); |
||||
return val; |
||||
} |
||||
|
||||
uint64_t Size(const Slice& start, const Slice& limit) { |
||||
Range r(start, limit); |
||||
uint64_t size; |
||||
db_->GetApproximateSizes(&r, 1, &size); |
||||
return size; |
||||
} |
||||
|
||||
std::set<LargeValueRef> LargeValueFiles() const { |
||||
// Return the set of large value files that exist in the database
|
||||
std::vector<std::string> filenames; |
||||
env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
|
||||
uint64_t number; |
||||
LargeValueRef large_ref; |
||||
FileType type; |
||||
std::set<LargeValueRef> live; |
||||
for (int i = 0; i < filenames.size(); i++) { |
||||
if (ParseFileName(filenames[i], &number, &large_ref, &type) && |
||||
type == kLargeValueFile) { |
||||
fprintf(stderr, " live: %s\n", |
||||
LargeValueRefToFilenameString(large_ref).c_str()); |
||||
live.insert(large_ref); |
||||
} |
||||
} |
||||
fprintf(stderr, "Found %d live large value files\n", (int)live.size()); |
||||
return live; |
||||
} |
||||
}; |
||||
|
||||
TEST(DBTest, Empty) { |
||||
ASSERT_TRUE(db_ != NULL); |
||||
ASSERT_EQ("NOT_FOUND", Get("foo")); |
||||
} |
||||
|
||||
TEST(DBTest, ReadWrite) { |
||||
ASSERT_OK(Put("foo", "v1")); |
||||
ASSERT_EQ("v1", Get("foo")); |
||||
ASSERT_OK(Put("bar", "v2")); |
||||
ASSERT_OK(Put("foo", "v3")); |
||||
ASSERT_EQ("v3", Get("foo")); |
||||
ASSERT_EQ("v2", Get("bar")); |
||||
} |
||||
|
||||
TEST(DBTest, PutDeleteGet) { |
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); |
||||
ASSERT_EQ("v1", Get("foo")); |
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); |
||||
ASSERT_EQ("v2", Get("foo")); |
||||
ASSERT_OK(db_->Delete(WriteOptions(), "foo")); |
||||
ASSERT_EQ("NOT_FOUND", Get("foo")); |
||||
} |
||||
|
||||
TEST(DBTest, Recover) { |
||||
ASSERT_OK(Put("foo", "v1")); |
||||
ASSERT_OK(Put("baz", "v5")); |
||||
|
||||
Reopen(); |
||||
ASSERT_EQ("v1", Get("foo")); |
||||
|
||||
ASSERT_EQ("v1", Get("foo")); |
||||
ASSERT_EQ("v5", Get("baz")); |
||||
ASSERT_OK(Put("bar", "v2")); |
||||
ASSERT_OK(Put("foo", "v3")); |
||||
|
||||
Reopen(); |
||||
ASSERT_EQ("v3", Get("foo")); |
||||
ASSERT_OK(Put("foo", "v4")); |
||||
ASSERT_EQ("v4", Get("foo")); |
||||
ASSERT_EQ("v2", Get("bar")); |
||||
ASSERT_EQ("v5", Get("baz")); |
||||
} |
||||
|
||||
TEST(DBTest, RecoveryWithEmptyLog) { |
||||
ASSERT_OK(Put("foo", "v1")); |
||||
ASSERT_OK(Put("foo", "v2")); |
||||
Reopen(); |
||||
Reopen(); |
||||
ASSERT_OK(Put("foo", "v3")); |
||||
Reopen(); |
||||
ASSERT_EQ("v3", Get("foo")); |
||||
} |
||||
|
||||
static std::string Key(int i) { |
||||
char buf[100]; |
||||
snprintf(buf, sizeof(buf), "key%06d", i); |
||||
return std::string(buf); |
||||
} |
||||
|
||||
TEST(DBTest, MinorCompactionsHappen) { |
||||
Options options; |
||||
options.write_buffer_size = 10000; |
||||
Reopen(&options); |
||||
|
||||
const int N = 100; |
||||
|
||||
int starting_num_tables = NumTableFilesAtLevel(0); |
||||
for (int i = 0; i < N; i++) { |
||||
ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); |
||||
} |
||||
int ending_num_tables = NumTableFilesAtLevel(0); |
||||
ASSERT_GT(ending_num_tables, starting_num_tables); |
||||
|
||||
for (int i = 0; i < N; i++) { |
||||
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); |
||||
} |
||||
|
||||
Reopen(); |
||||
|
||||
for (int i = 0; i < N; i++) { |
||||
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); |
||||
} |
||||
} |
||||
|
||||
TEST(DBTest, RecoverWithLargeLog) { |
||||
{ |
||||
Options options; |
||||
options.large_value_threshold = 1048576; |
||||
Reopen(&options); |
||||
ASSERT_OK(Put("big1", std::string(200000, '1'))); |
||||
ASSERT_OK(Put("big2", std::string(200000, '2'))); |
||||
ASSERT_OK(Put("small3", std::string(10, '3'))); |
||||
ASSERT_OK(Put("small4", std::string(10, '4'))); |
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); |
||||
} |
||||
|
||||
// Make sure that if we re-open with a small write buffer size that
|
||||
// we flush table files in the middle of a large log file.
|
||||
Options options; |
||||
options.write_buffer_size = 100000; |
||||
options.large_value_threshold = 1048576; |
||||
Reopen(&options); |
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 3); |
||||
ASSERT_EQ(std::string(200000, '1'), Get("big1")); |
||||
ASSERT_EQ(std::string(200000, '2'), Get("big2")); |
||||
ASSERT_EQ(std::string(10, '3'), Get("small3")); |
||||
ASSERT_EQ(std::string(10, '4'), Get("small4")); |
||||
ASSERT_GT(NumTableFilesAtLevel(0), 1); |
||||
} |
||||
|
||||
TEST(DBTest, CompactionsGenerateMultipleFiles) { |
||||
Options options; |
||||
options.write_buffer_size = 100000000; // Large write buffer
|
||||
options.large_value_threshold = 1048576; |
||||
Reopen(&options); |
||||
|
||||
Random rnd(301); |
||||
|
||||
// Write 8MB (80 values, each 100K)
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); |
||||
std::vector<std::string> values; |
||||
for (int i = 0; i < 80; i++) { |
||||
values.push_back(RandomString(&rnd, 100000)); |
||||
ASSERT_OK(Put(Key(i), values[i])); |
||||
} |
||||
|
||||
// Reopening moves updates to level-0
|
||||
Reopen(&options); |
||||
dbfull()->TEST_CompactRange(0, "", Key(100000)); |
||||
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); |
||||
ASSERT_GT(NumTableFilesAtLevel(1), 1); |
||||
for (int i = 0; i < 80; i++) { |
||||
ASSERT_EQ(Get(Key(i)), values[i]); |
||||
} |
||||
} |
||||
|
||||
static bool Between(uint64_t val, uint64_t low, uint64_t high) { |
||||
bool result = (val >= low) && (val <= high); |
||||
if (!result) { |
||||
fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", |
||||
(unsigned long long)(val), |
||||
(unsigned long long)(low), |
||||
(unsigned long long)(high)); |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
TEST(DBTest, ApproximateSizes) { |
||||
for (int test = 0; test < 2; test++) { |
||||
// test==0: default large_value_threshold
|
||||
// test==1: 1 MB large_value_threshold
|
||||
Options options; |
||||
options.large_value_threshold = (test == 0) ? 65536 : 1048576; |
||||
options.write_buffer_size = 100000000; // Large write buffer
|
||||
options.compression = kNoCompression; |
||||
DestroyAndReopen(); |
||||
|
||||
ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); |
||||
Reopen(&options); |
||||
ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); |
||||
|
||||
// Write 8MB (80 values, each 100K)
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); |
||||
const int N = 80; |
||||
Random rnd(301); |
||||
for (int i = 0; i < N; i++) { |
||||
ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); |
||||
} |
||||
if (test == 1) { |
||||
// 0 because GetApproximateSizes() does not account for memtable space for
|
||||
// non-large values
|
||||
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); |
||||
} else { |
||||
ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000)); |
||||
ASSERT_TRUE(Between(Size(Key(20), Key(30)), |
||||
100000*10, 100000*10 + 10000)); |
||||
} |
||||
|
||||
// Check sizes across recovery by reopening a few times
|
||||
for (int run = 0; run < 3; run++) { |
||||
Reopen(&options); |
||||
|
||||
for (int compact_start = 0; compact_start < N; compact_start += 10) { |
||||
for (int i = 0; i < N; i += 10) { |
||||
ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); |
||||
ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), |
||||
100000 * (i+1), 100000 * (i+1) + 10000)); |
||||
ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), |
||||
100000 * 10, 100000 * 10 + 10000)); |
||||
} |
||||
ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); |
||||
ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); |
||||
|
||||
dbfull()->TEST_CompactRange(0, |
||||
Key(compact_start), |
||||
Key(compact_start + 9)); |
||||
} |
||||
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); |
||||
ASSERT_GT(NumTableFilesAtLevel(1), 0); |
||||
} |
||||
} |
||||
} |
||||
|
||||
TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { |
||||
Options options; |
||||
options.large_value_threshold = 65536; |
||||
options.compression = kNoCompression; |
||||
Reopen(); |
||||
|
||||
Random rnd(301); |
||||
std::string big1 = RandomString(&rnd, 100000); |
||||
ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); |
||||
ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); |
||||
ASSERT_OK(Put(Key(2), big1)); |
||||
ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); |
||||
ASSERT_OK(Put(Key(4), big1)); |
||||
ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); |
||||
ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); |
||||
ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); |
||||
|
||||
// Check sizes across recovery by reopening a few times
|
||||
for (int run = 0; run < 3; run++) { |
||||
Reopen(&options); |
||||
|
||||
ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); |
||||
ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); |
||||
ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); |
||||
ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); |
||||
ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); |
||||
ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); |
||||
ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); |
||||
ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); |
||||
ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000)); |
||||
|
||||
ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); |
||||
|
||||
dbfull()->TEST_CompactRange(0, Key(0), Key(100)); |
||||
} |
||||
} |
||||
|
||||
TEST(DBTest, IteratorPinsRef) { |
||||
Put("foo", "hello"); |
||||
|
||||
// Get iterator that will yield the current contents of the DB.
|
||||
Iterator* iter = db_->NewIterator(ReadOptions()); |
||||
|
||||
// Write to force compactions
|
||||
Put("foo", "newvalue1"); |
||||
for (int i = 0; i < 100; i++) { |
||||
ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values
|
||||
} |
||||
Put("foo", "newvalue2"); |
||||
|
||||
iter->SeekToFirst(); |
||||
ASSERT_TRUE(iter->Valid()); |
||||
ASSERT_EQ("foo", iter->key().ToString()); |
||||
ASSERT_EQ("hello", iter->value().ToString()); |
||||
iter->Next(); |
||||
ASSERT_TRUE(!iter->Valid()); |
||||
delete iter; |
||||
} |
||||
|
||||
TEST(DBTest, Snapshot) { |
||||
Put("foo", "v1"); |
||||
const Snapshot* s1 = db_->GetSnapshot(); |
||||
Put("foo", "v2"); |
||||
const Snapshot* s2 = db_->GetSnapshot(); |
||||
Put("foo", "v3"); |
||||
const Snapshot* s3 = db_->GetSnapshot(); |
||||
|
||||
Put("foo", "v4"); |
||||
ASSERT_EQ("v1", Get("foo", s1)); |
||||
ASSERT_EQ("v2", Get("foo", s2)); |
||||
ASSERT_EQ("v3", Get("foo", s3)); |
||||
ASSERT_EQ("v4", Get("foo")); |
||||
|
||||
db_->ReleaseSnapshot(s3); |
||||
ASSERT_EQ("v1", Get("foo", s1)); |
||||
ASSERT_EQ("v2", Get("foo", s2)); |
||||
ASSERT_EQ("v4", Get("foo")); |
||||
|
||||
db_->ReleaseSnapshot(s1); |
||||
ASSERT_EQ("v2", Get("foo", s2)); |
||||
ASSERT_EQ("v4", Get("foo")); |
||||
|
||||
db_->ReleaseSnapshot(s2); |
||||
ASSERT_EQ("v4", Get("foo")); |
||||
} |
||||
|
||||
TEST(DBTest, HiddenValuesAreRemoved) { |
||||
Random rnd(301); |
||||
std::string big = RandomString(&rnd, 50000); |
||||
Put("foo", big); |
||||
Put("pastfoo", "v"); |
||||
const Snapshot* snapshot = db_->GetSnapshot(); |
||||
Put("foo", "tiny"); |
||||
Put("pastfoo2", "v2"); // Advance sequence number one more
|
||||
|
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); |
||||
ASSERT_GT(NumTableFilesAtLevel(0), 0); |
||||
|
||||
ASSERT_EQ(big, Get("foo", snapshot)); |
||||
ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); |
||||
db_->ReleaseSnapshot(snapshot); |
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); |
||||
dbfull()->TEST_CompactRange(0, "", "x"); |
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); |
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); |
||||
ASSERT_GE(NumTableFilesAtLevel(1), 1); |
||||
dbfull()->TEST_CompactRange(1, "", "x"); |
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); |
||||
|
||||
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); |
||||
} |
||||
|
||||
TEST(DBTest, DeletionMarkers1) { |
||||
Put("foo", "v1"); |
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); |
||||
dbfull()->TEST_CompactRange(0, "", "z"); |
||||
dbfull()->TEST_CompactRange(1, "", "z"); |
||||
ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file
|
||||
Delete("foo"); |
||||
Put("foo", "v2"); |
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); |
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); |
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); |
||||
dbfull()->TEST_CompactRange(0, "", "z"); |
||||
// DEL eliminated, but v1 remains because we aren't compacting that level
|
||||
// (DEL can be eliminated because v2 hides v1).
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); |
||||
dbfull()->TEST_CompactRange(1, "", "z"); |
||||
// Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed.
|
||||
// (as is v1).
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); |
||||
} |
||||
|
||||
TEST(DBTest, DeletionMarkers2) { |
||||
Put("foo", "v1"); |
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); |
||||
dbfull()->TEST_CompactRange(0, "", "z"); |
||||
dbfull()->TEST_CompactRange(1, "", "z"); |
||||
ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file
|
||||
Delete("foo"); |
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); |
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); |
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); |
||||
dbfull()->TEST_CompactRange(0, "", "z"); |
||||
// DEL kept: L2 file overlaps
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); |
||||
dbfull()->TEST_CompactRange(1, "", "z"); |
||||
// Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed.
|
||||
// (as is v1).
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); |
||||
} |
||||
|
||||
TEST(DBTest, ComparatorCheck) { |
||||
class NewComparator : public Comparator { |
||||
public: |
||||
virtual const char* Name() const { return "leveldb.NewComparator"; } |
||||
virtual int Compare(const Slice& a, const Slice& b) const { |
||||
return BytewiseComparator()->Compare(a, b); |
||||
} |
||||
virtual void FindShortestSeparator(std::string* s, const Slice& l) const { |
||||
BytewiseComparator()->FindShortestSeparator(s, l); |
||||
} |
||||
virtual void FindShortSuccessor(std::string* key) const { |
||||
BytewiseComparator()->FindShortSuccessor(key); |
||||
} |
||||
}; |
||||
NewComparator cmp; |
||||
Options new_options; |
||||
new_options.comparator = &cmp; |
||||
Status s = TryReopen(&new_options); |
||||
ASSERT_TRUE(!s.ok()); |
||||
ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) |
||||
<< s.ToString(); |
||||
} |
||||
|
||||
static bool LargeValuesOK(DBTest* db, |
||||
const std::set<LargeValueRef>& expected) { |
||||
std::set<LargeValueRef> actual = db->LargeValueFiles(); |
||||
if (actual.size() != expected.size()) { |
||||
fprintf(stderr, "Sets differ in size: %d vs %d\n", |
||||
(int)actual.size(), (int)expected.size()); |
||||
return false; |
||||
} |
||||
for (std::set<LargeValueRef>::const_iterator it = expected.begin(); |
||||
it != expected.end(); |
||||
++it) { |
||||
if (actual.count(*it) != 1) { |
||||
fprintf(stderr, " key '%s' not found in actual set\n", |
||||
LargeValueRefToFilenameString(*it).c_str()); |
||||
return false; |
||||
} |
||||
} |
||||
return true; |
||||
} |
||||
|
||||
TEST(DBTest, LargeValues1) { |
||||
Options options; |
||||
options.large_value_threshold = 10000; |
||||
Reopen(&options); |
||||
|
||||
Random rnd(301); |
||||
|
||||
std::string big1; |
||||
test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible
|
||||
std::set<LargeValueRef> expected; |
||||
|
||||
ASSERT_OK(Put("big1", big1)); |
||||
expected.insert(LargeValueRef::Make(big1, kNoCompression)); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
|
||||
ASSERT_OK(Delete("big1")); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); |
||||
// No handling of deletion markers on memtable compactions, so big1 remains
|
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
|
||||
dbfull()->TEST_CompactRange(0, "", "z"); |
||||
expected.erase(LargeValueRef::Make(big1, kNoCompression)); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
} |
||||
|
||||
TEST(DBTest, LargeValues2) { |
||||
Options options; |
||||
options.large_value_threshold = 10000; |
||||
Reopen(&options); |
||||
|
||||
Random rnd(301); |
||||
|
||||
std::string big1, big2; |
||||
test::CompressibleString(&rnd, 1.0, 20000, &big1); // Not compressible
|
||||
test::CompressibleString(&rnd, 0.6, 40000, &big2); // Compressible
|
||||
std::set<LargeValueRef> expected; |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
|
||||
ASSERT_OK(Put("big1", big1)); |
||||
expected.insert(LargeValueRef::Make(big1, kNoCompression)); |
||||
ASSERT_EQ(big1, Get("big1")); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
|
||||
ASSERT_OK(Put("big2", big2)); |
||||
ASSERT_EQ(big2, Get("big2")); |
||||
#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM) |
||||
// TODO(sanjay) Reenable after compression support is added
|
||||
expected.insert(LargeValueRef::Make(big2, kNoCompression)); |
||||
#else |
||||
expected.insert(LargeValueRef::Make(big2, kLightweightCompression)); |
||||
#endif |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
|
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
|
||||
dbfull()->TEST_CompactRange(0, "", "z"); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
|
||||
ASSERT_OK(Put("big2", big2)); |
||||
ASSERT_OK(Put("big2_b", big2)); |
||||
ASSERT_EQ(big1, Get("big1")); |
||||
ASSERT_EQ(big2, Get("big2")); |
||||
ASSERT_EQ(big2, Get("big2_b")); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
|
||||
ASSERT_OK(Delete("big1")); |
||||
ASSERT_EQ("NOT_FOUND", Get("big1")); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
|
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
dbfull()->TEST_CompactRange(0, "", "z"); |
||||
expected.erase(LargeValueRef::Make(big1, kNoCompression)); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
dbfull()->TEST_CompactRange(1, "", "z"); |
||||
|
||||
ASSERT_OK(Delete("big2")); |
||||
ASSERT_EQ("NOT_FOUND", Get("big2")); |
||||
ASSERT_EQ(big2, Get("big2_b")); |
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
dbfull()->TEST_CompactRange(0, "", "z"); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
|
||||
// Make sure the large value refs survive a reload and compactions after
|
||||
// the reload.
|
||||
Reopen(); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
ASSERT_OK(Put("foo", "bar")); |
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); |
||||
dbfull()->TEST_CompactRange(0, "", "z"); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
} |
||||
|
||||
TEST(DBTest, LargeValues3) { |
||||
// Make sure we don't compress values if
|
||||
Options options; |
||||
options.large_value_threshold = 10000; |
||||
options.compression = kNoCompression; |
||||
Reopen(&options); |
||||
|
||||
Random rnd(301); |
||||
|
||||
std::string big1 = std::string(100000, 'x'); // Very compressible
|
||||
std::set<LargeValueRef> expected; |
||||
|
||||
ASSERT_OK(Put("big1", big1)); |
||||
ASSERT_EQ(big1, Get("big1")); |
||||
expected.insert(LargeValueRef::Make(big1, kNoCompression)); |
||||
ASSERT_TRUE(LargeValuesOK(this, expected)); |
||||
} |
||||
|
||||
|
||||
TEST(DBTest, DBOpen_Options) { |
||||
std::string dbname = test::TmpDir() + "/db_options_test"; |
||||
DestroyDB(dbname, Options()); |
||||
|
||||
// Does not exist, and create_if_missing == false: error
|
||||
DB* db = NULL; |
||||
Options opts; |
||||
opts.create_if_missing = false; |
||||
Status s = DB::Open(opts, dbname, &db); |
||||
ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL); |
||||
ASSERT_TRUE(db == NULL); |
||||
|
||||
// Does not exist, and create_if_missing == true: OK
|
||||
opts.create_if_missing = true; |
||||
s = DB::Open(opts, dbname, &db); |
||||
ASSERT_OK(s); |
||||
ASSERT_TRUE(db != NULL); |
||||
|
||||
delete db; |
||||
db = NULL; |
||||
|
||||
// Does exist, and error_if_exists == true: error
|
||||
opts.create_if_missing = false; |
||||
opts.error_if_exists = true; |
||||
s = DB::Open(opts, dbname, &db); |
||||
ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL); |
||||
ASSERT_TRUE(db == NULL); |
||||
|
||||
// Does exist, and error_if_exists == false: OK
|
||||
opts.create_if_missing = true; |
||||
opts.error_if_exists = false; |
||||
s = DB::Open(opts, dbname, &db); |
||||
ASSERT_OK(s); |
||||
ASSERT_TRUE(db != NULL); |
||||
|
||||
delete db; |
||||
db = NULL; |
||||
} |
||||
|
||||
class ModelDB: public DB { |
||||
public: |
||||
explicit ModelDB(const Options& options): options_(options) { } |
||||
~ModelDB() { } |
||||
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { |
||||
return DB::Put(o, k, v); |
||||
} |
||||
virtual Status Delete(const WriteOptions& o, const Slice& key) { |
||||
return DB::Delete(o, key); |
||||
} |
||||
virtual Status Get(const ReadOptions& options, |
||||
const Slice& key, std::string* value) { |
||||
assert(false); // Not implemented
|
||||
return Status::NotFound(key); |
||||
} |
||||
virtual Iterator* NewIterator(const ReadOptions& options) { |
||||
if (options.snapshot == NULL) { |
||||
KVMap* saved = new KVMap; |
||||
*saved = map_; |
||||
return new ModelIter(saved, true); |
||||
} else { |
||||
const KVMap* snapshot_state = |
||||
reinterpret_cast<const KVMap*>(options.snapshot->number_); |
||||
return new ModelIter(snapshot_state, false); |
||||
} |
||||
} |
||||
virtual const Snapshot* GetSnapshot() { |
||||
KVMap* saved = new KVMap; |
||||
*saved = map_; |
||||
return snapshots_.New( |
||||
reinterpret_cast<SequenceNumber>(saved)); |
||||
} |
||||
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot) { |
||||
const KVMap* saved = reinterpret_cast<const KVMap*>(snapshot->number_); |
||||
delete saved; |
||||
snapshots_.Delete(snapshot); |
||||
} |
||||
virtual Status Write(const WriteOptions& options, WriteBatch* batch) { |
||||
assert(options.post_write_snapshot == NULL); // Not supported
|
||||
for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) { |
||||
switch (it.op()) { |
||||
case kTypeValue: |
||||
map_[it.key().ToString()] = it.value().ToString(); |
||||
break; |
||||
case kTypeLargeValueRef: |
||||
assert(false); // Should not occur
|
||||
break; |
||||
case kTypeDeletion: |
||||
map_.erase(it.key().ToString()); |
||||
break; |
||||
} |
||||
} |
||||
return Status::OK(); |
||||
} |
||||
|
||||
virtual bool GetProperty(const Slice& property, uint64_t* value) { |
||||
return false; |
||||
} |
||||
virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { |
||||
for (int i = 0; i < n; i++) { |
||||
sizes[i] = 0; |
||||
} |
||||
} |
||||
private: |
||||
typedef std::map<std::string, std::string> KVMap; |
||||
class ModelIter: public Iterator { |
||||
public: |
||||
ModelIter(const KVMap* map, bool owned) |
||||
: map_(map), owned_(owned), iter_(map_->end()) { |
||||
} |
||||
~ModelIter() { |
||||
if (owned_) delete map_; |
||||
} |
||||
virtual bool Valid() const { return iter_ != map_->end(); } |
||||
virtual void SeekToFirst() { iter_ = map_->begin(); } |
||||
virtual void SeekToLast() { |
||||
if (map_->empty()) { |
||||
iter_ = map_->end(); |
||||
} else { |
||||
iter_ = map_->find(map_->rbegin()->first); |
||||
} |
||||
} |
||||
virtual void Seek(const Slice& k) { |
||||
iter_ = map_->lower_bound(k.ToString()); |
||||
} |
||||
virtual void Next() { ++iter_; } |
||||
virtual void Prev() { --iter_; } |
||||
virtual Slice key() const { return iter_->first; } |
||||
virtual Slice value() const { return iter_->second; } |
||||
virtual Status status() const { return Status::OK(); } |
||||
private: |
||||
const KVMap* const map_; |
||||
const bool owned_; // Do we own map_
|
||||
KVMap::const_iterator iter_; |
||||
}; |
||||
const Options options_; |
||||
KVMap map_; |
||||
SnapshotList snapshots_; |
||||
}; |
||||
|
||||
static std::string RandomKey(Random* rnd) { |
||||
int len = (rnd->OneIn(3) |
||||
? 1 // Short sometimes to encourage collisions
|
||||
: (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); |
||||
return test::RandomKey(rnd, len); |
||||
} |
||||
|
||||
static bool CompareIterators(int step, |
||||
DB* model, |
||||
DB* db, |
||||
const Snapshot* model_snap, |
||||
const Snapshot* db_snap) { |
||||
ReadOptions options; |
||||
options.snapshot = model_snap; |
||||
Iterator* miter = model->NewIterator(options); |
||||
options.snapshot = db_snap; |
||||
Iterator* dbiter = db->NewIterator(options); |
||||
bool ok = true; |
||||
int count = 0; |
||||
for (miter->SeekToFirst(), dbiter->SeekToFirst(); |
||||
ok && miter->Valid() && dbiter->Valid(); |
||||
miter->Next(), dbiter->Next()) { |
||||
count++; |
||||
if (miter->key().compare(dbiter->key()) != 0) { |
||||
fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", |
||||
step, |
||||
EscapeString(miter->key()).c_str(), |
||||
EscapeString(dbiter->key()).c_str()); |
||||
ok = false; |
||||
break; |
||||
} |
||||
|
||||
if (miter->value().compare(dbiter->value()) != 0) { |
||||
fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", |
||||
step, |
||||
EscapeString(miter->key()).c_str(), |
||||
EscapeString(miter->value()).c_str(), |
||||
EscapeString(miter->value()).c_str()); |
||||
ok = false; |
||||
} |
||||
} |
||||
|
||||
if (ok) { |
||||
if (miter->Valid() != dbiter->Valid()) { |
||||
fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", |
||||
step, miter->Valid(), dbiter->Valid()); |
||||
ok = false; |
||||
} |
||||
} |
||||
fprintf(stderr, "%d entries compared: ok=%d\n", count, ok); |
||||
delete miter; |
||||
delete dbiter; |
||||
return ok; |
||||
} |
||||
|
||||
TEST(DBTest, Randomized) { |
||||
Random rnd(test::RandomSeed()); |
||||
ModelDB model(last_options_); |
||||
const int N = 10000; |
||||
const Snapshot* model_snap = NULL; |
||||
const Snapshot* db_snap = NULL; |
||||
std::string k, v; |
||||
for (int step = 0; step < N; step++) { |
||||
if (step % 100 == 0) { |
||||
fprintf(stderr, "Step %d of %d\n", step, N); |
||||
} |
||||
int p = rnd.Uniform(100); |
||||
if (p < 45) { // Put
|
||||
k = RandomKey(&rnd); |
||||
v = RandomString(&rnd, |
||||
rnd.OneIn(20) |
||||
? 100 + rnd.Uniform(100) |
||||
: rnd.Uniform(8)); |
||||
ASSERT_OK(model.Put(WriteOptions(), k, v)); |
||||
ASSERT_OK(db_->Put(WriteOptions(), k, v)); |
||||
|
||||
} else if (p < 90) { // Delete
|
||||
k = RandomKey(&rnd); |
||||
ASSERT_OK(model.Delete(WriteOptions(), k)); |
||||
ASSERT_OK(db_->Delete(WriteOptions(), k)); |
||||
|
||||
|
||||
} else { // Multi-element batch
|
||||
WriteBatch b; |
||||
const int num = rnd.Uniform(8); |
||||
for (int i = 0; i < num; i++) { |
||||
if (i == 0 || !rnd.OneIn(10)) { |
||||
k = RandomKey(&rnd); |
||||
} else { |
||||
// Periodically re-use the same key from the previous iter, so
|
||||
// we have multiple entries in the write batch for the same key
|
||||
} |
||||
if (rnd.OneIn(2)) { |
||||
v = RandomString(&rnd, rnd.Uniform(10)); |
||||
b.Put(k, v); |
||||
} else { |
||||
b.Delete(k); |
||||
} |
||||
} |
||||
ASSERT_OK(model.Write(WriteOptions(), &b)); |
||||
ASSERT_OK(db_->Write(WriteOptions(), &b)); |
||||
} |
||||
|
||||
if ((step % 100) == 0) { |
||||
ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); |
||||
ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); |
||||
// Save a snapshot from each DB this time that we'll use next
|
||||
// time we compare things, to make sure the current state is
|
||||
// preserved with the snapshot
|
||||
if (model_snap != NULL) model.ReleaseSnapshot(model_snap); |
||||
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); |
||||
|
||||
Reopen(); |
||||
ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); |
||||
|
||||
model_snap = model.GetSnapshot(); |
||||
db_snap = db_->GetSnapshot(); |
||||
} |
||||
} |
||||
if (model_snap != NULL) model.ReleaseSnapshot(model_snap); |
||||
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,152 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <stdio.h> |
||||
#include "db/dbformat.h" |
||||
#include "port/port.h" |
||||
#include "util/coding.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { |
||||
assert(seq <= kMaxSequenceNumber); |
||||
assert(t <= kValueTypeForSeek); |
||||
return (seq << 8) | t; |
||||
} |
||||
|
||||
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { |
||||
result->append(key.user_key.data(), key.user_key.size()); |
||||
PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); |
||||
} |
||||
|
||||
std::string ParsedInternalKey::DebugString() const { |
||||
char buf[50]; |
||||
snprintf(buf, sizeof(buf), "' @ %llu : %d", |
||||
(unsigned long long) sequence, |
||||
int(type)); |
||||
std::string result = "'"; |
||||
result += user_key.ToString(); |
||||
result += buf; |
||||
return result; |
||||
} |
||||
|
||||
const char* InternalKeyComparator::Name() const { |
||||
return "leveldb.InternalKeyComparator"; |
||||
} |
||||
|
||||
int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { |
||||
// Order by:
|
||||
// increasing user key (according to user-supplied comparator)
|
||||
// decreasing sequence number
|
||||
// decreasing type (though sequence# should be enough to disambiguate)
|
||||
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); |
||||
if (r == 0) { |
||||
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); |
||||
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); |
||||
if (anum > bnum) { |
||||
r = -1; |
||||
} else if (anum < bnum) { |
||||
r = +1; |
||||
} |
||||
} |
||||
return r; |
||||
} |
||||
|
||||
void InternalKeyComparator::FindShortestSeparator( |
||||
std::string* start, |
||||
const Slice& limit) const { |
||||
// Attempt to shorten the user portion of the key
|
||||
Slice user_start = ExtractUserKey(*start); |
||||
Slice user_limit = ExtractUserKey(limit); |
||||
std::string tmp(user_start.data(), user_start.size()); |
||||
user_comparator_->FindShortestSeparator(&tmp, user_limit); |
||||
if (user_comparator_->Compare(*start, tmp) < 0) { |
||||
// User key has become larger. Tack on the earliest possible
|
||||
// number to the shortened user key.
|
||||
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); |
||||
assert(this->Compare(*start, tmp) < 0); |
||||
assert(this->Compare(tmp, limit) < 0); |
||||
start->swap(tmp); |
||||
} |
||||
} |
||||
|
||||
void InternalKeyComparator::FindShortSuccessor(std::string* key) const { |
||||
Slice user_key = ExtractUserKey(*key); |
||||
std::string tmp(user_key.data(), user_key.size()); |
||||
user_comparator_->FindShortSuccessor(&tmp); |
||||
if (user_comparator_->Compare(user_key, tmp) < 0) { |
||||
// User key has become larger. Tack on the earliest possible
|
||||
// number to the shortened user key.
|
||||
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); |
||||
assert(this->Compare(*key, tmp) < 0); |
||||
key->swap(tmp); |
||||
} |
||||
} |
||||
|
||||
LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) { |
||||
LargeValueRef result; |
||||
port::SHA1_Hash(value.data(), value.size(), &result.data[0]); |
||||
EncodeFixed64(&result.data[20], value.size()); |
||||
result.data[28] = static_cast<unsigned char>(ctype); |
||||
return result; |
||||
} |
||||
|
||||
std::string LargeValueRefToFilenameString(const LargeValueRef& h) { |
||||
assert(sizeof(h.data) == LargeValueRef::ByteSize()); |
||||
assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf
|
||||
static const char tohex[] = "0123456789abcdef"; |
||||
char buf[20*2]; |
||||
for (int i = 0; i < 20; i++) { |
||||
buf[2*i] = tohex[(h.data[i] >> 4) & 0xf]; |
||||
buf[2*i+1] = tohex[h.data[i] & 0xf]; |
||||
} |
||||
std::string result = std::string(buf, sizeof(buf)); |
||||
result += "-"; |
||||
result += NumberToString(h.ValueSize()); |
||||
result += "-"; |
||||
result += NumberToString(static_cast<uint64_t>(h.compression_type())); |
||||
return result; |
||||
} |
||||
|
||||
static uint32_t hexvalue(char c) { |
||||
if (c >= '0' && c <= '9') { |
||||
return c - '0'; |
||||
} else if (c >= 'A' && c <= 'F') { |
||||
return 10 + c - 'A'; |
||||
} else { |
||||
assert(c >= 'a' && c <= 'f'); |
||||
return 10 + c - 'a'; |
||||
} |
||||
} |
||||
|
||||
bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) { |
||||
Slice in = s; |
||||
if (in.size() < 40) { |
||||
return false; |
||||
} |
||||
for (int i = 0; i < 20; i++) { |
||||
if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) { |
||||
return false; |
||||
} |
||||
unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]); |
||||
h->data[i] = c; |
||||
} |
||||
in.remove_prefix(40); |
||||
uint64_t value_size, ctype; |
||||
|
||||
if (ConsumeChar(&in, '-') && |
||||
ConsumeDecimalNumber(&in, &value_size) && |
||||
ConsumeChar(&in, '-') && |
||||
ConsumeDecimalNumber(&in, &ctype) && |
||||
in.empty() && |
||||
(ctype <= kLightweightCompression)) { |
||||
EncodeFixed64(&h->data[20], value_size); |
||||
h->data[28] = static_cast<unsigned char>(ctype); |
||||
return true; |
||||
} else { |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
} |
@ -0,0 +1,198 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ |
||||
#define STORAGE_LEVELDB_DB_FORMAT_H_ |
||||
|
||||
#include <stdio.h> |
||||
#include "include/comparator.h" |
||||
#include "include/db.h" |
||||
#include "include/slice.h" |
||||
#include "include/table_builder.h" |
||||
#include "util/coding.h" |
||||
#include "util/logging.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class InternalKey; |
||||
|
||||
// Value types encoded as the last component of internal keys.
|
||||
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
|
||||
// data structures.
|
||||
enum ValueType { |
||||
kTypeDeletion = 0x0, |
||||
kTypeValue = 0x1, |
||||
kTypeLargeValueRef = 0x2, |
||||
}; |
||||
// kValueTypeForSeek defines the ValueType that should be passed when
|
||||
// constructing a ParsedInternalKey object for seeking to a particular
|
||||
// sequence number (since we sort sequence numbers in decreasing order
|
||||
// and the value type is embedded as the low 8 bits in the sequence
|
||||
// number in internal keys, we need to use the highest-numbered
|
||||
// ValueType, not the lowest).
|
||||
static const ValueType kValueTypeForSeek = kTypeLargeValueRef; |
||||
|
||||
typedef uint64_t SequenceNumber; |
||||
|
||||
// We leave eight bits empty at the bottom so a type and sequence#
|
||||
// can be packed together into 64-bits.
|
||||
static const SequenceNumber kMaxSequenceNumber = |
||||
((0x1ull << 56) - 1); |
||||
|
||||
struct ParsedInternalKey { |
||||
Slice user_key; |
||||
SequenceNumber sequence; |
||||
ValueType type; |
||||
|
||||
ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
|
||||
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) |
||||
: user_key(u), sequence(seq), type(t) { } |
||||
std::string DebugString() const; |
||||
}; |
||||
|
||||
// Return the length of the encoding of "key".
|
||||
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { |
||||
return key.user_key.size() + 8; |
||||
} |
||||
|
||||
// Append the serialization of "key" to *result.
|
||||
extern void AppendInternalKey(std::string* result, |
||||
const ParsedInternalKey& key); |
||||
|
||||
// Attempt to parse an internal key from "internal_key". On success,
|
||||
// stores the parsed data in "*result", and returns true.
|
||||
//
|
||||
// On error, returns false, leaves "*result" in an undefined state.
|
||||
extern bool ParseInternalKey(const Slice& internal_key, |
||||
ParsedInternalKey* result); |
||||
|
||||
// Returns the user key portion of an internal key.
|
||||
inline Slice ExtractUserKey(const Slice& internal_key) { |
||||
assert(internal_key.size() >= 8); |
||||
return Slice(internal_key.data(), internal_key.size() - 8); |
||||
} |
||||
|
||||
inline ValueType ExtractValueType(const Slice& internal_key) { |
||||
assert(internal_key.size() >= 8); |
||||
const size_t n = internal_key.size(); |
||||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8); |
||||
unsigned char c = num & 0xff; |
||||
return static_cast<ValueType>(c); |
||||
} |
||||
|
||||
// A comparator for internal keys that uses a specified comparator for
|
||||
// the user key portion and breaks ties by decreasing sequence number.
|
||||
class InternalKeyComparator : public Comparator { |
||||
private: |
||||
const Comparator* user_comparator_; |
||||
public: |
||||
explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } |
||||
virtual const char* Name() const; |
||||
virtual int Compare(const Slice& a, const Slice& b) const; |
||||
virtual void FindShortestSeparator( |
||||
std::string* start, |
||||
const Slice& limit) const; |
||||
virtual void FindShortSuccessor(std::string* key) const; |
||||
|
||||
const Comparator* user_comparator() const { return user_comparator_; } |
||||
|
||||
int Compare(const InternalKey& a, const InternalKey& b) const; |
||||
}; |
||||
|
||||
// Modules in this directory should keep internal keys wrapped inside
|
||||
// the following class instead of plain strings so that we do not
|
||||
// incorrectly use string comparisons instead of an InternalKeyComparator.
|
||||
class InternalKey { |
||||
private: |
||||
std::string rep_; |
||||
public: |
||||
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
|
||||
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { |
||||
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); |
||||
} |
||||
|
||||
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } |
||||
Slice Encode() const { |
||||
assert(!rep_.empty()); |
||||
return rep_; |
||||
} |
||||
|
||||
Slice user_key() const { return ExtractUserKey(rep_); } |
||||
|
||||
void SetFrom(const ParsedInternalKey& p) { |
||||
rep_.clear(); |
||||
AppendInternalKey(&rep_, p); |
||||
} |
||||
|
||||
void Clear() { rep_.clear(); } |
||||
}; |
||||
|
||||
inline int InternalKeyComparator::Compare( |
||||
const InternalKey& a, const InternalKey& b) const { |
||||
return Compare(a.Encode(), b.Encode()); |
||||
} |
||||
|
||||
// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte
|
||||
// uncompressed size, and a 1 byte CompressionType code. An
|
||||
// encoded form of it is embedded in the filenames of large value
|
||||
// files stored in the database, and the raw binary form is stored as
|
||||
// the iter->value() result for values of type kTypeLargeValueRef in
|
||||
// the table and log files that make up the database.
|
||||
struct LargeValueRef { |
||||
char data[29]; |
||||
|
||||
// Initialize a large value ref for the given data
|
||||
static LargeValueRef Make(const Slice& data, |
||||
CompressionType compression_type); |
||||
|
||||
// Initialize a large value ref from a serialized, 29-byte reference value
|
||||
static LargeValueRef FromRef(const Slice& ref) { |
||||
LargeValueRef result; |
||||
assert(ref.size() == sizeof(result.data)); |
||||
memcpy(result.data, ref.data(), sizeof(result.data)); |
||||
return result; |
||||
} |
||||
|
||||
// Return the number of bytes in a LargeValueRef (not the
|
||||
// number of bytes in the value referenced).
|
||||
static size_t ByteSize() { return sizeof(LargeValueRef().data); } |
||||
|
||||
// Return the number of bytes in the value referenced by "*this".
|
||||
uint64_t ValueSize() const { return DecodeFixed64(&data[20]); } |
||||
|
||||
CompressionType compression_type() const { |
||||
return static_cast<CompressionType>(data[28]); |
||||
} |
||||
|
||||
bool operator==(const LargeValueRef& b) const { |
||||
return memcmp(data, b.data, sizeof(data)) == 0; |
||||
} |
||||
bool operator<(const LargeValueRef& b) const { |
||||
return memcmp(data, b.data, sizeof(data)) < 0; |
||||
} |
||||
}; |
||||
|
||||
// Convert the large value ref to a human-readable string suitable
|
||||
// for embedding in a large value filename.
|
||||
extern std::string LargeValueRefToFilenameString(const LargeValueRef& h); |
||||
|
||||
// Parse the large value filename string in "input" and store it in
|
||||
// "*h". If successful, returns true. Otherwise returns false.
|
||||
extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref); |
||||
|
||||
inline bool ParseInternalKey(const Slice& internal_key, |
||||
ParsedInternalKey* result) { |
||||
const size_t n = internal_key.size(); |
||||
if (n < 8) return false; |
||||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8); |
||||
unsigned char c = num & 0xff; |
||||
result->sequence = num >> 8; |
||||
result->type = static_cast<ValueType>(c); |
||||
result->user_key = Slice(internal_key.data(), n - 8); |
||||
return (c <= static_cast<unsigned char>(kTypeLargeValueRef)); |
||||
} |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_FORMAT_H_
|
@ -0,0 +1,127 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/dbformat.h" |
||||
#include "util/logging.h" |
||||
#include "util/testharness.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
static std::string IKey(const std::string& user_key, |
||||
uint64_t seq, |
||||
ValueType vt) { |
||||
std::string encoded; |
||||
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); |
||||
return encoded; |
||||
} |
||||
|
||||
static std::string Shorten(const std::string& s, const std::string& l) { |
||||
std::string result = s; |
||||
InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); |
||||
return result; |
||||
} |
||||
|
||||
static std::string ShortSuccessor(const std::string& s) { |
||||
std::string result = s; |
||||
InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); |
||||
return result; |
||||
} |
||||
|
||||
static void TestKey(const std::string& key, |
||||
uint64_t seq, |
||||
ValueType vt) { |
||||
std::string encoded = IKey(key, seq, vt); |
||||
|
||||
Slice in(encoded); |
||||
ParsedInternalKey decoded("", 0, kTypeValue); |
||||
|
||||
ASSERT_TRUE(ParseInternalKey(in, &decoded)); |
||||
ASSERT_EQ(key, decoded.user_key.ToString()); |
||||
ASSERT_EQ(seq, decoded.sequence); |
||||
ASSERT_EQ(vt, decoded.type); |
||||
|
||||
ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); |
||||
} |
||||
|
||||
class FormatTest { }; |
||||
|
||||
TEST(FormatTest, InternalKey_EncodeDecode) { |
||||
const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; |
||||
const uint64_t seq[] = { |
||||
1, 2, 3, |
||||
(1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, |
||||
(1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, |
||||
(1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 |
||||
}; |
||||
for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { |
||||
for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { |
||||
TestKey(keys[k], seq[s], kTypeValue); |
||||
TestKey("hello", 1, kTypeDeletion); |
||||
} |
||||
} |
||||
} |
||||
|
||||
TEST(FormatTest, InternalKeyShortSeparator) { |
||||
// When user keys are same
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue), |
||||
Shorten(IKey("foo", 100, kTypeValue), |
||||
IKey("foo", 99, kTypeValue))); |
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue), |
||||
Shorten(IKey("foo", 100, kTypeValue), |
||||
IKey("foo", 101, kTypeValue))); |
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue), |
||||
Shorten(IKey("foo", 100, kTypeValue), |
||||
IKey("foo", 100, kTypeValue))); |
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue), |
||||
Shorten(IKey("foo", 100, kTypeValue), |
||||
IKey("foo", 100, kTypeDeletion))); |
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue), |
||||
Shorten(IKey("foo", 100, kTypeValue), |
||||
IKey("foo", 100, kTypeLargeValueRef))); |
||||
|
||||
// When user keys are misordered
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue), |
||||
Shorten(IKey("foo", 100, kTypeValue), |
||||
IKey("bar", 99, kTypeValue))); |
||||
|
||||
// When user keys are different, but correctly ordered
|
||||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), |
||||
Shorten(IKey("foo", 100, kTypeValue), |
||||
IKey("hello", 200, kTypeValue))); |
||||
|
||||
// When start user key is prefix of limit user key
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue), |
||||
Shorten(IKey("foo", 100, kTypeValue), |
||||
IKey("foobar", 200, kTypeValue))); |
||||
|
||||
// When limit user key is prefix of start user key
|
||||
ASSERT_EQ(IKey("foobar", 100, kTypeValue), |
||||
Shorten(IKey("foobar", 100, kTypeValue), |
||||
IKey("foo", 200, kTypeValue))); |
||||
} |
||||
|
||||
TEST(FormatTest, InternalKeyShortestSuccessor) { |
||||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), |
||||
ShortSuccessor(IKey("foo", 100, kTypeValue))); |
||||
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), |
||||
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); |
||||
} |
||||
|
||||
TEST(FormatTest, SHA1) { |
||||
// Check that we are computing the same value as sha1.
|
||||
// Note that the last two numbers are the length of the input and the
|
||||
// compression type.
|
||||
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0", // SHA1, uncompr
|
||||
LargeValueRefToFilenameString( |
||||
LargeValueRef::Make("hello", kNoCompression))); |
||||
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr
|
||||
LargeValueRefToFilenameString( |
||||
LargeValueRef::Make("hello", kLightweightCompression))); |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,154 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <ctype.h> |
||||
#include <stdio.h> |
||||
#include "db/filename.h" |
||||
#include "db/dbformat.h" |
||||
#include "include/env.h" |
||||
#include "util/logging.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
static std::string MakeFileName(const std::string& name, uint64_t number, |
||||
const char* suffix) { |
||||
char buf[100]; |
||||
snprintf(buf, sizeof(buf), "/%06llu.%s", |
||||
static_cast<unsigned long long>(number), |
||||
suffix); |
||||
return name + buf; |
||||
} |
||||
|
||||
std::string LogFileName(const std::string& name, uint64_t number) { |
||||
assert(number > 0); |
||||
return MakeFileName(name, number, "log"); |
||||
} |
||||
|
||||
std::string TableFileName(const std::string& name, uint64_t number) { |
||||
assert(number > 0); |
||||
return MakeFileName(name, number, "sst"); |
||||
} |
||||
|
||||
std::string LargeValueFileName(const std::string& name, |
||||
const LargeValueRef& large_ref) { |
||||
std::string result = name + "/"; |
||||
result += LargeValueRefToFilenameString(large_ref); |
||||
result += ".val"; |
||||
return result; |
||||
} |
||||
|
||||
std::string DescriptorFileName(const std::string& dbname, uint64_t number) { |
||||
assert(number > 0); |
||||
char buf[100]; |
||||
snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", |
||||
static_cast<unsigned long long>(number)); |
||||
return dbname + buf; |
||||
} |
||||
|
||||
std::string CurrentFileName(const std::string& dbname) { |
||||
return dbname + "/CURRENT"; |
||||
} |
||||
|
||||
std::string LockFileName(const std::string& dbname) { |
||||
return dbname + "/LOCK"; |
||||
} |
||||
|
||||
std::string TempFileName(const std::string& dbname, uint64_t number) { |
||||
assert(number > 0); |
||||
return MakeFileName(dbname, number, "dbtmp"); |
||||
} |
||||
|
||||
std::string InfoLogFileName(const std::string& dbname) { |
||||
return dbname + "/LOG"; |
||||
} |
||||
|
||||
// Return the name of the old info log file for "dbname".
|
||||
std::string OldInfoLogFileName(const std::string& dbname) { |
||||
return dbname + "/LOG.old"; |
||||
} |
||||
|
||||
|
||||
// Owned filenames have the form:
|
||||
// dbname/CURRENT
|
||||
// dbname/LOCK
|
||||
// dbname/LOG
|
||||
// dbname/LOG.old
|
||||
// dbname/MANIFEST-[0-9]+
|
||||
// dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val
|
||||
// dbname/[0-9]+.(log|sst)
|
||||
bool ParseFileName(const std::string& fname, |
||||
uint64_t* number, |
||||
LargeValueRef* large_ref, |
||||
FileType* type) { |
||||
Slice rest(fname); |
||||
if (rest == "CURRENT") { |
||||
*number = 0; |
||||
*type = kCurrentFile; |
||||
} else if (rest == "LOCK") { |
||||
*number = 0; |
||||
*type = kDBLockFile; |
||||
} else if (rest == "LOG" || rest == "LOG.old") { |
||||
*number = 0; |
||||
*type = kInfoLogFile; |
||||
} else if (rest.size() >= 4 && |
||||
Slice(rest.data() + rest.size() - 4, 4) == ".val") { |
||||
LargeValueRef h; |
||||
if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4), |
||||
&h)) { |
||||
return false; |
||||
} |
||||
*large_ref = h; |
||||
*type = kLargeValueFile; |
||||
} else if (rest.starts_with("MANIFEST-")) { |
||||
rest.remove_prefix(strlen("MANIFEST-")); |
||||
uint64_t num; |
||||
if (!ConsumeDecimalNumber(&rest, &num)) { |
||||
return false; |
||||
} |
||||
if (!rest.empty()) { |
||||
return false; |
||||
} |
||||
*type = kDescriptorFile; |
||||
*number = num; |
||||
} else { |
||||
// Avoid strtoull() to keep filename format independent of the
|
||||
// current locale
|
||||
uint64_t num; |
||||
if (!ConsumeDecimalNumber(&rest, &num)) { |
||||
return false; |
||||
} |
||||
Slice suffix = rest; |
||||
if (suffix == Slice(".log")) { |
||||
*type = kLogFile; |
||||
} else if (suffix == Slice(".sst")) { |
||||
*type = kTableFile; |
||||
} else if (suffix == Slice(".dbtmp")) { |
||||
*type = kTempFile; |
||||
} else { |
||||
return false; |
||||
} |
||||
*number = num; |
||||
} |
||||
return true; |
||||
} |
||||
|
||||
Status SetCurrentFile(Env* env, const std::string& dbname, |
||||
uint64_t descriptor_number) { |
||||
// Remove leading "dbname/" and add newline to manifest file name
|
||||
std::string manifest = DescriptorFileName(dbname, descriptor_number); |
||||
Slice contents = manifest; |
||||
assert(contents.starts_with(dbname + "/")); |
||||
contents.remove_prefix(dbname.size() + 1); |
||||
std::string tmp = TempFileName(dbname, descriptor_number); |
||||
Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp); |
||||
if (s.ok()) { |
||||
s = env->RenameFile(tmp, CurrentFileName(dbname)); |
||||
} |
||||
if (!s.ok()) { |
||||
env->DeleteFile(tmp); |
||||
} |
||||
return s; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,92 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// File names used by DB code
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_FILENAME_H_ |
||||
#define STORAGE_LEVELDB_DB_FILENAME_H_ |
||||
|
||||
#include <stdint.h> |
||||
#include <string> |
||||
#include "include/slice.h" |
||||
#include "include/status.h" |
||||
#include "port/port.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Env; |
||||
struct LargeValueRef; |
||||
|
||||
enum FileType { |
||||
kLogFile, |
||||
kDBLockFile, |
||||
kTableFile, |
||||
kLargeValueFile, |
||||
kDescriptorFile, |
||||
kCurrentFile, |
||||
kTempFile, |
||||
kInfoLogFile, // Either the current one, or an old one
|
||||
}; |
||||
|
||||
// Return the name of the log file with the specified number
|
||||
// in the db named by "dbname". The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string LogFileName(const std::string& dbname, uint64_t number); |
||||
|
||||
// Return the name of the sstable with the specified number
|
||||
// in the db named by "dbname". The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string TableFileName(const std::string& dbname, uint64_t number); |
||||
|
||||
// Return the name of the large value file with the specified large
|
||||
// value reference in the db named by "dbname". The result will be
|
||||
// prefixed with "dbname".
|
||||
extern std::string LargeValueFileName(const std::string& dbname, |
||||
const LargeValueRef& large_ref); |
||||
|
||||
// Return the name of the descriptor file for the db named by
|
||||
// "dbname" and the specified incarnation number. The result will be
|
||||
// prefixed with "dbname".
|
||||
extern std::string DescriptorFileName(const std::string& dbname, |
||||
uint64_t number); |
||||
|
||||
// Return the name of the current file. This file contains the name
|
||||
// of the current manifest file. The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string CurrentFileName(const std::string& dbname); |
||||
|
||||
// Return the name of the lock file for the db named by
|
||||
// "dbname". The result will be prefixed with "dbname".
|
||||
extern std::string LockFileName(const std::string& dbname); |
||||
|
||||
// Return the name of a temporary file owned by the db named "dbname".
|
||||
// The result will be prefixed with "dbname".
|
||||
extern std::string TempFileName(const std::string& dbname, uint64_t number); |
||||
|
||||
// Return the name of the info log file for "dbname".
|
||||
extern std::string InfoLogFileName(const std::string& dbname); |
||||
|
||||
// Return the name of the old info log file for "dbname".
|
||||
extern std::string OldInfoLogFileName(const std::string& dbname); |
||||
|
||||
// If filename is a leveldb file, store the type of the file in *type.
|
||||
// If *type is kLargeValueFile, then the large value reference data
|
||||
// from the filename is stored in "*large_ref. For all other types of
|
||||
// files, the number encoded in the filename is stored in *number. If
|
||||
// the filename was successfully parsed, returns true. Else return
|
||||
// false.
|
||||
extern bool ParseFileName(const std::string& filename, |
||||
uint64_t* number, |
||||
LargeValueRef* large_ref, |
||||
FileType* type); |
||||
|
||||
// Make the CURRENT file point to the descriptor file with the
|
||||
// specified number.
|
||||
extern Status SetCurrentFile(Env* env, const std::string& dbname, |
||||
uint64_t descriptor_number); |
||||
|
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_FILENAME_H_
|
@ -0,0 +1,156 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/filename.h" |
||||
|
||||
#include "db/dbformat.h" |
||||
#include "port/port.h" |
||||
#include "util/logging.h" |
||||
#include "util/testharness.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class FileNameTest { }; |
||||
|
||||
TEST(FileNameTest, Parse) { |
||||
Slice db; |
||||
FileType type; |
||||
uint64_t number; |
||||
LargeValueRef large_ref; |
||||
|
||||
// Successful parses
|
||||
static struct { |
||||
const char* fname; |
||||
uint64_t number; |
||||
const char* large_ref; |
||||
FileType type; |
||||
} cases[] = { |
||||
{ "100.log", 100, "", kLogFile }, |
||||
{ "0.log", 0, "", kLogFile }, |
||||
{ "0.sst", 0, "", kTableFile }, |
||||
{ "CURRENT", 0, "", kCurrentFile }, |
||||
{ "LOCK", 0, "", kDBLockFile }, |
||||
{ "MANIFEST-2", 2, "", kDescriptorFile }, |
||||
{ "MANIFEST-7", 7, "", kDescriptorFile }, |
||||
{ "LOG", 0, "", kInfoLogFile }, |
||||
{ "LOG.old", 0, "", kInfoLogFile }, |
||||
{ "18446744073709551615.log", 18446744073709551615ull, "", |
||||
kLogFile }, |
||||
{ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0, |
||||
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile }, |
||||
{ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0, |
||||
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0", |
||||
kLargeValueFile }, |
||||
}; |
||||
for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { |
||||
std::string f = cases[i].fname; |
||||
ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f; |
||||
ASSERT_EQ(cases[i].type, type) << f; |
||||
if (type == kLargeValueFile) { |
||||
ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref)) |
||||
<< f; |
||||
} else { |
||||
ASSERT_EQ(cases[i].number, number) << f; |
||||
} |
||||
} |
||||
|
||||
// Errors
|
||||
static const char* errors[] = { |
||||
"", |
||||
"foo", |
||||
"foo-dx-100.log", |
||||
".log", |
||||
"", |
||||
"manifest", |
||||
"CURREN", |
||||
"CURRENTX", |
||||
"MANIFES", |
||||
"MANIFEST", |
||||
"MANIFEST-", |
||||
"XMANIFEST-3", |
||||
"MANIFEST-3x", |
||||
"LOC", |
||||
"LOCKx", |
||||
"LO", |
||||
"LOGx", |
||||
"18446744073709551616.log", |
||||
"184467440737095516150.log", |
||||
"100", |
||||
"100.", |
||||
"100.lop", |
||||
"100.val", |
||||
".val", |
||||
"123456789012345678901234567890123456789-12340.val", |
||||
"1234567890123456789012345678901234567-123-0.val", |
||||
"12345678901234567890123456789012345678902-100-1-.val", |
||||
// Overflow on value size
|
||||
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val", |
||||
// '03.val' is a bad compression type
|
||||
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" }; |
||||
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { |
||||
std::string f = errors[i]; |
||||
ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f; |
||||
}; |
||||
} |
||||
|
||||
TEST(FileNameTest, Construction) { |
||||
uint64_t number; |
||||
FileType type; |
||||
LargeValueRef large_ref; |
||||
std::string fname; |
||||
|
||||
fname = CurrentFileName("foo"); |
||||
ASSERT_EQ("foo/", std::string(fname.data(), 4)); |
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); |
||||
ASSERT_EQ(0, number); |
||||
ASSERT_EQ(kCurrentFile, type); |
||||
|
||||
fname = LockFileName("foo"); |
||||
ASSERT_EQ("foo/", std::string(fname.data(), 4)); |
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); |
||||
ASSERT_EQ(0, number); |
||||
ASSERT_EQ(kDBLockFile, type); |
||||
|
||||
fname = LogFileName("foo", 192); |
||||
ASSERT_EQ("foo/", std::string(fname.data(), 4)); |
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); |
||||
ASSERT_EQ(192, number); |
||||
ASSERT_EQ(kLogFile, type); |
||||
|
||||
fname = TableFileName("bar", 200); |
||||
ASSERT_EQ("bar/", std::string(fname.data(), 4)); |
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); |
||||
ASSERT_EQ(200, number); |
||||
ASSERT_EQ(kTableFile, type); |
||||
|
||||
fname = DescriptorFileName("bar", 100); |
||||
ASSERT_EQ("bar/", std::string(fname.data(), 4)); |
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); |
||||
ASSERT_EQ(100, number); |
||||
ASSERT_EQ(kDescriptorFile, type); |
||||
|
||||
fname = TempFileName("tmp", 999); |
||||
ASSERT_EQ("tmp/", std::string(fname.data(), 4)); |
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); |
||||
ASSERT_EQ(999, number); |
||||
ASSERT_EQ(kTempFile, type); |
||||
|
||||
for (int i = 0; i <= kLightweightCompression; i++) { |
||||
CompressionType ctype = static_cast<CompressionType>(i); |
||||
std::string value = "abcdef"; |
||||
LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype); |
||||
fname = LargeValueFileName("tmp", real_large_ref); |
||||
ASSERT_EQ("tmp/", std::string(fname.data(), 4)); |
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); |
||||
ASSERT_TRUE(real_large_ref == large_ref); |
||||
ASSERT_EQ(kLargeValueFile, type); |
||||
ASSERT_EQ(large_ref.compression_type(), ctype); |
||||
} |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,35 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Log format information shared by reader and writer.
|
||||
// See ../doc/log_format.txt for more detail.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ |
||||
#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ |
||||
|
||||
namespace leveldb { |
||||
namespace log { |
||||
|
||||
enum RecordType { |
||||
// Zero is reserved for preallocated files
|
||||
kZeroType = 0, |
||||
|
||||
kFullType = 1, |
||||
|
||||
// For fragments
|
||||
kFirstType = 2, |
||||
kMiddleType = 3, |
||||
kLastType = 4, |
||||
}; |
||||
static const int kMaxRecordType = kLastType; |
||||
|
||||
static const int kBlockSize = 32768; |
||||
|
||||
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
|
||||
static const int kHeaderSize = 4 + 1 + 2; |
||||
|
||||
} |
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_
|
@ -0,0 +1,172 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/log_reader.h" |
||||
|
||||
#include <stdint.h> |
||||
#include "include/env.h" |
||||
#include "util/coding.h" |
||||
#include "util/crc32c.h" |
||||
|
||||
namespace leveldb { |
||||
namespace log { |
||||
|
||||
Reader::Reporter::~Reporter() { |
||||
} |
||||
|
||||
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum) |
||||
: file_(file), |
||||
reporter_(reporter), |
||||
checksum_(checksum), |
||||
backing_store_(new char[kBlockSize]), |
||||
buffer_(), |
||||
eof_(false) { |
||||
} |
||||
|
||||
Reader::~Reader() { |
||||
delete[] backing_store_; |
||||
} |
||||
|
||||
bool Reader::ReadRecord(Slice* record, std::string* scratch) { |
||||
scratch->clear(); |
||||
record->clear(); |
||||
bool in_fragmented_record = false; |
||||
|
||||
Slice fragment; |
||||
while (true) { |
||||
switch (ReadPhysicalRecord(&fragment)) { |
||||
case kFullType: |
||||
if (in_fragmented_record) { |
||||
ReportDrop(scratch->size(), "partial record without end"); |
||||
} |
||||
scratch->clear(); |
||||
*record = fragment; |
||||
return true; |
||||
|
||||
case kFirstType: |
||||
if (in_fragmented_record) { |
||||
ReportDrop(scratch->size(), "partial record without end"); |
||||
} |
||||
scratch->assign(fragment.data(), fragment.size()); |
||||
in_fragmented_record = true; |
||||
break; |
||||
|
||||
case kMiddleType: |
||||
if (!in_fragmented_record) { |
||||
ReportDrop(fragment.size(), "missing start of fragmented record"); |
||||
} else { |
||||
scratch->append(fragment.data(), fragment.size()); |
||||
} |
||||
break; |
||||
|
||||
case kLastType: |
||||
if (!in_fragmented_record) { |
||||
ReportDrop(fragment.size(), "missing start of fragmented record"); |
||||
} else { |
||||
scratch->append(fragment.data(), fragment.size()); |
||||
*record = Slice(*scratch); |
||||
return true; |
||||
} |
||||
break; |
||||
|
||||
case kEof: |
||||
if (in_fragmented_record) { |
||||
ReportDrop(scratch->size(), "partial record without end"); |
||||
scratch->clear(); |
||||
} |
||||
return false; |
||||
|
||||
case kBadRecord: |
||||
if (in_fragmented_record) { |
||||
ReportDrop(scratch->size(), "error in middle of record"); |
||||
in_fragmented_record = false; |
||||
scratch->clear(); |
||||
} |
||||
break; |
||||
|
||||
default: |
||||
ReportDrop( |
||||
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)), |
||||
"unknown record type"); |
||||
in_fragmented_record = false; |
||||
scratch->clear(); |
||||
break; |
||||
} |
||||
} |
||||
return false; |
||||
} |
||||
|
||||
void Reader::ReportDrop(size_t bytes, const char* reason) { |
||||
if (reporter_ != NULL) { |
||||
reporter_->Corruption(bytes, Status::Corruption(reason)); |
||||
} |
||||
} |
||||
|
||||
unsigned int Reader::ReadPhysicalRecord(Slice* result) { |
||||
while (true) { |
||||
if (buffer_.size() <= kHeaderSize) { |
||||
if (!eof_) { |
||||
// Last read was a full read, so this is a trailer to skip
|
||||
buffer_.clear(); |
||||
Status status = file_->Read(kBlockSize, &buffer_, backing_store_); |
||||
if (!status.ok()) { |
||||
if (reporter_ != NULL) { |
||||
reporter_->Corruption(kBlockSize, status); |
||||
} |
||||
buffer_.clear(); |
||||
eof_ = true; |
||||
return kEof; |
||||
} else if (buffer_.size() < kBlockSize) { |
||||
eof_ = true; |
||||
} |
||||
continue; |
||||
} else if (buffer_.size() == 0) { |
||||
// End of file
|
||||
return kEof; |
||||
} else if (buffer_.size() < kHeaderSize) { |
||||
ReportDrop(buffer_.size(), "truncated record at end of file"); |
||||
buffer_.clear(); |
||||
return kEof; |
||||
} else { |
||||
// We have a trailing zero-length record. Fall through and check it.
|
||||
} |
||||
} |
||||
|
||||
// Parse the header
|
||||
const char* header = buffer_.data(); |
||||
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff; |
||||
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff; |
||||
const unsigned int type = header[6]; |
||||
const uint32_t length = a | (b << 8); |
||||
if (kHeaderSize + length > buffer_.size()) { |
||||
ReportDrop(buffer_.size(), "bad record length"); |
||||
buffer_.clear(); |
||||
return kBadRecord; |
||||
} |
||||
|
||||
// Check crc
|
||||
if (checksum_) { |
||||
if (type == kZeroType && length == 0) { |
||||
// Skip zero length record
|
||||
buffer_.remove_prefix(kHeaderSize + length); |
||||
return kBadRecord; |
||||
} |
||||
|
||||
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); |
||||
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); |
||||
if (actual_crc != expected_crc) { |
||||
ReportDrop(length, "checksum mismatch"); |
||||
buffer_.remove_prefix(kHeaderSize + length); |
||||
return kBadRecord; |
||||
} |
||||
} |
||||
|
||||
buffer_.remove_prefix(kHeaderSize + length); |
||||
*result = Slice(header + kHeaderSize, length); |
||||
return type; |
||||
} |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,75 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ |
||||
#define STORAGE_LEVELDB_DB_LOG_READER_H_ |
||||
|
||||
#include "db/log_format.h" |
||||
#include "include/slice.h" |
||||
#include "include/status.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class SequentialFile; |
||||
|
||||
namespace log { |
||||
|
||||
class Reader { |
||||
public: |
||||
// Interface for reporting errors.
|
||||
class Reporter { |
||||
public: |
||||
virtual ~Reporter(); |
||||
|
||||
// Some corruption was detected. "size" is the approximate number
|
||||
// of bytes dropped due to the corruption.
|
||||
virtual void Corruption(size_t bytes, const Status& status) = 0; |
||||
}; |
||||
|
||||
// Create a reader that will return log records from "*file".
|
||||
// "*file" must remain live while this Reader is in use.
|
||||
//
|
||||
// If "reporter" is non-NULL, it is notified whenever some data is
|
||||
// dropped due to a detected corruption. "*reporter" must remain
|
||||
// live while this Reader is in use.
|
||||
//
|
||||
// If "checksum" is true, verify checksums if available.
|
||||
Reader(SequentialFile* file, Reporter* reporter, bool checksum); |
||||
|
||||
~Reader(); |
||||
|
||||
// Read the next record into *record. Returns true if read
|
||||
// successfully, false if we hit end of the input. May use
|
||||
// "*scratch" as temporary storage. The contents filled in *record
|
||||
// will only be valid until the next mutating operation on this
|
||||
// reader or the next mutation to *scratch.
|
||||
bool ReadRecord(Slice* record, std::string* scratch); |
||||
|
||||
private: |
||||
SequentialFile* const file_; |
||||
Reporter* const reporter_; |
||||
bool const checksum_; |
||||
char* const backing_store_; |
||||
Slice buffer_; |
||||
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
|
||||
|
||||
// Extend record types with the following special values
|
||||
enum { |
||||
kEof = kMaxRecordType + 1, |
||||
kBadRecord = kMaxRecordType + 2 |
||||
}; |
||||
|
||||
// Return type, or one of the preceding special values
|
||||
unsigned int ReadPhysicalRecord(Slice* result); |
||||
void ReportDrop(size_t bytes, const char* reason); |
||||
|
||||
// No copying allowed
|
||||
Reader(const Reader&); |
||||
void operator=(const Reader&); |
||||
}; |
||||
|
||||
} |
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_LOG_READER_H_
|
@ -0,0 +1,361 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/log_reader.h" |
||||
#include "db/log_writer.h" |
||||
#include "include/env.h" |
||||
#include "util/coding.h" |
||||
#include "util/crc32c.h" |
||||
#include "util/random.h" |
||||
#include "util/testharness.h" |
||||
|
||||
namespace leveldb { |
||||
namespace log { |
||||
|
||||
// Construct a string of the specified length made out of the supplied
|
||||
// partial string.
|
||||
static std::string BigString(const std::string& partial_string, size_t n) { |
||||
std::string result; |
||||
while (result.size() < n) { |
||||
result.append(partial_string); |
||||
} |
||||
result.resize(n); |
||||
return result; |
||||
} |
||||
|
||||
// Construct a string from a number
|
||||
static std::string NumberString(int n) { |
||||
char buf[50]; |
||||
snprintf(buf, sizeof(buf), "%d.", n); |
||||
return std::string(buf); |
||||
} |
||||
|
||||
// Return a skewed potentially long string
|
||||
static std::string RandomSkewedString(int i, Random* rnd) { |
||||
return BigString(NumberString(i), rnd->Skewed(17)); |
||||
} |
||||
|
||||
class LogTest { |
||||
private: |
||||
class StringDest : public WritableFile { |
||||
public: |
||||
std::string contents_; |
||||
|
||||
virtual Status Close() { return Status::OK(); } |
||||
virtual Status Flush() { return Status::OK(); } |
||||
virtual Status Sync() { return Status::OK(); } |
||||
virtual Status Append(const Slice& slice) { |
||||
contents_.append(slice.data(), slice.size()); |
||||
return Status::OK(); |
||||
} |
||||
}; |
||||
|
||||
class StringSource : public SequentialFile { |
||||
public: |
||||
Slice contents_; |
||||
bool force_error_; |
||||
bool returned_partial_; |
||||
StringSource() : force_error_(false), returned_partial_(false) { } |
||||
|
||||
virtual Status Read(size_t n, Slice* result, char* scratch) { |
||||
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; |
||||
ASSERT_EQ(kBlockSize, n); |
||||
|
||||
if (force_error_) { |
||||
force_error_ = false; |
||||
returned_partial_ = true; |
||||
return Status::Corruption("read error"); |
||||
} |
||||
|
||||
if (contents_.size() < n) { |
||||
n = contents_.size(); |
||||
returned_partial_ = true; |
||||
} |
||||
*result = Slice(contents_.data(), n); |
||||
contents_.remove_prefix(n); |
||||
return Status::OK(); |
||||
} |
||||
}; |
||||
|
||||
class ReportCollector : public Reader::Reporter { |
||||
public: |
||||
size_t dropped_bytes_; |
||||
std::string message_; |
||||
|
||||
ReportCollector() : dropped_bytes_(0) { } |
||||
virtual void Corruption(size_t bytes, const Status& status) { |
||||
dropped_bytes_ += bytes; |
||||
message_.append(status.ToString()); |
||||
} |
||||
}; |
||||
|
||||
StringDest dest_; |
||||
StringSource source_; |
||||
ReportCollector report_; |
||||
bool reading_; |
||||
Writer writer_; |
||||
Reader reader_; |
||||
|
||||
public: |
||||
LogTest() : reading_(false), |
||||
writer_(&dest_), |
||||
reader_(&source_, &report_, true/*checksum*/) { |
||||
} |
||||
|
||||
void Write(const std::string& msg) { |
||||
ASSERT_TRUE(!reading_) << "Write() after starting to read"; |
||||
writer_.AddRecord(Slice(msg)); |
||||
} |
||||
|
||||
size_t WrittenBytes() const { |
||||
return dest_.contents_.size(); |
||||
} |
||||
|
||||
std::string Read() { |
||||
if (!reading_) { |
||||
reading_ = true; |
||||
source_.contents_ = Slice(dest_.contents_); |
||||
} |
||||
std::string scratch; |
||||
Slice record; |
||||
if (reader_.ReadRecord(&record, &scratch)) { |
||||
return record.ToString(); |
||||
} else { |
||||
return "EOF"; |
||||
} |
||||
} |
||||
|
||||
void IncrementByte(int offset, int delta) { |
||||
dest_.contents_[offset] += delta; |
||||
} |
||||
|
||||
void SetByte(int offset, char new_byte) { |
||||
dest_.contents_[offset] = new_byte; |
||||
} |
||||
|
||||
void ShrinkSize(int bytes) { |
||||
dest_.contents_.resize(dest_.contents_.size() - bytes); |
||||
} |
||||
|
||||
void FixChecksum(int header_offset, int len) { |
||||
// Compute crc of type/len/data
|
||||
uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len); |
||||
crc = crc32c::Mask(crc); |
||||
EncodeFixed32(&dest_.contents_[header_offset], crc); |
||||
} |
||||
|
||||
void ForceError() { |
||||
source_.force_error_ = true; |
||||
} |
||||
|
||||
size_t DroppedBytes() const { |
||||
return report_.dropped_bytes_; |
||||
} |
||||
|
||||
// Returns OK iff recorded error message contains "msg"
|
||||
std::string MatchError(const std::string& msg) const { |
||||
if (report_.message_.find(msg) == std::string::npos) { |
||||
return report_.message_; |
||||
} else { |
||||
return "OK"; |
||||
} |
||||
} |
||||
}; |
||||
|
||||
TEST(LogTest, Empty) { |
||||
ASSERT_EQ("EOF", Read()); |
||||
} |
||||
|
||||
TEST(LogTest, ReadWrite) { |
||||
Write("foo"); |
||||
Write("bar"); |
||||
Write(""); |
||||
Write("xxxx"); |
||||
ASSERT_EQ("foo", Read()); |
||||
ASSERT_EQ("bar", Read()); |
||||
ASSERT_EQ("", Read()); |
||||
ASSERT_EQ("xxxx", Read()); |
||||
ASSERT_EQ("EOF", Read()); |
||||
ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
|
||||
} |
||||
|
||||
TEST(LogTest, ManyBlocks) { |
||||
for (int i = 0; i < 100000; i++) { |
||||
Write(NumberString(i)); |
||||
} |
||||
for (int i = 0; i < 100000; i++) { |
||||
ASSERT_EQ(NumberString(i), Read()); |
||||
} |
||||
ASSERT_EQ("EOF", Read()); |
||||
} |
||||
|
||||
TEST(LogTest, Fragmentation) { |
||||
Write("small"); |
||||
Write(BigString("medium", 50000)); |
||||
Write(BigString("large", 100000)); |
||||
ASSERT_EQ("small", Read()); |
||||
ASSERT_EQ(BigString("medium", 50000), Read()); |
||||
ASSERT_EQ(BigString("large", 100000), Read()); |
||||
ASSERT_EQ("EOF", Read()); |
||||
} |
||||
|
||||
TEST(LogTest, MarginalTrailer) { |
||||
// Make a trailer that is exactly the same length as an empty record.
|
||||
const int n = kBlockSize - 2*kHeaderSize; |
||||
Write(BigString("foo", n)); |
||||
ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); |
||||
Write(""); |
||||
Write("bar"); |
||||
ASSERT_EQ(BigString("foo", n), Read()); |
||||
ASSERT_EQ("", Read()); |
||||
ASSERT_EQ("bar", Read()); |
||||
ASSERT_EQ("EOF", Read()); |
||||
} |
||||
|
||||
TEST(LogTest, ShortTrailer) { |
||||
const int n = kBlockSize - 2*kHeaderSize + 4; |
||||
Write(BigString("foo", n)); |
||||
ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); |
||||
Write(""); |
||||
Write("bar"); |
||||
ASSERT_EQ(BigString("foo", n), Read()); |
||||
ASSERT_EQ("", Read()); |
||||
ASSERT_EQ("bar", Read()); |
||||
ASSERT_EQ("EOF", Read()); |
||||
} |
||||
|
||||
TEST(LogTest, AlignedEof) { |
||||
const int n = kBlockSize - 2*kHeaderSize + 4; |
||||
Write(BigString("foo", n)); |
||||
ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); |
||||
ASSERT_EQ(BigString("foo", n), Read()); |
||||
ASSERT_EQ("EOF", Read()); |
||||
} |
||||
|
||||
TEST(LogTest, RandomRead) { |
||||
const int N = 500; |
||||
Random write_rnd(301); |
||||
for (int i = 0; i < N; i++) { |
||||
Write(RandomSkewedString(i, &write_rnd)); |
||||
} |
||||
Random read_rnd(301); |
||||
for (int i = 0; i < N; i++) { |
||||
ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); |
||||
} |
||||
ASSERT_EQ("EOF", Read()); |
||||
} |
||||
|
||||
// Tests of all the error paths in log_reader.cc follow:
|
||||
|
||||
TEST(LogTest, ReadError) { |
||||
Write("foo"); |
||||
ForceError(); |
||||
ASSERT_EQ("EOF", Read()); |
||||
ASSERT_EQ(kBlockSize, DroppedBytes()); |
||||
ASSERT_EQ("OK", MatchError("read error")); |
||||
} |
||||
|
||||
TEST(LogTest, BadRecordType) { |
||||
Write("foo"); |
||||
// Type is stored in header[6]
|
||||
IncrementByte(6, 100); |
||||
FixChecksum(0, 3); |
||||
ASSERT_EQ("EOF", Read()); |
||||
ASSERT_EQ(3, DroppedBytes()); |
||||
ASSERT_EQ("OK", MatchError("unknown record type")); |
||||
} |
||||
|
||||
TEST(LogTest, TruncatedTrailingRecord) { |
||||
Write("foo"); |
||||
ShrinkSize(4); // Drop all payload as well as a header byte
|
||||
ASSERT_EQ("EOF", Read()); |
||||
ASSERT_EQ(kHeaderSize - 1, DroppedBytes()); |
||||
ASSERT_EQ("OK", MatchError("truncated record at end of file")); |
||||
} |
||||
|
||||
TEST(LogTest, BadLength) { |
||||
Write("foo"); |
||||
ShrinkSize(1); |
||||
ASSERT_EQ("EOF", Read()); |
||||
ASSERT_EQ(kHeaderSize + 2, DroppedBytes()); |
||||
ASSERT_EQ("OK", MatchError("bad record length")); |
||||
} |
||||
|
||||
TEST(LogTest, ChecksumMismatch) { |
||||
Write("foo"); |
||||
IncrementByte(0, 10); |
||||
ASSERT_EQ("EOF", Read()); |
||||
ASSERT_EQ(3, DroppedBytes()); |
||||
ASSERT_EQ("OK", MatchError("checksum mismatch")); |
||||
} |
||||
|
||||
TEST(LogTest, UnexpectedMiddleType) { |
||||
Write("foo"); |
||||
SetByte(6, kMiddleType); |
||||
FixChecksum(0, 3); |
||||
ASSERT_EQ("EOF", Read()); |
||||
ASSERT_EQ(3, DroppedBytes()); |
||||
ASSERT_EQ("OK", MatchError("missing start")); |
||||
} |
||||
|
||||
TEST(LogTest, UnexpectedLastType) { |
||||
Write("foo"); |
||||
SetByte(6, kLastType); |
||||
FixChecksum(0, 3); |
||||
ASSERT_EQ("EOF", Read()); |
||||
ASSERT_EQ(3, DroppedBytes()); |
||||
ASSERT_EQ("OK", MatchError("missing start")); |
||||
} |
||||
|
||||
TEST(LogTest, UnexpectedFullType) { |
||||
Write("foo"); |
||||
Write("bar"); |
||||
SetByte(6, kFirstType); |
||||
FixChecksum(0, 3); |
||||
ASSERT_EQ("bar", Read()); |
||||
ASSERT_EQ("EOF", Read()); |
||||
ASSERT_EQ(3, DroppedBytes()); |
||||
ASSERT_EQ("OK", MatchError("partial record without end")); |
||||
} |
||||
|
||||
TEST(LogTest, UnexpectedFirstType) { |
||||
Write("foo"); |
||||
Write(BigString("bar", 100000)); |
||||
SetByte(6, kFirstType); |
||||
FixChecksum(0, 3); |
||||
ASSERT_EQ(BigString("bar", 100000), Read()); |
||||
ASSERT_EQ("EOF", Read()); |
||||
ASSERT_EQ(3, DroppedBytes()); |
||||
ASSERT_EQ("OK", MatchError("partial record without end")); |
||||
} |
||||
|
||||
TEST(LogTest, ErrorJoinsRecords) { |
||||
// Consider two fragmented records:
|
||||
// first(R1) last(R1) first(R2) last(R2)
|
||||
// where the middle two fragments disappear. We do not want
|
||||
// first(R1),last(R2) to get joined and returned as a valid record.
|
||||
|
||||
// Write records that span two blocks
|
||||
Write(BigString("foo", kBlockSize)); |
||||
Write(BigString("bar", kBlockSize)); |
||||
Write("correct"); |
||||
|
||||
// Wipe the middle block
|
||||
for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) { |
||||
SetByte(offset, 'x'); |
||||
} |
||||
|
||||
ASSERT_EQ("correct", Read()); |
||||
ASSERT_EQ("EOF", Read()); |
||||
const int dropped = DroppedBytes(); |
||||
ASSERT_LE(dropped, 2*kBlockSize + 100); |
||||
ASSERT_GE(dropped, 2*kBlockSize); |
||||
} |
||||
|
||||
} |
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,101 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/log_writer.h" |
||||
|
||||
#include <stdint.h> |
||||
#include "include/env.h" |
||||
#include "util/coding.h" |
||||
#include "util/crc32c.h" |
||||
|
||||
namespace leveldb { |
||||
namespace log { |
||||
|
||||
Writer::Writer(WritableFile* dest) |
||||
: dest_(dest), |
||||
block_offset_(0) { |
||||
for (int i = 0; i <= kMaxRecordType; i++) { |
||||
char t = static_cast<char>(i); |
||||
type_crc_[i] = crc32c::Value(&t, 1); |
||||
} |
||||
} |
||||
|
||||
Writer::~Writer() { |
||||
} |
||||
|
||||
Status Writer::AddRecord(const Slice& slice) { |
||||
const char* ptr = slice.data(); |
||||
size_t left = slice.size(); |
||||
|
||||
// Fragment the record if necessary and emit it. Note that if slice
|
||||
// is empty, we still want to iterate once to emit a single
|
||||
// zero-length record
|
||||
Status s; |
||||
do { |
||||
const int leftover = kBlockSize - block_offset_; |
||||
assert(leftover >= 0); |
||||
if (leftover <= kHeaderSize) { |
||||
// Switch to a new block
|
||||
if (leftover > 0) { |
||||
// Fill the trailer
|
||||
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00", leftover)); |
||||
} |
||||
block_offset_ = 0; |
||||
} |
||||
|
||||
// Invariant: we never leave <= kHeaderSize bytes in a block.
|
||||
const int avail = kBlockSize - block_offset_ - kHeaderSize; |
||||
assert(avail > 0); |
||||
|
||||
const size_t fragment_length = (left < avail) ? left : avail; |
||||
|
||||
RecordType type; |
||||
const bool begin = (ptr == slice.data()); |
||||
const bool end = (left == fragment_length); |
||||
if (begin && end) { |
||||
type = kFullType; |
||||
} else if (begin) { |
||||
type = kFirstType; |
||||
} else if (end) { |
||||
type = kLastType; |
||||
} else { |
||||
type = kMiddleType; |
||||
} |
||||
|
||||
s = EmitPhysicalRecord(type, ptr, fragment_length); |
||||
ptr += fragment_length; |
||||
left -= fragment_length; |
||||
} while (s.ok() && left > 0); |
||||
return s; |
||||
} |
||||
|
||||
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { |
||||
assert(n <= 0xffff); // Must fit in two bytes
|
||||
assert(block_offset_ + kHeaderSize + n <= kBlockSize); |
||||
|
||||
// Format the header
|
||||
char buf[kHeaderSize]; |
||||
buf[4] = static_cast<char>(n & 0xff); |
||||
buf[5] = static_cast<char>(n >> 8); |
||||
buf[6] = static_cast<char>(t); |
||||
|
||||
// Compute the crc of the record type and the payload.
|
||||
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); |
||||
crc = crc32c::Mask(crc); // Adjust for storage
|
||||
EncodeFixed32(buf, crc); |
||||
|
||||
// Write the header and the payload
|
||||
Status s = dest_->Append(Slice(buf, kHeaderSize)); |
||||
if (s.ok()) { |
||||
s = dest_->Append(Slice(ptr, n)); |
||||
if (s.ok()) { |
||||
s = dest_->Flush(); |
||||
} |
||||
} |
||||
block_offset_ += kHeaderSize + n; |
||||
return s; |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,48 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_ |
||||
#define STORAGE_LEVELDB_DB_LOG_WRITER_H_ |
||||
|
||||
#include <stdint.h> |
||||
#include "db/log_format.h" |
||||
#include "include/slice.h" |
||||
#include "include/status.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class WritableFile; |
||||
|
||||
namespace log { |
||||
|
||||
class Writer { |
||||
public: |
||||
// Create a writer that will append data to "*dest".
|
||||
// "*dest" must be initially empty.
|
||||
// "*dest" must remain live while this Writer is in use.
|
||||
explicit Writer(WritableFile* dest); |
||||
~Writer(); |
||||
|
||||
Status AddRecord(const Slice& slice); |
||||
|
||||
private: |
||||
WritableFile* dest_; |
||||
int block_offset_; // Current offset in block
|
||||
|
||||
// crc32c values for all supported record types. These are
|
||||
// pre-computed to reduce the overhead of computing the crc of the
|
||||
// record type stored in the header.
|
||||
uint32_t type_crc_[kMaxRecordType + 1]; |
||||
|
||||
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); |
||||
|
||||
// No copying allowed
|
||||
Writer(const Writer&); |
||||
void operator=(const Writer&); |
||||
}; |
||||
|
||||
} |
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_
|
@ -0,0 +1,109 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/memtable.h" |
||||
#include "db/dbformat.h" |
||||
#include "include/comparator.h" |
||||
#include "include/env.h" |
||||
#include "include/iterator.h" |
||||
#include "util/coding.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
static Slice GetLengthPrefixedSlice(const char* data) { |
||||
uint32_t len; |
||||
const char* p = data; |
||||
p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted
|
||||
return Slice(p, len); |
||||
} |
||||
|
||||
MemTable::MemTable(const InternalKeyComparator& cmp) |
||||
: comparator_(cmp), |
||||
table_(comparator_, &arena_) { |
||||
} |
||||
|
||||
MemTable::~MemTable() { |
||||
} |
||||
|
||||
size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } |
||||
|
||||
int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) |
||||
const { |
||||
// Internal keys are encoded as length-prefixed strings.
|
||||
Slice a = GetLengthPrefixedSlice(aptr); |
||||
Slice b = GetLengthPrefixedSlice(bptr); |
||||
return comparator.Compare(a, b); |
||||
} |
||||
|
||||
// Encode a suitable internal key target for "target" and return it.
|
||||
// Uses *scratch as scratch space, and the returned pointer will point
|
||||
// into this scratch space.
|
||||
static const char* EncodeKey(std::string* scratch, const Slice& target) { |
||||
scratch->clear(); |
||||
PutVarint32(scratch, target.size()); |
||||
scratch->append(target.data(), target.size()); |
||||
return scratch->data(); |
||||
} |
||||
|
||||
class MemTableIterator: public Iterator { |
||||
public: |
||||
explicit MemTableIterator(MemTable::Table* table) { |
||||
iter_ = new MemTable::Table::Iterator(table); |
||||
} |
||||
virtual ~MemTableIterator() { delete iter_; } |
||||
|
||||
virtual bool Valid() const { return iter_->Valid(); } |
||||
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } |
||||
virtual void SeekToFirst() { iter_->SeekToFirst(); } |
||||
virtual void SeekToLast() { iter_->SeekToLast(); } |
||||
virtual void Next() { iter_->Next(); } |
||||
virtual void Prev() { iter_->Prev(); } |
||||
virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); } |
||||
virtual Slice value() const { |
||||
Slice key_slice = GetLengthPrefixedSlice(iter_->key()); |
||||
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); |
||||
} |
||||
|
||||
virtual Status status() const { return Status::OK(); } |
||||
|
||||
private: |
||||
MemTable::Table::Iterator* iter_; |
||||
std::string tmp_; // For passing to EncodeKey
|
||||
|
||||
// No copying allowed
|
||||
MemTableIterator(const MemTableIterator&); |
||||
void operator=(const MemTableIterator&); |
||||
}; |
||||
|
||||
Iterator* MemTable::NewIterator() { |
||||
return new MemTableIterator(&table_); |
||||
} |
||||
|
||||
void MemTable::Add(SequenceNumber s, ValueType type, |
||||
const Slice& key, |
||||
const Slice& value) { |
||||
// Format of an entry is concatenation of:
|
||||
// key_size : varint32 of internal_key.size()
|
||||
// key bytes : char[internal_key.size()]
|
||||
// value_size : varint32 of value.size()
|
||||
// value bytes : char[value.size()]
|
||||
size_t key_size = key.size(); |
||||
size_t val_size = value.size(); |
||||
size_t internal_key_size = key_size + 8; |
||||
const size_t encoded_len = |
||||
VarintLength(internal_key_size) + internal_key_size + |
||||
VarintLength(val_size) + val_size; |
||||
char* buf = arena_.Allocate(encoded_len); |
||||
char* p = EncodeVarint32(buf, internal_key_size); |
||||
memcpy(p, key.data(), key_size); |
||||
p += key_size; |
||||
EncodeFixed64(p, (s << 8) | type); |
||||
p += 8; |
||||
p = EncodeVarint32(p, val_size); |
||||
memcpy(p, value.data(), val_size); |
||||
assert((p + val_size) - buf == encoded_len); |
||||
table_.Insert(buf); |
||||
} |
||||
|
||||
} |
@ -0,0 +1,69 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_ |
||||
#define STORAGE_LEVELDB_DB_MEMTABLE_H_ |
||||
|
||||
#include <string> |
||||
#include "include/db.h" |
||||
#include "db/dbformat.h" |
||||
#include "db/skiplist.h" |
||||
#include "util/arena.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class InternalKeyComparator; |
||||
class Mutex; |
||||
class MemTableIterator; |
||||
|
||||
class MemTable { |
||||
public: |
||||
explicit MemTable(const InternalKeyComparator& comparator); |
||||
~MemTable(); |
||||
|
||||
// Returns an estimate of the number of bytes of data in use by this
|
||||
// data structure.
|
||||
//
|
||||
// REQUIRES: external synchronization to prevent simultaneous
|
||||
// operations on the same MemTable.
|
||||
size_t ApproximateMemoryUsage(); |
||||
|
||||
// Return an iterator that yields the contents of the memtable.
|
||||
//
|
||||
// The caller must ensure that the underlying MemTable remains live
|
||||
// while the returned iterator is live. The keys returned by this
|
||||
// iterator are internal keys encoded by AppendInternalKey in the
|
||||
// db/format.{h,cc} module.
|
||||
Iterator* NewIterator(); |
||||
|
||||
// Add an entry into memtable that maps key to value at the
|
||||
// specified sequence number and with the specified type.
|
||||
// Typically value will be empty if type==kTypeDeletion.
|
||||
void Add(SequenceNumber seq, ValueType type, |
||||
const Slice& key, |
||||
const Slice& value); |
||||
|
||||
private: |
||||
struct KeyComparator { |
||||
const InternalKeyComparator comparator; |
||||
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } |
||||
int operator()(const char* a, const char* b) const; |
||||
}; |
||||
friend class MemTableIterator; |
||||
friend class MemTableBackwardIterator; |
||||
|
||||
typedef SkipList<const char*, KeyComparator> Table; |
||||
|
||||
KeyComparator comparator_; |
||||
Arena arena_; |
||||
Table table_; |
||||
|
||||
// No copying allowed
|
||||
MemTable(const MemTable&); |
||||
void operator=(const MemTable&); |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_
|
@ -0,0 +1,396 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// We recover the contents of the descriptor from the other files we find.
|
||||
// (1) Any log files are first converted to tables
|
||||
// (2) We scan every table to compute
|
||||
// (a) smallest/largest for the table
|
||||
// (b) large value refs from the table
|
||||
// (c) largest sequence number in the table
|
||||
// (3) We generate descriptor contents:
|
||||
// - log number is set to zero
|
||||
// - next-file-number is set to 1 + largest file number we found
|
||||
// - last-sequence-number is set to largest sequence# found across
|
||||
// all tables (see 2c)
|
||||
// - compaction pointers are cleared
|
||||
// - every table file is added at level 0
|
||||
//
|
||||
// Possible optimization 1:
|
||||
// (a) Compute total size and use to pick appropriate max-level M
|
||||
// (b) Sort tables by largest sequence# in the table
|
||||
// (c) For each table: if it overlaps earlier table, place in level-0,
|
||||
// else place in level-M.
|
||||
// Possible optimization 2:
|
||||
// Store per-table metadata (smallest, largest, largest-seq#,
|
||||
// large-value-refs, ...) in the table's meta section to speed up
|
||||
// ScanTable.
|
||||
|
||||
#include "db/builder.h" |
||||
#include "db/db_impl.h" |
||||
#include "db/dbformat.h" |
||||
#include "db/filename.h" |
||||
#include "db/log_reader.h" |
||||
#include "db/log_writer.h" |
||||
#include "db/memtable.h" |
||||
#include "db/table_cache.h" |
||||
#include "db/version_edit.h" |
||||
#include "db/write_batch_internal.h" |
||||
#include "include/comparator.h" |
||||
#include "include/db.h" |
||||
#include "include/env.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
namespace { |
||||
|
||||
class Repairer { |
||||
public: |
||||
Repairer(const std::string& dbname, const Options& options) |
||||
: dbname_(dbname), |
||||
env_(options.env), |
||||
icmp_(options.comparator), |
||||
options_(SanitizeOptions(dbname, &icmp_, options)), |
||||
owns_info_log_(options_.info_log != options.info_log), |
||||
next_file_number_(1) { |
||||
// TableCache can be small since we expect each table to be opened once.
|
||||
table_cache_ = new TableCache(dbname_, &options_, 10); |
||||
} |
||||
|
||||
~Repairer() { |
||||
delete table_cache_; |
||||
if (owns_info_log_) { |
||||
delete options_.info_log; |
||||
} |
||||
} |
||||
|
||||
Status Run() { |
||||
Status status = FindFiles(); |
||||
if (status.ok()) { |
||||
ConvertLogFilesToTables(); |
||||
ExtractMetaData(); |
||||
status = WriteDescriptor(); |
||||
} |
||||
if (status.ok()) { |
||||
unsigned long long bytes = 0; |
||||
for (int i = 0; i < tables_.size(); i++) { |
||||
bytes += tables_[i].meta.file_size; |
||||
} |
||||
Log(env_, options_.info_log, |
||||
"**** Repaired leveldb %s; " |
||||
"recovered %d files; %llu bytes. " |
||||
"Some data may have been lost. " |
||||
"****", |
||||
dbname_.c_str(), |
||||
static_cast<int>(tables_.size()), |
||||
bytes); |
||||
} |
||||
return status; |
||||
} |
||||
|
||||
private: |
||||
struct TableInfo { |
||||
FileMetaData meta; |
||||
SequenceNumber max_sequence; |
||||
}; |
||||
|
||||
std::string const dbname_; |
||||
Env* const env_; |
||||
InternalKeyComparator const icmp_; |
||||
Options const options_; |
||||
bool owns_info_log_; |
||||
TableCache* table_cache_; |
||||
VersionEdit edit_; |
||||
|
||||
std::vector<std::string> manifests_; |
||||
std::vector<uint64_t> table_numbers_; |
||||
std::vector<uint64_t> logs_; |
||||
std::vector<TableInfo> tables_; |
||||
uint64_t next_file_number_; |
||||
|
||||
Status FindFiles() { |
||||
std::vector<std::string> filenames; |
||||
Status status = env_->GetChildren(dbname_, &filenames); |
||||
if (!status.ok()) { |
||||
return status; |
||||
} |
||||
if (filenames.empty()) { |
||||
return Status::IOError(dbname_, "repair found no files"); |
||||
} |
||||
|
||||
uint64_t number; |
||||
LargeValueRef large_ref; |
||||
FileType type; |
||||
for (int i = 0; i < filenames.size(); i++) { |
||||
if (ParseFileName(filenames[i], &number, &large_ref, &type)) { |
||||
if (type == kLargeValueFile) { |
||||
// Will be picked up when we process a Table that points to it
|
||||
} else if (type == kDescriptorFile) { |
||||
manifests_.push_back(filenames[i]); |
||||
} else { |
||||
if (number + 1 > next_file_number_) { |
||||
next_file_number_ = number + 1; |
||||
} |
||||
if (type == kLogFile) { |
||||
logs_.push_back(number); |
||||
} else if (type == kTableFile) { |
||||
table_numbers_.push_back(number); |
||||
} else { |
||||
// Ignore other files
|
||||
} |
||||
} |
||||
} |
||||
} |
||||
return status; |
||||
} |
||||
|
||||
void ConvertLogFilesToTables() { |
||||
for (int i = 0; i < logs_.size(); i++) { |
||||
std::string logname = LogFileName(dbname_, logs_[i]); |
||||
Status status = ConvertLogToTable(logs_[i]); |
||||
if (!status.ok()) { |
||||
Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s", |
||||
(unsigned long long) logs_[i], |
||||
status.ToString().c_str()); |
||||
} |
||||
ArchiveFile(logname); |
||||
} |
||||
} |
||||
|
||||
Status ConvertLogToTable(uint64_t log) { |
||||
struct LogReporter : public log::Reader::Reporter { |
||||
Env* env; |
||||
WritableFile* info_log; |
||||
uint64_t lognum; |
||||
virtual void Corruption(size_t bytes, const Status& s) { |
||||
// We print error messages for corruption, but continue repairing.
|
||||
Log(env, info_log, "Log #%llu: dropping %d bytes; %s", |
||||
(unsigned long long) lognum, |
||||
static_cast<int>(bytes), |
||||
s.ToString().c_str()); |
||||
} |
||||
}; |
||||
|
||||
// Open the log file
|
||||
std::string logname = LogFileName(dbname_, log); |
||||
SequentialFile* lfile; |
||||
Status status = env_->NewSequentialFile(logname, &lfile); |
||||
if (!status.ok()) { |
||||
return status; |
||||
} |
||||
|
||||
// Create the log reader.
|
||||
LogReporter reporter; |
||||
reporter.env = env_; |
||||
reporter.info_log = options_.info_log; |
||||
reporter.lognum = log; |
||||
// We intentially make log::Reader do checksumming so that
|
||||
// corruptions cause entire commits to be skipped instead of
|
||||
// propagating bad information (like overly large sequence
|
||||
// numbers).
|
||||
log::Reader reader(lfile, &reporter, false/*do not checksum*/); |
||||
|
||||
// Read all the records and add to a memtable
|
||||
std::string scratch; |
||||
Slice record; |
||||
WriteBatch batch; |
||||
MemTable mem(icmp_); |
||||
int counter = 0; |
||||
while (reader.ReadRecord(&record, &scratch)) { |
||||
if (record.size() < 12) { |
||||
reporter.Corruption( |
||||
record.size(), Status::Corruption("log record too small")); |
||||
continue; |
||||
} |
||||
WriteBatchInternal::SetContents(&batch, record); |
||||
status = WriteBatchInternal::InsertInto(&batch, &mem); |
||||
if (status.ok()) { |
||||
counter += WriteBatchInternal::Count(&batch); |
||||
} else { |
||||
Log(env_, options_.info_log, "Log #%llu: ignoring %s", |
||||
(unsigned long long) log, |
||||
status.ToString().c_str()); |
||||
status = Status::OK(); // Keep going with rest of file
|
||||
} |
||||
} |
||||
delete lfile; |
||||
|
||||
// We ignore any version edits generated by the conversion to a Table
|
||||
// since ExtractMetaData() will also generate edits.
|
||||
VersionEdit skipped; |
||||
FileMetaData meta; |
||||
meta.number = next_file_number_++; |
||||
Iterator* iter = mem.NewIterator(); |
||||
status = BuildTable(dbname_, env_, options_, table_cache_, iter, |
||||
&meta, &skipped); |
||||
delete iter; |
||||
if (status.ok()) { |
||||
if (meta.file_size > 0) { |
||||
table_numbers_.push_back(meta.number); |
||||
} |
||||
} |
||||
Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", |
||||
(unsigned long long) log, |
||||
counter, |
||||
(unsigned long long) meta.number, |
||||
status.ToString().c_str()); |
||||
return status; |
||||
} |
||||
|
||||
void ExtractMetaData() { |
||||
std::vector<TableInfo> kept; |
||||
for (int i = 0; i < table_numbers_.size(); i++) { |
||||
TableInfo t; |
||||
t.meta.number = table_numbers_[i]; |
||||
Status status = ScanTable(&t); |
||||
if (!status.ok()) { |
||||
std::string fname = TableFileName(dbname_, table_numbers_[i]); |
||||
Log(env_, options_.info_log, "Table #%llu: ignoring %s", |
||||
(unsigned long long) table_numbers_[i], |
||||
status.ToString().c_str()); |
||||
ArchiveFile(fname); |
||||
} else { |
||||
tables_.push_back(t); |
||||
} |
||||
} |
||||
} |
||||
|
||||
Status ScanTable(TableInfo* t) { |
||||
std::string fname = TableFileName(dbname_, t->meta.number); |
||||
int counter = 0; |
||||
Status status = env_->GetFileSize(fname, &t->meta.file_size); |
||||
if (status.ok()) { |
||||
Iterator* iter = table_cache_->NewIterator( |
||||
ReadOptions(), t->meta.number); |
||||
bool empty = true; |
||||
ParsedInternalKey parsed; |
||||
t->max_sequence = 0; |
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { |
||||
Slice key = iter->key(); |
||||
if (!ParseInternalKey(key, &parsed)) { |
||||
Log(env_, options_.info_log, "Table #%llu: unparsable key %s", |
||||
(unsigned long long) t->meta.number, |
||||
EscapeString(key).c_str()); |
||||
continue; |
||||
} |
||||
|
||||
counter++; |
||||
if (empty) { |
||||
empty = false; |
||||
t->meta.smallest.DecodeFrom(key); |
||||
} |
||||
t->meta.largest.DecodeFrom(key); |
||||
if (parsed.sequence > t->max_sequence) { |
||||
t->max_sequence = parsed.sequence; |
||||
} |
||||
|
||||
if (ExtractValueType(key) == kTypeLargeValueRef) { |
||||
if (iter->value().size() != LargeValueRef::ByteSize()) { |
||||
Log(env_, options_.info_log, "Table #%llu: bad large value ref", |
||||
(unsigned long long) t->meta.number); |
||||
} else { |
||||
edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()), |
||||
t->meta.number, |
||||
key); |
||||
} |
||||
} |
||||
} |
||||
if (!iter->status().ok()) { |
||||
status = iter->status(); |
||||
} |
||||
delete iter; |
||||
} |
||||
Log(env_, options_.info_log, "Table #%llu: %d entries %s", |
||||
(unsigned long long) t->meta.number, |
||||
counter, |
||||
status.ToString().c_str()); |
||||
return status; |
||||
} |
||||
|
||||
Status WriteDescriptor() { |
||||
std::string tmp = TempFileName(dbname_, 1); |
||||
WritableFile* file; |
||||
Status status = env_->NewWritableFile(tmp, &file); |
||||
if (!status.ok()) { |
||||
return status; |
||||
} |
||||
|
||||
SequenceNumber max_sequence = 0; |
||||
for (int i = 0; i < tables_.size(); i++) { |
||||
if (max_sequence < tables_[i].max_sequence) { |
||||
max_sequence = tables_[i].max_sequence; |
||||
} |
||||
} |
||||
|
||||
edit_.SetComparatorName(icmp_.user_comparator()->Name()); |
||||
edit_.SetLogNumber(0); |
||||
edit_.SetNextFile(next_file_number_); |
||||
edit_.SetLastSequence(max_sequence); |
||||
|
||||
for (int i = 0; i < tables_.size(); i++) { |
||||
// TODO(opt): separate out into multiple levels
|
||||
const TableInfo& t = tables_[i]; |
||||
edit_.AddFile(0, t.meta.number, t.meta.file_size, |
||||
t.meta.smallest, t.meta.largest); |
||||
} |
||||
|
||||
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
|
||||
{ |
||||
log::Writer log(file); |
||||
std::string record; |
||||
edit_.EncodeTo(&record); |
||||
status = log.AddRecord(record); |
||||
} |
||||
if (status.ok()) { |
||||
status = file->Close(); |
||||
} |
||||
delete file; |
||||
file = NULL; |
||||
|
||||
if (!status.ok()) { |
||||
env_->DeleteFile(tmp); |
||||
} else { |
||||
// Discard older manifests
|
||||
for (int i = 0; i < manifests_.size(); i++) { |
||||
ArchiveFile(dbname_ + "/" + manifests_[i]); |
||||
} |
||||
|
||||
// Install new manifest
|
||||
status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); |
||||
if (status.ok()) { |
||||
status = SetCurrentFile(env_, dbname_, 1); |
||||
} else { |
||||
env_->DeleteFile(tmp); |
||||
} |
||||
} |
||||
return status; |
||||
} |
||||
|
||||
void ArchiveFile(const std::string& fname) { |
||||
// Move into another directory. E.g., for
|
||||
// dir/foo
|
||||
// rename to
|
||||
// dir/lost/foo
|
||||
const char* slash = strrchr(fname.c_str(), '/'); |
||||
std::string new_dir; |
||||
if (slash != NULL) { |
||||
new_dir.assign(fname.data(), slash - fname.data()); |
||||
} |
||||
new_dir.append("/lost"); |
||||
env_->CreateDir(new_dir); // Ignore error
|
||||
std::string new_file = new_dir; |
||||
new_file.append("/"); |
||||
new_file.append((slash == NULL) ? fname.c_str() : slash + 1); |
||||
Status s = env_->RenameFile(fname, new_file); |
||||
Log(env_, options_.info_log, "Archiving %s: %s\n", |
||||
fname.c_str(), s.ToString().c_str()); |
||||
} |
||||
}; |
||||
} |
||||
|
||||
Status RepairDB(const std::string& dbname, const Options& options) { |
||||
Repairer repairer(dbname, options); |
||||
return repairer.Run(); |
||||
} |
||||
|
||||
} |
@ -0,0 +1,378 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Thread safety
|
||||
// -------------
|
||||
//
|
||||
// Writes require external synchronization, most likely a mutex.
|
||||
// Reads require a guarantee that the SkipList will not be destroyed
|
||||
// while the read is in progress. Apart from that, reads progress
|
||||
// without any internal locking or synchronization.
|
||||
//
|
||||
// Invariants:
|
||||
//
|
||||
// (1) Allocated nodes are never deleted until the SkipList is
|
||||
// destroyed. This is trivially guaranteed by the code since we
|
||||
// never delete any skip list nodes.
|
||||
//
|
||||
// (2) The contents of a Node except for the next/prev pointers are
|
||||
// immutable after the Node has been linked into the SkipList.
|
||||
// Only Insert() modifies the list, and it is careful to initialize
|
||||
// a node and use release-stores to publish the nodes in one or
|
||||
// more lists.
|
||||
//
|
||||
// ... prev vs. next pointer ordering ...
|
||||
|
||||
#include <assert.h> |
||||
#include <stdlib.h> |
||||
#include "port/port.h" |
||||
#include "util/arena.h" |
||||
#include "util/random.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Arena; |
||||
|
||||
template<typename Key, class Comparator> |
||||
class SkipList { |
||||
private: |
||||
struct Node; |
||||
|
||||
public: |
||||
// Create a new SkipList object that will use "cmp" for comparing keys,
|
||||
// and will allocate memory using "*arena". Objects allocated in the arena
|
||||
// must remain allocated for the lifetime of the skiplist object.
|
||||
explicit SkipList(Comparator cmp, Arena* arena); |
||||
|
||||
// Insert key into the list.
|
||||
// REQUIRES: nothing that compares equal to key is currently in the list.
|
||||
void Insert(const Key& key); |
||||
|
||||
// Returns true iff an entry that compares equal to key is in the list.
|
||||
bool Contains(const Key& key) const; |
||||
|
||||
// Iteration over the contents of a skip list
|
||||
class Iterator { |
||||
public: |
||||
// Initialize an iterator over the specified list.
|
||||
// The returned iterator is not valid.
|
||||
explicit Iterator(const SkipList* list); |
||||
|
||||
// Returns true iff the iterator is positioned at a valid node.
|
||||
bool Valid() const; |
||||
|
||||
// Returns the key at the current position.
|
||||
// REQUIRES: Valid()
|
||||
const Key& key() const; |
||||
|
||||
// Advances to the next position.
|
||||
// REQUIRES: Valid()
|
||||
void Next(); |
||||
|
||||
// Advances to the previous position.
|
||||
// REQUIRES: Valid()
|
||||
void Prev(); |
||||
|
||||
// Advance to the first entry with a key >= target
|
||||
void Seek(const Key& target); |
||||
|
||||
// Position at the first entry in list.
|
||||
// Final state of iterator is Valid() iff list is not empty.
|
||||
void SeekToFirst(); |
||||
|
||||
// Position at the last entry in list.
|
||||
// Final state of iterator is Valid() iff list is not empty.
|
||||
void SeekToLast(); |
||||
|
||||
private: |
||||
const SkipList* list_; |
||||
Node* node_; |
||||
// Intentionally copyable
|
||||
}; |
||||
|
||||
private: |
||||
enum { kMaxHeight = 12 }; |
||||
|
||||
// Immutable after construction
|
||||
Comparator const compare_; |
||||
Arena* const arena_; // Arena used for allocations of nodes
|
||||
|
||||
Node* const head_; |
||||
|
||||
// Modified only by Insert(). Read racily by readers, but stale
|
||||
// values are ok.
|
||||
port::AtomicPointer max_height_; // Height of the entire list
|
||||
|
||||
inline int GetMaxHeight() const { |
||||
return reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()); |
||||
} |
||||
|
||||
// Read/written only by Insert().
|
||||
Random rnd_; |
||||
|
||||
Node* NewNode(const Key& key, int height); |
||||
int RandomHeight(); |
||||
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } |
||||
|
||||
// Return true if key is greater than the data stored in "n"
|
||||
bool KeyIsAfterNode(const Key& key, Node* n) const; |
||||
|
||||
// Return the earliest node that comes at or after key.
|
||||
// Return NULL if there is no such node.
|
||||
//
|
||||
// If prev is non-NULL, fills prev[level] with pointer to previous
|
||||
// node at "level" for every level in [0..max_height_-1].
|
||||
Node* FindGreaterOrEqual(const Key& key, Node** prev) const; |
||||
|
||||
// Return the latest node with a key < key.
|
||||
// Return head_ if there is no such node.
|
||||
Node* FindLessThan(const Key& key) const; |
||||
|
||||
// Return the last node in the list.
|
||||
// Return head_ if list is empty.
|
||||
Node* FindLast() const; |
||||
|
||||
// No copying allowed
|
||||
SkipList(const SkipList&); |
||||
void operator=(const SkipList&); |
||||
}; |
||||
|
||||
// Implementation details follow
|
||||
template<typename Key, class Comparator> |
||||
struct SkipList<Key,Comparator>::Node { |
||||
explicit Node(const Key& k) : key(k) { } |
||||
|
||||
Key const key; |
||||
|
||||
// Accessors/mutators for links. Wrapped in methods so we can
|
||||
// add the appropriate barriers as necessary.
|
||||
Node* Next(int n) { |
||||
assert(n >= 0); |
||||
// Use an 'acquire load' so that we observe a fully initialized
|
||||
// version of the returned Node.
|
||||
return reinterpret_cast<Node*>(next_[n].Acquire_Load()); |
||||
} |
||||
void SetNext(int n, Node* x) { |
||||
assert(n >= 0); |
||||
// Use a 'release store' so that anybody who reads through this
|
||||
// pointer observes a fully initialized version of the inserted node.
|
||||
next_[n].Release_Store(x); |
||||
} |
||||
|
||||
// No-barrier variants that can be safely used in a few locations.
|
||||
Node* NoBarrier_Next(int n) { |
||||
assert(n >= 0); |
||||
return reinterpret_cast<Node*>(next_[n].NoBarrier_Load()); |
||||
} |
||||
void NoBarrier_SetNext(int n, Node* x) { |
||||
assert(n >= 0); |
||||
next_[n].NoBarrier_Store(x); |
||||
} |
||||
|
||||
private: |
||||
// Array of length equal to the node height. next_[0] is lowest level link.
|
||||
port::AtomicPointer next_[1]; |
||||
}; |
||||
|
||||
template<typename Key, class Comparator> |
||||
typename SkipList<Key,Comparator>::Node* |
||||
SkipList<Key,Comparator>::NewNode(const Key& key, int height) { |
||||
char* mem = arena_->AllocateAligned( |
||||
sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); |
||||
return new (mem) Node(key); |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) { |
||||
list_ = list; |
||||
node_ = NULL; |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
inline bool SkipList<Key,Comparator>::Iterator::Valid() const { |
||||
return node_ != NULL; |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
inline const Key& SkipList<Key,Comparator>::Iterator::key() const { |
||||
assert(Valid()); |
||||
return node_->key; |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
inline void SkipList<Key,Comparator>::Iterator::Next() { |
||||
assert(Valid()); |
||||
node_ = node_->Next(0); |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
inline void SkipList<Key,Comparator>::Iterator::Prev() { |
||||
// Instead of using explicit "prev" links, we just search for the
|
||||
// last node that falls before key.
|
||||
assert(Valid()); |
||||
node_ = list_->FindLessThan(node_->key); |
||||
if (node_ == list_->head_) { |
||||
node_ = NULL; |
||||
} |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) { |
||||
node_ = list_->FindGreaterOrEqual(target, NULL); |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() { |
||||
node_ = list_->head_->Next(0); |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
inline void SkipList<Key,Comparator>::Iterator::SeekToLast() { |
||||
node_ = list_->FindLast(); |
||||
if (node_ == list_->head_) { |
||||
node_ = NULL; |
||||
} |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
int SkipList<Key,Comparator>::RandomHeight() { |
||||
// Increase height with probability 1 in kBranching
|
||||
static const unsigned int kBranching = 4; |
||||
int height = 1; |
||||
while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { |
||||
height++; |
||||
} |
||||
assert(height > 0); |
||||
assert(height <= kMaxHeight); |
||||
return height; |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const { |
||||
// NULL n is considered infinite
|
||||
return (n != NULL) && (compare_(n->key, key) < 0); |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev) |
||||
const { |
||||
Node* x = head_; |
||||
int level = GetMaxHeight() - 1; |
||||
while (true) { |
||||
Node* next = x->Next(level); |
||||
if (KeyIsAfterNode(key, next)) { |
||||
// Keep searching in this list
|
||||
x = next; |
||||
} else { |
||||
if (prev != NULL) prev[level] = x; |
||||
if (level == 0) { |
||||
return next; |
||||
} else { |
||||
// Switch to next list
|
||||
level--; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
typename SkipList<Key,Comparator>::Node* |
||||
SkipList<Key,Comparator>::FindLessThan(const Key& key) const { |
||||
Node* x = head_; |
||||
int level = GetMaxHeight() - 1; |
||||
while (true) { |
||||
assert(x == head_ || compare_(x->key, key) < 0); |
||||
Node* next = x->Next(level); |
||||
if (next == NULL || compare_(next->key, key) >= 0) { |
||||
if (level == 0) { |
||||
return x; |
||||
} else { |
||||
// Switch to next list
|
||||
level--; |
||||
} |
||||
} else { |
||||
x = next; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast() |
||||
const { |
||||
Node* x = head_; |
||||
int level = GetMaxHeight() - 1; |
||||
while (true) { |
||||
Node* next = x->Next(level); |
||||
if (next == NULL) { |
||||
if (level == 0) { |
||||
return x; |
||||
} else { |
||||
// Switch to next list
|
||||
level--; |
||||
} |
||||
} else { |
||||
x = next; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena) |
||||
: compare_(cmp), |
||||
arena_(arena), |
||||
head_(NewNode(0 /* any key will do */, kMaxHeight)), |
||||
max_height_(reinterpret_cast<void*>(1)), |
||||
rnd_(0xdeadbeef) { |
||||
for (int i = 0; i < kMaxHeight; i++) { |
||||
head_->SetNext(i, NULL); |
||||
} |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
void SkipList<Key,Comparator>::Insert(const Key& key) { |
||||
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
|
||||
// here since Insert() is externally synchronized.
|
||||
Node* prev[kMaxHeight]; |
||||
Node* x = FindGreaterOrEqual(key, prev); |
||||
|
||||
// Our data structure does not allow duplicate insertion
|
||||
assert(x == NULL || !Equal(key, x->key)); |
||||
|
||||
int height = RandomHeight(); |
||||
if (height > GetMaxHeight()) { |
||||
for (int i = GetMaxHeight(); i < height; i++) { |
||||
prev[i] = head_; |
||||
} |
||||
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
|
||||
|
||||
// It is ok to mutate max_height_ without any synchronization
|
||||
// with concurrent readers. A concurrent reader that observes
|
||||
// the new value of max_height_ will see either the old value of
|
||||
// new level pointers from head_ (NULL), or a new value set in
|
||||
// the loop below. In the former case the reader will
|
||||
// immediately drop to the next level since NULL sorts after all
|
||||
// keys. In the latter case the reader will use the new node.
|
||||
max_height_.NoBarrier_Store(reinterpret_cast<void*>(height)); |
||||
} |
||||
|
||||
x = NewNode(key, height); |
||||
for (int i = 0; i < height; i++) { |
||||
// NoBarrier_SetNext() suffices since we will add a barrier when
|
||||
// we publish a pointer to "x" in prev[i].
|
||||
x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); |
||||
prev[i]->SetNext(i, x); |
||||
} |
||||
} |
||||
|
||||
template<typename Key, class Comparator> |
||||
bool SkipList<Key,Comparator>::Contains(const Key& key) const { |
||||
Node* x = FindGreaterOrEqual(key, NULL); |
||||
if (x != NULL && Equal(key, x->key)) { |
||||
return true; |
||||
} else { |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
} |
@ -0,0 +1,378 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/skiplist.h" |
||||
#include <set> |
||||
#include "include/env.h" |
||||
#include "util/arena.h" |
||||
#include "util/hash.h" |
||||
#include "util/random.h" |
||||
#include "util/testharness.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
typedef uint64_t Key; |
||||
|
||||
struct Comparator { |
||||
int operator()(const Key& a, const Key& b) const { |
||||
if (a < b) { |
||||
return -1; |
||||
} else if (a > b) { |
||||
return +1; |
||||
} else { |
||||
return 0; |
||||
} |
||||
} |
||||
}; |
||||
|
||||
class SkipTest { }; |
||||
|
||||
TEST(SkipTest, Empty) { |
||||
Arena arena; |
||||
Comparator cmp; |
||||
SkipList<Key, Comparator> list(cmp, &arena); |
||||
ASSERT_TRUE(!list.Contains(10)); |
||||
|
||||
SkipList<Key, Comparator>::Iterator iter(&list); |
||||
ASSERT_TRUE(!iter.Valid()); |
||||
iter.SeekToFirst(); |
||||
ASSERT_TRUE(!iter.Valid()); |
||||
iter.Seek(100); |
||||
ASSERT_TRUE(!iter.Valid()); |
||||
iter.SeekToLast(); |
||||
ASSERT_TRUE(!iter.Valid()); |
||||
} |
||||
|
||||
TEST(SkipTest, InsertAndLookup) { |
||||
const int N = 2000; |
||||
const int R = 5000; |
||||
Random rnd(1000); |
||||
std::set<Key> keys; |
||||
Arena arena; |
||||
Comparator cmp; |
||||
SkipList<Key, Comparator> list(cmp, &arena); |
||||
for (int i = 0; i < N; i++) { |
||||
Key key = rnd.Next() % R; |
||||
if (keys.insert(key).second) { |
||||
list.Insert(key); |
||||
} |
||||
} |
||||
|
||||
for (int i = 0; i < R; i++) { |
||||
if (list.Contains(i)) { |
||||
ASSERT_EQ(keys.count(i), 1); |
||||
} else { |
||||
ASSERT_EQ(keys.count(i), 0); |
||||
} |
||||
} |
||||
|
||||
// Simple iterator tests
|
||||
{ |
||||
SkipList<Key, Comparator>::Iterator iter(&list); |
||||
ASSERT_TRUE(!iter.Valid()); |
||||
|
||||
iter.Seek(0); |
||||
ASSERT_TRUE(iter.Valid()); |
||||
ASSERT_EQ(*(keys.begin()), iter.key()); |
||||
|
||||
iter.SeekToFirst(); |
||||
ASSERT_TRUE(iter.Valid()); |
||||
ASSERT_EQ(*(keys.begin()), iter.key()); |
||||
|
||||
iter.SeekToLast(); |
||||
ASSERT_TRUE(iter.Valid()); |
||||
ASSERT_EQ(*(keys.rbegin()), iter.key()); |
||||
} |
||||
|
||||
// Forward iteration test
|
||||
for (int i = 0; i < R; i++) { |
||||
SkipList<Key, Comparator>::Iterator iter(&list); |
||||
iter.Seek(i); |
||||
|
||||
// Compare against model iterator
|
||||
std::set<Key>::iterator model_iter = keys.lower_bound(i); |
||||
for (int j = 0; j < 3; j++) { |
||||
if (model_iter == keys.end()) { |
||||
ASSERT_TRUE(!iter.Valid()); |
||||
break; |
||||
} else { |
||||
ASSERT_TRUE(iter.Valid()); |
||||
ASSERT_EQ(*model_iter, iter.key()); |
||||
++model_iter; |
||||
iter.Next(); |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Backward iteration test
|
||||
{ |
||||
SkipList<Key, Comparator>::Iterator iter(&list); |
||||
iter.SeekToLast(); |
||||
|
||||
// Compare against model iterator
|
||||
for (std::set<Key>::reverse_iterator model_iter = keys.rbegin(); |
||||
model_iter != keys.rend(); |
||||
++model_iter) { |
||||
ASSERT_TRUE(iter.Valid()); |
||||
ASSERT_EQ(*model_iter, iter.key()); |
||||
iter.Prev(); |
||||
} |
||||
ASSERT_TRUE(!iter.Valid()); |
||||
} |
||||
} |
||||
|
||||
// We want to make sure that with a single writer and multiple
|
||||
// concurrent readers (with no synchronization other than when a
|
||||
// reader's iterator is created), the reader always observes all the
|
||||
// data that was present in the skip list when the iterator was
|
||||
// constructor. Because insertions are happening concurrently, we may
|
||||
// also observe new values that were inserted since the iterator was
|
||||
// constructed, but we should never miss any values that were present
|
||||
// at iterator construction time.
|
||||
//
|
||||
// We generate multi-part keys:
|
||||
// <key,gen,hash>
|
||||
// where:
|
||||
// key is in range [0..K-1]
|
||||
// gen is a generation number for key
|
||||
// hash is hash(key,gen)
|
||||
//
|
||||
// The insertion code picks a random key, sets gen to be 1 + the last
|
||||
// generation number inserted for that key, and sets hash to Hash(key,gen).
|
||||
//
|
||||
// At the beginning of a read, we snapshot the last inserted
|
||||
// generation number for each key. We then iterate, including random
|
||||
// calls to Next() and Seek(). For every key we encounter, we
|
||||
// check that it is either expected given the initial snapshot or has
|
||||
// been concurrently added since the iterator started.
|
||||
class ConcurrentTest { |
||||
private: |
||||
static const uint32_t K = 4; |
||||
|
||||
static uint64_t key(Key key) { return (key >> 40); } |
||||
static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } |
||||
static uint64_t hash(Key key) { return key & 0xff; } |
||||
|
||||
static uint64_t HashNumbers(uint64_t k, uint64_t g) { |
||||
uint64_t data[2] = { k, g }; |
||||
return Hash(reinterpret_cast<char*>(data), sizeof(data), 0); |
||||
} |
||||
|
||||
static Key MakeKey(uint64_t k, uint64_t g) { |
||||
assert(sizeof(Key) == sizeof(uint64_t)); |
||||
assert(k <= K); // We sometimes pass K to seek to the end of the skiplist
|
||||
assert(g <= 0xffffffffu); |
||||
return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); |
||||
} |
||||
|
||||
static bool IsValidKey(Key k) { |
||||
return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); |
||||
} |
||||
|
||||
static Key RandomTarget(Random* rnd) { |
||||
switch (rnd->Next() % 10) { |
||||
case 0: |
||||
// Seek to beginning
|
||||
return MakeKey(0, 0); |
||||
case 1: |
||||
// Seek to end
|
||||
return MakeKey(K, 0); |
||||
default: |
||||
// Seek to middle
|
||||
return MakeKey(rnd->Next() % K, 0); |
||||
} |
||||
} |
||||
|
||||
// Per-key generation
|
||||
struct State { |
||||
port::AtomicPointer generation[K]; |
||||
void Set(int k, intptr_t v) { |
||||
generation[k].Release_Store(reinterpret_cast<void*>(v)); |
||||
} |
||||
intptr_t Get(int k) { |
||||
return reinterpret_cast<intptr_t>(generation[k].Acquire_Load()); |
||||
} |
||||
|
||||
State() { |
||||
for (int k = 0; k < K; k++) { |
||||
Set(k, 0); |
||||
} |
||||
} |
||||
}; |
||||
|
||||
// Current state of the test
|
||||
State current_; |
||||
|
||||
Arena arena_; |
||||
|
||||
// SkipList is not protected by mu_. We just use a single writer
|
||||
// thread to modify it.
|
||||
SkipList<Key, Comparator> list_; |
||||
|
||||
public: |
||||
ConcurrentTest() : list_(Comparator(), &arena_) { } |
||||
|
||||
// REQUIRES: External synchronization
|
||||
void WriteStep(Random* rnd) { |
||||
const uint32_t k = rnd->Next() % K; |
||||
const intptr_t g = current_.Get(k) + 1; |
||||
const Key key = MakeKey(k, g); |
||||
list_.Insert(key); |
||||
current_.Set(k, g); |
||||
} |
||||
|
||||
void ReadStep(Random* rnd) { |
||||
// Remember the initial committed state of the skiplist.
|
||||
State initial_state; |
||||
for (int k = 0; k < K; k++) { |
||||
initial_state.Set(k, current_.Get(k)); |
||||
} |
||||
|
||||
Key pos = RandomTarget(rnd); |
||||
SkipList<Key, Comparator>::Iterator iter(&list_); |
||||
iter.Seek(pos); |
||||
while (true) { |
||||
Key current; |
||||
if (!iter.Valid()) { |
||||
current = MakeKey(K, 0); |
||||
} else { |
||||
current = iter.key(); |
||||
ASSERT_TRUE(IsValidKey(current)) << std::hex << current; |
||||
} |
||||
ASSERT_LE(pos, current) << "should not go backwards"; |
||||
|
||||
// Verify that everything in [pos,current) was not present in
|
||||
// initial_state.
|
||||
while (pos < current) { |
||||
ASSERT_LT(key(pos), K) << std::hex << pos; |
||||
|
||||
// Note that generation 0 is never inserted, so it is ok if
|
||||
// <*,0,*> is missing.
|
||||
ASSERT_TRUE((gen(pos) == 0) || |
||||
(gen(pos) > initial_state.Get(key(pos))) |
||||
) << "key: " << key(pos) |
||||
<< "; gen: " << gen(pos) |
||||
<< "; initgen: " |
||||
<< initial_state.Get(key(pos)); |
||||
|
||||
// Advance to next key in the valid key space
|
||||
if (key(pos) < key(current)) { |
||||
pos = MakeKey(key(pos) + 1, 0); |
||||
} else { |
||||
pos = MakeKey(key(pos), gen(pos) + 1); |
||||
} |
||||
} |
||||
|
||||
if (!iter.Valid()) { |
||||
break; |
||||
} |
||||
|
||||
if (rnd->Next() % 2) { |
||||
iter.Next(); |
||||
pos = MakeKey(key(pos), gen(pos) + 1); |
||||
} else { |
||||
Key new_target = RandomTarget(rnd); |
||||
if (new_target > pos) { |
||||
pos = new_target; |
||||
iter.Seek(new_target); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
const uint32_t ConcurrentTest::K; |
||||
|
||||
// Simple test that does single-threaded testing of the ConcurrentTest
|
||||
// scaffolding.
|
||||
TEST(SkipTest, ConcurrentWithoutThreads) { |
||||
ConcurrentTest test; |
||||
Random rnd(test::RandomSeed()); |
||||
for (int i = 0; i < 10000; i++) { |
||||
test.ReadStep(&rnd); |
||||
test.WriteStep(&rnd); |
||||
} |
||||
} |
||||
|
||||
class TestState { |
||||
public: |
||||
ConcurrentTest t_; |
||||
int seed_; |
||||
port::AtomicPointer quit_flag_; |
||||
|
||||
enum ReaderState { |
||||
STARTING, |
||||
RUNNING, |
||||
DONE |
||||
}; |
||||
|
||||
explicit TestState(int s) |
||||
: seed_(s), |
||||
quit_flag_(NULL), |
||||
state_(STARTING), |
||||
state_cv_(&mu_) {} |
||||
|
||||
void Wait(ReaderState s) { |
||||
mu_.Lock(); |
||||
while (state_ != s) { |
||||
state_cv_.Wait(); |
||||
} |
||||
mu_.Unlock(); |
||||
} |
||||
|
||||
void Change(ReaderState s) { |
||||
mu_.Lock(); |
||||
state_ = s; |
||||
state_cv_.Signal(); |
||||
mu_.Unlock(); |
||||
} |
||||
|
||||
private: |
||||
port::Mutex mu_; |
||||
ReaderState state_; |
||||
port::CondVar state_cv_; |
||||
}; |
||||
|
||||
static void ConcurrentReader(void* arg) { |
||||
TestState* state = reinterpret_cast<TestState*>(arg); |
||||
Random rnd(state->seed_); |
||||
int64_t reads = 0; |
||||
state->Change(TestState::RUNNING); |
||||
while (!state->quit_flag_.Acquire_Load()) { |
||||
state->t_.ReadStep(&rnd); |
||||
++reads; |
||||
} |
||||
state->Change(TestState::DONE); |
||||
} |
||||
|
||||
static void RunConcurrent(int run) { |
||||
const int seed = test::RandomSeed() + (run * 100); |
||||
Random rnd(seed); |
||||
const int N = 1000; |
||||
const int kSize = 1000; |
||||
for (int i = 0; i < N; i++) { |
||||
if ((i % 100) == 0) { |
||||
fprintf(stderr, "Run %d of %d\n", i, N); |
||||
} |
||||
TestState state(seed + 1); |
||||
Env::Default()->Schedule(ConcurrentReader, &state); |
||||
state.Wait(TestState::RUNNING); |
||||
for (int i = 0; i < kSize; i++) { |
||||
state.t_.WriteStep(&rnd); |
||||
} |
||||
state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do
|
||||
state.Wait(TestState::DONE); |
||||
} |
||||
} |
||||
|
||||
TEST(SkipTest, Concurrent1) { RunConcurrent(1); } |
||||
TEST(SkipTest, Concurrent2) { RunConcurrent(2); } |
||||
TEST(SkipTest, Concurrent3) { RunConcurrent(3); } |
||||
TEST(SkipTest, Concurrent4) { RunConcurrent(4); } |
||||
TEST(SkipTest, Concurrent5) { RunConcurrent(5); } |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,66 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ |
||||
#define STORAGE_LEVELDB_DB_SNAPSHOT_H_ |
||||
|
||||
#include "include/db.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class SnapshotList; |
||||
|
||||
// Snapshots are kept in a doubly-linked list in the DB.
|
||||
// Each Snapshot corresponds to a particular sequence number.
|
||||
class Snapshot { |
||||
public: |
||||
SequenceNumber number_; // const after creation
|
||||
|
||||
private: |
||||
friend class SnapshotList; |
||||
|
||||
// Snapshot is kept in a doubly-linked circular list
|
||||
Snapshot* prev_; |
||||
Snapshot* next_; |
||||
|
||||
SnapshotList* list_; // just for sanity checks
|
||||
}; |
||||
|
||||
class SnapshotList { |
||||
public: |
||||
SnapshotList() { |
||||
list_.prev_ = &list_; |
||||
list_.next_ = &list_; |
||||
} |
||||
|
||||
bool empty() const { return list_.next_ == &list_; } |
||||
Snapshot* oldest() const { assert(!empty()); return list_.next_; } |
||||
Snapshot* newest() const { assert(!empty()); return list_.prev_; } |
||||
|
||||
const Snapshot* New(SequenceNumber seq) { |
||||
Snapshot* s = new Snapshot; |
||||
s->number_ = seq; |
||||
s->list_ = this; |
||||
s->next_ = &list_; |
||||
s->prev_ = list_.prev_; |
||||
s->prev_->next_ = s; |
||||
s->next_->prev_ = s; |
||||
return s; |
||||
} |
||||
|
||||
void Delete(const Snapshot* s) { |
||||
assert(s->list_ == this); |
||||
s->prev_->next_ = s->next_; |
||||
s->next_->prev_ = s->prev_; |
||||
delete s; |
||||
} |
||||
|
||||
private: |
||||
// Dummy head of doubly-linked list of snapshots
|
||||
Snapshot list_; |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_
|
@ -0,0 +1,94 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/table_cache.h" |
||||
|
||||
#include "db/filename.h" |
||||
#include "include/env.h" |
||||
#include "include/table.h" |
||||
#include "util/coding.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
struct TableAndFile { |
||||
RandomAccessFile* file; |
||||
Table* table; |
||||
}; |
||||
|
||||
static void DeleteEntry(const Slice& key, void* value) { |
||||
TableAndFile* tf = reinterpret_cast<TableAndFile*>(value); |
||||
delete tf->table; |
||||
delete tf->file; |
||||
delete tf; |
||||
} |
||||
|
||||
static void UnrefEntry(void* arg1, void* arg2) { |
||||
Cache* cache = reinterpret_cast<Cache*>(arg1); |
||||
Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2); |
||||
cache->Release(h); |
||||
} |
||||
|
||||
TableCache::TableCache(const std::string& dbname, |
||||
const Options* options, |
||||
int entries) |
||||
: env_(options->env), |
||||
dbname_(dbname), |
||||
options_(options), |
||||
cache_(NewLRUCache(entries)) { |
||||
} |
||||
|
||||
TableCache::~TableCache() { |
||||
delete cache_; |
||||
} |
||||
|
||||
Iterator* TableCache::NewIterator(const ReadOptions& options, |
||||
uint64_t file_number, |
||||
Table** tableptr) { |
||||
if (tableptr != NULL) { |
||||
*tableptr = NULL; |
||||
} |
||||
|
||||
char buf[sizeof(file_number)]; |
||||
EncodeFixed64(buf, file_number); |
||||
Slice key(buf, sizeof(buf)); |
||||
Cache::Handle* handle = cache_->Lookup(key); |
||||
if (handle == NULL) { |
||||
std::string fname = TableFileName(dbname_, file_number); |
||||
RandomAccessFile* file = NULL; |
||||
Table* table = NULL; |
||||
Status s = env_->NewRandomAccessFile(fname, &file); |
||||
if (s.ok()) { |
||||
s = Table::Open(*options_, file, &table); |
||||
} |
||||
|
||||
if (!s.ok()) { |
||||
assert(table == NULL); |
||||
delete file; |
||||
// We do not cache error results so that if the error is transient,
|
||||
// or somebody repairs the file, we recover automatically.
|
||||
return NewErrorIterator(s); |
||||
} |
||||
|
||||
TableAndFile* tf = new TableAndFile; |
||||
tf->file = file; |
||||
tf->table = table; |
||||
handle = cache_->Insert(key, tf, 1, &DeleteEntry); |
||||
} |
||||
|
||||
Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table; |
||||
Iterator* result = table->NewIterator(options); |
||||
result->RegisterCleanup(&UnrefEntry, cache_, handle); |
||||
if (tableptr != NULL) { |
||||
*tableptr = table; |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
void TableCache::Evict(uint64_t file_number) { |
||||
char buf[sizeof(file_number)]; |
||||
EncodeFixed64(buf, file_number); |
||||
cache_->Erase(Slice(buf, sizeof(buf))); |
||||
} |
||||
|
||||
} |
@ -0,0 +1,49 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Thread-safe (provides internal synchronization)
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_ |
||||
#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_ |
||||
|
||||
#include <string> |
||||
#include <stdint.h> |
||||
#include "db/dbformat.h" |
||||
#include "include/cache.h" |
||||
#include "include/table.h" |
||||
#include "port/port.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Env; |
||||
|
||||
class TableCache { |
||||
public: |
||||
TableCache(const std::string& dbname, const Options* options, int entries); |
||||
~TableCache(); |
||||
|
||||
// Get an iterator for the specified file number and return it. If
|
||||
// "tableptr" is non-NULL, also sets "*tableptr" to point to the
|
||||
// Table object underlying the returned iterator, or NULL if no
|
||||
// Table object underlies the returned iterator. The returned
|
||||
// "*tableptr" object is owned by the cache and should not be
|
||||
// deleted, and is valid for as long as the returned iterator is
|
||||
// live.
|
||||
Iterator* NewIterator(const ReadOptions& options, |
||||
uint64_t file_number, |
||||
Table** tableptr = NULL); |
||||
|
||||
// Evict any entry for the specified file number
|
||||
void Evict(uint64_t file_number); |
||||
|
||||
private: |
||||
Env* const env_; |
||||
const std::string dbname_; |
||||
const Options* options_; |
||||
Cache* cache_; |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_
|
@ -0,0 +1,282 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/version_edit.h" |
||||
|
||||
#include "db/version_set.h" |
||||
#include "util/coding.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
// Tag numbers for serialized VersionEdit. These numbers are written to
|
||||
// disk and should not be changed.
|
||||
enum Tag { |
||||
kComparator = 1, |
||||
kLogNumber = 2, |
||||
kNextFileNumber = 3, |
||||
kLastSequence = 4, |
||||
kCompactPointer = 5, |
||||
kDeletedFile = 6, |
||||
kNewFile = 7, |
||||
kLargeValueRef = 8, |
||||
}; |
||||
|
||||
void VersionEdit::Clear() { |
||||
comparator_.clear(); |
||||
log_number_ = 0; |
||||
last_sequence_ = 0; |
||||
next_file_number_ = 0; |
||||
has_comparator_ = false; |
||||
has_log_number_ = false; |
||||
has_next_file_number_ = false; |
||||
has_last_sequence_ = false; |
||||
deleted_files_.clear(); |
||||
new_files_.clear(); |
||||
large_refs_added_.clear(); |
||||
} |
||||
|
||||
void VersionEdit::EncodeTo(std::string* dst) const { |
||||
if (has_comparator_) { |
||||
PutVarint32(dst, kComparator); |
||||
PutLengthPrefixedSlice(dst, comparator_); |
||||
} |
||||
if (has_log_number_) { |
||||
PutVarint32(dst, kLogNumber); |
||||
PutVarint64(dst, log_number_); |
||||
} |
||||
if (has_next_file_number_) { |
||||
PutVarint32(dst, kNextFileNumber); |
||||
PutVarint64(dst, next_file_number_); |
||||
} |
||||
if (has_last_sequence_) { |
||||
PutVarint32(dst, kLastSequence); |
||||
PutVarint64(dst, last_sequence_); |
||||
} |
||||
|
||||
for (int i = 0; i < compact_pointers_.size(); i++) { |
||||
PutVarint32(dst, kCompactPointer); |
||||
PutVarint32(dst, compact_pointers_[i].first); // level
|
||||
PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); |
||||
} |
||||
|
||||
for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); |
||||
iter != deleted_files_.end(); |
||||
++iter) { |
||||
PutVarint32(dst, kDeletedFile); |
||||
PutVarint32(dst, iter->first); // level
|
||||
PutVarint64(dst, iter->second); // file number
|
||||
} |
||||
|
||||
for (int i = 0; i < new_files_.size(); i++) { |
||||
const FileMetaData& f = new_files_[i].second; |
||||
PutVarint32(dst, kNewFile); |
||||
PutVarint32(dst, new_files_[i].first); // level
|
||||
PutVarint64(dst, f.number); |
||||
PutVarint64(dst, f.file_size); |
||||
PutLengthPrefixedSlice(dst, f.smallest.Encode()); |
||||
PutLengthPrefixedSlice(dst, f.largest.Encode()); |
||||
} |
||||
|
||||
for (int i = 0; i < large_refs_added_.size(); i++) { |
||||
const VersionEdit::Large& l = large_refs_added_[i]; |
||||
PutVarint32(dst, kLargeValueRef); |
||||
PutLengthPrefixedSlice(dst, |
||||
Slice(l.large_ref.data, LargeValueRef::ByteSize())); |
||||
PutVarint64(dst, l.fnum); |
||||
PutLengthPrefixedSlice(dst, l.internal_key.Encode()); |
||||
} |
||||
} |
||||
|
||||
static bool GetInternalKey(Slice* input, InternalKey* dst) { |
||||
Slice str; |
||||
if (GetLengthPrefixedSlice(input, &str)) { |
||||
dst->DecodeFrom(str); |
||||
return true; |
||||
} else { |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
static bool GetLevel(Slice* input, int* level) { |
||||
uint32_t v; |
||||
if (GetVarint32(input, &v) && |
||||
v < config::kNumLevels) { |
||||
*level = v; |
||||
return true; |
||||
} else { |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
Status VersionEdit::DecodeFrom(const Slice& src) { |
||||
Clear(); |
||||
Slice input = src; |
||||
const char* msg = NULL; |
||||
uint32_t tag; |
||||
|
||||
// Temporary storage for parsing
|
||||
int level; |
||||
uint64_t number; |
||||
FileMetaData f; |
||||
Slice str; |
||||
Large large; |
||||
InternalKey key; |
||||
|
||||
while (msg == NULL && GetVarint32(&input, &tag)) { |
||||
switch (tag) { |
||||
case kComparator: |
||||
if (GetLengthPrefixedSlice(&input, &str)) { |
||||
comparator_ = str.ToString(); |
||||
has_comparator_ = true; |
||||
} else { |
||||
msg = "comparator name"; |
||||
} |
||||
break; |
||||
|
||||
case kLogNumber: |
||||
if (GetVarint64(&input, &log_number_)) { |
||||
has_log_number_ = true; |
||||
} else { |
||||
msg = "log number"; |
||||
} |
||||
break; |
||||
|
||||
case kNextFileNumber: |
||||
if (GetVarint64(&input, &next_file_number_)) { |
||||
has_next_file_number_ = true; |
||||
} else { |
||||
msg = "next file number"; |
||||
} |
||||
break; |
||||
|
||||
case kLastSequence: |
||||
if (GetVarint64(&input, &last_sequence_)) { |
||||
has_last_sequence_ = true; |
||||
} else { |
||||
msg = "last sequence number"; |
||||
} |
||||
break; |
||||
|
||||
case kCompactPointer: |
||||
if (GetLevel(&input, &level) && |
||||
GetInternalKey(&input, &key)) { |
||||
compact_pointers_.push_back(std::make_pair(level, key)); |
||||
} else { |
||||
msg = "compaction pointer"; |
||||
} |
||||
break; |
||||
|
||||
case kDeletedFile: |
||||
if (GetLevel(&input, &level) && |
||||
GetVarint64(&input, &number)) { |
||||
deleted_files_.insert(std::make_pair(level, number)); |
||||
} else { |
||||
msg = "deleted file"; |
||||
} |
||||
break; |
||||
|
||||
case kNewFile: |
||||
if (GetLevel(&input, &level) && |
||||
GetVarint64(&input, &f.number) && |
||||
GetVarint64(&input, &f.file_size) && |
||||
GetInternalKey(&input, &f.smallest) && |
||||
GetInternalKey(&input, &f.largest)) { |
||||
new_files_.push_back(std::make_pair(level, f)); |
||||
} else { |
||||
msg = "new-file entry"; |
||||
} |
||||
break; |
||||
|
||||
case kLargeValueRef: |
||||
if (GetLengthPrefixedSlice(&input, &str) && |
||||
(str.size() == LargeValueRef::ByteSize()) && |
||||
GetVarint64(&input, &large.fnum) && |
||||
GetInternalKey(&input, &large.internal_key)) { |
||||
large.large_ref = LargeValueRef::FromRef(str); |
||||
large_refs_added_.push_back(large); |
||||
} else { |
||||
msg = "large ref"; |
||||
} |
||||
break; |
||||
|
||||
default: |
||||
msg = "unknown tag"; |
||||
break; |
||||
} |
||||
} |
||||
|
||||
if (msg == NULL && !input.empty()) { |
||||
msg = "invalid tag"; |
||||
} |
||||
|
||||
Status result; |
||||
if (msg != NULL) { |
||||
result = Status::Corruption("VersionEdit", msg); |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
std::string VersionEdit::DebugString() const { |
||||
std::string r; |
||||
r.append("VersionEdit {"); |
||||
if (has_comparator_) { |
||||
r.append("\n Comparator: "); |
||||
r.append(comparator_); |
||||
} |
||||
if (has_log_number_) { |
||||
r.append("\n LogNumber: "); |
||||
AppendNumberTo(&r, log_number_); |
||||
} |
||||
if (has_next_file_number_) { |
||||
r.append("\n NextFile: "); |
||||
AppendNumberTo(&r, next_file_number_); |
||||
} |
||||
if (has_last_sequence_) { |
||||
r.append("\n LastSeq: "); |
||||
AppendNumberTo(&r, last_sequence_); |
||||
} |
||||
for (int i = 0; i < compact_pointers_.size(); i++) { |
||||
r.append("\n CompactPointer: "); |
||||
AppendNumberTo(&r, compact_pointers_[i].first); |
||||
r.append(" '"); |
||||
AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode()); |
||||
r.append("'"); |
||||
} |
||||
for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); |
||||
iter != deleted_files_.end(); |
||||
++iter) { |
||||
r.append("\n DeleteFile: "); |
||||
AppendNumberTo(&r, iter->first); |
||||
r.append(" "); |
||||
AppendNumberTo(&r, iter->second); |
||||
} |
||||
for (int i = 0; i < new_files_.size(); i++) { |
||||
const FileMetaData& f = new_files_[i].second; |
||||
r.append("\n AddFile: "); |
||||
AppendNumberTo(&r, new_files_[i].first); |
||||
r.append(" "); |
||||
AppendNumberTo(&r, f.number); |
||||
r.append(" "); |
||||
AppendNumberTo(&r, f.file_size); |
||||
r.append(" '"); |
||||
AppendEscapedStringTo(&r, f.smallest.Encode()); |
||||
r.append("' .. '"); |
||||
AppendEscapedStringTo(&r, f.largest.Encode()); |
||||
r.append("'"); |
||||
} |
||||
for (int i = 0; i < large_refs_added_.size(); i++) { |
||||
const VersionEdit::Large& l = large_refs_added_[i]; |
||||
r.append("\n LargeRef: "); |
||||
AppendNumberTo(&r, l.fnum); |
||||
r.append(" "); |
||||
r.append(LargeValueRefToFilenameString(l.large_ref)); |
||||
r.append(" '"); |
||||
AppendEscapedStringTo(&r, l.internal_key.Encode()); |
||||
r.append("'"); |
||||
} |
||||
r.append("\n}\n"); |
||||
return r; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,118 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_ |
||||
#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_ |
||||
|
||||
#include <set> |
||||
#include <utility> |
||||
#include <vector> |
||||
#include "db/dbformat.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class VersionSet; |
||||
|
||||
struct FileMetaData { |
||||
int refs; |
||||
uint64_t number; |
||||
uint64_t file_size; // File size in bytes
|
||||
InternalKey smallest; // Smallest internal key served by table
|
||||
InternalKey largest; // Largest internal key served by table
|
||||
|
||||
FileMetaData() : refs(0), file_size(0) { } |
||||
}; |
||||
|
||||
class VersionEdit { |
||||
public: |
||||
VersionEdit() { Clear(); } |
||||
~VersionEdit() { } |
||||
|
||||
void Clear(); |
||||
|
||||
void SetComparatorName(const Slice& name) { |
||||
has_comparator_ = true; |
||||
comparator_ = name.ToString(); |
||||
} |
||||
void SetLogNumber(uint64_t num) { |
||||
has_log_number_ = true; |
||||
log_number_ = num; |
||||
} |
||||
void SetNextFile(uint64_t num) { |
||||
has_next_file_number_ = true; |
||||
next_file_number_ = num; |
||||
} |
||||
void SetLastSequence(SequenceNumber seq) { |
||||
has_last_sequence_ = true; |
||||
last_sequence_ = seq; |
||||
} |
||||
void SetCompactPointer(int level, const InternalKey& key) { |
||||
compact_pointers_.push_back(std::make_pair(level, key)); |
||||
} |
||||
|
||||
// Add the specified file at the specified number.
|
||||
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
|
||||
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
|
||||
void AddFile(int level, uint64_t file, |
||||
uint64_t file_size, |
||||
const InternalKey& smallest, |
||||
const InternalKey& largest) { |
||||
FileMetaData f; |
||||
f.number = file; |
||||
f.file_size = file_size; |
||||
f.smallest = smallest; |
||||
f.largest = largest; |
||||
new_files_.push_back(std::make_pair(level, f)); |
||||
} |
||||
|
||||
// Delete the specified "file" from the specified "level".
|
||||
void DeleteFile(int level, uint64_t file) { |
||||
deleted_files_.insert(std::make_pair(level, file)); |
||||
} |
||||
|
||||
// Record that a large value with the specified large_ref was
|
||||
// written to the output file numbered "fnum"
|
||||
void AddLargeValueRef(const LargeValueRef& large_ref, |
||||
uint64_t fnum, |
||||
const Slice& internal_key) { |
||||
large_refs_added_.resize(large_refs_added_.size() + 1); |
||||
Large* large = &(large_refs_added_.back()); |
||||
large->large_ref = large_ref; |
||||
large->fnum = fnum; |
||||
large->internal_key.DecodeFrom(internal_key); |
||||
} |
||||
|
||||
void EncodeTo(std::string* dst) const; |
||||
Status DecodeFrom(const Slice& src); |
||||
|
||||
std::string DebugString() const; |
||||
|
||||
private: |
||||
friend class VersionSet; |
||||
|
||||
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet; |
||||
|
||||
std::string comparator_; |
||||
uint64_t log_number_; |
||||
uint64_t next_file_number_; |
||||
SequenceNumber last_sequence_; |
||||
bool has_comparator_; |
||||
bool has_log_number_; |
||||
bool has_next_file_number_; |
||||
bool has_last_sequence_; |
||||
|
||||
std::vector< std::pair<int, InternalKey> > compact_pointers_; |
||||
DeletedFileSet deleted_files_; |
||||
std::vector< std::pair<int, FileMetaData> > new_files_; |
||||
struct Large { |
||||
LargeValueRef large_ref; |
||||
uint64_t fnum; |
||||
InternalKey internal_key; |
||||
}; |
||||
std::vector<Large> large_refs_added_; |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_
|
@ -0,0 +1,50 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/version_edit.h" |
||||
#include "util/testharness.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
static void TestEncodeDecode(const VersionEdit& edit) { |
||||
std::string encoded, encoded2; |
||||
edit.EncodeTo(&encoded); |
||||
VersionEdit parsed; |
||||
Status s = parsed.DecodeFrom(encoded); |
||||
ASSERT_TRUE(s.ok()) << s.ToString(); |
||||
parsed.EncodeTo(&encoded2); |
||||
ASSERT_EQ(encoded, encoded2); |
||||
} |
||||
|
||||
class VersionEditTest { }; |
||||
|
||||
TEST(VersionEditTest, EncodeDecode) { |
||||
static const uint64_t kBig = 1ull << 50; |
||||
|
||||
VersionEdit edit; |
||||
for (int i = 0; i < 4; i++) { |
||||
TestEncodeDecode(edit); |
||||
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, |
||||
InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef), |
||||
InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); |
||||
edit.DeleteFile(4, kBig + 700 + i); |
||||
edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression), |
||||
kBig + 800 + i, "foobar"); |
||||
edit.AddLargeValueRef(LargeValueRef::Make("big2", kLightweightCompression), |
||||
kBig + 801 + i, "baz"); |
||||
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); |
||||
} |
||||
|
||||
edit.SetComparatorName("foo"); |
||||
edit.SetLogNumber(kBig + 100); |
||||
edit.SetNextFile(kBig + 200); |
||||
edit.SetLastSequence(kBig + 1000); |
||||
TestEncodeDecode(edit); |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,290 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// The representation of a DBImpl consists of a set of Versions. The
|
||||
// newest version is called "current". Older versions may be kept
|
||||
// around to provide a consistent view to live iterators.
|
||||
//
|
||||
// Each Version keeps track of a set of Table files per level. The
|
||||
// entire set of versions is maintained in a VersionSet.
|
||||
//
|
||||
// Version,VersionSet are thread-compatible, but require external
|
||||
// synchronization on all accesses.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ |
||||
#define STORAGE_LEVELDB_DB_VERSION_SET_H_ |
||||
|
||||
#include <map> |
||||
#include <set> |
||||
#include <vector> |
||||
#include "db/dbformat.h" |
||||
#include "db/version_edit.h" |
||||
#include "port/port.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
// Grouping of constants. We may want to make some of these
|
||||
// parameters set via options.
|
||||
namespace config { |
||||
static const int kNumLevels = 7; |
||||
} |
||||
|
||||
namespace log { class Writer; } |
||||
|
||||
class Compaction; |
||||
class Iterator; |
||||
class MemTable; |
||||
class TableBuilder; |
||||
class TableCache; |
||||
class Version; |
||||
class VersionSet; |
||||
class WritableFile; |
||||
|
||||
class Version { |
||||
public: |
||||
// Append to *iters a sequence of iterators that will
|
||||
// yield the contents of this Version when merged together.
|
||||
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
|
||||
void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters); |
||||
|
||||
// Reference count management (so Versions do not disappear out from
|
||||
// under live iterators)
|
||||
void Ref(); |
||||
void Unref(); |
||||
|
||||
// Return a human readable string that describes this version's contents.
|
||||
std::string DebugString() const; |
||||
|
||||
private: |
||||
friend class Compaction; |
||||
friend class VersionSet; |
||||
|
||||
class LevelFileNumIterator; |
||||
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; |
||||
|
||||
VersionSet* vset_; // VersionSet to which this Version belongs
|
||||
Version* next_; // Next version in linked list
|
||||
int refs_; // Number of live refs to this version
|
||||
MemTable* cleanup_mem_; // NULL, or table to delete when version dropped
|
||||
|
||||
// List of files per level
|
||||
std::vector<FileMetaData*> files_[config::kNumLevels]; |
||||
|
||||
// Level that should be compacted next and its compaction score.
|
||||
// Score < 1 means compaction is not strictly needed. These fields
|
||||
// are initialized by Finalize().
|
||||
double compaction_score_; |
||||
int compaction_level_; |
||||
|
||||
explicit Version(VersionSet* vset) |
||||
: vset_(vset), next_(NULL), refs_(0), |
||||
cleanup_mem_(NULL), |
||||
compaction_score_(-1), |
||||
compaction_level_(-1) { |
||||
} |
||||
|
||||
~Version(); |
||||
|
||||
// No copying allowed
|
||||
Version(const Version&); |
||||
void operator=(const Version&); |
||||
}; |
||||
|
||||
class VersionSet { |
||||
public: |
||||
VersionSet(const std::string& dbname, |
||||
const Options* options, |
||||
TableCache* table_cache, |
||||
const InternalKeyComparator*); |
||||
~VersionSet(); |
||||
|
||||
// Apply *edit to the current version to form a new descriptor that
|
||||
// is both saved to persistent state and installed as the new
|
||||
// current version. Iff Apply() returns OK, arrange to delete
|
||||
// cleanup_mem (if cleanup_mem != NULL) when it is no longer needed
|
||||
// by older versions.
|
||||
Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem); |
||||
|
||||
// Recover the last saved descriptor from persistent storage.
|
||||
Status Recover(uint64_t* log_number, SequenceNumber* last_sequence); |
||||
|
||||
// Save current contents to *log
|
||||
Status WriteSnapshot(log::Writer* log); |
||||
|
||||
// Return the current version.
|
||||
Version* current() const { return current_; } |
||||
|
||||
// Return the current manifest file number
|
||||
uint64_t ManifestFileNumber() const { return manifest_file_number_; } |
||||
|
||||
// Allocate and return a new file number
|
||||
uint64_t NewFileNumber() { return next_file_number_++; } |
||||
|
||||
// Return the number of Table files at the specified level.
|
||||
int NumLevelFiles(int level) const; |
||||
|
||||
// Pick level and inputs for a new compaction.
|
||||
// Returns NULL if there is no compaction to be done.
|
||||
// Otherwise returns a pointer to a heap-allocated object that
|
||||
// describes the compaction. Caller should delete the result.
|
||||
Compaction* PickCompaction(); |
||||
|
||||
// Return a compaction object for compacting the range [begin,end] in
|
||||
// the specified level. Returns NULL if there is nothing in that
|
||||
// level that overlaps the specified range. Caller should delete
|
||||
// the result.
|
||||
Compaction* CompactRange( |
||||
int level, |
||||
const InternalKey& begin, |
||||
const InternalKey& end); |
||||
|
||||
// Create an iterator that reads over the compaction inputs for "*c".
|
||||
// The caller should delete the iterator when no longer needed.
|
||||
Iterator* MakeInputIterator(Compaction* c); |
||||
|
||||
// Returns true iff some level needs a compaction.
|
||||
bool NeedsCompaction() const { return current_->compaction_score_ >= 1; } |
||||
|
||||
// Add all files listed in any live version to *live.
|
||||
// May also mutate some internal state.
|
||||
void AddLiveFiles(std::set<uint64_t>* live); |
||||
|
||||
// Return the approximate offset in the database of the data for
|
||||
// "key" as of version "v".
|
||||
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); |
||||
|
||||
// Register a reference to a large value with the specified
|
||||
// large_ref from the specified file number. Returns "true" if this
|
||||
// is the first recorded reference to the "large_ref" value in the
|
||||
// database, and false otherwise.
|
||||
bool RegisterLargeValueRef(const LargeValueRef& large_ref, |
||||
uint64_t filenum, |
||||
const InternalKey& internal_key); |
||||
|
||||
// Cleanup the large value reference state by eliminating any
|
||||
// references from files that are not includes in either "live_tables"
|
||||
// or "log_file".
|
||||
void CleanupLargeValueRefs(const std::set<uint64_t>& live_tables, |
||||
uint64_t log_file_num); |
||||
|
||||
// Returns true if a large value with the given reference is live.
|
||||
bool LargeValueIsLive(const LargeValueRef& large_ref); |
||||
|
||||
private: |
||||
class Builder; |
||||
|
||||
friend class Compaction; |
||||
friend class Version; |
||||
|
||||
Status Finalize(Version* v); |
||||
|
||||
// Delete any old versions that are no longer needed.
|
||||
void MaybeDeleteOldVersions(); |
||||
|
||||
struct BySmallestKey; |
||||
Status SortLevel(Version* v, uint64_t level); |
||||
|
||||
void GetOverlappingInputs( |
||||
int level, |
||||
const InternalKey& begin, |
||||
const InternalKey& end, |
||||
std::vector<FileMetaData*>* inputs); |
||||
|
||||
void GetRange(const std::vector<FileMetaData*>& inputs, |
||||
InternalKey* smallest, |
||||
InternalKey* largest); |
||||
|
||||
Env* const env_; |
||||
const std::string dbname_; |
||||
const Options* const options_; |
||||
TableCache* const table_cache_; |
||||
const InternalKeyComparator icmp_; |
||||
uint64_t next_file_number_; |
||||
uint64_t manifest_file_number_; |
||||
|
||||
// Opened lazily
|
||||
WritableFile* descriptor_file_; |
||||
log::Writer* descriptor_log_; |
||||
|
||||
// Versions are kept in a singly linked list that is never empty
|
||||
Version* current_; // Pointer to the last (newest) list entry
|
||||
Version* oldest_; // Pointer to the first (oldest) list entry
|
||||
|
||||
// Map from large value reference to the set of <file numbers,internal_key>
|
||||
// values containing references to the value. We keep the
|
||||
// internal key as a std::string rather than as an InternalKey because
|
||||
// we want to be able to easily use a set.
|
||||
typedef std::set<std::pair<uint64_t, std::string> > LargeReferencesSet; |
||||
typedef std::map<LargeValueRef, LargeReferencesSet> LargeValueMap; |
||||
LargeValueMap large_value_refs_; |
||||
|
||||
// Per-level key at which the next compaction at that level should start.
|
||||
// Either an empty string, or a valid InternalKey.
|
||||
std::string compact_pointer_[config::kNumLevels]; |
||||
|
||||
// No copying allowed
|
||||
VersionSet(const VersionSet&); |
||||
void operator=(const VersionSet&); |
||||
}; |
||||
|
||||
// A Compaction encapsulates information about a compaction.
|
||||
class Compaction { |
||||
public: |
||||
~Compaction(); |
||||
|
||||
// Return the level that is being compacted. Inputs from "level"
|
||||
// and "level+1" will be merged to produce a set of "level+1" files.
|
||||
int level() const { return level_; } |
||||
|
||||
// Return the object that holds the edits to the descriptor done
|
||||
// by this compaction.
|
||||
VersionEdit* edit() { return &edit_; } |
||||
|
||||
// "which" must be either 0 or 1
|
||||
int num_input_files(int which) const { return inputs_[which].size(); } |
||||
|
||||
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
|
||||
FileMetaData* input(int which, int i) const { return inputs_[which][i]; } |
||||
|
||||
// Maximum size of files to build during this compaction.
|
||||
uint64_t MaxOutputFileSize() const { return max_output_file_size_; } |
||||
|
||||
// Add all inputs to this compaction as delete operations to *edit.
|
||||
void AddInputDeletions(VersionEdit* edit); |
||||
|
||||
// Returns true if the information we have available guarantees that
|
||||
// the compaction is producing data in "level+1" for which no data exists
|
||||
// in levels greater than "level+1".
|
||||
bool IsBaseLevelForKey(const Slice& user_key); |
||||
|
||||
// Release the input version for the compaction, once the compaction
|
||||
// is successful.
|
||||
void ReleaseInputs(); |
||||
|
||||
private: |
||||
friend class Version; |
||||
friend class VersionSet; |
||||
|
||||
explicit Compaction(int level); |
||||
|
||||
int level_; |
||||
uint64_t max_output_file_size_; |
||||
Version* input_version_; |
||||
VersionEdit edit_; |
||||
|
||||
// Each compaction reads inputs from "level_" and "level_+1"
|
||||
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
|
||||
|
||||
// State for implementing IsBaseLevelForKey
|
||||
|
||||
// level_ptrs_ holds indices into input_version_->levels_: our state
|
||||
// is that we are positioned at one of the file ranges for each
|
||||
// higher level than the ones involved in this compaction (i.e. for
|
||||
// all L >= level_ + 2).
|
||||
int level_ptrs_[config::kNumLevels]; |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_
|
@ -0,0 +1,164 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// WriteBatch::rep_ :=
|
||||
// sequence: fixed64
|
||||
// count: fixed32
|
||||
// data: record[count]
|
||||
// record :=
|
||||
// kTypeValue varstring varstring |
|
||||
// kTypeLargeValueRef varstring varstring |
|
||||
// kTypeDeletion varstring
|
||||
// varstring :=
|
||||
// len: varint32
|
||||
// data: uint8[len]
|
||||
|
||||
#include "include/write_batch.h" |
||||
|
||||
#include "include/db.h" |
||||
#include "db/dbformat.h" |
||||
#include "db/memtable.h" |
||||
#include "db/write_batch_internal.h" |
||||
#include "util/coding.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
WriteBatch::WriteBatch() { |
||||
Clear(); |
||||
} |
||||
|
||||
WriteBatch::~WriteBatch() { } |
||||
|
||||
void WriteBatch::Clear() { |
||||
rep_.clear(); |
||||
rep_.resize(12); |
||||
} |
||||
|
||||
int WriteBatchInternal::Count(const WriteBatch* b) { |
||||
return DecodeFixed32(b->rep_.data() + 8); |
||||
} |
||||
|
||||
void WriteBatchInternal::SetCount(WriteBatch* b, int n) { |
||||
EncodeFixed32(&b->rep_[8], n); |
||||
} |
||||
|
||||
SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { |
||||
return SequenceNumber(DecodeFixed64(b->rep_.data())); |
||||
} |
||||
|
||||
void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { |
||||
EncodeFixed64(&b->rep_[0], seq); |
||||
} |
||||
|
||||
void WriteBatch::Put(const Slice& key, const Slice& value) { |
||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); |
||||
rep_.push_back(static_cast<char>(kTypeValue)); |
||||
PutLengthPrefixedSlice(&rep_, key); |
||||
PutLengthPrefixedSlice(&rep_, value); |
||||
} |
||||
|
||||
void WriteBatchInternal::PutLargeValueRef(WriteBatch* b, |
||||
const Slice& key, |
||||
const LargeValueRef& large_ref) { |
||||
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); |
||||
b->rep_.push_back(static_cast<char>(kTypeLargeValueRef)); |
||||
PutLengthPrefixedSlice(&b->rep_, key); |
||||
PutLengthPrefixedSlice(&b->rep_, |
||||
Slice(large_ref.data, sizeof(large_ref.data))); |
||||
} |
||||
|
||||
void WriteBatch::Delete(const Slice& key) { |
||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); |
||||
rep_.push_back(static_cast<char>(kTypeDeletion)); |
||||
PutLengthPrefixedSlice(&rep_, key); |
||||
} |
||||
|
||||
Status WriteBatchInternal::InsertInto(const WriteBatch* b, |
||||
MemTable* memtable) { |
||||
const int count = WriteBatchInternal::Count(b); |
||||
int found = 0; |
||||
Iterator it(*b); |
||||
for (; !it.Done(); it.Next()) { |
||||
switch (it.op()) { |
||||
case kTypeDeletion: |
||||
memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice()); |
||||
break; |
||||
case kTypeValue: |
||||
memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); |
||||
break; |
||||
case kTypeLargeValueRef: |
||||
memtable->Add(it.sequence_number(), kTypeLargeValueRef, |
||||
it.key(), it.value()); |
||||
break; |
||||
} |
||||
found++; |
||||
} |
||||
if (!it.status().ok()) { |
||||
return it.status(); |
||||
} else if (found != count) { |
||||
return Status::Corruption("wrong count in WriteBatch"); |
||||
} |
||||
return Status::OK(); |
||||
} |
||||
|
||||
void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { |
||||
assert(contents.size() >= 12); |
||||
b->rep_.assign(contents.data(), contents.size()); |
||||
} |
||||
|
||||
WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch) |
||||
: input_(WriteBatchInternal::Contents(&batch)), |
||||
done_(false) { |
||||
if (input_.size() < 12) { |
||||
done_ = true; |
||||
} else { |
||||
seq_ = WriteBatchInternal::Sequence(&batch), |
||||
input_.remove_prefix(12); |
||||
GetNextEntry(); |
||||
} |
||||
} |
||||
|
||||
void WriteBatchInternal::Iterator::Next() { |
||||
assert(!done_); |
||||
seq_++; |
||||
GetNextEntry(); |
||||
} |
||||
|
||||
void WriteBatchInternal::Iterator::GetNextEntry() { |
||||
if (input_.empty()) { |
||||
done_ = true; |
||||
return; |
||||
} |
||||
char tag = input_[0]; |
||||
input_.remove_prefix(1); |
||||
switch (tag) { |
||||
case kTypeValue: |
||||
case kTypeLargeValueRef: |
||||
if (GetLengthPrefixedSlice(&input_, &key_) && |
||||
GetLengthPrefixedSlice(&input_, &value_)) { |
||||
op_ = static_cast<ValueType>(tag); |
||||
} else { |
||||
status_ = Status::Corruption("bad WriteBatch Put"); |
||||
done_ = true; |
||||
input_.clear(); |
||||
} |
||||
break; |
||||
case kTypeDeletion: |
||||
if (GetLengthPrefixedSlice(&input_, &key_)) { |
||||
op_ = kTypeDeletion; |
||||
} else { |
||||
status_ = Status::Corruption("bad WriteBatch Delete"); |
||||
done_ = true; |
||||
input_.clear(); |
||||
} |
||||
break; |
||||
default: |
||||
status_ = Status::Corruption("unknown WriteBatch tag"); |
||||
done_ = true; |
||||
input_.clear(); |
||||
break; |
||||
} |
||||
} |
||||
|
||||
} |
@ -0,0 +1,73 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ |
||||
#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ |
||||
|
||||
#include "include/write_batch.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
// WriteBatchInternal provides static methods for manipulating a
|
||||
// WriteBatch that we don't want in the public WriteBatch interface.
|
||||
class WriteBatchInternal { |
||||
public: |
||||
static void PutLargeValueRef(WriteBatch* batch, |
||||
const Slice& key, |
||||
const LargeValueRef& large_ref); |
||||
|
||||
// Return the number of entries in the batch.
|
||||
static int Count(const WriteBatch* batch); |
||||
|
||||
// Set the count for the number of entries in the batch.
|
||||
static void SetCount(WriteBatch* batch, int n); |
||||
|
||||
// Return the seqeunce number for the start of this batch.
|
||||
static SequenceNumber Sequence(const WriteBatch* batch); |
||||
|
||||
// Store the specified number as the seqeunce number for the start of
|
||||
// this batch.
|
||||
static void SetSequence(WriteBatch* batch, SequenceNumber seq); |
||||
|
||||
static Slice Contents(const WriteBatch* batch) { |
||||
return Slice(batch->rep_); |
||||
} |
||||
|
||||
static size_t ByteSize(const WriteBatch* batch) { |
||||
return batch->rep_.size(); |
||||
} |
||||
|
||||
static void SetContents(WriteBatch* batch, const Slice& contents); |
||||
|
||||
static Status InsertInto(const WriteBatch* batch, MemTable* memtable); |
||||
|
||||
// Iterate over the contents of a write batch.
|
||||
class Iterator { |
||||
public: |
||||
explicit Iterator(const WriteBatch& batch); |
||||
bool Done() const { return done_; } |
||||
void Next(); |
||||
ValueType op() const { return op_; } |
||||
const Slice& key() const { return key_; } |
||||
const Slice& value() const { return value_; } |
||||
SequenceNumber sequence_number() const { return seq_; } |
||||
Status status() const { return status_; } |
||||
|
||||
private: |
||||
void GetNextEntry(); |
||||
|
||||
Slice input_; |
||||
bool done_; |
||||
ValueType op_; |
||||
Slice key_; |
||||
Slice value_; |
||||
SequenceNumber seq_; |
||||
Status status_; |
||||
}; |
||||
}; |
||||
|
||||
} |
||||
|
||||
|
||||
#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
|
@ -0,0 +1,110 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "include/db.h" |
||||
|
||||
#include "db/memtable.h" |
||||
#include "db/write_batch_internal.h" |
||||
#include "include/env.h" |
||||
#include "util/logging.h" |
||||
#include "util/testharness.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
static std::string PrintContents(WriteBatch* b) { |
||||
InternalKeyComparator cmp(BytewiseComparator()); |
||||
MemTable mem(cmp); |
||||
std::string state; |
||||
Status s = WriteBatchInternal::InsertInto(b, &mem); |
||||
Iterator* iter = mem.NewIterator(); |
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { |
||||
ParsedInternalKey ikey; |
||||
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); |
||||
switch (ikey.type) { |
||||
case kTypeValue: |
||||
state.append("Put("); |
||||
state.append(ikey.user_key.ToString()); |
||||
state.append(", "); |
||||
state.append(iter->value().ToString()); |
||||
state.append(")"); |
||||
break; |
||||
case kTypeLargeValueRef: |
||||
state.append("PutRef("); |
||||
state.append(ikey.user_key.ToString()); |
||||
state.append(", "); |
||||
state.append(iter->value().ToString()); |
||||
state.append(")"); |
||||
break; |
||||
case kTypeDeletion: |
||||
state.append("Delete("); |
||||
state.append(ikey.user_key.ToString()); |
||||
state.append(")"); |
||||
break; |
||||
} |
||||
state.append("@"); |
||||
state.append(NumberToString(ikey.sequence)); |
||||
} |
||||
delete iter; |
||||
if (!s.ok()) { |
||||
state.append("ParseError()"); |
||||
} |
||||
return state; |
||||
} |
||||
|
||||
class WriteBatchTest { }; |
||||
|
||||
TEST(WriteBatchTest, Empty) { |
||||
WriteBatch batch; |
||||
ASSERT_EQ("", PrintContents(&batch)); |
||||
ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); |
||||
} |
||||
|
||||
TEST(WriteBatchTest, Multiple) { |
||||
WriteBatch batch; |
||||
batch.Put(Slice("foo"), Slice("bar")); |
||||
batch.Delete(Slice("box")); |
||||
batch.Put(Slice("baz"), Slice("boo")); |
||||
WriteBatchInternal::SetSequence(&batch, 100); |
||||
ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); |
||||
ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); |
||||
ASSERT_EQ("Put(baz, boo)@102" |
||||
"Delete(box)@101" |
||||
"Put(foo, bar)@100", |
||||
PrintContents(&batch)); |
||||
} |
||||
|
||||
TEST(WriteBatchTest, PutIndirect) { |
||||
WriteBatch batch; |
||||
batch.Put(Slice("baz"), Slice("boo")); |
||||
LargeValueRef h; |
||||
for (int i = 0; i < LargeValueRef::ByteSize(); i++) { |
||||
h.data[i] = (i < 20) ? 'a' : 'b'; |
||||
} |
||||
WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h); |
||||
WriteBatchInternal::SetSequence(&batch, 100); |
||||
ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); |
||||
ASSERT_EQ(2, WriteBatchInternal::Count(&batch)); |
||||
ASSERT_EQ("Put(baz, boo)@100" |
||||
"PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101", |
||||
PrintContents(&batch)); |
||||
} |
||||
|
||||
TEST(WriteBatchTest, Corruption) { |
||||
WriteBatch batch; |
||||
batch.Put(Slice("foo"), Slice("bar")); |
||||
batch.Delete(Slice("box")); |
||||
WriteBatchInternal::SetSequence(&batch, 200); |
||||
Slice contents = WriteBatchInternal::Contents(&batch); |
||||
WriteBatchInternal::SetContents(&batch, |
||||
Slice(contents.data(),contents.size()-1)); |
||||
ASSERT_EQ("Put(foo, bar)@200" |
||||
"ParseError()", |
||||
PrintContents(&batch)); |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,89 @@ |
||||
body { |
||||
margin-left: 0.5in; |
||||
margin-right: 0.5in; |
||||
background: white; |
||||
color: black; |
||||
} |
||||
|
||||
h1 { |
||||
margin-left: -0.2in; |
||||
font-size: 14pt; |
||||
} |
||||
h2 { |
||||
margin-left: -0in; |
||||
font-size: 12pt; |
||||
} |
||||
h3 { |
||||
margin-left: -0in; |
||||
} |
||||
h4 { |
||||
margin-left: -0in; |
||||
} |
||||
hr { |
||||
margin-left: -0in; |
||||
} |
||||
|
||||
/* Definition lists: definition term bold */ |
||||
dt { |
||||
font-weight: bold; |
||||
} |
||||
|
||||
address { |
||||
text-align: center; |
||||
} |
||||
code,samp,var { |
||||
color: blue; |
||||
} |
||||
kbd { |
||||
color: #600000; |
||||
} |
||||
div.note p { |
||||
float: right; |
||||
width: 3in; |
||||
margin-right: 0%; |
||||
padding: 1px; |
||||
border: 2px solid #6060a0; |
||||
background-color: #fffff0; |
||||
} |
||||
|
||||
ul { |
||||
margin-top: -0em; |
||||
margin-bottom: -0em; |
||||
} |
||||
|
||||
ol { |
||||
margin-top: -0em; |
||||
margin-bottom: -0em; |
||||
} |
||||
|
||||
UL.nobullets { |
||||
list-style-type: none; |
||||
list-style-image: none; |
||||
margin-left: -1em; |
||||
} |
||||
|
||||
p { |
||||
margin: 1em 0 1em 0; |
||||
padding: 0 0 0 0; |
||||
} |
||||
|
||||
pre { |
||||
line-height: 1.3em; |
||||
padding: 0.4em 0 0.8em 0; |
||||
margin: 0 0 0 0; |
||||
border: 0 0 0 0; |
||||
color: blue; |
||||
} |
||||
|
||||
.datatable { |
||||
margin-left: auto; |
||||
margin-right: auto; |
||||
margin-top: 2em; |
||||
margin-bottom: 2em; |
||||
border: 1px solid; |
||||
} |
||||
|
||||
.datatable td,th { |
||||
padding: 0 0.5em 0 0.5em; |
||||
text-align: right; |
||||
} |
@ -0,0 +1,222 @@ |
||||
<!DOCTYPE html> |
||||
<html> |
||||
<head> |
||||
<link rel="stylesheet" type="text/css" href="doc.css" /> |
||||
<title>Leveldb file layout and compactions</title> |
||||
</head> |
||||
|
||||
<body> |
||||
|
||||
<h1>Files</h1> |
||||
|
||||
The implementation of leveldb is similar in spirit to the |
||||
representation of a single |
||||
<a href="http://labs.google.com/papers/bigtable.html"> |
||||
Bigtable tablet (section 5.3)</a>. |
||||
However the organization of the files that make up the representation |
||||
is somewhat different and is explained below. |
||||
|
||||
<p> |
||||
Each database is represented by a set of file stored in a directory. |
||||
There are several different types of files as documented below: |
||||
<p> |
||||
<h2>Log files</h2> |
||||
<p> |
||||
A log file (*.log) stores a sequence of recent updates. Each update |
||||
is appended to the current log file. When the log file reaches a |
||||
pre-determined size (approximately 1MB by default), it is converted |
||||
to a sorted table (see below) and a new log file is created for future |
||||
updates. |
||||
<p> |
||||
A copy of the current log file is kept in an in-memory structure (the |
||||
<code>memtable</code>). This copy is consulted on every read so that read |
||||
operations reflect all logged updates. |
||||
<p> |
||||
<h2>Sorted tables</h2> |
||||
<p> |
||||
A sorted table (*.sst) stores a sequence of entries sorted by key. |
||||
Each entry is either a value for the key, or a deletion marker for the |
||||
key. (Deletion markers are kept around to hide obsolete values |
||||
present in older sorted tables). |
||||
<p> |
||||
The set of sorted tables are organized into a sequence of levels. The |
||||
sorted table generated from a log file is placed in a special <code>young</code> |
||||
level (also called level-0). When the number of young files exceeds a |
||||
certain threshold (currently four), all of the young files are merged |
||||
together with all of the overlapping level-1 files to produce a |
||||
sequence of new level-1 files (we create a new level-1 file for every |
||||
2MB of data.) |
||||
<p> |
||||
Files in the young level may contain overlapping keys. However files |
||||
in other levels have distinct non-overlapping key ranges. Consider |
||||
level number L where L >= 1. When the combined size of files in |
||||
level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2, |
||||
...), one file in level-L, and all of the overlapping files in |
||||
level-(L+1) are merged to form a set of new files for level-(L+1). |
||||
These merges have the effect of gradually migrating new updates from |
||||
the young level to the largest level using only bulk reads and writes |
||||
(i.e., minimizing expensive seeks). |
||||
|
||||
<h2>Large value files</h2> |
||||
<p> |
||||
Each large value (greater than 64KB by default) is placed in a large |
||||
value file (*.val) of its own. An entry is maintained in the log |
||||
and/or sorted tables that maps from the corresponding key to the |
||||
name of this large value file. The name of the large value file |
||||
is derived from a SHA1 hash of the value and its length so that |
||||
identical values share the same file. |
||||
<p> |
||||
<h2>Manifest</h2> |
||||
<p> |
||||
A MANIFEST file lists the set of sorted tables that make up each |
||||
level, the corresponding key ranges, and other important metadata. |
||||
A new MANIFEST file (with a new number embedded in the file name) |
||||
is created whenever the database is reopened. The MANIFEST file is |
||||
formatted as a log, and changes made to the serving state (as files |
||||
are added or removed) are appended to this log. |
||||
<p> |
||||
<h2>Current</h2> |
||||
<p> |
||||
CURRENT is a simple text file that contains the name of the latest |
||||
MANIFEST file. |
||||
<p> |
||||
<h2>Info logs</h2> |
||||
<p> |
||||
Informational messages are printed to files named LOG and LOG.old. |
||||
<p> |
||||
<h2>Others</h2> |
||||
<p> |
||||
Other files used for miscellaneous purposes may also be present |
||||
(LOCK, *.dbtmp). |
||||
|
||||
<h1>Level 0</h1> |
||||
When the log file grows above a certain size (1MB by default): |
||||
<ul> |
||||
<li>Write the contents of the current memtable to an sstable |
||||
<li>Replace the current memtable by a brand new empty memtable |
||||
<li>Switch to a new log file |
||||
<li>Delete the old log file and the old memtable |
||||
</ul> |
||||
Experimental measurements show that generating an sstable from a 1MB |
||||
log file takes ~12ms, which seems like an acceptable latency hiccup to |
||||
add infrequently to a log write. |
||||
|
||||
<p> |
||||
The new sstable is added to a special level-0 level. level-0 contains |
||||
a set of files (up to 4 by default). However unlike other levels, |
||||
these files do not cover disjoint ranges, but may overlap each other. |
||||
|
||||
<h1>Compactions</h1> |
||||
|
||||
<p> |
||||
When the size of level L exceeds its limit, we compact it in a |
||||
background thread. The compaction picks a file from level L and all |
||||
overlapping files from the next level L+1. Note that if a level-L |
||||
file overlaps only part of a level-(L+1) file, the entire file at |
||||
level-(L+1) is used as an input to the compaction and will be |
||||
discarded after the compaction. Aside: because level-0 is special |
||||
(files in it may overlap each other), we treat compactions from |
||||
level-0 to level-1 specially: a level-0 compaction may pick more than |
||||
one level-0 file in case some of these files overlap each other. |
||||
|
||||
<p> |
||||
A compaction merges the contents of the picked files to produce a |
||||
sequence of level-(L+1) files. We switch to producing a new |
||||
level-(L+1) file after the current output file has reached the target |
||||
file size (2MB). The old files are discarded and the new files are |
||||
added to the serving state. |
||||
|
||||
<p> |
||||
Compactions for a particular level rotate through the key space. In |
||||
more detail, for each level L, we remember the ending key of the last |
||||
compaction at level L. The next compaction for level L will pick the |
||||
first file that starts after this key (wrapping around to the |
||||
beginning of the key space if there is no such file). |
||||
|
||||
<p> |
||||
Compactions drop overwritten values. They also drop deletion markers |
||||
if there are no higher numbered levels that contain a file whose range |
||||
overlaps the current key. |
||||
|
||||
<h2>Timing</h2> |
||||
|
||||
Level-0 compactions will read up to four 1MB files from level-0, and |
||||
at worst all the level-1 files (10MB). I.e., we will read 14MB and |
||||
write 14MB. |
||||
|
||||
<p> |
||||
Other than the special level-0 compactions, we will pick one 2MB file |
||||
from level L. In the worst case, this will overlap ~ 12 files from |
||||
level L+1 (10 because level-(L+1) is ten times the size of level-L, |
||||
and another two at the boundaries since the file ranges at level-L |
||||
will usually not be aligned with the file ranges at level-L+1). The |
||||
compaction will therefore read 26MB and write 26MB. Assuming a disk |
||||
IO rate of 100MB/s (ballpark range for modern drives), the worst |
||||
compaction cost will be approximately 0.5 second. |
||||
|
||||
<p> |
||||
If we throttle the background writing to something small, say 10% of |
||||
the full 100MB/s speed, a compaction may take up to 5 seconds. If the |
||||
user is writing at 10MB/s, we might build up lots of level-0 files |
||||
(~50 to hold the 5*10MB). This may signficantly increase the cost of |
||||
reads due to the overhead of merging more files together on every |
||||
read. |
||||
|
||||
<p> |
||||
Solution 1: To reduce this problem, we might want to increase the log |
||||
switching threshold when the number of level-0 files is large. Though |
||||
the downside is that the larger this threshold, the larger the delay |
||||
that we will add to write latency when a write triggers a log switch. |
||||
|
||||
<p> |
||||
Solution 2: We might want to decrease write rate artificially when the |
||||
number of level-0 files goes up. |
||||
|
||||
<p> |
||||
Solution 3: We work on reducing the cost of very wide merges. |
||||
Perhaps most of the level-0 files will have their blocks sitting |
||||
uncompressed in the cache and we will only need to worry about the |
||||
O(N) complexity in the merging iterator. |
||||
|
||||
<h2>Number of files</h2> |
||||
|
||||
Instead of always making 2MB files, we could make larger files for |
||||
larger levels to reduce the total file count, though at the expense of |
||||
more bursty compactions. Alternatively, we could shard the set of |
||||
files into multiple directories. |
||||
|
||||
<p> |
||||
An experiment on an <code>ext3</code> filesystem on Feb 04, 2011 shows |
||||
the following timings to do 100K file opens in directories with |
||||
varying number of files: |
||||
<table class="datatable"> |
||||
<tr><th>Files in directory</th><th>Microseconds to open a file</th></tr> |
||||
<tr><td>1000</td><td>9</td> |
||||
<tr><td>10000</td><td>10</td> |
||||
<tr><td>100000</td><td>16</td> |
||||
</table> |
||||
So maybe even the sharding is not necessary on modern filesystems? |
||||
|
||||
<h1>Recovery</h1> |
||||
|
||||
<ul> |
||||
<li> Read CURRENT to find name of the latest committed MANIFEST |
||||
<li> Read the named MANIFEST file |
||||
<li> Clean up stale files |
||||
<li> We could open all sstables here, but it is probably better to be lazy... |
||||
<li> Convert log chunk to a new level-0 sstable |
||||
<li> Start directing new writes to a new log file with recovered sequence# |
||||
</ul> |
||||
|
||||
<h1>Garbage collection of files</h1> |
||||
|
||||
<code>DeleteObsoleteFiles()</code> is called at the end of every |
||||
compaction and at the end of recovery. It finds the names of all |
||||
files in the database. It deletes all log files that are not the |
||||
current log file. It deletes all table files that are not referenced |
||||
from some level and are not the output of an active compaction. It |
||||
deletes all large value files that are not referenced from any live |
||||
table or log file. |
||||
|
||||
</body> |
||||
</html> |
@ -0,0 +1,508 @@ |
||||
<!DOCTYPE html> |
||||
<html> |
||||
<head> |
||||
<link rel="stylesheet" type="text/css" href="doc.css" /> |
||||
<title>Leveldb</title> |
||||
</head> |
||||
|
||||
<body> |
||||
<h1>Leveldb</h1> |
||||
<address>Jeff Dean, Sanjay Ghemawat</address> |
||||
<p> |
||||
The <code>leveldb</code> library provides a persistent key value store. Keys and |
||||
values are arbitrary byte arrays. The keys are ordered within the key |
||||
value store according to a user-specified comparator function. |
||||
|
||||
<p> |
||||
<h1>Opening A Database</h1> |
||||
<p> |
||||
A <code>leveldb</code> database has a name which corresponds to a file system |
||||
directory. All of the contents of database are stored in this |
||||
directory. The following example shows how to open a database, |
||||
creating it if necessary: |
||||
<p> |
||||
<pre> |
||||
#include <assert> |
||||
#include "leveldb/include/db.h" |
||||
|
||||
leveldb::DB* db; |
||||
leveldb::Options options; |
||||
options.create_if_missing = true; |
||||
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db); |
||||
assert(status.ok()); |
||||
... |
||||
</pre> |
||||
If you want to raise an error if the database already exists, add |
||||
the following line before the <code>leveldb::DB::Open</code> call: |
||||
<pre> |
||||
options.error_if_exists = true; |
||||
</pre> |
||||
<h1>Status</h1> |
||||
<p> |
||||
You may have noticed the <code>leveldb::Status</code> type above. Values of this |
||||
type are returned by most functions in <code>leveldb</code> that may encounter an |
||||
error. You can check if such a result is ok, and also print an |
||||
associated error message: |
||||
<p> |
||||
<pre> |
||||
leveldb::Status s = ...; |
||||
if (!s.ok()) cerr << s.ToString() << endl; |
||||
</pre> |
||||
<h1>Closing A Database</h1> |
||||
<p> |
||||
When you are done with a database, just delete the database object. |
||||
Example: |
||||
<p> |
||||
<pre> |
||||
... open the db as described above ... |
||||
... do something with db ... |
||||
delete db; |
||||
</pre> |
||||
<h1>Reads And Writes</h1> |
||||
<p> |
||||
The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to |
||||
modify/query the database. For example, the following code |
||||
moves the value stored under key1 to key2. |
||||
<p> |
||||
<pre> |
||||
std::string value; |
||||
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value); |
||||
if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value); |
||||
if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1); |
||||
</pre> |
||||
See <a href="#async">important performance note</a> below for how to |
||||
speed up writes significantly. |
||||
|
||||
<h1>Atomic Updates</h1> |
||||
<p> |
||||
Note that if the process dies after the Put of key2 but before the |
||||
delete of key1, the same value may be left stored under multiple keys. |
||||
Such problems can be avoided by using the <code>WriteBatch</code> class to |
||||
atomically apply a set of updates: |
||||
<p> |
||||
<pre> |
||||
#include "leveldb/include/write_batch.h" |
||||
... |
||||
std::string value; |
||||
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value); |
||||
if (s.ok()) { |
||||
leveldb::WriteBatch batch; |
||||
batch.Delete(key1); |
||||
batch.Put(key2, value); |
||||
s = db->Write(leveldb::WriteOptions(), &batch); |
||||
} |
||||
</pre> |
||||
The <code>WriteBatch</code> holds a sequence of edits to be made to the database, |
||||
and these edits within the batch are applied in order. Note that we |
||||
called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>, |
||||
we do not end up erroneously dropping the value entirely. |
||||
<p> |
||||
Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to |
||||
speed up bulk updates by placing lots of individual mutations into the |
||||
same batch. |
||||
<p> |
||||
<h1>Concurrency</h1> |
||||
<p> |
||||
A database may only be opened by one process at a time. The <code>leveldb</code> |
||||
implementation acquires a lock from the operating system to prevent |
||||
misuse. Within a single process, the same <code>leveldb::DB</code> object may |
||||
be safely used by multiple concurrent threads. |
||||
<p> |
||||
<h1>Iteration</h1> |
||||
<p> |
||||
The following example demonstrates how to print all key,value pairs |
||||
in a database. |
||||
<p> |
||||
<pre> |
||||
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions()); |
||||
for (it->SeekToFirst(); it->Valid(); it->Next()) { |
||||
cout << it->key().ToString() << ": " << it->value().ToString() << endl; |
||||
} |
||||
assert(it->status().ok()); // Check for any errors found during the scan |
||||
delete it; |
||||
</pre> |
||||
The following variation shows how to process just the keys in the |
||||
range <code>[start,limit)</code>: |
||||
<p> |
||||
<pre> |
||||
for (it->Seek(start); |
||||
it->Valid() && it->key().ToString() < limit; |
||||
it->Next()) { |
||||
... |
||||
} |
||||
</pre> |
||||
You can also process entries in reverse order. (Caveat: reverse |
||||
iteration is currently a factor of two or three slower than forward |
||||
iteration.) |
||||
<p> |
||||
<pre> |
||||
for (it->SeekToLast(); it->Valid(); it->Prev()) { |
||||
... |
||||
} |
||||
</pre> |
||||
<h1>Snapshots</h1> |
||||
<p> |
||||
Snapshots provide consistent read-only views over the entire state of |
||||
the key-value store. <code>ReadOptions::snapshot</code> may be non-NULL to indicate |
||||
that a read should operate on a particular version of the DB state. |
||||
If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an |
||||
implicit snapshot of the current state. |
||||
<p> |
||||
Snapshots typically are created by the DB::GetSnapshot() method: |
||||
<p> |
||||
<pre> |
||||
leveldb::ReadOptions options; |
||||
options.snapshot = db->GetSnapshot(); |
||||
... apply some updates to db ... |
||||
leveldb::Iterator* iter = db->NewIterator(options); |
||||
... read using iter to view the state when the snapshot was created ... |
||||
delete iter; |
||||
db->ReleaseSnapshot(options.snapshot); |
||||
</pre> |
||||
Note that when a snapshot is no longer needed, it should be released |
||||
using the DB::ReleaseSnapshot interface. This allows the |
||||
implementation to get rid of state that was being maintained just to |
||||
support reading as of that snapshot. |
||||
<p> |
||||
A Write operation can also return a snapshot that |
||||
represents the state of the database just after applying a particular |
||||
set of updates: |
||||
<p> |
||||
<pre> |
||||
leveldb::Snapshot* snapshot; |
||||
leveldb::WriteOptions write_options; |
||||
write_options.post_write_snapshot = &snapshot; |
||||
leveldb::Status status = db->Write(write_options, ...); |
||||
... perform other mutations to db ... |
||||
|
||||
leveldb::ReadOptions read_options; |
||||
read_options.snapshot = snapshot; |
||||
leveldb::Iterator* iter = db->NewIterator(read_options); |
||||
... read as of the state just after the Write call returned ... |
||||
delete iter; |
||||
|
||||
db->ReleaseSnapshot(snapshot); |
||||
</pre> |
||||
<h1>Slice</h1> |
||||
<p> |
||||
The return value of the <code>it->key()</code> and <code>it->value()</code> calls above |
||||
are instances of the <code>leveldb::Slice</code> type. <code>Slice</code> is a simple |
||||
structure that contains a length and a pointer to an external byte |
||||
array. Returning a <code>Slice</code> is a cheaper alternative to returning a |
||||
<code>std::string</code> since we do not need to copy potentially large keys and |
||||
values. In addition, <code>leveldb</code> methods do not return null-terminated |
||||
C-style strings since <code>leveldb</code> keys and values are allowed to |
||||
contain '\0' bytes. |
||||
<p> |
||||
C++ strings and null-terminated C-style strings can be easily converted |
||||
to a Slice: |
||||
<p> |
||||
<pre> |
||||
leveldb::Slice s1 = "hello"; |
||||
|
||||
std::string str("world"); |
||||
leveldb::Slice s2 = str; |
||||
</pre> |
||||
A Slice can be easily converted back to a C++ string: |
||||
<pre> |
||||
std::string str = s1.ToString(); |
||||
assert(str == std::string("hello")); |
||||
</pre> |
||||
Be careful when using Slices since it is up to the caller to ensure that |
||||
the external byte array into which the Slice points remains live while |
||||
the Slice is in use. For example, the following is buggy: |
||||
<p> |
||||
<pre> |
||||
leveldb::Slice slice; |
||||
if (...) { |
||||
std::string str = ...; |
||||
slice = str; |
||||
} |
||||
Use(slice); |
||||
</pre> |
||||
When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the |
||||
backing storage for <code>slice</code> will disappear. |
||||
<p> |
||||
<h1>Comparators</h1> |
||||
<p> |
||||
The preceding examples used the default ordering function for key, |
||||
which orders bytes lexicographically. You can however supply a custom |
||||
comparator when opening a database. For example, suppose each |
||||
database key consists of two numbers and we should sort by the first |
||||
number, breaking ties by the second number. First, define a proper |
||||
subclass of <code>leveldb::Comparator</code> that expresses these rules: |
||||
<p> |
||||
<pre> |
||||
class TwoPartComparator : public leveldb::Comparator { |
||||
public: |
||||
// Three-way comparison function: |
||||
// if a < b: negative result |
||||
// if a > b: positive result |
||||
// else: zero result |
||||
int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const { |
||||
int a1, a2, b1, b2; |
||||
ParseKey(a, &a1, &a2); |
||||
ParseKey(b, &b1, &b2); |
||||
if (a1 < b1) return -1; |
||||
if (a1 > b1) return +1; |
||||
if (a2 < b2) return -1; |
||||
if (a2 > b2) return +1; |
||||
return 0; |
||||
} |
||||
|
||||
// Ignore the following methods for now: |
||||
const char* Name() { return "TwoPartComparator"; } |
||||
void FindShortestSeparator(std::string*, const leveldb::Slice&) const { } |
||||
void FindShortSuccessor(std::string*) const { } |
||||
}; |
||||
</pre> |
||||
Now create a database using this custom comparator: |
||||
<p> |
||||
<pre> |
||||
TwoPartComparator cmp; |
||||
leveldb::DB* db; |
||||
leveldb::Options options; |
||||
options.create_if_missing = true; |
||||
options.comparator = &cmp; |
||||
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db); |
||||
... |
||||
</pre> |
||||
<h2>Backwards compatibility</h2> |
||||
<p> |
||||
The result of the comparator's <code>Name</code> method is attached to the |
||||
database when it is created, and is checked on every subsequent |
||||
database open. If the name changes, the <code>leveldb::DB::Open</code> call will |
||||
fail. Therefore, change the name if and only if the new key format |
||||
and comparison function are incompatible with existing databases, and |
||||
it is ok to discard the contents of all existing databases. |
||||
<p> |
||||
You can however still gradually evolve your key format over time with |
||||
a little bit of pre-planning. For example, you could store a version |
||||
number at the end of each key (one byte should suffice for most uses). |
||||
When you wish to switch to a new key format (e.g., adding an optional |
||||
third part to the keys processed by <code>TwoPartComparator</code>), |
||||
(a) keep the same comparator name (b) increment the version number |
||||
for new keys (c) change the comparator function so it uses the |
||||
version numbers found in the keys to decide how to interpret them. |
||||
<p> |
||||
<h1>Performance</h1> |
||||
<p> |
||||
Performance can be tuned by changing the default values of the |
||||
types defined in <code>leveldb/include/options.h</code>. |
||||
|
||||
<p> |
||||
<h2><a name="async">Asynchronous Writes</a></h2> |
||||
|
||||
By default, each write to <code>leveldb</code> is synchronous: it does |
||||
not return until the write has been pushed from memory to persistent |
||||
storage. (On Posix systems, this is implemented by calling either |
||||
<code>fdatasync(...)</code> or <code>msync(..., MS_SYNC)</code>.) |
||||
<strong>Synchronous writes may be very slow and the synchrony can be |
||||
optionally disabled</strong>: |
||||
<pre> |
||||
leveldb::WriteOptions write_options; |
||||
write_options.sync = false; |
||||
db->Put(write_options, ...); |
||||
</pre> |
||||
Asynchronous writes are often more than a hundred times as fast as |
||||
synchronous writes. The downside of asynchronous writes is that a |
||||
crash of the machine may cause the last few updates to be lost. Note |
||||
that a crash of just the writing process (i.e., not a reboot) will not |
||||
cause any loss since even when <code>sync</code> is false, an update |
||||
is pushed from the process memory into the operating system before it |
||||
is considered done. |
||||
|
||||
<p> |
||||
Asynchronous writes can be particularly beneficial when loading a |
||||
large amount of data into the database since you can mitigate the |
||||
problem of lost updates by restarting the bulk load. A hybrid scheme |
||||
is also possible where every Nth write is synchronous, and in the |
||||
event of a crash, the bulk load is restarted just after the last |
||||
synchronous write finished by the previous run. |
||||
|
||||
<p> |
||||
<code>WriteBatch</code> provides an alternative to asynchronous writes. |
||||
Multiple updates may be placed in the same <code>WriteBatch</code> and |
||||
applied together using a synchronous write. The extra cost of the |
||||
synchronous write will be amortized across all of the writes in the batch. |
||||
|
||||
<p> |
||||
<h2>Block size</h2> |
||||
<p> |
||||
<code>leveldb</code> groups adjacent keys together into the same block and such a |
||||
block is the unit of transfer to and from persistent storage. The |
||||
default block size is approximately 8192 uncompressed bytes. |
||||
Applications that mostly do bulk scans over the contents of the |
||||
database may wish to increase this size. Applications that do a lot |
||||
of point reads of small values may wish to switch to a smaller block |
||||
size if performance measurements indicate an improvement. There isn't |
||||
much benefit in using blocks smaller than one kilobyte, or larger than |
||||
a few megabytes. Also note that compression will be more effective |
||||
with larger block sizes. |
||||
<p> |
||||
<h2>Compression</h2> |
||||
<p> |
||||
Each block is individually compressed before being written to |
||||
persistent storage. Compression is on by default since the default |
||||
compression method is very fast, and is automatically disabled for |
||||
uncompressible data. In rare cases, applications may want to disable |
||||
compression entirely, but should only do so if benchmarks show a |
||||
performance improvement: |
||||
<p> |
||||
<pre> |
||||
leveldb::Options options; |
||||
options.compression = leveldb::kNoCompression; |
||||
... leveldb::DB::Open(options, name, ...) .... |
||||
</pre> |
||||
<h2>Cache</h2> |
||||
<p> |
||||
The contents of the database are stored in a set of files in the |
||||
filesystem and each file stores a sequence of compressed blocks. If |
||||
<code>options.cache</code> is non-NULL, it is used to cache frequently used |
||||
uncompressed block contents. |
||||
<p> |
||||
<pre> |
||||
#include "leveldb/include/cache.h" |
||||
|
||||
leveldb::Options options; |
||||
options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache |
||||
leveldb::DB* db; |
||||
leveldb::DB::Open(options, name, &db); |
||||
... use the db ... |
||||
delete db |
||||
delete options.cache; |
||||
</pre> |
||||
Note that the cache holds uncompressed data, and therefore it should |
||||
be sized according to application level data sizes, without any |
||||
reduction from compression. (Caching of compressed blocks is left to |
||||
the operating system buffer cache, or any custom <code>Env</code> |
||||
implementation provided by the client.) |
||||
<p> |
||||
When performing a bulk read, the application may wish to disable |
||||
caching so that the data processed by the bulk read does not end up |
||||
displacing most of the cached contents. A per-iterator option can be |
||||
used to achieve this: |
||||
<p> |
||||
<pre> |
||||
leveldb::ReadOptions options; |
||||
options.fill_cache = false; |
||||
leveldb::Iterator* it = db->NewIterator(options); |
||||
for (it->SeekToFirst(); it->Valid(); it->Next()) { |
||||
... |
||||
} |
||||
</pre> |
||||
<h2>Key Layout</h2> |
||||
<p> |
||||
Note that the unit of disk transfer and caching is a block. Adjacent |
||||
keys (according to the database sort order) will usually be placed in |
||||
the same block. Therefore the application can improve its performance |
||||
by placing keys that are accessed together near each other and placing |
||||
infrequently used keys in a separate region of the key space. |
||||
<p> |
||||
For example, suppose we are implementing a simple file system on top |
||||
of <code>leveldb</code>. The types of entries we might wish to store are: |
||||
<p> |
||||
<pre> |
||||
filename -> permission-bits, length, list of file_block_ids |
||||
file_block_id -> data |
||||
</pre> |
||||
We might want to prefix <code>filename</code> keys with one letter (say '/') and the |
||||
<code>file_block_id</code> keys with a different letter (say '0') so that scans |
||||
over just the metadata do not force us to fetch and cache bulky file |
||||
contents. |
||||
<p> |
||||
<h2>Large Values</h2> |
||||
<p> |
||||
<code>leveldb</code> has special treatment of large values (by default, a value |
||||
of length greater than or equal to 64K is considered large, though a |
||||
field in Options can be used to adjust this threshold). Each such |
||||
large value is placed in a separate operating system file, and the |
||||
normal database blocks just contain pointers to such files. |
||||
<p> |
||||
Furthermore, if the same large value occurs multiple times in a single |
||||
database, it will be stored just once. |
||||
<p> |
||||
<h1>Checksums</h1> |
||||
<p> |
||||
<code>leveldb</code> associates checksums with all data it stores in the file system. |
||||
There are two separate controls provided over how aggressively these |
||||
checksums are verified: |
||||
<p> |
||||
<ul> |
||||
<li> <code>ReadOptions::verify_checksums</code> may be set to true to force |
||||
checksum verification of all data that is read from the file system on |
||||
behalf of a particular read. By default, no such verification is |
||||
done. |
||||
<p> |
||||
<li> <code>Options::paranoid_checks</code> may be set to true before opening a |
||||
database to make the database implementation raise an error as soon as |
||||
it detects an internal corruption. Depending on which portion of the |
||||
database has been corrupted, the error may be raised when the database |
||||
is opened, or later by another database operation. By default, |
||||
paranoid checking is off so that the database can be used even if |
||||
parts of its persistent storage have been corrupted. |
||||
<p> |
||||
If a database is corrupted (perhaps it cannot be opened when |
||||
paranoid checking is turned on), the <code>leveldb::RepairDB</code> function |
||||
may be used to recover as much of the data as possible |
||||
<p> |
||||
</ul> |
||||
<h1>Approximate Sizes</h1> |
||||
<p> |
||||
The <code>GetApproximateSizes</code> method can used to get the approximate |
||||
number of bytes of file system space used by one or more key ranges. |
||||
<p> |
||||
<pre> |
||||
leveldb::Range ranges[2]; |
||||
ranges[0] = leveldb::Range("a", "c"); |
||||
ranges[1] = leveldb::Range("x", "z"); |
||||
uint64_t sizes[2]; |
||||
leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes); |
||||
</pre> |
||||
The preceding call will set <code>sizes[0]</code> to the approximate number of |
||||
bytes of file system space used by the key range <code>[a..c)</code> and |
||||
<code>sizes[1]</code> to the approximate number of bytes used by the key range |
||||
<code>[x..z)</code>. |
||||
<p> |
||||
<h1>Environment</h1> |
||||
<p> |
||||
All file operations (and other operating system calls) issued by the |
||||
<code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object. |
||||
Sophisticated clients may wish to provide their own <code>Env</code> |
||||
implementation to get better control. For example, an application may |
||||
introduce artificial delays in the file IO paths to limit the impact |
||||
of <code>leveldb</code> on other activities in the system. |
||||
<p> |
||||
<pre> |
||||
class SlowEnv : public leveldb::Env { |
||||
.. implementation of the Env interface ... |
||||
}; |
||||
|
||||
SlowEnv env; |
||||
leveldb::Options options; |
||||
options.env = &env; |
||||
Status s = leveldb::DB::Open(options, ...); |
||||
</pre> |
||||
<h1>Porting</h1> |
||||
<p> |
||||
<code>leveldb</code> may be ported to a new platform by providing platform |
||||
specific implementations of the types/methods/functions exported by |
||||
<code>leveldb/port/port.h</code>. See <code>leveldb/port/port_example.h</code> for more |
||||
details. |
||||
<p> |
||||
In addition, the new platform may need a new default <code>leveldb::Env</code> |
||||
implementation. See <code>leveldb/util/env_posix.h</code> for an example. |
||||
|
||||
<h1>Other Information</h1> |
||||
|
||||
<p> |
||||
Details about the <code>leveldb</code> implementation may be found in |
||||
the following documents: |
||||
<ul> |
||||
<li> <a href="impl.html">Implementation notes</a> |
||||
<li> <a href="table_format.txt">Format of an immutable Table file</a> |
||||
<li> <a href="log_format.txt">Format of a log file</a> |
||||
</ul> |
||||
|
||||
</body> |
||||
</html> |
@ -0,0 +1,72 @@ |
||||
The log file contents are a sequence of 32KB blocks. The only |
||||
exception is that the tail of the file may contain a partial block. |
||||
|
||||
Each block consists of a sequence of records: |
||||
block := record* trailer? |
||||
record := |
||||
checksum: uint32 // crc32c of type and data[] |
||||
length: uint16 |
||||
type: uint8 // One of FULL, FIRST, MIDDLE, LAST |
||||
data: uint8[length] |
||||
|
||||
A record never starts within the last seven bytes of a block. Any |
||||
leftover bytes here form the trailer, which must consist entirely of |
||||
zero bytes and must be skipped by readers. In particular, even if |
||||
there are exactly seven bytes left in the block, and a zero-length |
||||
user record is added (which will fit in these seven bytes), the writer |
||||
must skip these trailer bytes and add the record to the next block. |
||||
|
||||
More types may be added in the future. Some Readers may skip record |
||||
types they do not understand, others may report that some data was |
||||
skipped. |
||||
|
||||
FULL == 1 |
||||
FIRST == 2 |
||||
MIDDLE == 3 |
||||
LAST == 4 |
||||
|
||||
The FULL record contains the contents of an entire user record. |
||||
|
||||
FIRST, MIDDLE, LAST are types used for user records that have been |
||||
split into multiple fragments (typically because of block boundaries). |
||||
FIRST is the type of the first fragment of a user record, LAST is the |
||||
type of the last fragment of a user record, and MID is the type of all |
||||
interior fragments of a user record. |
||||
|
||||
Example: consider a sequence of user records: |
||||
A: length 1000 |
||||
B: length 97270 |
||||
C: length 8000 |
||||
A will be stored as a FULL record in the first block. |
||||
|
||||
B will be split into three fragments: first fragment occupies the rest |
||||
of the first block, second fragment occupies the entirety of the |
||||
second block, and the third fragment occupies a prefix of the third |
||||
block. This will leave six bytes free in the third block, which will |
||||
be left empty as the trailer. |
||||
|
||||
C will be stored as a FULL record in the fourth block. |
||||
|
||||
=================== |
||||
|
||||
Some benefits over the recordio format: |
||||
|
||||
(1) We do not need any heuristics for resyncing - just go to next |
||||
block boundary and scan. If there is a corruption, skip to the next |
||||
block. As a side-benefit, we do not get confused when part of the |
||||
contents of one log file are embedded as a record inside another log |
||||
file. |
||||
|
||||
(2) Splitting at approximate boundaries (e.g., for mapreduce) is |
||||
simple: find the next block boundary and skip records until we |
||||
hit a FULL or FIRST record. |
||||
|
||||
(3) We do not need extra buffering for large records. |
||||
|
||||
Some downsides compared to recordio format: |
||||
|
||||
(1) No packing of tiny records. This could be fixed by adding a new |
||||
record type, so it is a shortcoming of the current implementation, |
||||
not necessarily the format. |
||||
|
||||
(2) No compression. Again, this could be fixed by adding new record types. |
@ -0,0 +1,61 @@ |
||||
File format |
||||
=========== |
||||
|
||||
<beginning_of_file> |
||||
[data block 1] |
||||
[data block 2] |
||||
... |
||||
[data block N] |
||||
[meta block 1] |
||||
... |
||||
[meta block K] |
||||
[metaindex block] |
||||
[index block] |
||||
[Footer] (fixed size; starts at file_size - sizeof(Footer)) |
||||
<end_of_file> |
||||
|
||||
The file contains internal pointers. Each such pointer is called |
||||
a BlockHandle and contains the following information: |
||||
offset: varint64 |
||||
size: varint64 |
||||
|
||||
(1) The sequence of key/value pairs in the file are stored in sorted |
||||
order and partitioned into a sequence of data blocks. These blocks |
||||
come one after another at the beginning of the file. Each data block |
||||
is formatted according to the code in block_builder.cc, and then |
||||
optionally compressed. |
||||
|
||||
(2) After the data blocks we store a bunch of meta blocks. The |
||||
supported meta block types are described below. More meta block types |
||||
may be added in the future. Each meta block is again formatted using |
||||
block_builder.cc and then optionally compressed. |
||||
|
||||
(3) A "metaindex" block. It contains one entry for every other meta |
||||
block where the key is the name of the meta block and the value is a |
||||
BlockHandle pointing to that meta block. |
||||
|
||||
(4) An "index" block. This block contains one entry per data block, |
||||
where the key is a string >= last key in that data block and before |
||||
the first key in the successive data block. The value is the |
||||
BlockHandle for the data block. |
||||
|
||||
(6) At the very end of the file is a fixed length footer that contains |
||||
the BlockHandle of the metaindex and index blocks as well as a magic number. |
||||
metaindex_handle: char[p]; // Block handle for metaindex |
||||
index_handle: char[q]; // Block handle for index |
||||
padding: char[40-p-q]; // 0 bytes to make fixed length |
||||
// (40==2*BlockHandle::kMaxEncodedLength) |
||||
magic: fixed64; // == 0xdb4775248b80fb57 |
||||
|
||||
"stats" Meta Block |
||||
------------------ |
||||
|
||||
This meta block contains a bunch of stats. The key is the name |
||||
of the statistic. The value contains the statistic. |
||||
TODO(postrelease): record following stats. |
||||
data size |
||||
index size |
||||
key size (uncompressed) |
||||
value size (uncompressed) |
||||
number of entries |
||||
number of data blocks |
@ -0,0 +1,99 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A Cache is an interface that maps keys to values. It has internal
|
||||
// synchronization and may be safely accessed concurrently from
|
||||
// multiple threads. It may automatically evict entries to make room
|
||||
// for new entries. Values have a specified charge against the cache
|
||||
// capacity. For example, a cache where the values are variable
|
||||
// length strings, may use the length of the string as the charge for
|
||||
// the string.
|
||||
//
|
||||
// A builtin cache implementation with a least-recently-used eviction
|
||||
// policy is provided. Clients may use their own implementations if
|
||||
// they want something more sophisticated (like scan-resistance, a
|
||||
// custom eviction policy, variable cache sizing, etc.)
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_CACHE_H_ |
||||
|
||||
#include <stdint.h> |
||||
#include "include/slice.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Cache; |
||||
|
||||
// Create a new cache with a fixed size capacity. This implementation
|
||||
// of Cache uses a least-recently-used eviction policy.
|
||||
extern Cache* NewLRUCache(size_t capacity); |
||||
|
||||
class Cache { |
||||
public: |
||||
Cache() { } |
||||
|
||||
// Destroys all existing entries by calling the "deleter"
|
||||
// function that was passed to the constructor.
|
||||
virtual ~Cache(); |
||||
|
||||
// Opaque handle to an entry stored in the cache.
|
||||
struct Handle { }; |
||||
|
||||
// Insert a mapping from key->value into the cache and assign it
|
||||
// the specified charge against the total cache capacity.
|
||||
//
|
||||
// Returns a handle that corresponds to the mapping. The caller
|
||||
// must call this->Release(handle) when the returned mapping is no
|
||||
// longer needed.
|
||||
//
|
||||
// When the inserted entry is no longer needed, the key and
|
||||
// value will be passed to "deleter".
|
||||
virtual Handle* Insert(const Slice& key, void* value, size_t charge, |
||||
void (*deleter)(const Slice& key, void* value)) = 0; |
||||
|
||||
// If the cache has no mapping for "key", returns NULL.
|
||||
//
|
||||
// Else return a handle that corresponds to the mapping. The caller
|
||||
// must call this->Release(handle) when the returned mapping is no
|
||||
// longer needed.
|
||||
virtual Handle* Lookup(const Slice& key) = 0; |
||||
|
||||
// Release a mapping returned by a previous Lookup().
|
||||
// REQUIRES: handle must not have been released yet.
|
||||
// REQUIRES: handle must have been returned by a method on *this.
|
||||
virtual void Release(Handle* handle) = 0; |
||||
|
||||
// Return the value encapsulated in a handle returned by a
|
||||
// successful Lookup().
|
||||
// REQUIRES: handle must not have been released yet.
|
||||
// REQUIRES: handle must have been returned by a method on *this.
|
||||
virtual void* Value(Handle* handle) = 0; |
||||
|
||||
// If the cache contains entry for key, erase it. Note that the
|
||||
// underlying entry will be kept around until all existing handles
|
||||
// to it have been released.
|
||||
virtual void Erase(const Slice& key) = 0; |
||||
|
||||
// Return a new numeric id. May be used by multiple clients who are
|
||||
// sharing the same cache to partition the key space. Typically the
|
||||
// client will allocate a new id at startup and prepend the id to
|
||||
// its cache keys.
|
||||
virtual uint64_t NewId() = 0; |
||||
|
||||
private: |
||||
void LRU_Remove(Handle* e); |
||||
void LRU_Append(Handle* e); |
||||
void Unref(Handle* e); |
||||
|
||||
struct Rep; |
||||
Rep* rep_; |
||||
|
||||
// No copying allowed
|
||||
Cache(const Cache&); |
||||
void operator=(const Cache&); |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_UTIL_CACHE_H_
|
@ -0,0 +1,61 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ |
||||
|
||||
#include <string> |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Slice; |
||||
|
||||
// A Comparator object provides a total order across slices that are
|
||||
// used as keys in an sstable or a database.
|
||||
class Comparator { |
||||
public: |
||||
virtual ~Comparator(); |
||||
|
||||
// Three-way comparison. Returns value:
|
||||
// < 0 iff "a" < "b",
|
||||
// == 0 iff "a" == "b",
|
||||
// > 0 iff "a" > "b"
|
||||
virtual int Compare(const Slice& a, const Slice& b) const = 0; |
||||
|
||||
// The name of the comparator. Used to check for comparator
|
||||
// mismatches (i.e., a DB created with one comparator is
|
||||
// accessed using a different comparator.
|
||||
//
|
||||
// The client of this package should switch to a new name whenever
|
||||
// the comparator implementation changes in a way that will cause
|
||||
// the relative ordering of any two keys to change.
|
||||
//
|
||||
// Names starting with "leveldb." are reserved and should not be used
|
||||
// by any clients of this package.
|
||||
virtual const char* Name() const = 0; |
||||
|
||||
// Advanced functions: these are used to reduce the space requirements
|
||||
// for internal data structures like index blocks.
|
||||
|
||||
// If *start < limit, changes *start to a short string in [start,limit).
|
||||
// Simple comparator implementations may return with *start unchanged,
|
||||
// i.e., an implementation of this method that does nothing is correct.
|
||||
virtual void FindShortestSeparator( |
||||
std::string* start, |
||||
const Slice& limit) const = 0; |
||||
|
||||
// Changes *key to a short string >= *key.
|
||||
// Simple comparator implementations may return with *key unchanged,
|
||||
// i.e., an implementation of this method that does nothing is correct.
|
||||
virtual void FindShortSuccessor(std::string* key) const = 0; |
||||
}; |
||||
|
||||
// Return a builtin comparator that uses lexicographic byte-wise
|
||||
// ordering. The result remains the property of this module and
|
||||
// must not be deleted.
|
||||
extern const Comparator* BytewiseComparator(); |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
|
@ -0,0 +1,137 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_DB_H_ |
||||
|
||||
#include <stdint.h> |
||||
#include <stdio.h> |
||||
#include "include/iterator.h" |
||||
#include "include/options.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
struct Options; |
||||
struct ReadOptions; |
||||
struct WriteOptions; |
||||
|
||||
class Snapshot; |
||||
class WriteBatch; |
||||
|
||||
// Some internal types. Clients should ignore.
|
||||
class WriteBatchInternal; |
||||
|
||||
struct Range { |
||||
Slice start; |
||||
Slice limit; |
||||
|
||||
Range(const Slice& s, const Slice& l) : start(s), limit(l) { } |
||||
}; |
||||
|
||||
// A DB is a persistent ordered map from keys to values.
|
||||
class DB { |
||||
public: |
||||
// Open the database with the specified "name".
|
||||
// Stores a pointer to a heap-allocated database in *dbptr and returns
|
||||
// OK on success.
|
||||
// Stores NULL in *dbptr and returns a non-OK status on error.
|
||||
// Caller should delete *dbptr when it is no longer needed.
|
||||
static Status Open(const Options& options, |
||||
const std::string& name, |
||||
DB** dbptr); |
||||
|
||||
DB() { } |
||||
virtual ~DB(); |
||||
|
||||
// Set the database entry for "key" to "value". Returns OK on success,
|
||||
// and a non-OK status on error.
|
||||
// Note: consider setting options.sync = false.
|
||||
virtual Status Put(const WriteOptions& options, |
||||
const Slice& key, |
||||
const Slice& value) = 0; |
||||
|
||||
// Remove the database entry (if any) for "key". Returns OK on
|
||||
// success, and a non-OK status on error. It is not an error if "key"
|
||||
// did not exist in the database.
|
||||
// Note: consider setting options.sync = false.
|
||||
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; |
||||
|
||||
// Apply the specified updates to the database.
|
||||
// Returns OK on success, non-OK on failure.
|
||||
// Note: consider setting options.sync = false.
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; |
||||
|
||||
// If the database contains an entry for "key" store the
|
||||
// corresponding value in *value and return OK.
|
||||
//
|
||||
// If there is no entry for "key" leave *value unchanged and return
|
||||
// a status for which Status::IsNotFound() returns true.
|
||||
//
|
||||
// May return some other Status on an error.
|
||||
virtual Status Get(const ReadOptions& options, |
||||
const Slice& key, std::string* value) = 0; |
||||
|
||||
// Return a heap-allocated iterator over the contents of the database.
|
||||
// The result of NewIterator() is initially invalid (caller must
|
||||
// call one of the Seek methods on the iterator before using it).
|
||||
//
|
||||
// Caller should delete the iterator when it is no longer needed.
|
||||
// The returned iterator should be deleted before this db is deleted.
|
||||
virtual Iterator* NewIterator(const ReadOptions& options) = 0; |
||||
|
||||
// Return a handle to the current DB state. Iterators created with
|
||||
// this handle will all observe a stable snapshot of the current DB
|
||||
// state. The caller must call ReleaseSnapshot(result) when the
|
||||
// snapshot is no longer needed.
|
||||
virtual const Snapshot* GetSnapshot() = 0; |
||||
|
||||
// Release a previously acquired snapshot. The caller must not
|
||||
// use "snapshot" after this call.
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; |
||||
|
||||
// DB implementations can export properties about their state
|
||||
// via this method. If "property" is a valid property understood by this
|
||||
// DB implementation, fills "*value" with its current value and returns
|
||||
// true. Otherwise returns false.
|
||||
//
|
||||
//
|
||||
// Valid property names include:
|
||||
//
|
||||
// "leveldb.num-files-at-level<N>" - return the number of files at level <N>,
|
||||
// where <N> is an ASCII representation of a level number (e.g. "0").
|
||||
virtual bool GetProperty(const Slice& property, uint64_t* value) = 0; |
||||
|
||||
// For each i in [0,n-1], store in "sizes[i]", the approximate
|
||||
// file system space used by keys in "[range[i].start .. range[i].limit)".
|
||||
//
|
||||
// Note that the returned sizes measure file system space usage, so
|
||||
// if the user data compresses by a factor of ten, the returned
|
||||
// sizes will be one-tenth the size of the corresponding user data size.
|
||||
//
|
||||
// The results may not include the sizes of recently written data.
|
||||
virtual void GetApproximateSizes(const Range* range, int n, |
||||
uint64_t* sizes) = 0; |
||||
|
||||
// Possible extensions:
|
||||
// (1) Add a method to compact a range of keys
|
||||
|
||||
private: |
||||
// No copying allowed
|
||||
DB(const DB&); |
||||
void operator=(const DB&); |
||||
}; |
||||
|
||||
// Destroy the contents of the specified database.
|
||||
// Be very careful using this method.
|
||||
Status DestroyDB(const std::string& name, const Options& options); |
||||
|
||||
// If a DB cannot be opened, you may attempt to call this method to
|
||||
// resurrect as much of the contents of the database as possible.
|
||||
// Some data may be lost, so be careful when calling this function
|
||||
// on a database that contains important information.
|
||||
Status RepairDB(const std::string& dbname, const Options& options); |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_DB_H_
|
@ -0,0 +1,293 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// An Env is an interface used by the leveldb implementation to access
|
||||
// operating system functionality like the filesystem etc. Callers
|
||||
// may wish to provide a custom Env object when opening a database to
|
||||
// get fine gain control; e.g., to rate limit file system operations.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_ENV_H_ |
||||
|
||||
#include <cstdarg> |
||||
#include <string> |
||||
#include <vector> |
||||
#include <stdint.h> |
||||
#include "include/status.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class FileLock; |
||||
class RandomAccessFile; |
||||
class SequentialFile; |
||||
class Slice; |
||||
class WritableFile; |
||||
|
||||
class Env { |
||||
public: |
||||
Env() { } |
||||
virtual ~Env(); |
||||
|
||||
// Return a default environment suitable for the current operating
|
||||
// system. Sophisticated users may wish to provide their own Env
|
||||
// implementation instead of relying on this default environment.
|
||||
//
|
||||
// The result of Default() belongs to leveldb and must never be deleted.
|
||||
static Env* Default(); |
||||
|
||||
// Create a brand new sequentially-readable file with the specified name.
|
||||
// On success, stores a pointer to the new file in *result and returns OK.
|
||||
// On failure stores NULL in *result and returns non-OK. If the file does
|
||||
// not exist, returns a non-OK status.
|
||||
//
|
||||
// The returned file will only be accessed by one thread at a time.
|
||||
virtual Status NewSequentialFile(const std::string& fname, |
||||
SequentialFile** result) = 0; |
||||
|
||||
// Create a brand new random access read-only file with the
|
||||
// specified name. On success, stores a pointer to the new file in
|
||||
// *result and returns OK. On failure stores NULL in *result and
|
||||
// returns non-OK. If the file does not exist, returns a non-OK
|
||||
// status.
|
||||
//
|
||||
// The returned file may be concurrently accessed by multiple threads.
|
||||
virtual Status NewRandomAccessFile(const std::string& fname, |
||||
RandomAccessFile** result) = 0; |
||||
|
||||
// Create an object that writes to a new file with the specified
|
||||
// name. Deletes any existing file with the same name and creates a
|
||||
// new file. On success, stores a pointer to the new file in
|
||||
// *result and returns OK. On failure stores NULL in *result and
|
||||
// returns non-OK.
|
||||
//
|
||||
// The returned file will only be accessed by one thread at a time.
|
||||
virtual Status NewWritableFile(const std::string& fname, |
||||
WritableFile** result) = 0; |
||||
|
||||
// Returns true iff the named file exists.
|
||||
virtual bool FileExists(const std::string& fname) = 0; |
||||
|
||||
// Store in *result the names of the children of the specified directory.
|
||||
// The names are relative to "dir".
|
||||
// Original contents of *results are dropped.
|
||||
virtual Status GetChildren(const std::string& dir, |
||||
std::vector<std::string>* result) = 0; |
||||
|
||||
// Delete the named file.
|
||||
virtual Status DeleteFile(const std::string& fname) = 0; |
||||
|
||||
// Create the specified directory.
|
||||
virtual Status CreateDir(const std::string& dirname) = 0; |
||||
|
||||
// Delete the specified directory.
|
||||
virtual Status DeleteDir(const std::string& dirname) = 0; |
||||
|
||||
// Store the size of fname in *file_size.
|
||||
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; |
||||
|
||||
// Rename file src to target.
|
||||
virtual Status RenameFile(const std::string& src, |
||||
const std::string& target) = 0; |
||||
|
||||
// Lock the specified file. Used to prevent concurrent access to
|
||||
// the same db by multiple processes. On failure, stores NULL in
|
||||
// *lock and returns non-OK.
|
||||
//
|
||||
// On success, stores a pointer to the object that represents the
|
||||
// acquired lock in *lock and returns OK. The caller should call
|
||||
// UnlockFile(*lock) to release the lock. If the process exits,
|
||||
// the lock will be automatically released.
|
||||
//
|
||||
// If somebody else already holds the lock, finishes immediately
|
||||
// with a failure. I.e., this call does not wait for existing locks
|
||||
// to go away.
|
||||
//
|
||||
// May create the named file if it does not already exist.
|
||||
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; |
||||
|
||||
// Release the lock acquired by a previous successful call to LockFile.
|
||||
// REQUIRES: lock was returned by a successful LockFile() call
|
||||
// REQUIRES: lock has not already been unlocked.
|
||||
virtual Status UnlockFile(FileLock* lock) = 0; |
||||
|
||||
// Arrange to run "(*function)(arg)" once in a background thread.
|
||||
//
|
||||
// "function" may run in an unspecified thread. Multiple functions
|
||||
// added to the same Env may run concurrently in different threads.
|
||||
// I.e., the caller may not assume that background work items are
|
||||
// serialized.
|
||||
virtual void Schedule( |
||||
void (*function)(void* arg), |
||||
void* arg) = 0; |
||||
|
||||
// Start a new thread, invoking "function(arg)" within the new thread.
|
||||
// When "function(arg)" returns, the thread will be destroyed.
|
||||
virtual void StartThread(void (*function)(void* arg), void* arg) = 0; |
||||
|
||||
// *path is set to a temporary directory that can be used for testing. It may
|
||||
// or many not have just been created. The directory may or may not differ
|
||||
// between runs of the same process, but subsequent calls will return the
|
||||
// same directory.
|
||||
virtual Status GetTestDirectory(std::string* path) = 0; |
||||
|
||||
// Write an entry to the log file with the specified format.
|
||||
virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0; |
||||
|
||||
// Returns the number of micro-seconds since some fixed point in time. Only
|
||||
// useful for computing deltas of time.
|
||||
virtual uint64_t NowMicros() = 0; |
||||
|
||||
// Sleep/delay the thread for the perscribed number of micro-seconds.
|
||||
virtual void SleepForMicroseconds(int micros) = 0; |
||||
|
||||
private: |
||||
// No copying allowed
|
||||
Env(const Env&); |
||||
void operator=(const Env&); |
||||
}; |
||||
|
||||
// A file abstraction for reading sequentially through a file
|
||||
class SequentialFile { |
||||
public: |
||||
SequentialFile() { } |
||||
virtual ~SequentialFile(); |
||||
|
||||
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
|
||||
// written by this routine. Sets "*result" to the data that was
|
||||
// read (including if fewer than "n" bytes were successfully read).
|
||||
// If an error was encountered, returns a non-OK status.
|
||||
//
|
||||
// REQUIRES: External synchronization
|
||||
virtual Status Read(size_t n, Slice* result, char* scratch) = 0; |
||||
}; |
||||
|
||||
// A file abstraction for randomly reading the contents of a file.
|
||||
class RandomAccessFile { |
||||
public: |
||||
RandomAccessFile() { } |
||||
virtual ~RandomAccessFile(); |
||||
|
||||
// Return the length of this file in bytes.
|
||||
virtual uint64_t Size() const = 0; |
||||
|
||||
// Read up to "n" bytes from the file starting at "offset".
|
||||
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
|
||||
// to the data that was read (including if fewer than "n" bytes were
|
||||
// successfully read). If an error was encountered, returns a
|
||||
// non-OK status.
|
||||
//
|
||||
// Safe for concurrent use by multiple threads.
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result, |
||||
char* scratch) const = 0; |
||||
}; |
||||
|
||||
// A file abstraction for sequential writing. The implementation
|
||||
// must provide buffering since callers may append small fragments
|
||||
// at a time to the file.
|
||||
class WritableFile { |
||||
public: |
||||
WritableFile() { } |
||||
virtual ~WritableFile(); |
||||
|
||||
virtual Status Append(const Slice& data) = 0; |
||||
virtual Status Close() = 0; |
||||
virtual Status Flush() = 0; |
||||
virtual Status Sync() = 0; |
||||
|
||||
private: |
||||
// No copying allowed
|
||||
WritableFile(const WritableFile&); |
||||
void operator=(const WritableFile&); |
||||
}; |
||||
|
||||
// Identifies a locked file.
|
||||
class FileLock { |
||||
public: |
||||
FileLock() { } |
||||
virtual ~FileLock(); |
||||
private: |
||||
// No copying allowed
|
||||
FileLock(const FileLock&); |
||||
void operator=(const FileLock&); |
||||
}; |
||||
|
||||
// Log the specified data to *info_log if info_log is non-NULL.
|
||||
extern void Log(Env* env, WritableFile* info_log, const char* format, ...) |
||||
# if defined(__GNUC__) || defined(__clang__) |
||||
__attribute__((__format__ (__printf__, 3, 4))) |
||||
# endif |
||||
; |
||||
|
||||
// A utility routine: write "data" to the named file.
|
||||
extern Status WriteStringToFile(Env* env, const Slice& data, |
||||
const std::string& fname); |
||||
|
||||
// A utility routine: read contents of named file into *data
|
||||
extern Status ReadFileToString(Env* env, const std::string& fname, |
||||
std::string* data); |
||||
|
||||
// An implementation of Env that forwards all calls to another Env.
|
||||
// May be useful to clients who wish to override just part of the
|
||||
// functionality of another Env.
|
||||
class EnvWrapper : public Env { |
||||
public: |
||||
// Initialize an EnvWrapper that delegates all calls to *target
|
||||
explicit EnvWrapper(Env* target) : target_(target) { } |
||||
virtual ~EnvWrapper(); |
||||
|
||||
// Return the target to which this Env forwards all calls
|
||||
Env* target() const { return target_; } |
||||
|
||||
// The following text is boilerplate that forwards all methods to target()
|
||||
Status NewSequentialFile(const std::string& f, SequentialFile** r) { |
||||
return target_->NewSequentialFile(f, r); |
||||
} |
||||
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { |
||||
return target_->NewRandomAccessFile(f, r); |
||||
} |
||||
Status NewWritableFile(const std::string& f, WritableFile** r) { |
||||
return target_->NewWritableFile(f, r); |
||||
} |
||||
bool FileExists(const std::string& f) { return target_->FileExists(f); } |
||||
Status GetChildren(const std::string& dir, std::vector<std::string>* r) { |
||||
return target_->GetChildren(dir, r); |
||||
} |
||||
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } |
||||
Status CreateDir(const std::string& d) { return target_->CreateDir(d); } |
||||
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } |
||||
Status GetFileSize(const std::string& f, uint64_t* s) { |
||||
return target_->GetFileSize(f, s); |
||||
} |
||||
Status RenameFile(const std::string& s, const std::string& t) { |
||||
return target_->RenameFile(s, t); |
||||
} |
||||
Status LockFile(const std::string& f, FileLock** l) { |
||||
return target_->LockFile(f, l); |
||||
} |
||||
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } |
||||
void Schedule(void (*f)(void*), void* a) { |
||||
return target_->Schedule(f, a); |
||||
} |
||||
void StartThread(void (*f)(void*), void* a) { |
||||
return target_->StartThread(f, a); |
||||
} |
||||
virtual Status GetTestDirectory(std::string* path) { |
||||
return target_->GetTestDirectory(path); |
||||
} |
||||
virtual void Logv(WritableFile* log, const char* format, va_list ap) { |
||||
return target_->Logv(log, format, ap); |
||||
} |
||||
uint64_t NowMicros() { |
||||
return target_->NowMicros(); |
||||
} |
||||
void SleepForMicroseconds(int micros) { |
||||
target_->SleepForMicroseconds(micros); |
||||
} |
||||
private: |
||||
Env* target_; |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_
|
@ -0,0 +1,95 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// An iterator yields a sequence of key/value pairs from a source.
|
||||
// The following class defines the interface. Multiple implementations
|
||||
// are provided by this library. In particular, iterators are provided
|
||||
// to access the contents of a Table or a DB.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ |
||||
|
||||
#include "include/slice.h" |
||||
#include "include/status.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Iterator { |
||||
public: |
||||
Iterator(); |
||||
virtual ~Iterator(); |
||||
|
||||
// An iterator is either positioned at a key/value pair, or
|
||||
// not valid. This method returns true iff the iterator is valid.
|
||||
virtual bool Valid() const = 0; |
||||
|
||||
// Position at the first key in the source. The iterator is Valid()
|
||||
// after this call iff the source is not empty.
|
||||
virtual void SeekToFirst() = 0; |
||||
|
||||
// Position at the last key in the source. The iterator is
|
||||
// Valid() after this call iff the source is not empty.
|
||||
virtual void SeekToLast() = 0; |
||||
|
||||
// Position at the first key in the source that at or past target
|
||||
// The iterator is Valid() after this call iff the source contains
|
||||
// an entry that comes at or past target.
|
||||
virtual void Seek(const Slice& target) = 0; |
||||
|
||||
// Moves to the next entry in the source. After this call, Valid() is
|
||||
// true iff the iterator was not positioned at the last entry in the source.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Next() = 0; |
||||
|
||||
// Moves to the previous entry in the source. After this call, Valid() is
|
||||
// true iff the iterator was not positioned at the first entry in source.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Prev() = 0; |
||||
|
||||
// Return the key for the current entry. The underlying storage for
|
||||
// the returned slice is valid only until the next modification of
|
||||
// the iterator.
|
||||
// REQUIRES: Valid()
|
||||
virtual Slice key() const = 0; |
||||
|
||||
// Return the value for the current entry. The underlying storage for
|
||||
// the returned slice is valid only until the next modification of
|
||||
// the iterator.
|
||||
// REQUIRES: !AtEnd() && !AtStart()
|
||||
virtual Slice value() const = 0; |
||||
|
||||
// If an error has occurred, return it. Else return an ok status.
|
||||
virtual Status status() const = 0; |
||||
|
||||
// Clients are allowed to register function/arg1/arg2 triples that
|
||||
// will be invoked when this iterator is destroyed.
|
||||
//
|
||||
// Note that unlike all of the preceding methods, this method is
|
||||
// not abstract and therefore clients should not override it.
|
||||
typedef void (*CleanupFunction)(void* arg1, void* arg2); |
||||
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); |
||||
|
||||
private: |
||||
struct Cleanup { |
||||
CleanupFunction function; |
||||
void* arg1; |
||||
void* arg2; |
||||
Cleanup* next; |
||||
}; |
||||
Cleanup cleanup_; |
||||
|
||||
// No copying allowed
|
||||
Iterator(const Iterator&); |
||||
void operator=(const Iterator&); |
||||
}; |
||||
|
||||
// Return an empty iterator (yields nothing).
|
||||
extern Iterator* NewEmptyIterator(); |
||||
|
||||
// Return an empty iterator with the specified status.
|
||||
extern Iterator* NewErrorIterator(const Status& status); |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
|
@ -0,0 +1,203 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ |
||||
|
||||
#include <stddef.h> |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Cache; |
||||
class Comparator; |
||||
class Env; |
||||
class Snapshot; |
||||
class WritableFile; |
||||
|
||||
// DB contents are stored in a set of blocks, each of which holds a
|
||||
// sequence of key,value pairs. Each block may be compressed before
|
||||
// being stored in a file. The following enum describes which
|
||||
// compression method (if any) is used to compress a block.
|
||||
enum CompressionType { |
||||
// NOTE: do not change the values of existing entries, as these are
|
||||
// part of the persistent format on disk.
|
||||
kNoCompression = 0x0, |
||||
kLightweightCompression = 0x1, |
||||
}; |
||||
|
||||
// Options to control the behavior of a database (passed to DB::Open)
|
||||
struct Options { |
||||
// -------------------
|
||||
// Parameters that affect behavior
|
||||
|
||||
// Comparator used to define the order of keys in the table.
|
||||
// Default: a comparator that uses lexicographic byte-wise ordering
|
||||
//
|
||||
// REQUIRES: The client must ensure that the comparator supplied
|
||||
// here has the same name and orders keys *exactly* the same as the
|
||||
// comparator provided to previous open calls on the same DB.
|
||||
const Comparator* comparator; |
||||
|
||||
// If true, the database will be created if it is missing.
|
||||
// Default: false
|
||||
bool create_if_missing; |
||||
|
||||
// If true, an error is raised if the database already exists.
|
||||
// Default: false
|
||||
bool error_if_exists; |
||||
|
||||
// If true, the implementation will do aggressive checking of the
|
||||
// data it is processing and will stop early if it detects any
|
||||
// errors. This may have unforeseen ramifications: for example, a
|
||||
// corruption of one DB entry may cause a large number of entries to
|
||||
// become unreadable or for the entire DB to become unopenable.
|
||||
// Default: false
|
||||
bool paranoid_checks; |
||||
|
||||
// Use the specified object to interact with the environment,
|
||||
// e.g. to read/write files, schedule background work, etc.
|
||||
// Default: Env::Default()
|
||||
Env* env; |
||||
|
||||
// Any internal progress/error information generated by the db will
|
||||
// be to written to info_log if it is non-NULL, or to a file stored
|
||||
// in the same directory as the DB contents if info_log is NULL.
|
||||
// Default: NULL
|
||||
WritableFile* info_log; |
||||
|
||||
// -------------------
|
||||
// Parameters that affect performance
|
||||
|
||||
// Amount of data to build up in memory before converting to an
|
||||
// on-disk file.
|
||||
//
|
||||
// Some DB operations may encounter a delay proportional to the size
|
||||
// of this parameter. Therefore we recommend against increasing
|
||||
// this parameter unless you are willing to live with an occasional
|
||||
// slow operation in exchange for faster bulk loading throughput.
|
||||
//
|
||||
// Default: 1MB
|
||||
size_t write_buffer_size; |
||||
|
||||
// Number of open files that can be used by the DB. You may need to
|
||||
// increase this if your database has a large working set (budget
|
||||
// one open file per 2MB of working set).
|
||||
//
|
||||
// Default: 1000
|
||||
int max_open_files; |
||||
|
||||
// Handle values larger than "large_value_threshold" bytes
|
||||
// specially, by writing them into their own files (to avoid
|
||||
// compaction overhead) and doing content-based elimination of
|
||||
// duplicate values to save space.
|
||||
//
|
||||
// We recommend against changing this value.
|
||||
//
|
||||
// Default: 64K
|
||||
size_t large_value_threshold; |
||||
|
||||
// Control over blocks (user data is stored in a set of blocks, and
|
||||
// a block is the unit of reading from disk).
|
||||
|
||||
// Use the specified cache for blocks (if non-NULL).
|
||||
// Default: NULL
|
||||
Cache* block_cache; |
||||
|
||||
// Approximate size of user data packed per block. Note that the
|
||||
// block size specified here corresponds to uncompressed data. The
|
||||
// actual size of the unit read from disk may be smaller if
|
||||
// compression is enabled. This parameter can be changed dynamically.
|
||||
//
|
||||
// Default: 8K
|
||||
int block_size; |
||||
|
||||
// Number of keys between restart points for delta encoding of keys.
|
||||
// This parameter can be changed dynamically. Most clients should
|
||||
// leave this parameter alone.
|
||||
//
|
||||
// Default: 16
|
||||
int block_restart_interval; |
||||
|
||||
// Compress blocks using the specified compression algorithm. This
|
||||
// parameter can be changed dynamically.
|
||||
//
|
||||
// Default: kLightweightCompression, which gives lightweight but fast
|
||||
// compression.
|
||||
//
|
||||
// Typical speeds of kLightweightCompression on an Intel(R) Core(TM)2 2.4GHz:
|
||||
// ~200-500MB/s compression
|
||||
// ~400-800MB/s decompression
|
||||
// Note that these speeds are significantly faster than most
|
||||
// persistent storage speeds, and therefore it is typically never
|
||||
// worth switching to kNoCompression. Even if the input data is
|
||||
// incompressible, the kLightweightCompression implementation will
|
||||
// efficiently detect that and will switch to uncompressed mode.
|
||||
CompressionType compression; |
||||
|
||||
// Create an Options object with default values for all fields.
|
||||
Options(); |
||||
}; |
||||
|
||||
// Options that control read operations
|
||||
struct ReadOptions { |
||||
// If true, all data read from underlying storage will be
|
||||
// verified against corresponding checksums.
|
||||
// Default: false
|
||||
bool verify_checksums; |
||||
|
||||
// Should the data read for this iteration be cached in memory?
|
||||
// Callers may wish to set this field to false for bulk scans.
|
||||
// Default: true
|
||||
bool fill_cache; |
||||
|
||||
// If "snapshot" is non-NULL, read as of the supplied snapshot
|
||||
// (which must belong to the DB that is being read and which must
|
||||
// not have been released). If "snapshot" is NULL, use an impliicit
|
||||
// snapshot of the state at the beginning of this read operation.
|
||||
// Default: NULL
|
||||
const Snapshot* snapshot; |
||||
|
||||
ReadOptions() |
||||
: verify_checksums(false), |
||||
fill_cache(true), |
||||
snapshot(NULL) { |
||||
} |
||||
}; |
||||
|
||||
// Options that control write operations
|
||||
struct WriteOptions { |
||||
// If true, the write will be flushed from the operating system
|
||||
// buffer cache (by calling WritableFile::Sync()) before the write
|
||||
// is considered complete. If this flag is true, writes will be
|
||||
// slower.
|
||||
//
|
||||
// If this flag is false, and the machine crashes, some recent
|
||||
// writes may be lost. Note that if it is just the process that
|
||||
// crashes (i.e., the machine does not reboot), no writes will be
|
||||
// lost even if sync==false.
|
||||
//
|
||||
// Default: true
|
||||
bool sync; |
||||
|
||||
// If "post_write_snapshot" is non-NULL, and the write succeeds,
|
||||
// *post_write_snapshot will be modified to point to a snapshot of
|
||||
// the DB state immediately after this write. The caller must call
|
||||
// DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the
|
||||
// snapshot is no longer needed.
|
||||
//
|
||||
// If "post_write_snapshot" is non-NULL, and the write fails,
|
||||
// *post_write_snapshot will be set to NULL.
|
||||
//
|
||||
// Default: NULL
|
||||
const Snapshot** post_write_snapshot; |
||||
|
||||
WriteOptions() |
||||
: sync(true), |
||||
post_write_snapshot(NULL) { |
||||
} |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
|
@ -0,0 +1,104 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Slice is a simple structure containing a pointer into some external
|
||||
// storage and a size. The user of a Slice must ensure that the slice
|
||||
// is not used after the corresponding external storage has been
|
||||
// deallocated.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_SLICE_H_ |
||||
|
||||
#include <assert.h> |
||||
#include <stddef.h> |
||||
#include <string.h> |
||||
#include <string> |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Slice { |
||||
public: |
||||
// Create an empty slice.
|
||||
Slice() : data_(""), size_(0) { } |
||||
|
||||
// Create a slice that refers to data[0,n-1].
|
||||
Slice(const char* data, size_t n) : data_(data), size_(n) { } |
||||
|
||||
// Create a slice that refers to the contents of "s"
|
||||
Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } |
||||
|
||||
// Create a slice that refers to s[0,strlen(s)-1]
|
||||
Slice(const char* s) : data_(s), size_(strlen(s)) { } |
||||
|
||||
// Return a pointer to the beginning of the referenced data
|
||||
const char* data() const { return data_; } |
||||
|
||||
// Return the length (in bytes) of the referenced data
|
||||
size_t size() const { return size_; } |
||||
|
||||
// Return true iff the length of the referenced data is zero
|
||||
bool empty() const { return size_ == 0; } |
||||
|
||||
// Return the ith byte in the referenced data.
|
||||
// REQUIRES: n < size()
|
||||
char operator[](size_t n) const { |
||||
assert(n < size()); |
||||
return data_[n]; |
||||
} |
||||
|
||||
// Change this slice to refer to an empty array
|
||||
void clear() { data_ = ""; size_ = 0; } |
||||
|
||||
// Drop the first "n" bytes from this slice.
|
||||
void remove_prefix(size_t n) { |
||||
assert(n <= size()); |
||||
data_ += n; |
||||
size_ -= n; |
||||
} |
||||
|
||||
// Return a string that contains the copy of the referenced data.
|
||||
std::string ToString() const { return std::string(data_, size_); } |
||||
|
||||
// Three-way comparison. Returns value:
|
||||
// < 0 iff "*this" < "b",
|
||||
// == 0 iff "*this" == "b",
|
||||
// > 0 iff "*this" > "b"
|
||||
int compare(const Slice& b) const; |
||||
|
||||
// Return true iff "x" is a prefix of "*this"
|
||||
bool starts_with(const Slice& x) const { |
||||
return ((size_ >= x.size_) && |
||||
(memcmp(data_, x.data_, x.size_) == 0)); |
||||
} |
||||
|
||||
private: |
||||
const char* data_; |
||||
size_t size_; |
||||
|
||||
// Intentionally copyable
|
||||
}; |
||||
|
||||
inline bool operator==(const Slice& x, const Slice& y) { |
||||
return ((x.size() == y.size()) && |
||||
(memcmp(x.data(), y.data(), x.size()) == 0)); |
||||
} |
||||
|
||||
inline bool operator!=(const Slice& x, const Slice& y) { |
||||
return !(x == y); |
||||
} |
||||
|
||||
inline int Slice::compare(const Slice& b) const { |
||||
const int min_len = (size_ < b.size_) ? size_ : b.size_; |
||||
int r = memcmp(data_, b.data_, min_len); |
||||
if (r == 0) { |
||||
if (size_ < b.size_) r = -1; |
||||
else if (size_ > b.size_) r = +1; |
||||
} |
||||
return r; |
||||
} |
||||
|
||||
} |
||||
|
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_
|
@ -0,0 +1,86 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A Status encapsulates the result of an operation. It may indicate success,
|
||||
// or it may indicate an error with an associated error message.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_STATUS_H_ |
||||
|
||||
#include <string> |
||||
#include <utility> |
||||
#include "include/slice.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Status { |
||||
public: |
||||
// Create a success status.
|
||||
Status() : state_(NULL) { } |
||||
~Status() { delete state_; } |
||||
|
||||
// Copy the specified status.
|
||||
Status(const Status& s); |
||||
void operator=(const Status& s); |
||||
|
||||
// Return a success status.
|
||||
static Status OK() { return Status(); } |
||||
|
||||
// Return error status of an appropriate type.
|
||||
static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { |
||||
return Status(kNotFound, msg, Slice()); |
||||
} |
||||
static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { |
||||
return Status(kCorruption, msg, msg2); |
||||
} |
||||
static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { |
||||
return Status(kNotSupported, msg, msg2); |
||||
} |
||||
static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { |
||||
return Status(kInvalidArgument, msg, msg2); |
||||
} |
||||
static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { |
||||
return Status(kIOError, msg, msg2); |
||||
} |
||||
|
||||
// Returns true iff the status indicates success.
|
||||
bool ok() const { return (state_ == NULL); } |
||||
|
||||
// Returns true iff the status indicates a NotFound error.
|
||||
bool IsNotFound() const { return code() == kNotFound; } |
||||
|
||||
// Return a string representation of this status suitable for printing.
|
||||
// Returns the string "OK" for success.
|
||||
std::string ToString() const; |
||||
|
||||
private: |
||||
enum Code { |
||||
kOk = 0, |
||||
kNotFound = 1, |
||||
kCorruption = 2, |
||||
kNotSupported = 3, |
||||
kInvalidArgument = 4, |
||||
kIOError = 5, |
||||
}; |
||||
Code code() const { return (state_ == NULL) ? kOk : state_->first; } |
||||
|
||||
Status(Code code, const Slice& msg, const Slice& msg2); |
||||
|
||||
typedef std::pair<Code, std::string> State; |
||||
State* state_; |
||||
}; |
||||
|
||||
inline Status::Status(const Status& s) { |
||||
state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); |
||||
} |
||||
inline void Status::operator=(const Status& s) { |
||||
if (this != &s) { |
||||
delete state_; |
||||
state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); |
||||
} |
||||
} |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_
|
@ -0,0 +1,67 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_TABLE_H_ |
||||
|
||||
#include <stdint.h> |
||||
#include "include/iterator.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Block; |
||||
class BlockHandle; |
||||
struct Options; |
||||
class RandomAccessFile; |
||||
struct ReadOptions; |
||||
|
||||
// A Table is a sorted map from strings to strings. Tables are
|
||||
// immutable and persistent.
|
||||
class Table { |
||||
public: |
||||
// Attempt to open the table that is stored in "file", and read the
|
||||
// metadata entries necessary to allow retrieving data from the table.
|
||||
//
|
||||
// If successful, returns ok and sets "*table" to the newly opened
|
||||
// table. The client should delete "*table" when no longer needed.
|
||||
// If there was an error while initializing the table, sets "*table"
|
||||
// to NULL and returns a non-ok status. Does not take ownership of
|
||||
// "*source", but the client must ensure that "source" remains live
|
||||
// for the duration of the returned table's lifetime.
|
||||
//
|
||||
// *file must remain live while this Table is in use.
|
||||
static Status Open(const Options& options, |
||||
RandomAccessFile* file, |
||||
Table** table); |
||||
|
||||
~Table(); |
||||
|
||||
// Returns a new iterator over the table contents.
|
||||
// The result of NewIterator() is initially invalid (caller must
|
||||
// call one of the Seek methods on the iterator before using it).
|
||||
Iterator* NewIterator(const ReadOptions&) const; |
||||
|
||||
// Given a key, return an approximate byte offset in the file where
|
||||
// the data for that key begins (or would begin if the key were
|
||||
// present in the file). The returned value is in terms of file
|
||||
// bytes, and so includes effects like compression of the underlying data.
|
||||
// E.g., the approximate offset of the last key in the table will
|
||||
// be close to the file length.
|
||||
uint64_t ApproximateOffsetOf(const Slice& key) const; |
||||
|
||||
private: |
||||
struct Rep; |
||||
Rep* rep_; |
||||
|
||||
explicit Table(Rep* rep) { rep_ = rep; } |
||||
static Iterator* BlockReader(void*, const ReadOptions&, const Slice&); |
||||
|
||||
// No copying allowed
|
||||
Table(const Table&); |
||||
void operator=(const Table&); |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_
|
@ -0,0 +1,86 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// TableBuilder provides the interface used to build a Table
|
||||
// (an immutable and sorted map from keys to values).
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ |
||||
|
||||
#include <stdint.h> |
||||
#include "include/options.h" |
||||
#include "include/status.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class BlockBuilder; |
||||
class BlockHandle; |
||||
class WritableFile; |
||||
|
||||
class TableBuilder { |
||||
public: |
||||
// Create a builder that will store the contents of the table it is
|
||||
// building in *file. Does not close the file. It is up to the
|
||||
// caller to close the file after calling Finish().
|
||||
TableBuilder(const Options& options, WritableFile* file); |
||||
|
||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||
~TableBuilder(); |
||||
|
||||
// Change the options used by this builder. Note: only some of the
|
||||
// option fields can be changed after construction. If a field is
|
||||
// not allowed to change dynamically and its value in the structure
|
||||
// passed to the constructor is different from its value in the
|
||||
// structure passed to this method, this method will return an error
|
||||
// without changing any fields.
|
||||
Status ChangeOptions(const Options& options); |
||||
|
||||
// Add key,value to the table being constructed.
|
||||
// REQUIRES: key is after any previously added key according to comparator.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Add(const Slice& key, const Slice& value); |
||||
|
||||
// Advanced operation: flush any buffered key/value pairs to file.
|
||||
// Can be used to ensure that two adjacent entries never live in
|
||||
// the same data block. Most clients should not need to use this method.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Flush(); |
||||
|
||||
// Return non-ok iff some error has been detected.
|
||||
Status status() const; |
||||
|
||||
// Finish building the table. Stops using the file passed to the
|
||||
// constructor after this function returns.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
Status Finish(); |
||||
|
||||
// Indicate that the contents of this builder should be abandoned. Stops
|
||||
// using the file passed to the constructor after this function returns.
|
||||
// If the caller is not going to call Finish(), it must call Abandon()
|
||||
// before destroying this builder.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Abandon(); |
||||
|
||||
// Number of calls to Add() so far.
|
||||
uint64_t NumEntries() const; |
||||
|
||||
// Size of the file generated so far. If invoked after a successful
|
||||
// Finish() call, returns the size of the final generated file.
|
||||
uint64_t FileSize() const; |
||||
|
||||
private: |
||||
bool ok() const { return status().ok(); } |
||||
void WriteBlock(BlockBuilder* block, BlockHandle* handle); |
||||
|
||||
struct Rep; |
||||
Rep* rep_; |
||||
|
||||
// No copying allowed
|
||||
TableBuilder(const TableBuilder&); |
||||
void operator=(const TableBuilder&); |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
|
@ -0,0 +1,49 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// WriteBatch holds a collection of updates to apply atomically to a DB.
|
||||
//
|
||||
// The updates are applied in the order in which they are added
|
||||
// to the WriteBatch. For example, the value of "key" will be "v3"
|
||||
// after the following batch is written:
|
||||
//
|
||||
// batch.Put("key", "v1");
|
||||
// batch.Delete("key");
|
||||
// batch.Put("key", "v2");
|
||||
// batch.Put("key", "v3");
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ |
||||
#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ |
||||
|
||||
#include <string> |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Slice; |
||||
|
||||
class WriteBatch { |
||||
public: |
||||
WriteBatch(); |
||||
~WriteBatch(); |
||||
|
||||
// Store the mapping "key->value" in the database.
|
||||
void Put(const Slice& key, const Slice& value); |
||||
|
||||
// If the database contains a mapping for "key", erase it. Else do nothing.
|
||||
void Delete(const Slice& key); |
||||
|
||||
// Clear all updates buffered in this batch.
|
||||
void Clear(); |
||||
|
||||
private: |
||||
friend class WriteBatchInternal; |
||||
|
||||
std::string rep_; // See comment in write_batch.cc for the format of rep_
|
||||
|
||||
// Intentionally copyable
|
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
|
@ -0,0 +1,329 @@ |
||||
# Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
||||
# Use of this source code is governed by a BSD-style license that can be |
||||
# found in the LICENSE file. See the AUTHORS file for names of contributors. |
||||
|
||||
{ |
||||
'variables': { |
||||
'use_snappy%': 0, |
||||
}, |
||||
'target_defaults': { |
||||
'defines': [ |
||||
'LEVELDB_PLATFORM_CHROMIUM=1', |
||||
], |
||||
'include_dirs': [ |
||||
# MOE:begin_strip |
||||
'../..', |
||||
# MOE:end_strip_and_replace '.', |
||||
], |
||||
'conditions': [ |
||||
['OS == "win"', { |
||||
'include_dirs': [ |
||||
'port/win', |
||||
], |
||||
}], |
||||
['use_snappy', { |
||||
'defines': [ |
||||
'USE_SNAPPY=1', |
||||
], |
||||
}], |
||||
], |
||||
}, |
||||
'targets': [ |
||||
{ |
||||
'target_name': 'leveldb', |
||||
'type': '<(library)', |
||||
'dependencies': [ |
||||
# The base libary is a lightweight abstraction layer for things like |
||||
# threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ |
||||
# MOE:begin_strip |
||||
'../../../../base/base.gyp:base', |
||||
# MOE:end_strip_and_replace '../../base/base.gyp:base', |
||||
], |
||||
'conditions': [ |
||||
['use_snappy', { |
||||
'dependencies': [ |
||||
'../../../../third_party/snappy/snappy.gyp:snappy', |
||||
], |
||||
}], |
||||
], |
||||
'sources': [ |
||||
# Include and then exclude so that all files show up in IDEs, even if |
||||
# they don't build. |
||||
'db/builder.cc', |
||||
'db/builder.h', |
||||
'db/db_impl.cc', |
||||
'db/db_impl.h', |
||||
'db/db_iter.cc', |
||||
'db/db_iter.h', |
||||
'db/filename.cc', |
||||
'db/filename.h', |
||||
'db/dbformat.cc', |
||||
'db/dbformat.h', |
||||
'db/log_format.h', |
||||
'db/log_reader.cc', |
||||
'db/log_reader.h', |
||||
'db/log_writer.cc', |
||||
'db/log_writer.h', |
||||
'db/memtable.cc', |
||||
'db/memtable.h', |
||||
'db/repair.cc', |
||||
'db/skiplist.h', |
||||
'db/snapshot.h', |
||||
'db/table_cache.cc', |
||||
'db/table_cache.h', |
||||
'db/version_edit.cc', |
||||
'db/version_edit.h', |
||||
'db/version_set.cc', |
||||
'db/version_set.h', |
||||
'db/write_batch.cc', |
||||
'db/write_batch_internal.h', |
||||
'include/cache.h', |
||||
'include/comparator.h', |
||||
'include/db.h', |
||||
'include/env.h', |
||||
'include/iterator.h', |
||||
'include/options.h', |
||||
'include/slice.h', |
||||
'include/status.h', |
||||
'include/table.h', |
||||
'include/table_builder.h', |
||||
'include/write_batch.h', |
||||
'port/port.h', |
||||
'port/port_chromium.cc', |
||||
'port/port_chromium.h', |
||||
'port/port_example.h', |
||||
'port/port_posix.cc', |
||||
'port/port_posix.h', |
||||
'port/sha1_portable.cc', |
||||
'port/sha1_portable.h', |
||||
'table/block.cc', |
||||
'table/block.h', |
||||
'table/block_builder.cc', |
||||
'table/block_builder.h', |
||||
'table/format.cc', |
||||
'table/format.h', |
||||
'table/iterator.cc', |
||||
'table/iterator_wrapper.h', |
||||
'table/merger.cc', |
||||
'table/merger.h', |
||||
'table/table.cc', |
||||
'table/table_builder.cc', |
||||
'table/two_level_iterator.cc', |
||||
'table/two_level_iterator.h', |
||||
'util/arena.cc', |
||||
'util/arena.h', |
||||
'util/cache.cc', |
||||
'util/coding.cc', |
||||
'util/coding.h', |
||||
'util/comparator.cc', |
||||
'util/crc32c.cc', |
||||
'util/crc32c.h', |
||||
'util/env.cc', |
||||
'util/env_chromium.cc', |
||||
'util/env_posix.cc', |
||||
'util/hash.cc', |
||||
'util/hash.h', |
||||
'util/logging.cc', |
||||
'util/logging.h', |
||||
'util/mutexlock.h', |
||||
'util/options.cc', |
||||
'util/random.h', |
||||
'util/status.cc', |
||||
], |
||||
'sources/': [ |
||||
['exclude', '_(android|example|portable|posix)\\.cc$'], |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_testutil', |
||||
'type': '<(library)', |
||||
'dependencies': [ |
||||
# MOE:begin_strip |
||||
'../../../../base/base.gyp:base', |
||||
# MOE:end_strip_and_replace '../../base/base.gyp:base', |
||||
'leveldb', |
||||
], |
||||
'export_dependent_settings': [ |
||||
# The tests use include directories from these projects. |
||||
# MOE:begin_strip |
||||
'../../../../base/base.gyp:base', |
||||
# MOE:end_strip_and_replace '../../base/base.gyp:base', |
||||
'leveldb', |
||||
], |
||||
'sources': [ |
||||
'util/histogram.cc', |
||||
'util/histogram.h', |
||||
'util/testharness.cc', |
||||
'util/testharness.h', |
||||
'util/testutil.cc', |
||||
'util/testutil.h', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_arena_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'util/arena_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_cache_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'util/cache_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_coding_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'util/coding_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_corruption_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'db/corruption_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_crc32c_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'util/crc32c_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_db_bench', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'db/db_bench.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_db_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'db/db_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_dbformat_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'db/dbformat_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_env_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'util/env_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_filename_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'db/filename_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_log_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'db/log_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_sha1_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'port/sha1_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_skiplist_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'db/skiplist_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_table_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'table/table_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_version_edit_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'db/version_edit_test.cc', |
||||
], |
||||
}, |
||||
{ |
||||
'target_name': 'leveldb_write_batch_test', |
||||
'type': 'executable', |
||||
'dependencies': [ |
||||
'leveldb_testutil', |
||||
], |
||||
'sources': [ |
||||
'db/write_batch_test.cc', |
||||
], |
||||
}, |
||||
], |
||||
} |
||||
|
||||
# Local Variables: |
||||
# tab-width:2 |
||||
# indent-tabs-mode:nil |
||||
# End: |
||||
# vim: set expandtab tabstop=2 shiftwidth=2: |
@ -0,0 +1,10 @@ |
||||
This directory contains interfaces and implementations that isolate the |
||||
rest of the package from platform details. |
||||
|
||||
Code in the rest of the package includes "port.h" from this directory. |
||||
"port.h" in turn includes a platform specific "port_<platform>.h" file |
||||
that provides the platform specific implementation. |
||||
|
||||
See port_posix.h for an example of what must be provided in a platform |
||||
specific header file. |
||||
|
@ -0,0 +1,21 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_PORT_PORT_H_ |
||||
#define STORAGE_LEVELDB_PORT_PORT_H_ |
||||
|
||||
#include <string.h> |
||||
|
||||
// Include the appropriate platform specific file below. If you are
|
||||
// porting to a new platform, see "port_example.h" for documentation
|
||||
// of what the new port_<platform>.h file must provide.
|
||||
#if defined(LEVELDB_PLATFORM_POSIX) |
||||
# include "port/port_posix.h" |
||||
#elif defined(LEVELDB_PLATFORM_CHROMIUM) |
||||
# include "port/port_chromium.h" |
||||
#elif defined(LEVELDB_PLATFORM_ANDROID) |
||||
# include "port/port_android.h" |
||||
#endif |
||||
|
||||
#endif // STORAGE_LEVELDB_PORT_PORT_H_
|
@ -0,0 +1,65 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "port/port_android.h" |
||||
|
||||
#include <cstdlib> |
||||
|
||||
extern "C" { |
||||
size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { |
||||
return fread(a, b, c, d); |
||||
} |
||||
|
||||
size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { |
||||
return fwrite(a, b, c, d); |
||||
} |
||||
|
||||
int fflush_unlocked(FILE *f) { |
||||
return fflush(f); |
||||
} |
||||
|
||||
int fdatasync(int fd) { |
||||
return fsync(fd); |
||||
} |
||||
} |
||||
|
||||
// TODO(gabor): This is copied from port_posix.cc - not sure if I should do this?
|
||||
namespace leveldb { |
||||
namespace port { |
||||
|
||||
static void PthreadCall(const char* label, int result) { |
||||
if (result != 0) { |
||||
fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); |
||||
abort(); |
||||
} |
||||
} |
||||
|
||||
Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } |
||||
Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } |
||||
void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } |
||||
void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } |
||||
|
||||
CondVar::CondVar(Mutex* mu) |
||||
: mu_(mu) { |
||||
PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); |
||||
} |
||||
|
||||
CondVar::~CondVar() {
|
||||
PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); |
||||
} |
||||
|
||||
void CondVar::Wait() { |
||||
PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); |
||||
} |
||||
|
||||
void CondVar::Signal(){ |
||||
PthreadCall("signal", pthread_cond_signal(&cv_)); |
||||
} |
||||
|
||||
void CondVar::SignalAll() { |
||||
PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,131 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// See port_example.h for documentation for the following types/functions.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ |
||||
#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ |
||||
|
||||
#include <endian.h> |
||||
#include <pthread.h> |
||||
#include <stdint.h> |
||||
#include <sha1.h> |
||||
#include <cstdatomic> |
||||
#include <string> |
||||
#include <cctype> |
||||
|
||||
extern "C" { |
||||
size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); |
||||
size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); |
||||
int fflush_unlocked(FILE *f); |
||||
int fdatasync (int fd); |
||||
} |
||||
|
||||
namespace leveldb { |
||||
namespace port { |
||||
|
||||
static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN; |
||||
|
||||
class CondVar; |
||||
|
||||
class Mutex { |
||||
public: |
||||
Mutex(); |
||||
~Mutex(); |
||||
|
||||
void Lock(); |
||||
void Unlock(); |
||||
void AssertHeld() { |
||||
//TODO(gabor): How can I implement this?
|
||||
} |
||||
|
||||
private: |
||||
friend class CondVar; |
||||
pthread_mutex_t mu_; |
||||
|
||||
// No copying
|
||||
Mutex(const Mutex&); |
||||
void operator=(const Mutex&); |
||||
}; |
||||
|
||||
class CondVar { |
||||
public: |
||||
explicit CondVar(Mutex* mu); |
||||
~CondVar(); |
||||
void Wait(); |
||||
void Signal(); |
||||
void SignalAll(); |
||||
private: |
||||
Mutex* mu_; |
||||
pthread_cond_t cv_; |
||||
}; |
||||
|
||||
// Storage for a lock-free pointer
|
||||
class AtomicPointer { |
||||
private: |
||||
std::atomic<void*> rep_; |
||||
public: |
||||
AtomicPointer() { } |
||||
explicit AtomicPointer(void* v) : rep_(v) { } |
||||
inline void* Acquire_Load() const { |
||||
return rep_.load(std::memory_order_acquire); |
||||
} |
||||
inline void Release_Store(void* v) { |
||||
rep_.store(v, std::memory_order_release); |
||||
} |
||||
inline void* NoBarrier_Load() const { |
||||
return rep_.load(std::memory_order_relaxed); |
||||
} |
||||
inline void NoBarrier_Store(void* v) { |
||||
rep_.store(v, std::memory_order_relaxed); |
||||
} |
||||
}; |
||||
|
||||
/**
|
||||
* TODO(gabor): Implement actual compress |
||||
* This is a hack - it just copies input to output. |
||||
* No actual compression occurs. |
||||
*/ |
||||
inline void Lightweight_Compress( |
||||
const char* input, |
||||
size_t input_length, |
||||
std::string* output) { |
||||
output->copy((char*)input,0,input_length); |
||||
} |
||||
|
||||
/**
|
||||
* TODO(gabor): Implement actual compress |
||||
* This is a hack - it just copies input to output. |
||||
* No actual uncompression occurs. |
||||
*/ |
||||
inline bool Lightweight_Uncompress( |
||||
const char* input_data, |
||||
size_t input_length, |
||||
std::string* output) { |
||||
output->copy((char*)input_data,0,input_length); |
||||
return (bool)1; |
||||
} |
||||
|
||||
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { |
||||
SHA1_CTX sha1_ctx; |
||||
SHA1Init(&sha1_ctx); |
||||
SHA1Update(&sha1_ctx, (const u_char*)data, len); |
||||
SHA1Final((u_char*)hash_array, &sha1_ctx); |
||||
} |
||||
|
||||
inline uint64_t ThreadIdentifier() { |
||||
pthread_t tid = pthread_self(); |
||||
uint64_t r = 0; |
||||
memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid)); |
||||
return r; |
||||
} |
||||
|
||||
inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { |
||||
return false; |
||||
} |
||||
|
||||
} |
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_
|
@ -0,0 +1,83 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "port/port_chromium.h" |
||||
|
||||
#include "util/logging.h" |
||||
|
||||
#if defined(USE_SNAPPY) |
||||
# include "third_party/snappy/src/snappy.h" |
||||
# include "third_party/snappy/src/snappy-stubs.h" |
||||
#endif |
||||
|
||||
namespace leveldb { |
||||
namespace port { |
||||
|
||||
Mutex::Mutex() { |
||||
} |
||||
|
||||
Mutex::~Mutex() { |
||||
} |
||||
|
||||
void Mutex::Lock() { |
||||
mu_.Acquire(); |
||||
} |
||||
|
||||
void Mutex::Unlock() { |
||||
mu_.Release(); |
||||
} |
||||
|
||||
void Mutex::AssertHeld() { |
||||
mu_.AssertAcquired(); |
||||
} |
||||
|
||||
CondVar::CondVar(Mutex* mu) |
||||
: cv_(&mu->mu_) { |
||||
} |
||||
|
||||
CondVar::~CondVar() { } |
||||
|
||||
void CondVar::Wait() { |
||||
cv_.Wait(); |
||||
} |
||||
|
||||
void CondVar::Signal(){ |
||||
cv_.Signal(); |
||||
} |
||||
|
||||
void CondVar::SignalAll() { |
||||
cv_.Broadcast(); |
||||
} |
||||
|
||||
void Lightweight_Compress(const char* input, size_t input_length, |
||||
std::string* output) { |
||||
#if defined(USE_SNAPPY) |
||||
output->resize(snappy::MaxCompressedLength(input_length)); |
||||
size_t outlen; |
||||
snappy::RawCompress(snappy::StringPiece(input, input_length), |
||||
&(*output)[0], &outlen); |
||||
output->resize(outlen); |
||||
#else |
||||
output->assign(input, input_length); |
||||
#endif |
||||
} |
||||
|
||||
bool Lightweight_Uncompress(const char* input_data, size_t input_length, |
||||
std::string* output) { |
||||
#if defined(USE_SNAPPY) |
||||
snappy::StringPiece input(input_data, input_length); |
||||
size_t ulength; |
||||
if (!snappy::GetUncompressedLength(input, &ulength)) { |
||||
return false; |
||||
} |
||||
output->resize(ulength); |
||||
return snappy::RawUncompress(input, &(*output)[0]); |
||||
#else |
||||
output->assign(input_data, input_length); |
||||
return true; |
||||
#endif |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,104 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// See port_example.h for documentation for the following types/functions.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ |
||||
#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ |
||||
|
||||
#include <stdint.h> |
||||
#include <string> |
||||
#include <cstring> |
||||
#include "base/atomicops.h" |
||||
#include "base/basictypes.h" |
||||
#include "base/logging.h" |
||||
#include "base/sha1.h" |
||||
#include "base/synchronization/condition_variable.h" |
||||
#include "base/synchronization/lock.h" |
||||
|
||||
// Linux's ThreadIdentifier() needs this.
|
||||
#if defined(OS_LINUX) |
||||
# include <linux/unistd.h> |
||||
#endif |
||||
|
||||
#if defined(OS_WIN) |
||||
#define snprintf _snprintf |
||||
#define va_copy(a, b) do { (a) = (b); } while (0) |
||||
#endif |
||||
|
||||
namespace leveldb { |
||||
namespace port { |
||||
|
||||
// Chromium only supports little endian.
|
||||
static const bool kLittleEndian = true; |
||||
|
||||
class Mutex { |
||||
public: |
||||
Mutex(); |
||||
~Mutex(); |
||||
void Lock(); |
||||
void Unlock(); |
||||
void AssertHeld(); |
||||
|
||||
private: |
||||
base::Lock mu_; |
||||
|
||||
friend class CondVar; |
||||
DISALLOW_COPY_AND_ASSIGN(Mutex); |
||||
}; |
||||
|
||||
class CondVar { |
||||
public: |
||||
explicit CondVar(Mutex* mu); |
||||
~CondVar(); |
||||
void Wait(); |
||||
void Signal(); |
||||
void SignalAll(); |
||||
|
||||
private: |
||||
base::ConditionVariable cv_; |
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(CondVar); |
||||
}; |
||||
|
||||
class AtomicPointer { |
||||
private: |
||||
typedef base::subtle::AtomicWord Rep; |
||||
Rep rep_; |
||||
public: |
||||
AtomicPointer() { } |
||||
explicit AtomicPointer(void* p) : rep_(reinterpret_cast<Rep>(p)) {} |
||||
inline void* Acquire_Load() const { |
||||
return reinterpret_cast<void*>(::base::subtle::Acquire_Load(&rep_)); |
||||
} |
||||
inline void Release_Store(void* v) { |
||||
::base::subtle::Release_Store(&rep_, reinterpret_cast<Rep>(v)); |
||||
} |
||||
inline void* NoBarrier_Load() const { |
||||
return reinterpret_cast<void*>(::base::subtle::NoBarrier_Load(&rep_)); |
||||
} |
||||
inline void NoBarrier_Store(void* v) { |
||||
::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast<Rep>(v)); |
||||
} |
||||
}; |
||||
|
||||
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { |
||||
return ::base::SHA1HashBytes(reinterpret_cast<const unsigned char*>(data), |
||||
len, |
||||
reinterpret_cast<unsigned char*>(hash_array)); |
||||
} |
||||
|
||||
void Lightweight_Compress(const char* input, size_t input_length, |
||||
std::string* output); |
||||
bool Lightweight_Uncompress(const char* input_data, size_t input_length, |
||||
std::string* output); |
||||
|
||||
inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { |
||||
return false; |
||||
} |
||||
|
||||
} |
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_
|
@ -0,0 +1,119 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// This file contains the specification, but not the implementations,
|
||||
// of the types/operations/etc. that should be defined by a platform
|
||||
// specific port_<platform>.h file. Use this file as a reference for
|
||||
// how to port this package to a new platform.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ |
||||
#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ |
||||
|
||||
namespace leveldb { |
||||
namespace port { |
||||
|
||||
// TODO(jorlow): Many of these belong more in the environment class rather than
|
||||
// here. We should try moving them and see if it affects perf.
|
||||
|
||||
// The following boolean constant must be true on a little-endian machine
|
||||
// and false otherwise.
|
||||
static const bool kLittleEndian = true /* or some other expression */; |
||||
|
||||
// ------------------ Threading -------------------
|
||||
|
||||
// A Mutex represents an exclusive lock.
|
||||
class Mutex { |
||||
public: |
||||
Mutex(); |
||||
~Mutex(); |
||||
|
||||
// Lock the mutex. Waits until other lockers have exited.
|
||||
// Will deadlock if the mutex is already locked by this thread.
|
||||
void Lock(); |
||||
|
||||
// Unlock the mutex.
|
||||
// REQUIRES: This mutex was locked by this thread.
|
||||
void Unlock(); |
||||
|
||||
// Optionally crash if this thread does not hold this mutex.
|
||||
// The implementation must be fast, especially if NDEBUG is
|
||||
// defined. The implementation is allowed to skip all checks.
|
||||
void AssertHeld(); |
||||
}; |
||||
|
||||
class CondVar { |
||||
public: |
||||
explicit CondVar(Mutex* mu); |
||||
~CondVar(); |
||||
|
||||
// Atomically release *mu and block on this condition variable until
|
||||
// either a call to SignalAll(), or a call to Signal() that picks
|
||||
// this thread to wakeup.
|
||||
// REQUIRES: this thread holds *mu
|
||||
void Wait(); |
||||
|
||||
// If there are some threads waiting, wake up at least one of them.
|
||||
void Signal(); |
||||
|
||||
// Wake up all waiting threads.
|
||||
void SignallAll(); |
||||
}; |
||||
|
||||
// A type that holds a pointer that can be read or written atomically
|
||||
// (i.e., without word-tearing.)
|
||||
class AtomicPointer { |
||||
private: |
||||
intptr_t rep_; |
||||
public: |
||||
// Initialize to arbitrary value
|
||||
AtomicPointer(); |
||||
|
||||
// Initialize to hold v
|
||||
explicit AtomicPointer(void* v) : rep_(v) { } |
||||
|
||||
// Read and return the stored pointer with the guarantee that no
|
||||
// later memory access (read or write) by this thread can be
|
||||
// reordered ahead of this read.
|
||||
void* Acquire_Load() const; |
||||
|
||||
// Set v as the stored pointer with the guarantee that no earlier
|
||||
// memory access (read or write) by this thread can be reordered
|
||||
// after this store.
|
||||
void Release_Store(void* v); |
||||
|
||||
// Read the stored pointer with no ordering guarantees.
|
||||
void* NoBarrier_Load() const; |
||||
|
||||
// Set va as the stored pointer with no ordering guarantees.
|
||||
void NoBarrier_Store(void* v); |
||||
}; |
||||
|
||||
// ------------------ Checksumming -------------------
|
||||
|
||||
// Store a 160-bit hash of "data[0..len-1]" in "hash_array[0]..hash_array[19]"
|
||||
extern void SHA1_Hash(const char* data, size_t len, char* hash_array); |
||||
|
||||
// ------------------ Compression -------------------
|
||||
|
||||
// Store the lightweight compression of "input[0,input_length-1]" in *output.
|
||||
extern void Lightweight_Compress(const char* input, size_t input_length, |
||||
std::string* output); |
||||
|
||||
// Attempt to lightweight uncompress input[0,input_length-1] into *output.
|
||||
// Returns true if successful, false if the input is invalid lightweight
|
||||
// compressed data.
|
||||
extern bool Lightweight_Uncompress(const char* input_data, size_t input_length, |
||||
std::string* output); |
||||
|
||||
// ------------------ Miscellaneous -------------------
|
||||
|
||||
// If heap profiling is not supported, returns false.
|
||||
// Else repeatedly calls (*func)(arg, data, n) and then returns true.
|
||||
// The concatenation of all "data[0,n-1]" fragments is the heap profile.
|
||||
extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); |
||||
|
||||
} |
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
|
@ -0,0 +1,50 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "port/port_posix.h" |
||||
|
||||
#include <cstdlib> |
||||
#include <stdio.h> |
||||
#include <string.h> |
||||
#include "util/logging.h" |
||||
|
||||
namespace leveldb { |
||||
namespace port { |
||||
|
||||
static void PthreadCall(const char* label, int result) { |
||||
if (result != 0) { |
||||
fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); |
||||
abort(); |
||||
} |
||||
} |
||||
|
||||
Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } |
||||
|
||||
Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } |
||||
|
||||
void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } |
||||
|
||||
void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } |
||||
|
||||
CondVar::CondVar(Mutex* mu) |
||||
: mu_(mu) { |
||||
PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); |
||||
} |
||||
|
||||
CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } |
||||
|
||||
void CondVar::Wait() { |
||||
PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); |
||||
} |
||||
|
||||
void CondVar::Signal() { |
||||
PthreadCall("signal", pthread_cond_signal(&cv_)); |
||||
} |
||||
|
||||
void CondVar::SignalAll() { |
||||
PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,108 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// See port_example.h for documentation for the following types/functions.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ |
||||
#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ |
||||
|
||||
#include <endian.h> |
||||
#include <pthread.h> |
||||
#include <stdint.h> |
||||
#include <string> |
||||
#include <cstdatomic> |
||||
#include <cstring> |
||||
#include "port/sha1_portable.h" |
||||
|
||||
namespace leveldb { |
||||
namespace port { |
||||
|
||||
static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN); |
||||
|
||||
class CondVar; |
||||
|
||||
class Mutex { |
||||
public: |
||||
Mutex(); |
||||
~Mutex(); |
||||
|
||||
void Lock(); |
||||
void Unlock(); |
||||
void AssertHeld() { } |
||||
|
||||
private: |
||||
friend class CondVar; |
||||
pthread_mutex_t mu_; |
||||
|
||||
// No copying
|
||||
Mutex(const Mutex&); |
||||
void operator=(const Mutex&); |
||||
}; |
||||
|
||||
class CondVar { |
||||
public: |
||||
explicit CondVar(Mutex* mu); |
||||
~CondVar(); |
||||
void Wait(); |
||||
void Signal(); |
||||
void SignalAll(); |
||||
private: |
||||
pthread_cond_t cv_; |
||||
Mutex* mu_; |
||||
}; |
||||
|
||||
// Storage for a lock-free pointer
|
||||
class AtomicPointer { |
||||
private: |
||||
std::atomic<void*> rep_; |
||||
public: |
||||
AtomicPointer() { } |
||||
explicit AtomicPointer(void* v) : rep_(v) { } |
||||
inline void* Acquire_Load() const { |
||||
return rep_.load(std::memory_order_acquire); |
||||
} |
||||
inline void Release_Store(void* v) { |
||||
rep_.store(v, std::memory_order_release); |
||||
} |
||||
inline void* NoBarrier_Load() const { |
||||
return rep_.load(std::memory_order_relaxed); |
||||
} |
||||
inline void NoBarrier_Store(void* v) { |
||||
rep_.store(v, std::memory_order_relaxed); |
||||
} |
||||
}; |
||||
|
||||
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { |
||||
SHA1_Hash_Portable(data, len, hash_array); |
||||
} |
||||
|
||||
/**
|
||||
* TODO(gabor): Implement actual compress |
||||
* This is a hack - it just copies input to output. |
||||
* No actual compression occurs. |
||||
*/ |
||||
inline void Lightweight_Compress(const char* input, size_t input_length, |
||||
std::string* output) { |
||||
output->assign(input, input_length); |
||||
} |
||||
|
||||
/**
|
||||
* TODO(gabor): Implement actual uncompress |
||||
* This is a hack - it just copies input to output. |
||||
* No actual uncompression occurs. |
||||
*/ |
||||
inline bool Lightweight_Uncompress(const char* input_data, size_t input_length, |
||||
std::string* output) { |
||||
output->assign(input_data, input_length); |
||||
return true; |
||||
} |
||||
|
||||
inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { |
||||
return false; |
||||
} |
||||
|
||||
} |
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_
|
@ -0,0 +1,298 @@ |
||||
// Portions copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// This module provides a slow but portable implementation of
|
||||
// the SHA1 hash function.
|
||||
//
|
||||
// It is adapted from free code written by Paul E. Jones
|
||||
// <paulej@packetizer.com>. See http://www.packetizer.com/security/sha1/
|
||||
//
|
||||
// The license for the original code is:
|
||||
/*
|
||||
Copyright (C) 1998, 2009 |
||||
Paul E. Jones <paulej@packetizer.com> |
||||
|
||||
Freeware Public License (FPL) |
||||
|
||||
This software is licensed as "freeware." Permission to distribute |
||||
this software in source and binary forms, including incorporation |
||||
into other products, is hereby granted without a fee. THIS SOFTWARE |
||||
IS PROVIDED 'AS IS' AND WITHOUT ANY EXPRESSED OR IMPLIED WARRANTIES, |
||||
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY |
||||
AND FITNESS FOR A PARTICULAR PURPOSE. THE AUTHOR SHALL NOT BE HELD |
||||
LIABLE FOR ANY DAMAGES RESULTING FROM THE USE OF THIS SOFTWARE, EITHER |
||||
DIRECTLY OR INDIRECTLY, INCLUDING, BUT NOT LIMITED TO, LOSS OF DATA |
||||
OR DATA BEING RENDERED INACCURATE. |
||||
*/ |
||||
|
||||
#include "port/sha1_portable.h" |
||||
#include <stdio.h> |
||||
#include <stdlib.h> |
||||
#include <stdint.h> |
||||
|
||||
namespace leveldb { |
||||
namespace port { |
||||
|
||||
/*
|
||||
* Description: |
||||
* This class implements the Secure Hashing Standard as defined |
||||
* in FIPS PUB 180-1 published April 17, 1995. |
||||
*/ |
||||
|
||||
/*
|
||||
* This structure will hold context information for the hashing |
||||
* operation |
||||
*/ |
||||
typedef struct SHA1Context { |
||||
unsigned Message_Digest[5]; /* Message Digest (output) */ |
||||
|
||||
unsigned Length_Low; /* Message length in bits */ |
||||
unsigned Length_High; /* Message length in bits */ |
||||
|
||||
unsigned char Message_Block[64]; /* 512-bit message blocks */ |
||||
int Message_Block_Index; /* Index into message block array */ |
||||
|
||||
bool Computed; /* Is the digest computed? */ |
||||
bool Corrupted; /* Is the message digest corruped? */ |
||||
} SHA1Context; |
||||
|
||||
/*
|
||||
* Portability Issues: |
||||
* SHA-1 is defined in terms of 32-bit "words". This code was |
||||
* written with the expectation that the processor has at least |
||||
* a 32-bit machine word size. If the machine word size is larger, |
||||
* the code should still function properly. One caveat to that |
||||
* is that the input functions taking characters and character |
||||
* arrays assume that only 8 bits of information are stored in each |
||||
* character. |
||||
*/ |
||||
|
||||
/*
|
||||
* Define the circular shift macro |
||||
*/ |
||||
#define SHA1CircularShift(bits,word) \ |
||||
((((word) << (bits)) & 0xFFFFFFFF) | \
|
||||
((word) >> (32-(bits)))) |
||||
|
||||
/* Function prototypes */ |
||||
static void SHA1ProcessMessageBlock(SHA1Context *); |
||||
static void SHA1PadMessage(SHA1Context *); |
||||
|
||||
// Initialize the SHA1Context in preparation for computing a new
|
||||
// message digest.
|
||||
static void SHA1Reset(SHA1Context* context) { |
||||
context->Length_Low = 0; |
||||
context->Length_High = 0; |
||||
context->Message_Block_Index = 0; |
||||
|
||||
context->Message_Digest[0] = 0x67452301; |
||||
context->Message_Digest[1] = 0xEFCDAB89; |
||||
context->Message_Digest[2] = 0x98BADCFE; |
||||
context->Message_Digest[3] = 0x10325476; |
||||
context->Message_Digest[4] = 0xC3D2E1F0; |
||||
|
||||
context->Computed = false; |
||||
context->Corrupted = false; |
||||
} |
||||
|
||||
// This function will return the 160-bit message digest into the
|
||||
// Message_Digest array within the SHA1Context provided
|
||||
static bool SHA1Result(SHA1Context *context) { |
||||
if (context->Corrupted) { |
||||
return false; |
||||
} |
||||
|
||||
if (!context->Computed) { |
||||
SHA1PadMessage(context); |
||||
context->Computed = true; |
||||
} |
||||
return true; |
||||
} |
||||
|
||||
// This function accepts an array of bytes as the next portion of
|
||||
// the message.
|
||||
static void SHA1Input(SHA1Context *context, |
||||
const unsigned char *message_array, |
||||
unsigned length) { |
||||
if (!length) return; |
||||
|
||||
if (context->Computed || context->Corrupted) { |
||||
context->Corrupted = true; |
||||
return; |
||||
} |
||||
|
||||
while(length-- && !context->Corrupted) { |
||||
context->Message_Block[context->Message_Block_Index++] = |
||||
(*message_array & 0xFF); |
||||
|
||||
context->Length_Low += 8; |
||||
/* Force it to 32 bits */ |
||||
context->Length_Low &= 0xFFFFFFFF; |
||||
if (context->Length_Low == 0) { |
||||
context->Length_High++; |
||||
/* Force it to 32 bits */ |
||||
context->Length_High &= 0xFFFFFFFF; |
||||
if (context->Length_High == 0) |
||||
{ |
||||
/* Message is too long */ |
||||
context->Corrupted = true; |
||||
} |
||||
} |
||||
|
||||
if (context->Message_Block_Index == 64) |
||||
{ |
||||
SHA1ProcessMessageBlock(context); |
||||
} |
||||
|
||||
message_array++; |
||||
} |
||||
} |
||||
|
||||
// This function will process the next 512 bits of the message stored
|
||||
// in the Message_Block array.
|
||||
static void SHA1ProcessMessageBlock(SHA1Context *context) { |
||||
const unsigned K[] = // Constants defined in SHA-1
|
||||
{ |
||||
0x5A827999, |
||||
0x6ED9EBA1, |
||||
0x8F1BBCDC, |
||||
0xCA62C1D6 |
||||
}; |
||||
int t; // Loop counter
|
||||
unsigned temp; // Temporary word value
|
||||
unsigned W[80]; // Word sequence
|
||||
unsigned A, B, C, D, E; // Word buffers
|
||||
|
||||
// Initialize the first 16 words in the array W
|
||||
for(t = 0; t < 16; t++) { |
||||
W[t] = ((unsigned) context->Message_Block[t * 4]) << 24; |
||||
W[t] |= ((unsigned) context->Message_Block[t * 4 + 1]) << 16; |
||||
W[t] |= ((unsigned) context->Message_Block[t * 4 + 2]) << 8; |
||||
W[t] |= ((unsigned) context->Message_Block[t * 4 + 3]); |
||||
} |
||||
|
||||
for(t = 16; t < 80; t++) { |
||||
W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]); |
||||
} |
||||
|
||||
A = context->Message_Digest[0]; |
||||
B = context->Message_Digest[1]; |
||||
C = context->Message_Digest[2]; |
||||
D = context->Message_Digest[3]; |
||||
E = context->Message_Digest[4]; |
||||
|
||||
for(t = 0; t < 20; t++) { |
||||
temp = SHA1CircularShift(5,A) + |
||||
((B & C) | ((~B) & D)) + E + W[t] + K[0]; |
||||
temp &= 0xFFFFFFFF; |
||||
E = D; |
||||
D = C; |
||||
C = SHA1CircularShift(30,B); |
||||
B = A; |
||||
A = temp; |
||||
} |
||||
|
||||
for(t = 20; t < 40; t++) { |
||||
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1]; |
||||
temp &= 0xFFFFFFFF; |
||||
E = D; |
||||
D = C; |
||||
C = SHA1CircularShift(30,B); |
||||
B = A; |
||||
A = temp; |
||||
} |
||||
|
||||
for(t = 40; t < 60; t++) { |
||||
temp = SHA1CircularShift(5,A) + |
||||
((B & C) | (B & D) | (C & D)) + E + W[t] + K[2]; |
||||
temp &= 0xFFFFFFFF; |
||||
E = D; |
||||
D = C; |
||||
C = SHA1CircularShift(30,B); |
||||
B = A; |
||||
A = temp; |
||||
} |
||||
|
||||
for(t = 60; t < 80; t++) { |
||||
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3]; |
||||
temp &= 0xFFFFFFFF; |
||||
E = D; |
||||
D = C; |
||||
C = SHA1CircularShift(30,B); |
||||
B = A; |
||||
A = temp; |
||||
} |
||||
|
||||
context->Message_Digest[0] = (context->Message_Digest[0] + A) & 0xFFFFFFFF; |
||||
context->Message_Digest[1] = (context->Message_Digest[1] + B) & 0xFFFFFFFF; |
||||
context->Message_Digest[2] = (context->Message_Digest[2] + C) & 0xFFFFFFFF; |
||||
context->Message_Digest[3] = (context->Message_Digest[3] + D) & 0xFFFFFFFF; |
||||
context->Message_Digest[4] = (context->Message_Digest[4] + E) & 0xFFFFFFFF; |
||||
|
||||
context->Message_Block_Index = 0; |
||||
} |
||||
|
||||
// According to the standard, the message must be padded to an even
|
||||
// 512 bits. The first padding bit must be a '1'. The last 64 bits
|
||||
// represent the length of the original message. All bits in between
|
||||
// should be 0. This function will pad the message according to those
|
||||
// rules by filling the Message_Block array accordingly. It will also
|
||||
// call SHA1ProcessMessageBlock() appropriately. When it returns, it
|
||||
// can be assumed that the message digest has been computed.
|
||||
static void SHA1PadMessage(SHA1Context *context) { |
||||
// Check to see if the current message block is too small to hold
|
||||
// the initial padding bits and length. If so, we will pad the
|
||||
// block, process it, and then continue padding into a second block.
|
||||
if (context->Message_Block_Index > 55) { |
||||
context->Message_Block[context->Message_Block_Index++] = 0x80; |
||||
while(context->Message_Block_Index < 64) { |
||||
context->Message_Block[context->Message_Block_Index++] = 0; |
||||
} |
||||
|
||||
SHA1ProcessMessageBlock(context); |
||||
|
||||
while(context->Message_Block_Index < 56) { |
||||
context->Message_Block[context->Message_Block_Index++] = 0; |
||||
} |
||||
} else { |
||||
context->Message_Block[context->Message_Block_Index++] = 0x80; |
||||
while(context->Message_Block_Index < 56) { |
||||
context->Message_Block[context->Message_Block_Index++] = 0; |
||||
} |
||||
} |
||||
|
||||
// Store the message length as the last 8 octets
|
||||
context->Message_Block[56] = (context->Length_High >> 24) & 0xFF; |
||||
context->Message_Block[57] = (context->Length_High >> 16) & 0xFF; |
||||
context->Message_Block[58] = (context->Length_High >> 8) & 0xFF; |
||||
context->Message_Block[59] = (context->Length_High) & 0xFF; |
||||
context->Message_Block[60] = (context->Length_Low >> 24) & 0xFF; |
||||
context->Message_Block[61] = (context->Length_Low >> 16) & 0xFF; |
||||
context->Message_Block[62] = (context->Length_Low >> 8) & 0xFF; |
||||
context->Message_Block[63] = (context->Length_Low) & 0xFF; |
||||
|
||||
SHA1ProcessMessageBlock(context); |
||||
} |
||||
|
||||
|
||||
void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array) { |
||||
SHA1Context context; |
||||
SHA1Reset(&context); |
||||
SHA1Input(&context, reinterpret_cast<const unsigned char*>(data), len); |
||||
bool ok = SHA1Result(&context); |
||||
if (!ok) { |
||||
fprintf(stderr, "Unexpected error in SHA1_Hash_Portable code\n"); |
||||
exit(1); |
||||
} |
||||
for (int i = 0; i < 5; i++) { |
||||
uint32_t value = context.Message_Digest[i]; |
||||
hash_array[i*4 + 0] = (value >> 24) & 0xff; |
||||
hash_array[i*4 + 1] = (value >> 16) & 0xff; |
||||
hash_array[i*4 + 2] = (value >> 8) & 0xff; |
||||
hash_array[i*4 + 3] = value & 0xff; |
||||
} |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,25 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ |
||||
#define STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ |
||||
|
||||
#include <stddef.h> |
||||
|
||||
namespace leveldb { |
||||
namespace port { |
||||
|
||||
// Compute the SHA1 hash value of "data[0..len-1]" and store it in
|
||||
// "hash_array[0..19]". hash_array must have 20 bytes of space available.
|
||||
//
|
||||
// This function is portable but may not be as fast as a version
|
||||
// optimized for your platform. It is provided as a default method
|
||||
// that can be used when porting leveldb to a new platform if no
|
||||
// better SHA1 hash implementation is available.
|
||||
void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array); |
||||
|
||||
} |
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_
|
@ -0,0 +1,55 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "port/port.h" |
||||
#include "util/testharness.h" |
||||
|
||||
namespace leveldb { |
||||
namespace port { |
||||
|
||||
class SHA1 { }; |
||||
|
||||
static std::string TestSHA1(const char* data, size_t len) { |
||||
char hash_val[20]; |
||||
SHA1_Hash(data, len, hash_val); |
||||
char buf[41]; |
||||
for (int i = 0; i < 20; i++) { |
||||
snprintf(buf + i * 2, 41 - i * 2, |
||||
"%02x", |
||||
static_cast<unsigned int>(static_cast<unsigned char>( |
||||
hash_val[i]))); |
||||
} |
||||
return std::string(buf, 40); |
||||
} |
||||
|
||||
TEST(SHA1, Simple) { |
||||
ASSERT_EQ("da39a3ee5e6b4b0d3255bfef95601890afd80709", TestSHA1("", 0)); |
||||
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d", TestSHA1("hello", 5)); |
||||
std::string x(10000, 'x'); |
||||
ASSERT_EQ("f8c5cde791c5056cf515881e701c8a9ecb439a75", |
||||
TestSHA1(x.data(), x.size())); |
||||
} |
||||
|
||||
TEST(SHA1, Benchmark) { |
||||
std::string data(1048576 * 100, 'x'); |
||||
double start = Env::Default()->NowMicros() * 1e-6; |
||||
static const int kIters = 10; |
||||
uint32_t sha1 = 0; |
||||
for (int i = 0; i < kIters; i++) { |
||||
char hash_val[20]; |
||||
SHA1_Hash(data.data(), data.size(), hash_val); |
||||
sha1 |= hash_val[0]; |
||||
} |
||||
double finish = Env::Default()->NowMicros() * 1e-6; |
||||
double mb = (static_cast<long long int>(data.size()) * kIters) / 1048576.0; |
||||
fprintf(stderr, "SHA1 %0.0f MB: %.3f secs; %.1f MB/s, dummy=0x%02x\n", |
||||
mb, (finish - start), mb / (finish - start), sha1); |
||||
} |
||||
|
||||
} |
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,261 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Decodes the blocks generated by block_builder.cc.
|
||||
|
||||
#include "table/block.h" |
||||
|
||||
#include <vector> |
||||
#include <algorithm> |
||||
#include "include/comparator.h" |
||||
#include "util/coding.h" |
||||
#include "util/logging.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
inline uint32_t Block::NumRestarts() const { |
||||
assert(size_ >= 2*sizeof(uint32_t)); |
||||
return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); |
||||
} |
||||
|
||||
Block::Block(const char* data, size_t size) |
||||
: data_(data), |
||||
size_(size) { |
||||
if (size_ < sizeof(uint32_t)) { |
||||
size_ = 0; // Error marker
|
||||
} else { |
||||
restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); |
||||
if (restart_offset_ > size_ - sizeof(uint32_t)) { |
||||
// The size is too small for NumRestarts() and therefore
|
||||
// restart_offset_ wrapped around.
|
||||
size_ = 0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
Block::~Block() { |
||||
delete[] data_; |
||||
} |
||||
|
||||
// Helper routine: decode the next block entry starting at "p",
|
||||
// storing the number of shared key bytes, non_shared key bytes,
|
||||
// and the length of the value in "*shared", "*non_shared", and
|
||||
// "*value_length", respectively. Will not derefence past "limit".
|
||||
//
|
||||
// If any errors are detected, returns NULL. Otherwise, returns a
|
||||
// pointer to the key delta (just past the three decoded values).
|
||||
static inline const char* DecodeEntry(const char* p, const char* limit, |
||||
uint32_t* shared, |
||||
uint32_t* non_shared, |
||||
uint32_t* value_length) { |
||||
if (limit - p < 3) return NULL; |
||||
*shared = reinterpret_cast<const unsigned char*>(p)[0]; |
||||
*non_shared = reinterpret_cast<const unsigned char*>(p)[1]; |
||||
*value_length = reinterpret_cast<const unsigned char*>(p)[2]; |
||||
if ((*shared | *non_shared | *value_length) < 128) { |
||||
// Fast path: all three values are encoded in one byte each
|
||||
p += 3; |
||||
} else { |
||||
if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL; |
||||
if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL; |
||||
if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; |
||||
} |
||||
|
||||
if (limit - p < (*non_shared + *value_length)) return NULL; |
||||
return p; |
||||
} |
||||
|
||||
class Block::Iter : public Iterator { |
||||
private: |
||||
const Comparator* const comparator_; |
||||
const char* const data_; // underlying block contents
|
||||
uint32_t const restarts_; // Offset of restart array (list of fixed32)
|
||||
uint32_t const num_restarts_; // Number of uint32_t entries in restart array
|
||||
|
||||
// current_ is offset in data_ of current entry. >= restarts_ if !Valid
|
||||
uint32_t current_; |
||||
uint32_t restart_index_; // Index of restart block in which current_ falls
|
||||
std::string key_; |
||||
Slice value_; |
||||
Status status_; |
||||
|
||||
inline int Compare(const Slice& a, const Slice& b) const { |
||||
return comparator_->Compare(a, b); |
||||
} |
||||
|
||||
// Return the offset in data_ just past the end of the current entry.
|
||||
inline uint32_t NextEntryOffset() const { |
||||
return (value_.data() + value_.size()) - data_; |
||||
} |
||||
|
||||
uint32_t GetRestartPoint(uint32_t index) { |
||||
assert(index < num_restarts_); |
||||
return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); |
||||
} |
||||
|
||||
void SeekToRestartPoint(uint32_t index) { |
||||
key_.clear(); |
||||
restart_index_ = index; |
||||
// current_ will be fixed by ParseNextKey();
|
||||
|
||||
// ParseNextKey() starts at the end of value_, so set value_ accordingly
|
||||
uint32_t offset = GetRestartPoint(index); |
||||
value_ = Slice(data_ + offset, 0); |
||||
} |
||||
|
||||
public: |
||||
Iter(const Comparator* comparator, |
||||
const char* data, |
||||
uint32_t restarts, |
||||
uint32_t num_restarts) |
||||
: comparator_(comparator), |
||||
data_(data), |
||||
restarts_(restarts), |
||||
num_restarts_(num_restarts), |
||||
current_(restarts_), |
||||
restart_index_(num_restarts_) { |
||||
assert(num_restarts_ > 0); |
||||
} |
||||
|
||||
virtual bool Valid() const { return current_ < restarts_; } |
||||
virtual Status status() const { return status_; } |
||||
virtual Slice key() const { |
||||
assert(Valid()); |
||||
return key_; |
||||
} |
||||
virtual Slice value() const { |
||||
assert(Valid()); |
||||
return value_; |
||||
} |
||||
|
||||
virtual void Next() { |
||||
assert(Valid()); |
||||
ParseNextKey(); |
||||
} |
||||
|
||||
virtual void Prev() { |
||||
assert(Valid()); |
||||
|
||||
// Scan backwards to a restart point before current_
|
||||
const uint32_t original = current_; |
||||
while (GetRestartPoint(restart_index_) >= original) { |
||||
if (restart_index_ == 0) { |
||||
// No more entries
|
||||
current_ = restarts_; |
||||
restart_index_ = num_restarts_; |
||||
return; |
||||
} |
||||
restart_index_--; |
||||
} |
||||
|
||||
SeekToRestartPoint(restart_index_); |
||||
do { |
||||
// Loop until end of current entry hits the start of original entry
|
||||
} while (ParseNextKey() && NextEntryOffset() < original); |
||||
} |
||||
|
||||
virtual void Seek(const Slice& target) { |
||||
// Binary search in restart array to find the first restart point
|
||||
// with a key >= target
|
||||
uint32_t left = 0; |
||||
uint32_t right = num_restarts_ - 1; |
||||
while (left < right) { |
||||
uint32_t mid = (left + right + 1) / 2; |
||||
uint32_t region_offset = GetRestartPoint(mid); |
||||
uint32_t shared, non_shared, value_length; |
||||
const char* key_ptr = DecodeEntry(data_ + region_offset, |
||||
data_ + restarts_, |
||||
&shared, &non_shared, &value_length); |
||||
if (key_ptr == NULL || (shared != 0)) { |
||||
CorruptionError(); |
||||
return; |
||||
} |
||||
Slice mid_key(key_ptr, non_shared); |
||||
if (Compare(mid_key, target) < 0) { |
||||
// Key at "mid" is smaller than "target". Therefore all
|
||||
// blocks before "mid" are uninteresting.
|
||||
left = mid; |
||||
} else { |
||||
// Key at "mid" is >= "target". Therefore all blocks at or
|
||||
// after "mid" are uninteresting.
|
||||
right = mid - 1; |
||||
} |
||||
} |
||||
|
||||
// Linear search (within restart block) for first key >= target
|
||||
SeekToRestartPoint(left); |
||||
while (true) { |
||||
if (!ParseNextKey()) { |
||||
return; |
||||
} |
||||
if (Compare(key_, target) >= 0) { |
||||
return; |
||||
} |
||||
} |
||||
} |
||||
|
||||
virtual void SeekToFirst() { |
||||
SeekToRestartPoint(0); |
||||
ParseNextKey(); |
||||
} |
||||
|
||||
virtual void SeekToLast() { |
||||
SeekToRestartPoint(num_restarts_ - 1); |
||||
while (ParseNextKey() && NextEntryOffset() < restarts_) { |
||||
// Keep skipping
|
||||
} |
||||
} |
||||
|
||||
private: |
||||
void CorruptionError() { |
||||
current_ = restarts_; |
||||
restart_index_ = num_restarts_; |
||||
status_ = Status::Corruption("bad entry in block"); |
||||
key_.clear(); |
||||
value_.clear(); |
||||
} |
||||
|
||||
bool ParseNextKey() { |
||||
current_ = NextEntryOffset(); |
||||
const char* p = data_ + current_; |
||||
const char* limit = data_ + restarts_; // Restarts come right after data
|
||||
if (p >= limit) { |
||||
// No more entries to return. Mark as invalid.
|
||||
current_ = restarts_; |
||||
restart_index_ = num_restarts_; |
||||
return false; |
||||
} |
||||
|
||||
// Decode next entry
|
||||
uint32_t shared, non_shared, value_length; |
||||
p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); |
||||
if (p == NULL || key_.size() < shared) { |
||||
CorruptionError(); |
||||
return false; |
||||
} else { |
||||
key_.resize(shared); |
||||
key_.append(p, non_shared); |
||||
value_ = Slice(p + non_shared, value_length); |
||||
while (restart_index_ + 1 < num_restarts_ && |
||||
GetRestartPoint(restart_index_ + 1) < current_) { |
||||
++restart_index_; |
||||
} |
||||
return true; |
||||
} |
||||
} |
||||
}; |
||||
|
||||
Iterator* Block::NewIterator(const Comparator* cmp) { |
||||
if (size_ < 2*sizeof(uint32_t)) { |
||||
return NewErrorIterator(Status::Corruption("bad block contents")); |
||||
} |
||||
const uint32_t num_restarts = NumRestarts(); |
||||
if (num_restarts == 0) { |
||||
return NewEmptyIterator(); |
||||
} else { |
||||
return new Iter(cmp, data_, restart_offset_, num_restarts); |
||||
} |
||||
} |
||||
|
||||
} |
@ -0,0 +1,43 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_TABLE_BLOCK_H_ |
||||
#define STORAGE_LEVELDB_TABLE_BLOCK_H_ |
||||
|
||||
#include <stddef.h> |
||||
#include <stdint.h> |
||||
#include "include/iterator.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Comparator; |
||||
|
||||
class Block { |
||||
public: |
||||
// Initialize the block with the specified contents.
|
||||
// Takes ownership of data[] and will delete[] it when done.
|
||||
Block(const char* data, size_t size); |
||||
|
||||
~Block(); |
||||
|
||||
size_t size() const { return size_; } |
||||
Iterator* NewIterator(const Comparator* comparator); |
||||
|
||||
private: |
||||
uint32_t NumRestarts() const; |
||||
|
||||
const char* data_; |
||||
size_t size_; |
||||
uint32_t restart_offset_; // Offset in data_ of restart array
|
||||
|
||||
// No copying allowed
|
||||
Block(const Block&); |
||||
void operator=(const Block&); |
||||
|
||||
class Iter; |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_TABLE_BLOCK_H_
|
@ -0,0 +1,109 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// BlockBuilder generates blocks where keys are prefix-compressed:
|
||||
//
|
||||
// When we store a key, we drop the prefix shared with the previous
|
||||
// string. This helps reduce the space requirement significantly.
|
||||
// Furthermore, once every K keys, we do not apply the prefix
|
||||
// compression and store the entire key. We call this a "restart
|
||||
// point". The tail end of the block stores the offsets of all of the
|
||||
// restart points, and can be used to do a binary search when looking
|
||||
// for a particular key. Values are stored as-is (without compression)
|
||||
// immediately following the corresponding key.
|
||||
//
|
||||
// An entry for a particular key-value pair has the form:
|
||||
// shared_bytes: varint32
|
||||
// unshared_bytes: varint32
|
||||
// value_length: varint32
|
||||
// key_delta: char[unshared_bytes]
|
||||
// value: char[value_length]
|
||||
// shared_bytes == 0 for restart points.
|
||||
//
|
||||
// The trailer of the block has the form:
|
||||
// restarts: uint32[num_restarts]
|
||||
// num_restarts: uint32
|
||||
// restarts[i] contains the offset within the block of the ith restart point.
|
||||
|
||||
#include "table/block_builder.h" |
||||
|
||||
#include <algorithm> |
||||
#include <assert.h> |
||||
#include "include/comparator.h" |
||||
#include "include/table_builder.h" |
||||
#include "util/coding.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
BlockBuilder::BlockBuilder(const Options* options) |
||||
: options_(options), |
||||
restarts_(), |
||||
counter_(0), |
||||
finished_(false) { |
||||
assert(options->block_restart_interval >= 1); |
||||
restarts_.push_back(0); // First restart point is at offset 0
|
||||
} |
||||
|
||||
void BlockBuilder::Reset() { |
||||
buffer_.clear(); |
||||
restarts_.clear(); |
||||
restarts_.push_back(0); // First restart point is at offset 0
|
||||
counter_ = 0; |
||||
finished_ = false; |
||||
last_key_.clear(); |
||||
} |
||||
|
||||
size_t BlockBuilder::CurrentSizeEstimate() const { |
||||
return (buffer_.size() + // Raw data buffer
|
||||
restarts_.size() * sizeof(uint32_t) + // Restart array
|
||||
sizeof(uint32_t)); // Restart array length
|
||||
} |
||||
|
||||
Slice BlockBuilder::Finish() { |
||||
// Append restart array
|
||||
for (int i = 0; i < restarts_.size(); i++) { |
||||
PutFixed32(&buffer_, restarts_[i]); |
||||
} |
||||
PutFixed32(&buffer_, restarts_.size()); |
||||
finished_ = true; |
||||
return Slice(buffer_); |
||||
} |
||||
|
||||
void BlockBuilder::Add(const Slice& key, const Slice& value) { |
||||
Slice last_key_piece(last_key_); |
||||
assert(!finished_); |
||||
assert(counter_ <= options_->block_restart_interval); |
||||
assert(buffer_.empty() // No values yet?
|
||||
|| options_->comparator->Compare(key, last_key_piece) > 0); |
||||
size_t shared = 0; |
||||
if (counter_ < options_->block_restart_interval) { |
||||
// See how much sharing to do with previous string
|
||||
const size_t min_length = std::min(last_key_piece.size(), key.size()); |
||||
while ((shared < min_length) && (last_key_[shared] == key[shared])) { |
||||
shared++; |
||||
} |
||||
} else { |
||||
// Restart compression
|
||||
restarts_.push_back(buffer_.size()); |
||||
counter_ = 0; |
||||
} |
||||
const size_t non_shared = key.size() - shared; |
||||
|
||||
// Add "<shared><non_shared><value_size>" to buffer_
|
||||
PutVarint32(&buffer_, shared); |
||||
PutVarint32(&buffer_, non_shared); |
||||
PutVarint32(&buffer_, value.size()); |
||||
|
||||
// Add string delta to buffer_ followed by value
|
||||
buffer_.append(key.data() + shared, non_shared); |
||||
buffer_.append(value.data(), value.size()); |
||||
|
||||
// Update state
|
||||
last_key_.resize(shared); |
||||
last_key_.append(key.data() + shared, non_shared); |
||||
assert(Slice(last_key_) == key); |
||||
counter_++; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,57 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ |
||||
#define STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ |
||||
|
||||
#include <vector> |
||||
|
||||
#include <stdint.h> |
||||
#include "include/slice.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
struct Options; |
||||
|
||||
class BlockBuilder { |
||||
public: |
||||
explicit BlockBuilder(const Options* options); |
||||
|
||||
// Reset the contents as if the BlockBuilder was just constructed.
|
||||
void Reset(); |
||||
|
||||
// REQUIRES: Finish() has not been callled since the last call to Reset().
|
||||
// REQUIRES: key is larger than any previously added key
|
||||
void Add(const Slice& key, const Slice& value); |
||||
|
||||
// Finish building the block and return a slice that refers to the
|
||||
// block contents. The returned slice will remain valid for the
|
||||
// lifetime of this builder or until Reset() is called.
|
||||
Slice Finish(); |
||||
|
||||
// Returns an estimate of the current (uncompressed) size of the block
|
||||
// we are building.
|
||||
size_t CurrentSizeEstimate() const; |
||||
|
||||
// Return true iff no entries have been added since the last Reset()
|
||||
bool empty() const { |
||||
return buffer_.empty(); |
||||
} |
||||
|
||||
private: |
||||
const Options* options_; |
||||
std::string buffer_; // Destination buffer
|
||||
std::vector<uint32_t> restarts_; // Restart points
|
||||
int counter_; // Number of entries emitted since restart
|
||||
bool finished_; // Has Finish() been called?
|
||||
std::string last_key_; |
||||
|
||||
// No copying allowed
|
||||
BlockBuilder(const BlockBuilder&); |
||||
void operator=(const BlockBuilder&); |
||||
}; |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_
|
@ -0,0 +1,131 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "table/format.h" |
||||
|
||||
#include "include/env.h" |
||||
#include "port/port.h" |
||||
#include "table/block.h" |
||||
#include "util/coding.h" |
||||
#include "util/crc32c.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
void BlockHandle::EncodeTo(std::string* dst) const { |
||||
// Sanity check that all fields have been set
|
||||
assert(offset_ != ~static_cast<uint64_t>(0)); |
||||
assert(size_ != ~static_cast<uint64_t>(0)); |
||||
PutVarint64(dst, offset_); |
||||
PutVarint64(dst, size_); |
||||
} |
||||
|
||||
Status BlockHandle::DecodeFrom(Slice* input) { |
||||
if (GetVarint64(input, &offset_) && |
||||
GetVarint64(input, &size_)) { |
||||
return Status::OK(); |
||||
} else { |
||||
return Status::Corruption("bad block handle"); |
||||
} |
||||
} |
||||
|
||||
void Footer::EncodeTo(std::string* dst) const { |
||||
#ifndef NDEBUG |
||||
const size_t original_size = dst->size(); |
||||
#endif |
||||
metaindex_handle_.EncodeTo(dst); |
||||
index_handle_.EncodeTo(dst); |
||||
dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding
|
||||
PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber)); |
||||
PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber >> 32)); |
||||
assert(dst->size() == original_size + kEncodedLength); |
||||
} |
||||
|
||||
Status Footer::DecodeFrom(Slice* input) { |
||||
const char* magic_ptr = input->data() + kEncodedLength - 8; |
||||
const uint32_t magic_lo = DecodeFixed32(magic_ptr); |
||||
const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); |
||||
const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) | |
||||
(static_cast<uint64_t>(magic_lo))); |
||||
if (magic != kTableMagicNumber) { |
||||
return Status::InvalidArgument("not an sstable (bad magic number)"); |
||||
} |
||||
|
||||
Status result = metaindex_handle_.DecodeFrom(input); |
||||
if (result.ok()) { |
||||
result = index_handle_.DecodeFrom(input); |
||||
} |
||||
if (result.ok()) { |
||||
// We skip over any leftover data (just padding for now) in "input"
|
||||
const char* end = magic_ptr + 8; |
||||
*input = Slice(end, input->data() + input->size() - end); |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
Status ReadBlock(RandomAccessFile* file, |
||||
const ReadOptions& options, |
||||
const BlockHandle& handle, |
||||
Block** block) { |
||||
*block = NULL; |
||||
|
||||
// Read the block contents as well as the type/crc footer.
|
||||
// See table_builder.cc for the code that built this structure.
|
||||
size_t n = handle.size(); |
||||
char* buf = new char[n + kBlockTrailerSize]; |
||||
Slice contents; |
||||
Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); |
||||
if (!s.ok()) { |
||||
delete[] buf; |
||||
return s; |
||||
} |
||||
if (contents.size() != n + kBlockTrailerSize) { |
||||
delete[] buf; |
||||
return Status::Corruption("truncated block read"); |
||||
} |
||||
|
||||
// Check the crc of the type and the block contents
|
||||
const char* data = contents.data(); // Pointer to where Read put the data
|
||||
if (options.verify_checksums) { |
||||
const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); |
||||
const uint32_t actual = crc32c::Value(data, n + 1); |
||||
if (actual != crc) { |
||||
delete[] buf; |
||||
s = Status::Corruption("block checksum mismatch"); |
||||
return s; |
||||
} |
||||
} |
||||
|
||||
switch (data[n]) { |
||||
case kNoCompression: |
||||
if (data != buf) { |
||||
// File implementation gave us pointer to some other data.
|
||||
// Copy into buf[].
|
||||
memcpy(buf, data, n + kBlockTrailerSize); |
||||
} |
||||
|
||||
// Ok
|
||||
break; |
||||
case kLightweightCompression: { |
||||
std::string decompressed; |
||||
if (!port::Lightweight_Uncompress(data, n, &decompressed)) { |
||||
delete[] buf; |
||||
s = Status::Corruption("corrupted compressed block contents"); |
||||
return s; |
||||
} |
||||
delete[] buf; // Done with uncompressed data
|
||||
buf = new char[decompressed.size()]; |
||||
memcpy(buf, decompressed.data(), decompressed.size()); |
||||
n = decompressed.size(); |
||||
break; |
||||
} |
||||
default: |
||||
delete[] buf; |
||||
return Status::Corruption("bad block type"); |
||||
} |
||||
|
||||
*block = new Block(buf, n); // Block takes ownership of buf[]
|
||||
return Status::OK(); |
||||
} |
||||
|
||||
} |
@ -0,0 +1,103 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_TABLE_FORMAT_H_ |
||||
#define STORAGE_LEVELDB_TABLE_FORMAT_H_ |
||||
|
||||
#include <string> |
||||
#include <stdint.h> |
||||
#include "include/slice.h" |
||||
#include "include/status.h" |
||||
#include "include/table_builder.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Block; |
||||
class RandomAccessFile; |
||||
struct ReadOptions; |
||||
|
||||
// BlockHandle is a pointer to the extent of a file that stores a data
|
||||
// block or a meta block.
|
||||
class BlockHandle { |
||||
public: |
||||
BlockHandle(); |
||||
|
||||
// The offset of the block in the file.
|
||||
uint64_t offset() const { return offset_; } |
||||
void set_offset(uint64_t offset) { offset_ = offset; } |
||||
|
||||
// The size of the stored block
|
||||
uint64_t size() const { return size_; } |
||||
void set_size(uint64_t size) { size_ = size; } |
||||
|
||||
void EncodeTo(std::string* dst) const; |
||||
Status DecodeFrom(Slice* input); |
||||
|
||||
// Maximum encoding length of a BlockHandle
|
||||
enum { kMaxEncodedLength = 10 + 10 }; |
||||
|
||||
private: |
||||
uint64_t offset_; |
||||
uint64_t size_; |
||||
}; |
||||
|
||||
// Footer encapsulates the fixed information stored at the tail
|
||||
// end of every table file.
|
||||
class Footer { |
||||
public: |
||||
Footer() { } |
||||
|
||||
// The block handle for the metaindex block of the table
|
||||
const BlockHandle& metaindex_handle() const { return metaindex_handle_; } |
||||
void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } |
||||
|
||||
// The block handle for the index block of the table
|
||||
const BlockHandle& index_handle() const { |
||||
return index_handle_; |
||||
} |
||||
void set_index_handle(const BlockHandle& h) { |
||||
index_handle_ = h; |
||||
} |
||||
|
||||
void EncodeTo(std::string* dst) const; |
||||
Status DecodeFrom(Slice* input); |
||||
|
||||
// Encoded length of a Footer. Note that the serialization of a
|
||||
// Footer will always occupy exactly this many bytes. It consists
|
||||
// of two block handles and a magic number.
|
||||
enum { |
||||
kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8 |
||||
}; |
||||
|
||||
private: |
||||
BlockHandle metaindex_handle_; |
||||
BlockHandle index_handle_; |
||||
}; |
||||
|
||||
// kTableMagicNumber was picked by running
|
||||
// echo http://code.google.com/p/leveldb/ | sha1sum
|
||||
// and taking the leading 64 bits.
|
||||
static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; |
||||
|
||||
// 1-byte type + 32-bit crc
|
||||
static const size_t kBlockTrailerSize = 5; |
||||
|
||||
// Read the block identified by "handle" from "file". On success,
|
||||
// store a pointer to the heap-allocated result in *block and return
|
||||
// OK. On failure store NULL in *block and return non-OK.
|
||||
extern Status ReadBlock(RandomAccessFile* file, |
||||
const ReadOptions& options, |
||||
const BlockHandle& handle, |
||||
Block** block); |
||||
|
||||
// Implementation details follow. Clients should ignore,
|
||||
|
||||
inline BlockHandle::BlockHandle() |
||||
: offset_(~static_cast<uint64_t>(0)), |
||||
size_(~static_cast<uint64_t>(0)) { |
||||
} |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_TABLE_FORMAT_H_
|
@ -0,0 +1,68 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "include/iterator.h" |
||||
#include "util/logging.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
Iterator::Iterator() { |
||||
cleanup_.function = NULL; |
||||
cleanup_.next = NULL; |
||||
} |
||||
|
||||
Iterator::~Iterator() { |
||||
if (cleanup_.function != NULL) { |
||||
(*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); |
||||
for (Cleanup* c = cleanup_.next; c != NULL; ) { |
||||
(*c->function)(c->arg1, c->arg2); |
||||
Cleanup* next = c->next; |
||||
delete c; |
||||
c = next; |
||||
} |
||||
} |
||||
} |
||||
|
||||
void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { |
||||
assert(func != NULL); |
||||
Cleanup* c; |
||||
if (cleanup_.function == NULL) { |
||||
c = &cleanup_; |
||||
} else { |
||||
c = new Cleanup; |
||||
c->next = cleanup_.next; |
||||
cleanup_.next = c; |
||||
} |
||||
c->function = func; |
||||
c->arg1 = arg1; |
||||
c->arg2 = arg2; |
||||
} |
||||
|
||||
namespace { |
||||
class EmptyIterator : public Iterator { |
||||
public: |
||||
EmptyIterator(const Status& s) : status_(s) { } |
||||
virtual bool Valid() const { return false; } |
||||
virtual void Seek(const Slice& target) { } |
||||
virtual void SeekToFirst() { } |
||||
virtual void SeekToLast() { } |
||||
virtual void Next() { assert(false); } |
||||
virtual void Prev() { assert(false); } |
||||
Slice key() const { assert(false); return Slice(); } |
||||
Slice value() const { assert(false); return Slice(); } |
||||
virtual Status status() const { return status_; } |
||||
private: |
||||
Status status_; |
||||
}; |
||||
} |
||||
|
||||
Iterator* NewEmptyIterator() { |
||||
return new EmptyIterator(Status::OK()); |
||||
} |
||||
|
||||
Iterator* NewErrorIterator(const Status& status) { |
||||
return new EmptyIterator(status); |
||||
} |
||||
|
||||
} |
@ -0,0 +1,64 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ |
||||
#define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ |
||||
|
||||
namespace leveldb { |
||||
|
||||
// A internal wrapper class with an interface similar to Iterator that
|
||||
// caches the valid() and key() results for an underlying iterator.
|
||||
// This can help avoid virtual function calls and also gives better
|
||||
// cache locality.
|
||||
class IteratorWrapper { |
||||
private: |
||||
Iterator* iter_; |
||||
bool valid_; |
||||
Slice key_; |
||||
public: |
||||
IteratorWrapper(): iter_(NULL), valid_(false) { } |
||||
explicit IteratorWrapper(Iterator* iter): iter_(NULL) { |
||||
Set(iter); |
||||
} |
||||
~IteratorWrapper() { delete iter_; } |
||||
Iterator* iter() const { return iter_; } |
||||
|
||||
// Takes ownership of "iter" and will delete it when destroyed, or
|
||||
// when Set() is invoked again.
|
||||
void Set(Iterator* iter) { |
||||
delete iter_; |
||||
iter_ = iter; |
||||
if (iter_ == NULL) { |
||||
valid_ = false; |
||||
} else { |
||||
Update(); |
||||
} |
||||
} |
||||
|
||||
|
||||
// Iterator interface methods
|
||||
bool Valid() const { return valid_; } |
||||
Slice key() const { assert(Valid()); return key_; } |
||||
Slice value() const { assert(Valid()); return iter_->value(); } |
||||
// Methods below require iter() != NULL
|
||||
Status status() const { assert(iter_); return iter_->status(); } |
||||
void Next() { assert(iter_); iter_->Next(); Update(); } |
||||
void Prev() { assert(iter_); iter_->Prev(); Update(); } |
||||
void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } |
||||
void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } |
||||
void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } |
||||
|
||||
private: |
||||
void Update() { |
||||
valid_ = iter_->Valid(); |
||||
if (valid_) { |
||||
key_ = iter_->key(); |
||||
} |
||||
} |
||||
}; |
||||
|
||||
} |
||||
|
||||
|
||||
#endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_
|
@ -0,0 +1,143 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "table/merger.h" |
||||
|
||||
#include "include/comparator.h" |
||||
#include "include/iterator.h" |
||||
#include "table/iterator_wrapper.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
namespace { |
||||
class MergingIterator : public Iterator { |
||||
public: |
||||
MergingIterator(const Comparator* comparator, Iterator** children, int n) |
||||
: comparator_(comparator), |
||||
children_(new IteratorWrapper[n]), |
||||
n_(n), |
||||
current_(NULL) { |
||||
for (int i = 0; i < n; i++) { |
||||
children_[i].Set(children[i]); |
||||
} |
||||
} |
||||
|
||||
virtual ~MergingIterator() { |
||||
delete[] children_; |
||||
} |
||||
|
||||
virtual bool Valid() const { |
||||
return (current_ != NULL); |
||||
} |
||||
|
||||
virtual void SeekToFirst() { |
||||
for (int i = 0; i < n_; i++) { |
||||
children_[i].SeekToFirst(); |
||||
} |
||||
FindSmallest(); |
||||
} |
||||
|
||||
virtual void SeekToLast() { |
||||
for (int i = 0; i < n_; i++) { |
||||
children_[i].SeekToLast(); |
||||
} |
||||
FindLargest(); |
||||
} |
||||
|
||||
virtual void Seek(const Slice& target) { |
||||
for (int i = 0; i < n_; i++) { |
||||
children_[i].Seek(target); |
||||
} |
||||
FindSmallest(); |
||||
} |
||||
|
||||
virtual void Next() { |
||||
assert(Valid()); |
||||
current_->Next(); |
||||
FindSmallest(); |
||||
} |
||||
|
||||
virtual void Prev() { |
||||
assert(Valid()); |
||||
current_->Prev(); |
||||
FindLargest(); |
||||
} |
||||
|
||||
virtual Slice key() const { |
||||
assert(Valid()); |
||||
return current_->key(); |
||||
} |
||||
|
||||
virtual Slice value() const { |
||||
assert(Valid()); |
||||
return current_->value(); |
||||
} |
||||
|
||||
virtual Status status() const { |
||||
Status status; |
||||
for (int i = 0; i < n_; i++) { |
||||
status = children_[i].status(); |
||||
if (!status.ok()) { |
||||
break; |
||||
} |
||||
} |
||||
return status; |
||||
} |
||||
|
||||
private: |
||||
void FindSmallest(); |
||||
void FindLargest(); |
||||
|
||||
// We might want to use a heap in case there are lots of children.
|
||||
// For now we use a simple array since we expect a very small number
|
||||
// of children in leveldb.
|
||||
const Comparator* comparator_; |
||||
IteratorWrapper* children_; |
||||
int n_; |
||||
IteratorWrapper* current_; |
||||
}; |
||||
|
||||
void MergingIterator::FindSmallest() { |
||||
IteratorWrapper* smallest = NULL; |
||||
for (int i = 0; i < n_; i++) { |
||||
IteratorWrapper* child = &children_[i]; |
||||
if (child->Valid()) { |
||||
if (smallest == NULL) { |
||||
smallest = child; |
||||
} else if (comparator_->Compare(child->key(), smallest->key()) < 0) { |
||||
smallest = child; |
||||
} |
||||
} |
||||
} |
||||
current_ = smallest; |
||||
} |
||||
|
||||
void MergingIterator::FindLargest() { |
||||
IteratorWrapper* largest = NULL; |
||||
for (int i = n_-1; i >= 0; i--) { |
||||
IteratorWrapper* child = &children_[i]; |
||||
if (child->Valid()) { |
||||
if (largest == NULL) { |
||||
largest = child; |
||||
} else if (comparator_->Compare(child->key(), largest->key()) > 0) { |
||||
largest = child; |
||||
} |
||||
} |
||||
} |
||||
current_ = largest; |
||||
} |
||||
} |
||||
|
||||
Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { |
||||
assert(n >= 0); |
||||
if (n == 0) { |
||||
return NewEmptyIterator(); |
||||
} else if (n == 1) { |
||||
return list[0]; |
||||
} else { |
||||
return new MergingIterator(cmp, list, n); |
||||
} |
||||
} |
||||
|
||||
} |
@ -0,0 +1,26 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_TABLE_MERGER_H_ |
||||
#define STORAGE_LEVELDB_TABLE_MERGER_H_ |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Comparator; |
||||
class Iterator; |
||||
|
||||
// Return an iterator that provided the union of the data in
|
||||
// children[0,n-1]. Takes ownership of the child iterators and
|
||||
// will delete them when the result iterator is deleted.
|
||||
//
|
||||
// The result does no duplicate suppression. I.e., if a particular
|
||||
// key is present in K child iterators, it will be yielded K times.
|
||||
//
|
||||
// REQUIRES: n >= 0
|
||||
extern Iterator* NewMergingIterator( |
||||
const Comparator* comparator, Iterator** children, int n); |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_TABLE_MERGER_H_
|
@ -0,0 +1,175 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "include/table.h" |
||||
|
||||
#include "include/cache.h" |
||||
#include "include/env.h" |
||||
#include "table/block.h" |
||||
#include "table/format.h" |
||||
#include "table/two_level_iterator.h" |
||||
#include "util/coding.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
struct Table::Rep { |
||||
~Rep() { |
||||
delete index_block; |
||||
} |
||||
|
||||
Options options; |
||||
Status status; |
||||
RandomAccessFile* file; |
||||
uint64_t cache_id; |
||||
|
||||
BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer
|
||||
Block* index_block; |
||||
}; |
||||
|
||||
Status Table::Open(const Options& options, |
||||
RandomAccessFile* file, |
||||
Table** table) { |
||||
*table = NULL; |
||||
const uint64_t size = file->Size(); |
||||
if (size < Footer::kEncodedLength) { |
||||
return Status::InvalidArgument("file is too short to be an sstable"); |
||||
} |
||||
|
||||
char footer_space[Footer::kEncodedLength]; |
||||
Slice footer_input; |
||||
Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, |
||||
&footer_input, footer_space); |
||||
if (!s.ok()) return s; |
||||
|
||||
Footer footer; |
||||
s = footer.DecodeFrom(&footer_input); |
||||
if (!s.ok()) return s; |
||||
|
||||
// Read the index block
|
||||
Block* index_block = NULL; |
||||
if (s.ok()) { |
||||
s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block); |
||||
} |
||||
|
||||
if (s.ok()) { |
||||
// We've successfully read the footer and the index block: we're
|
||||
// ready to serve requests.
|
||||
Rep* rep = new Table::Rep; |
||||
rep->options = options; |
||||
rep->file = file; |
||||
rep->metaindex_handle = footer.metaindex_handle(); |
||||
rep->index_block = index_block; |
||||
rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); |
||||
*table = new Table(rep); |
||||
} else { |
||||
if (index_block) delete index_block; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Table::~Table() { |
||||
delete rep_; |
||||
} |
||||
|
||||
static void DeleteBlock(void* arg, void* ignored) { |
||||
delete reinterpret_cast<Block*>(arg); |
||||
} |
||||
|
||||
static void DeleteCachedBlock(const Slice& key, void* value) { |
||||
Block* block = reinterpret_cast<Block*>(value); |
||||
delete block; |
||||
} |
||||
|
||||
static void ReleaseBlock(void* arg, void* h) { |
||||
Cache* cache = reinterpret_cast<Cache*>(arg); |
||||
Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h); |
||||
cache->Release(handle); |
||||
} |
||||
|
||||
// Convert an index iterator value (i.e., an encoded BlockHandle)
|
||||
// into an iterator over the contents of the corresponding block.
|
||||
Iterator* Table::BlockReader(void* arg, |
||||
const ReadOptions& options, |
||||
const Slice& index_value) { |
||||
Table* table = reinterpret_cast<Table*>(arg); |
||||
Cache* block_cache = table->rep_->options.block_cache; |
||||
Block* block = NULL; |
||||
Cache::Handle* cache_handle = NULL; |
||||
|
||||
BlockHandle handle; |
||||
Slice input = index_value; |
||||
Status s = handle.DecodeFrom(&input); |
||||
// We intentionally allow extra stuff in index_value so that we
|
||||
// can add more features in the future.
|
||||
|
||||
if (s.ok()) { |
||||
if (block_cache != NULL) { |
||||
char cache_key_buffer[16]; |
||||
EncodeFixed64(cache_key_buffer, table->rep_->cache_id); |
||||
EncodeFixed64(cache_key_buffer+8, handle.offset()); |
||||
Slice key(cache_key_buffer, sizeof(cache_key_buffer)); |
||||
cache_handle = block_cache->Lookup(key); |
||||
if (cache_handle != NULL) { |
||||
block = reinterpret_cast<Block*>(block_cache->Value(cache_handle)); |
||||
} else { |
||||
s = ReadBlock(table->rep_->file, options, handle, &block); |
||||
if (s.ok() && options.fill_cache) { |
||||
cache_handle = block_cache->Insert( |
||||
key, block, block->size(), &DeleteCachedBlock); |
||||
} |
||||
} |
||||
} else { |
||||
s = ReadBlock(table->rep_->file, options, handle, &block); |
||||
} |
||||
} |
||||
|
||||
Iterator* iter; |
||||
if (block != NULL) { |
||||
iter = block->NewIterator(table->rep_->options.comparator); |
||||
if (cache_handle == NULL) { |
||||
iter->RegisterCleanup(&DeleteBlock, block, NULL); |
||||
} else { |
||||
iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); |
||||
} |
||||
} else { |
||||
iter = NewErrorIterator(s); |
||||
} |
||||
return iter; |
||||
} |
||||
|
||||
Iterator* Table::NewIterator(const ReadOptions& options) const { |
||||
return NewTwoLevelIterator( |
||||
rep_->index_block->NewIterator(rep_->options.comparator), |
||||
&Table::BlockReader, const_cast<Table*>(this), options); |
||||
} |
||||
|
||||
uint64_t Table::ApproximateOffsetOf(const Slice& key) const { |
||||
Iterator* index_iter = |
||||
rep_->index_block->NewIterator(rep_->options.comparator); |
||||
index_iter->Seek(key); |
||||
uint64_t result; |
||||
if (index_iter->Valid()) { |
||||
BlockHandle handle; |
||||
Slice input = index_iter->value(); |
||||
Status s = handle.DecodeFrom(&input); |
||||
if (s.ok()) { |
||||
result = handle.offset(); |
||||
} else { |
||||
// Strange: we can't decode the block handle in the index block.
|
||||
// We'll just return the offset of the metaindex block, which is
|
||||
// close to the whole file size for this case.
|
||||
result = rep_->metaindex_handle.offset(); |
||||
} |
||||
} else { |
||||
// key is past the last key in the file. Approximate the offset
|
||||
// by returning the offset of the metaindex block (which is
|
||||
// right near the end of the file).
|
||||
result = rep_->metaindex_handle.offset(); |
||||
} |
||||
delete index_iter; |
||||
return result; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,224 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "include/table_builder.h" |
||||
|
||||
#include <assert.h> |
||||
#include <stdio.h> |
||||
#include "include/comparator.h" |
||||
#include "include/env.h" |
||||
#include "table/block_builder.h" |
||||
#include "table/format.h" |
||||
#include "util/coding.h" |
||||
#include "util/crc32c.h" |
||||
#include "util/logging.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
struct TableBuilder::Rep { |
||||
Options options; |
||||
Options index_block_options; |
||||
WritableFile* file; |
||||
uint64_t offset; |
||||
Status status; |
||||
BlockBuilder data_block; |
||||
BlockBuilder index_block; |
||||
std::string last_key; |
||||
int64_t num_entries; |
||||
bool closed; // Either Finish() or Abandon() has been called.
|
||||
|
||||
// We do not emit the index entry for a block until we have seen the
|
||||
// first key for the next data block. This allows us to use shorter
|
||||
// keys in the index block. For example, consider a block boundary
|
||||
// between the keys "the quick brown fox" and "the who". We can use
|
||||
// "the r" as the key for the index block entry since it is >= all
|
||||
// entries in the first block and < all entries in subsequent
|
||||
// blocks.
|
||||
//
|
||||
// Invariant: r->pending_index_entry is true only if data_block is empty.
|
||||
bool pending_index_entry; |
||||
BlockHandle pending_handle; // Handle to add to index block
|
||||
|
||||
std::string compressed_output; |
||||
|
||||
Rep(const Options& opt, WritableFile* f) |
||||
: options(opt), |
||||
index_block_options(opt), |
||||
file(f), |
||||
offset(0), |
||||
data_block(&options), |
||||
index_block(&index_block_options), |
||||
num_entries(0), |
||||
closed(false), |
||||
pending_index_entry(false) { |
||||
index_block_options.block_restart_interval = 1; |
||||
} |
||||
}; |
||||
|
||||
TableBuilder::TableBuilder(const Options& options, WritableFile* file) |
||||
: rep_(new Rep(options, file)) { |
||||
} |
||||
|
||||
TableBuilder::~TableBuilder() { |
||||
assert(rep_->closed); // Catch errors where caller forgot to call Finish()
|
||||
delete rep_; |
||||
} |
||||
|
||||
Status TableBuilder::ChangeOptions(const Options& options) { |
||||
// Note: if more fields are added to Options, update
|
||||
// this function to catch changes that should not be allowed to
|
||||
// change in the middle of building a Table.
|
||||
if (options.comparator != rep_->options.comparator) { |
||||
return Status::InvalidArgument("changing comparator while building table"); |
||||
} |
||||
|
||||
// Note that any live BlockBuilders point to rep_->options and therefore
|
||||
// will automatically pick up the updated options.
|
||||
rep_->options = options; |
||||
rep_->index_block_options = options; |
||||
rep_->index_block_options.block_restart_interval = 1; |
||||
return Status::OK(); |
||||
} |
||||
|
||||
void TableBuilder::Add(const Slice& key, const Slice& value) { |
||||
Rep* r = rep_; |
||||
assert(!r->closed); |
||||
if (!ok()) return; |
||||
if (r->num_entries > 0) { |
||||
assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); |
||||
} |
||||
|
||||
if (r->pending_index_entry) { |
||||
assert(r->data_block.empty()); |
||||
r->options.comparator->FindShortestSeparator(&r->last_key, key); |
||||
std::string handle_encoding; |
||||
r->pending_handle.EncodeTo(&handle_encoding); |
||||
r->index_block.Add(r->last_key, Slice(handle_encoding)); |
||||
r->pending_index_entry = false; |
||||
} |
||||
|
||||
r->last_key.assign(key.data(), key.size()); |
||||
r->num_entries++; |
||||
r->data_block.Add(key, value); |
||||
|
||||
const size_t estimated_block_size = r->data_block.CurrentSizeEstimate(); |
||||
if (estimated_block_size >= r->options.block_size) { |
||||
Flush(); |
||||
} |
||||
} |
||||
|
||||
void TableBuilder::Flush() { |
||||
Rep* r = rep_; |
||||
assert(!r->closed); |
||||
if (!ok()) return; |
||||
if (r->data_block.empty()) return; |
||||
assert(!r->pending_index_entry); |
||||
WriteBlock(&r->data_block, &r->pending_handle); |
||||
if (ok()) { |
||||
r->pending_index_entry = true; |
||||
r->status = r->file->Flush(); |
||||
} |
||||
} |
||||
|
||||
void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { |
||||
// File format contains a sequence of blocks where each block has:
|
||||
// block_data: uint8[n]
|
||||
// type: uint8
|
||||
// crc: uint32
|
||||
assert(ok()); |
||||
Rep* r = rep_; |
||||
Slice raw = block->Finish(); |
||||
|
||||
Slice block_contents; |
||||
CompressionType type = r->options.compression; |
||||
// TODO(postrelease): Support more compression options: zlib?
|
||||
switch (type) { |
||||
case kNoCompression: |
||||
block_contents = raw; |
||||
break; |
||||
|
||||
case kLightweightCompression: { |
||||
port::Lightweight_Compress(raw.data(), raw.size(), &r->compressed_output); |
||||
block_contents = r->compressed_output; |
||||
if (block_contents.size() >= raw.size() - (raw.size() / 8u)) { |
||||
// Compressed less than 12.5%, so just store uncompressed form
|
||||
block_contents = raw; |
||||
type = kNoCompression; |
||||
} |
||||
break; |
||||
} |
||||
} |
||||
handle->set_offset(r->offset); |
||||
handle->set_size(block_contents.size()); |
||||
r->status = r->file->Append(block_contents); |
||||
if (r->status.ok()) { |
||||
char trailer[kBlockTrailerSize]; |
||||
trailer[0] = type; |
||||
uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); |
||||
crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type
|
||||
EncodeFixed32(trailer+1, crc32c::Mask(crc)); |
||||
r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); |
||||
if (r->status.ok()) { |
||||
r->offset += block_contents.size() + kBlockTrailerSize; |
||||
} |
||||
} |
||||
r->compressed_output.clear(); |
||||
block->Reset(); |
||||
} |
||||
|
||||
Status TableBuilder::status() const { |
||||
return rep_->status; |
||||
} |
||||
|
||||
Status TableBuilder::Finish() { |
||||
Rep* r = rep_; |
||||
Flush(); |
||||
assert(!r->closed); |
||||
r->closed = true; |
||||
BlockHandle metaindex_block_handle; |
||||
BlockHandle index_block_handle; |
||||
if (ok()) { |
||||
BlockBuilder meta_index_block(&r->options); |
||||
// TODO(postrelease): Add stats and other meta blocks
|
||||
WriteBlock(&meta_index_block, &metaindex_block_handle); |
||||
} |
||||
if (ok()) { |
||||
if (r->pending_index_entry) { |
||||
r->options.comparator->FindShortSuccessor(&r->last_key); |
||||
std::string handle_encoding; |
||||
r->pending_handle.EncodeTo(&handle_encoding); |
||||
r->index_block.Add(r->last_key, Slice(handle_encoding)); |
||||
r->pending_index_entry = false; |
||||
} |
||||
WriteBlock(&r->index_block, &index_block_handle); |
||||
} |
||||
if (ok()) { |
||||
Footer footer; |
||||
footer.set_metaindex_handle(metaindex_block_handle); |
||||
footer.set_index_handle(index_block_handle); |
||||
std::string footer_encoding; |
||||
footer.EncodeTo(&footer_encoding); |
||||
r->status = r->file->Append(footer_encoding); |
||||
if (r->status.ok()) { |
||||
r->offset += footer_encoding.size(); |
||||
} |
||||
} |
||||
return r->status; |
||||
} |
||||
|
||||
void TableBuilder::Abandon() { |
||||
Rep* r = rep_; |
||||
assert(!r->closed); |
||||
r->closed = true; |
||||
} |
||||
|
||||
uint64_t TableBuilder::NumEntries() const { |
||||
return rep_->num_entries; |
||||
} |
||||
|
||||
uint64_t TableBuilder::FileSize() const { |
||||
return rep_->offset; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,808 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "include/table.h" |
||||
|
||||
#include <map> |
||||
#include "db/dbformat.h" |
||||
#include "db/memtable.h" |
||||
#include "db/write_batch_internal.h" |
||||
#include "include/db.h" |
||||
#include "include/env.h" |
||||
#include "include/iterator.h" |
||||
#include "include/table_builder.h" |
||||
#include "table/block.h" |
||||
#include "table/block_builder.h" |
||||
#include "table/format.h" |
||||
#include "util/random.h" |
||||
#include "util/testharness.h" |
||||
#include "util/testutil.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
// Return reverse of "key".
|
||||
// Used to test non-lexicographic comparators.
|
||||
static std::string Reverse(const Slice& key) { |
||||
std::string str(key.ToString()); |
||||
std::string rev(str.rbegin(), str.rend()); |
||||
return rev; |
||||
} |
||||
|
||||
namespace { |
||||
class ReverseKeyComparator : public Comparator { |
||||
public: |
||||
virtual const char* Name() const { |
||||
return "leveldb.ReverseBytewiseComparator"; |
||||
} |
||||
|
||||
virtual int Compare(const Slice& a, const Slice& b) const { |
||||
return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); |
||||
} |
||||
|
||||
virtual void FindShortestSeparator( |
||||
std::string* start, |
||||
const Slice& limit) const { |
||||
std::string s = Reverse(*start); |
||||
std::string l = Reverse(limit); |
||||
BytewiseComparator()->FindShortestSeparator(&s, l); |
||||
*start = Reverse(s); |
||||
} |
||||
|
||||
virtual void FindShortSuccessor(std::string* key) const { |
||||
std::string s = Reverse(*key); |
||||
BytewiseComparator()->FindShortSuccessor(&s); |
||||
*key = Reverse(s); |
||||
} |
||||
}; |
||||
} |
||||
static ReverseKeyComparator reverse_key_comparator; |
||||
|
||||
static void Increment(const Comparator* cmp, std::string* key) { |
||||
if (cmp == BytewiseComparator()) { |
||||
key->push_back('\0'); |
||||
} else { |
||||
assert(cmp == &reverse_key_comparator); |
||||
std::string rev = Reverse(*key); |
||||
rev.push_back('\0'); |
||||
*key = Reverse(rev); |
||||
} |
||||
} |
||||
|
||||
// An STL comparator that uses a Comparator
|
||||
namespace { |
||||
struct STLLessThan { |
||||
const Comparator* cmp; |
||||
|
||||
STLLessThan() : cmp(BytewiseComparator()) { } |
||||
STLLessThan(const Comparator* c) : cmp(c) { } |
||||
bool operator()(const std::string& a, const std::string& b) const { |
||||
return cmp->Compare(Slice(a), Slice(b)) < 0; |
||||
} |
||||
}; |
||||
} |
||||
|
||||
class StringSink: public WritableFile { |
||||
public: |
||||
~StringSink() { } |
||||
|
||||
const std::string& contents() const { return contents_; } |
||||
|
||||
virtual Status Close() { return Status::OK(); } |
||||
virtual Status Flush() { return Status::OK(); } |
||||
virtual Status Sync() { return Status::OK(); } |
||||
|
||||
virtual Status Append(const Slice& data) { |
||||
contents_.append(data.data(), data.size()); |
||||
return Status::OK(); |
||||
} |
||||
|
||||
private: |
||||
std::string contents_; |
||||
}; |
||||
|
||||
|
||||
class StringSource: public RandomAccessFile { |
||||
public: |
||||
StringSource(const Slice& contents) |
||||
: contents_(contents.data(), contents.size()) { |
||||
} |
||||
|
||||
virtual ~StringSource() { } |
||||
|
||||
virtual uint64_t Size() const { return contents_.size(); } |
||||
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result, |
||||
char* scratch) const { |
||||
if (offset > contents_.size()) { |
||||
return Status::InvalidArgument("invalid Read offset"); |
||||
} |
||||
if (offset + n > contents_.size()) { |
||||
n = contents_.size() - offset; |
||||
} |
||||
memcpy(scratch, &contents_[offset], n); |
||||
*result = Slice(scratch, n); |
||||
return Status::OK(); |
||||
} |
||||
|
||||
private: |
||||
std::string contents_; |
||||
}; |
||||
|
||||
typedef std::map<std::string, std::string, STLLessThan> KVMap; |
||||
|
||||
// Helper class for tests to unify the interface between
|
||||
// BlockBuilder/TableBuilder and Block/Table.
|
||||
class Constructor { |
||||
public: |
||||
explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) { } |
||||
virtual ~Constructor() { } |
||||
|
||||
void Add(const std::string& key, const Slice& value) { |
||||
data_[key] = value.ToString(); |
||||
} |
||||
|
||||
// Finish constructing the data structure with all the keys that have
|
||||
// been added so far. Returns the keys in sorted order in "*keys"
|
||||
// and stores the key/value pairs in "*kvmap"
|
||||
void Finish(const Options& options, |
||||
std::vector<std::string>* keys, |
||||
KVMap* kvmap) { |
||||
*kvmap = data_; |
||||
keys->clear(); |
||||
for (KVMap::const_iterator it = data_.begin(); |
||||
it != data_.end(); |
||||
++it) { |
||||
keys->push_back(it->first); |
||||
} |
||||
data_.clear(); |
||||
Status s = FinishImpl(options, *kvmap); |
||||
ASSERT_TRUE(s.ok()) << s.ToString(); |
||||
} |
||||
|
||||
// Construct the data structure from the data in "data"
|
||||
virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; |
||||
|
||||
virtual size_t NumBytes() const = 0; |
||||
|
||||
virtual Iterator* NewIterator() const = 0; |
||||
|
||||
virtual const KVMap& data() { return data_; } |
||||
|
||||
private: |
||||
KVMap data_; |
||||
}; |
||||
|
||||
class BlockConstructor: public Constructor { |
||||
public: |
||||
explicit BlockConstructor(const Comparator* cmp) |
||||
: Constructor(cmp), |
||||
comparator_(cmp), |
||||
block_size_(-1), |
||||
block_(NULL) { } |
||||
~BlockConstructor() { |
||||
delete block_; |
||||
} |
||||
virtual Status FinishImpl(const Options& options, const KVMap& data) { |
||||
delete block_; |
||||
block_ = NULL; |
||||
BlockBuilder builder(&options); |
||||
|
||||
for (KVMap::const_iterator it = data.begin(); |
||||
it != data.end(); |
||||
++it) { |
||||
builder.Add(it->first, it->second); |
||||
} |
||||
// Open the block
|
||||
Slice block_data = builder.Finish(); |
||||
block_size_ = block_data.size(); |
||||
char* block_data_copy = new char[block_size_]; |
||||
memcpy(block_data_copy, block_data.data(), block_size_); |
||||
block_ = new Block(block_data_copy, block_size_); |
||||
return Status::OK(); |
||||
} |
||||
virtual size_t NumBytes() const { return block_size_; } |
||||
|
||||
virtual Iterator* NewIterator() const { |
||||
return block_->NewIterator(comparator_); |
||||
} |
||||
|
||||
private: |
||||
const Comparator* comparator_; |
||||
int block_size_; |
||||
Block* block_; |
||||
|
||||
BlockConstructor(); |
||||
}; |
||||
|
||||
class TableConstructor: public Constructor { |
||||
public: |
||||
TableConstructor(const Comparator* cmp) |
||||
: Constructor(cmp), |
||||
source_(NULL), table_(NULL) { |
||||
} |
||||
~TableConstructor() { |
||||
Reset(); |
||||
} |
||||
virtual Status FinishImpl(const Options& options, const KVMap& data) { |
||||
Reset(); |
||||
StringSink sink; |
||||
TableBuilder builder(options, &sink); |
||||
|
||||
for (KVMap::const_iterator it = data.begin(); |
||||
it != data.end(); |
||||
++it) { |
||||
builder.Add(it->first, it->second); |
||||
ASSERT_TRUE(builder.status().ok()); |
||||
} |
||||
Status s = builder.Finish(); |
||||
ASSERT_TRUE(s.ok()) << s.ToString(); |
||||
|
||||
ASSERT_EQ(sink.contents().size(), builder.FileSize()); |
||||
|
||||
// Open the table
|
||||
source_ = new StringSource(sink.contents()); |
||||
Options table_options; |
||||
table_options.comparator = options.comparator; |
||||
return Table::Open(table_options, source_, &table_); |
||||
} |
||||
virtual size_t NumBytes() const { return source_->Size(); } |
||||
|
||||
virtual Iterator* NewIterator() const { |
||||
return table_->NewIterator(ReadOptions()); |
||||
} |
||||
|
||||
uint64_t ApproximateOffsetOf(const Slice& key) const { |
||||
return table_->ApproximateOffsetOf(key); |
||||
} |
||||
|
||||
private: |
||||
void Reset() { |
||||
delete table_; |
||||
delete source_; |
||||
table_ = NULL; |
||||
source_ = NULL; |
||||
} |
||||
|
||||
StringSource* source_; |
||||
Table* table_; |
||||
|
||||
TableConstructor(); |
||||
}; |
||||
|
||||
// A helper class that converts internal format keys into user keys
|
||||
class KeyConvertingIterator: public Iterator { |
||||
public: |
||||
explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } |
||||
virtual ~KeyConvertingIterator() { delete iter_; } |
||||
virtual bool Valid() const { return iter_->Valid(); } |
||||
virtual void Seek(const Slice& target) { |
||||
ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); |
||||
std::string encoded; |
||||
AppendInternalKey(&encoded, ikey); |
||||
iter_->Seek(encoded); |
||||
} |
||||
virtual void SeekToFirst() { iter_->SeekToFirst(); } |
||||
virtual void SeekToLast() { iter_->SeekToLast(); } |
||||
virtual void Next() { iter_->Next(); } |
||||
virtual void Prev() { iter_->Prev(); } |
||||
|
||||
virtual Slice key() const { |
||||
assert(Valid()); |
||||
ParsedInternalKey key; |
||||
if (!ParseInternalKey(iter_->key(), &key)) { |
||||
status_ = Status::Corruption("malformed internal key"); |
||||
return Slice("corrupted key"); |
||||
} |
||||
return key.user_key; |
||||
} |
||||
|
||||
virtual Slice value() const { return iter_->value(); } |
||||
virtual Status status() const { |
||||
return status_.ok() ? iter_->status() : status_; |
||||
} |
||||
|
||||
private: |
||||
mutable Status status_; |
||||
Iterator* iter_; |
||||
|
||||
// No copying allowed
|
||||
KeyConvertingIterator(const KeyConvertingIterator&); |
||||
void operator=(const KeyConvertingIterator&); |
||||
}; |
||||
|
||||
class MemTableConstructor: public Constructor { |
||||
public: |
||||
explicit MemTableConstructor(const Comparator* cmp) |
||||
: Constructor(cmp), |
||||
internal_comparator_(cmp) { |
||||
memtable_ = new MemTable(internal_comparator_); |
||||
} |
||||
~MemTableConstructor() { |
||||
delete memtable_; |
||||
} |
||||
virtual Status FinishImpl(const Options& options, const KVMap& data) { |
||||
delete memtable_; |
||||
memtable_ = new MemTable(internal_comparator_); |
||||
int seq = 1; |
||||
for (KVMap::const_iterator it = data.begin(); |
||||
it != data.end(); |
||||
++it) { |
||||
memtable_->Add(seq, kTypeValue, it->first, it->second); |
||||
seq++; |
||||
} |
||||
return Status::OK(); |
||||
} |
||||
virtual size_t NumBytes() const { |
||||
return memtable_->ApproximateMemoryUsage(); |
||||
} |
||||
|
||||
virtual Iterator* NewIterator() const { |
||||
return new KeyConvertingIterator(memtable_->NewIterator()); |
||||
} |
||||
|
||||
private: |
||||
InternalKeyComparator internal_comparator_; |
||||
MemTable* memtable_; |
||||
}; |
||||
|
||||
class DBConstructor: public Constructor { |
||||
public: |
||||
explicit DBConstructor(const Comparator* cmp) |
||||
: Constructor(cmp), |
||||
comparator_(cmp) { |
||||
db_ = NULL; |
||||
NewDB(); |
||||
} |
||||
~DBConstructor() { |
||||
delete db_; |
||||
} |
||||
virtual Status FinishImpl(const Options& options, const KVMap& data) { |
||||
delete db_; |
||||
db_ = NULL; |
||||
NewDB(); |
||||
for (KVMap::const_iterator it = data.begin(); |
||||
it != data.end(); |
||||
++it) { |
||||
WriteBatch batch; |
||||
batch.Put(it->first, it->second); |
||||
ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); |
||||
} |
||||
return Status::OK(); |
||||
} |
||||
virtual size_t NumBytes() const { |
||||
Range r("", "\xff\xff"); |
||||
uint64_t size; |
||||
db_->GetApproximateSizes(&r, 1, &size); |
||||
return size; |
||||
} |
||||
|
||||
virtual Iterator* NewIterator() const { |
||||
return db_->NewIterator(ReadOptions()); |
||||
} |
||||
|
||||
private: |
||||
void NewDB() { |
||||
std::string name = test::TmpDir() + "/table_testdb"; |
||||
|
||||
Options options; |
||||
options.comparator = comparator_; |
||||
Status status = DestroyDB(name, options); |
||||
ASSERT_TRUE(status.ok()) << status.ToString(); |
||||
|
||||
options.create_if_missing = true; |
||||
options.error_if_exists = true; |
||||
status = DB::Open(options, name, &db_); |
||||
ASSERT_TRUE(status.ok()) << status.ToString(); |
||||
} |
||||
|
||||
const Comparator* comparator_; |
||||
DB* db_; |
||||
}; |
||||
|
||||
enum TestType { |
||||
TABLE_TEST, |
||||
BLOCK_TEST, |
||||
MEMTABLE_TEST, |
||||
DB_TEST, |
||||
}; |
||||
|
||||
struct TestArgs { |
||||
TestType type; |
||||
bool reverse_compare; |
||||
int restart_interval; |
||||
}; |
||||
|
||||
static const TestArgs kTestArgList[] = { |
||||
{ TABLE_TEST, false, 16 }, |
||||
{ TABLE_TEST, false, 1 }, |
||||
{ TABLE_TEST, false, 1024 }, |
||||
{ TABLE_TEST, true, 16 }, |
||||
{ TABLE_TEST, true, 1 }, |
||||
{ TABLE_TEST, true, 1024 }, |
||||
|
||||
{ BLOCK_TEST, false, 16 }, |
||||
{ BLOCK_TEST, false, 1 }, |
||||
{ BLOCK_TEST, false, 1024 }, |
||||
{ BLOCK_TEST, true, 16 }, |
||||
{ BLOCK_TEST, true, 1 }, |
||||
{ BLOCK_TEST, true, 1024 }, |
||||
|
||||
// Restart interval does not matter for memtables
|
||||
{ MEMTABLE_TEST, false, 16 }, |
||||
{ MEMTABLE_TEST, true, 16 }, |
||||
|
||||
// Do not bother with restart interval variations for DB
|
||||
{ DB_TEST, false, 16 }, |
||||
{ DB_TEST, true, 16 }, |
||||
}; |
||||
static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]); |
||||
|
||||
class Harness { |
||||
public: |
||||
Harness() : constructor_(NULL) { } |
||||
|
||||
void Init(const TestArgs& args) { |
||||
delete constructor_; |
||||
constructor_ = NULL; |
||||
options_ = Options(); |
||||
|
||||
options_.block_restart_interval = args.restart_interval; |
||||
// Use shorter block size for tests to exercise block boundary
|
||||
// conditions more.
|
||||
options_.block_size = 256; |
||||
if (args.reverse_compare) { |
||||
options_.comparator = &reverse_key_comparator; |
||||
} |
||||
switch (args.type) { |
||||
case TABLE_TEST: |
||||
constructor_ = new TableConstructor(options_.comparator); |
||||
break; |
||||
case BLOCK_TEST: |
||||
constructor_ = new BlockConstructor(options_.comparator); |
||||
break; |
||||
case MEMTABLE_TEST: |
||||
constructor_ = new MemTableConstructor(options_.comparator); |
||||
break; |
||||
case DB_TEST: |
||||
constructor_ = new DBConstructor(options_.comparator); |
||||
break; |
||||
} |
||||
} |
||||
|
||||
~Harness() { |
||||
delete constructor_; |
||||
} |
||||
|
||||
void Add(const std::string& key, const std::string& value) { |
||||
constructor_->Add(key, value); |
||||
} |
||||
|
||||
void Test(Random* rnd) { |
||||
std::vector<std::string> keys; |
||||
KVMap data; |
||||
constructor_->Finish(options_, &keys, &data); |
||||
|
||||
TestForwardScan(keys, data); |
||||
TestBackwardScan(keys, data); |
||||
TestRandomAccess(rnd, keys, data); |
||||
} |
||||
|
||||
void TestForwardScan(const std::vector<std::string>& keys, |
||||
const KVMap& data) { |
||||
Iterator* iter = constructor_->NewIterator(); |
||||
ASSERT_TRUE(!iter->Valid()); |
||||
iter->SeekToFirst(); |
||||
for (KVMap::const_iterator model_iter = data.begin(); |
||||
model_iter != data.end(); |
||||
++model_iter) { |
||||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); |
||||
iter->Next(); |
||||
} |
||||
ASSERT_TRUE(!iter->Valid()); |
||||
delete iter; |
||||
} |
||||
|
||||
void TestBackwardScan(const std::vector<std::string>& keys, |
||||
const KVMap& data) { |
||||
Iterator* iter = constructor_->NewIterator(); |
||||
ASSERT_TRUE(!iter->Valid()); |
||||
iter->SeekToLast(); |
||||
for (KVMap::const_reverse_iterator model_iter = data.rbegin(); |
||||
model_iter != data.rend(); |
||||
++model_iter) { |
||||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); |
||||
iter->Prev(); |
||||
} |
||||
ASSERT_TRUE(!iter->Valid()); |
||||
delete iter; |
||||
} |
||||
|
||||
void TestRandomAccess(Random* rnd, |
||||
const std::vector<std::string>& keys, |
||||
const KVMap& data) { |
||||
static const bool kVerbose = false; |
||||
Iterator* iter = constructor_->NewIterator(); |
||||
ASSERT_TRUE(!iter->Valid()); |
||||
KVMap::const_iterator model_iter = data.begin(); |
||||
if (kVerbose) fprintf(stderr, "---\n"); |
||||
for (int i = 0; i < 200; i++) { |
||||
const int toss = rnd->Uniform(5); |
||||
switch (toss) { |
||||
case 0: { |
||||
if (iter->Valid()) { |
||||
if (kVerbose) fprintf(stderr, "Next\n"); |
||||
iter->Next(); |
||||
++model_iter; |
||||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); |
||||
} |
||||
break; |
||||
} |
||||
|
||||
case 1: { |
||||
if (kVerbose) fprintf(stderr, "SeekToFirst\n"); |
||||
iter->SeekToFirst(); |
||||
model_iter = data.begin(); |
||||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); |
||||
break; |
||||
} |
||||
|
||||
case 2: { |
||||
std::string key = PickRandomKey(rnd, keys); |
||||
model_iter = data.lower_bound(key); |
||||
if (kVerbose) fprintf(stderr, "Seek '%s'\n", |
||||
EscapeString(key).c_str()); |
||||
iter->Seek(Slice(key)); |
||||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); |
||||
break; |
||||
} |
||||
|
||||
case 3: { |
||||
if (iter->Valid()) { |
||||
if (kVerbose) fprintf(stderr, "Prev\n"); |
||||
iter->Prev(); |
||||
if (model_iter == data.begin()) { |
||||
model_iter = data.end(); // Wrap around to invalid value
|
||||
} else { |
||||
--model_iter; |
||||
} |
||||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); |
||||
} |
||||
break; |
||||
} |
||||
|
||||
case 4: { |
||||
if (kVerbose) fprintf(stderr, "SeekToLast\n"); |
||||
iter->SeekToLast(); |
||||
if (keys.empty()) { |
||||
model_iter = data.end(); |
||||
} else { |
||||
std::string last = data.rbegin()->first; |
||||
model_iter = data.lower_bound(last); |
||||
} |
||||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); |
||||
break; |
||||
} |
||||
} |
||||
} |
||||
delete iter; |
||||
} |
||||
|
||||
std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { |
||||
if (it == data.end()) { |
||||
return "END"; |
||||
} else { |
||||
return "'" + it->first + "->" + it->second + "'"; |
||||
} |
||||
} |
||||
|
||||
std::string ToString(const KVMap& data, |
||||
const KVMap::const_reverse_iterator& it) { |
||||
if (it == data.rend()) { |
||||
return "END"; |
||||
} else { |
||||
return "'" + it->first + "->" + it->second + "'"; |
||||
} |
||||
} |
||||
|
||||
std::string ToString(const Iterator* it) { |
||||
if (!it->Valid()) { |
||||
return "END"; |
||||
} else { |
||||
return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; |
||||
} |
||||
} |
||||
|
||||
std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) { |
||||
if (keys.empty()) { |
||||
return "foo"; |
||||
} else { |
||||
const int index = rnd->Uniform(keys.size()); |
||||
std::string result = keys[index]; |
||||
switch (rnd->Uniform(3)) { |
||||
case 0: |
||||
// Return an existing key
|
||||
break; |
||||
case 1: { |
||||
// Attempt to return something smaller than an existing key
|
||||
if (result.size() > 0 && result[result.size()-1] > '\0') { |
||||
result[result.size()-1]--; |
||||
} |
||||
break; |
||||
} |
||||
case 2: { |
||||
// Return something larger than an existing key
|
||||
Increment(options_.comparator, &result); |
||||
break; |
||||
} |
||||
} |
||||
return result; |
||||
} |
||||
} |
||||
|
||||
private: |
||||
Options options_; |
||||
Constructor* constructor_; |
||||
}; |
||||
|
||||
// Test the empty key
|
||||
TEST(Harness, SimpleEmptyKey) { |
||||
for (int i = 0; i < kNumTestArgs; i++) { |
||||
Init(kTestArgList[i]); |
||||
Random rnd(test::RandomSeed() + 1); |
||||
Add("", "v"); |
||||
Test(&rnd); |
||||
} |
||||
} |
||||
|
||||
TEST(Harness, SimpleSingle) { |
||||
for (int i = 0; i < kNumTestArgs; i++) { |
||||
Init(kTestArgList[i]); |
||||
Random rnd(test::RandomSeed() + 2); |
||||
Add("abc", "v"); |
||||
Test(&rnd); |
||||
} |
||||
} |
||||
|
||||
TEST(Harness, SimpleMulti) { |
||||
for (int i = 0; i < kNumTestArgs; i++) { |
||||
Init(kTestArgList[i]); |
||||
Random rnd(test::RandomSeed() + 3); |
||||
Add("abc", "v"); |
||||
Add("abcd", "v"); |
||||
Add("ac", "v2"); |
||||
Test(&rnd); |
||||
} |
||||
} |
||||
|
||||
TEST(Harness, SimpleSpecialKey) { |
||||
for (int i = 0; i < kNumTestArgs; i++) { |
||||
Init(kTestArgList[i]); |
||||
Random rnd(test::RandomSeed() + 4); |
||||
Add("\xff\xff", "v3"); |
||||
Test(&rnd); |
||||
} |
||||
} |
||||
|
||||
TEST(Harness, Randomized) { |
||||
for (int i = 0; i < kNumTestArgs; i++) { |
||||
Init(kTestArgList[i]); |
||||
Random rnd(test::RandomSeed() + 5); |
||||
for (int num_entries = 0; num_entries < 2000; |
||||
num_entries += (num_entries < 50 ? 1 : 200)) { |
||||
if ((num_entries % 10) == 0) { |
||||
fprintf(stderr, "case %d of %d: num_entries = %d\n", |
||||
(i + 1), int(kNumTestArgs), num_entries); |
||||
} |
||||
for (int e = 0; e < num_entries; e++) { |
||||
std::string v; |
||||
Add(test::RandomKey(&rnd, rnd.Skewed(4)), |
||||
test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); |
||||
} |
||||
Test(&rnd); |
||||
} |
||||
} |
||||
} |
||||
|
||||
class MemTableTest { }; |
||||
|
||||
TEST(MemTableTest, Simple) { |
||||
InternalKeyComparator cmp(BytewiseComparator()); |
||||
MemTable memtable(cmp); |
||||
WriteBatch batch; |
||||
WriteBatchInternal::SetSequence(&batch, 100); |
||||
batch.Put(std::string("k1"), std::string("v1")); |
||||
batch.Put(std::string("k2"), std::string("v2")); |
||||
batch.Put(std::string("k3"), std::string("v3")); |
||||
batch.Put(std::string("largekey"), std::string("vlarge")); |
||||
ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok()); |
||||
|
||||
Iterator* iter = memtable.NewIterator(); |
||||
iter->SeekToFirst(); |
||||
while (iter->Valid()) { |
||||
fprintf(stderr, "key: '%s' -> '%s'\n", |
||||
iter->key().ToString().c_str(), |
||||
iter->value().ToString().c_str()); |
||||
iter->Next(); |
||||
} |
||||
|
||||
delete iter; |
||||
} |
||||
|
||||
static bool Between(uint64_t val, uint64_t low, uint64_t high) { |
||||
bool result = (val >= low) && (val <= high); |
||||
if (!result) { |
||||
fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", |
||||
(unsigned long long)(val), |
||||
(unsigned long long)(low), |
||||
(unsigned long long)(high)); |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
class TableTest { }; |
||||
|
||||
TEST(TableTest, ApproximateOffsetOfPlain) { |
||||
TableConstructor c(BytewiseComparator()); |
||||
c.Add("k01", "hello"); |
||||
c.Add("k02", "hello2"); |
||||
c.Add("k03", std::string(10000, 'x')); |
||||
c.Add("k04", std::string(200000, 'x')); |
||||
c.Add("k05", std::string(300000, 'x')); |
||||
c.Add("k06", "hello3"); |
||||
c.Add("k07", std::string(100000, 'x')); |
||||
std::vector<std::string> keys; |
||||
KVMap kvmap; |
||||
Options options; |
||||
options.block_size = 1024; |
||||
options.compression = kNoCompression; |
||||
c.Finish(options, &keys, &kvmap); |
||||
|
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000)); |
||||
|
||||
} |
||||
|
||||
TEST(TableTest, ApproximateOffsetOfCompressed) { |
||||
#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM) |
||||
// Compression not supported yet, so skip this test.
|
||||
// TODO(sanjay) Reenable after compression support is added
|
||||
return; |
||||
#endif |
||||
|
||||
Random rnd(301); |
||||
TableConstructor c(BytewiseComparator()); |
||||
std::string tmp; |
||||
c.Add("k01", "hello"); |
||||
c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); |
||||
c.Add("k03", "hello3"); |
||||
c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); |
||||
std::vector<std::string> keys; |
||||
KVMap kvmap; |
||||
Options options; |
||||
options.block_size = 1024; |
||||
options.compression = kLightweightCompression; |
||||
c.Finish(options, &keys, &kvmap); |
||||
|
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); |
||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,182 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "table/two_level_iterator.h" |
||||
|
||||
#include "include/table.h" |
||||
#include "table/block.h" |
||||
#include "table/format.h" |
||||
#include "table/iterator_wrapper.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
namespace { |
||||
|
||||
typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, const Slice&); |
||||
|
||||
class TwoLevelIterator: public Iterator { |
||||
public: |
||||
TwoLevelIterator( |
||||
Iterator* index_iter, |
||||
BlockFunction block_function, |
||||
void* arg, |
||||
const ReadOptions& options); |
||||
|
||||
virtual ~TwoLevelIterator(); |
||||
|
||||
virtual void Seek(const Slice& target); |
||||
virtual void SeekToFirst(); |
||||
virtual void SeekToLast(); |
||||
virtual void Next(); |
||||
virtual void Prev(); |
||||
|
||||
virtual bool Valid() const { |
||||
return data_iter_.Valid(); |
||||
} |
||||
virtual Slice key() const { |
||||
assert(Valid()); |
||||
return data_iter_.key(); |
||||
} |
||||
virtual Slice value() const { |
||||
assert(Valid()); |
||||
return data_iter_.value(); |
||||
} |
||||
virtual Status status() const { |
||||
// It'd be nice if status() returned a const Status& instead of a Status
|
||||
if (!index_iter_.status().ok()) { |
||||
return index_iter_.status(); |
||||
} else if (data_iter_.iter() != NULL && !data_iter_.status().ok()) { |
||||
return data_iter_.status(); |
||||
} else { |
||||
return status_; |
||||
} |
||||
} |
||||
|
||||
private: |
||||
void SaveError(const Status& s) { |
||||
if (status_.ok() && !s.ok()) status_ = s; |
||||
} |
||||
void SkipEmptyDataBlocksForward(); |
||||
void SkipEmptyDataBlocksBackward(); |
||||
void SetDataIterator(Iterator* data_iter); |
||||
void InitDataBlock(); |
||||
|
||||
BlockFunction block_function_; |
||||
void* arg_; |
||||
const ReadOptions options_; |
||||
Status status_; |
||||
IteratorWrapper index_iter_; |
||||
IteratorWrapper data_iter_; // May be NULL
|
||||
// If data_iter_ is non-NULL, then "data_block_handle_" holds the
|
||||
// "index_value" passed to block_function_ to create the data_iter_.
|
||||
std::string data_block_handle_; |
||||
}; |
||||
|
||||
TwoLevelIterator::TwoLevelIterator( |
||||
Iterator* index_iter, |
||||
BlockFunction block_function, |
||||
void* arg, |
||||
const ReadOptions& options) |
||||
: block_function_(block_function), |
||||
arg_(arg), |
||||
options_(options), |
||||
index_iter_(index_iter), |
||||
data_iter_(NULL) { |
||||
} |
||||
|
||||
TwoLevelIterator::~TwoLevelIterator() { |
||||
} |
||||
|
||||
void TwoLevelIterator::Seek(const Slice& target) { |
||||
index_iter_.Seek(target); |
||||
InitDataBlock(); |
||||
if (data_iter_.iter() != NULL) data_iter_.Seek(target); |
||||
SkipEmptyDataBlocksForward(); |
||||
} |
||||
|
||||
void TwoLevelIterator::SeekToFirst() { |
||||
index_iter_.SeekToFirst(); |
||||
InitDataBlock(); |
||||
if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); |
||||
SkipEmptyDataBlocksForward(); |
||||
} |
||||
|
||||
void TwoLevelIterator::SeekToLast() { |
||||
index_iter_.SeekToLast(); |
||||
InitDataBlock(); |
||||
if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); |
||||
SkipEmptyDataBlocksBackward(); |
||||
} |
||||
|
||||
void TwoLevelIterator::Next() { |
||||
assert(Valid()); |
||||
data_iter_.Next(); |
||||
SkipEmptyDataBlocksForward(); |
||||
} |
||||
|
||||
void TwoLevelIterator::Prev() { |
||||
assert(Valid()); |
||||
data_iter_.Prev(); |
||||
SkipEmptyDataBlocksBackward(); |
||||
} |
||||
|
||||
|
||||
void TwoLevelIterator::SkipEmptyDataBlocksForward() { |
||||
while (data_iter_.iter() == NULL || !data_iter_.Valid()) { |
||||
// Move to next block
|
||||
if (!index_iter_.Valid()) { |
||||
SetDataIterator(NULL); |
||||
return; |
||||
} |
||||
index_iter_.Next(); |
||||
InitDataBlock(); |
||||
if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); |
||||
} |
||||
} |
||||
|
||||
void TwoLevelIterator::SkipEmptyDataBlocksBackward() { |
||||
while (data_iter_.iter() == NULL || !data_iter_.Valid()) { |
||||
// Move to next block
|
||||
if (!index_iter_.Valid()) { |
||||
SetDataIterator(NULL); |
||||
return; |
||||
} |
||||
index_iter_.Prev(); |
||||
InitDataBlock(); |
||||
if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); |
||||
} |
||||
} |
||||
|
||||
void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { |
||||
if (data_iter_.iter() != NULL) SaveError(data_iter_.status()); |
||||
data_iter_.Set(data_iter); |
||||
} |
||||
|
||||
void TwoLevelIterator::InitDataBlock() { |
||||
if (!index_iter_.Valid()) { |
||||
SetDataIterator(NULL); |
||||
} else { |
||||
Slice handle = index_iter_.value(); |
||||
if (data_iter_.iter() != NULL && handle.compare(data_block_handle_) == 0) { |
||||
// data_iter_ is already constructed with this iterator, so
|
||||
// no need to change anything
|
||||
} else { |
||||
Iterator* iter = (*block_function_)(arg_, options_, handle); |
||||
data_block_handle_.assign(handle.data(), handle.size()); |
||||
SetDataIterator(iter); |
||||
} |
||||
} |
||||
} |
||||
|
||||
} |
||||
|
||||
Iterator* NewTwoLevelIterator( |
||||
Iterator* index_iter, |
||||
BlockFunction block_function, |
||||
void* arg, |
||||
const ReadOptions& options) { |
||||
return new TwoLevelIterator(index_iter, block_function, arg, options); |
||||
} |
||||
|
||||
} |
@ -0,0 +1,34 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ |
||||
#define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ |
||||
|
||||
#include "include/iterator.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
struct ReadOptions; |
||||
|
||||
// Return a new two level iterator. A two-level iterator contains an
|
||||
// index iterator whose values point to a sequence of blocks where
|
||||
// each block is itself a sequence of key,value pairs. The returned
|
||||
// two-level iterator yields the concatenation of all key/value pairs
|
||||
// in the sequence of blocks. Takes ownership of "index_iter" and
|
||||
// will delete it when no longer needed.
|
||||
//
|
||||
// Uses a supplied function to convert an index_iter value into
|
||||
// an iterator over the contents of the corresponding block.
|
||||
extern Iterator* NewTwoLevelIterator( |
||||
Iterator* index_iter, |
||||
Iterator* (*block_function)( |
||||
void* arg, |
||||
const ReadOptions& options, |
||||
const Slice& index_value), |
||||
void* arg, |
||||
const ReadOptions& options); |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_
|
@ -0,0 +1,68 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "util/arena.h" |
||||
#include <assert.h> |
||||
|
||||
namespace leveldb { |
||||
|
||||
static const int kBlockSize = 4096; |
||||
|
||||
Arena::Arena() { |
||||
blocks_memory_ = 0; |
||||
alloc_ptr_ = NULL; // First allocation will allocate a block
|
||||
alloc_bytes_remaining_ = 0; |
||||
} |
||||
|
||||
Arena::~Arena() { |
||||
for (int i = 0; i < blocks_.size(); i++) { |
||||
delete[] blocks_[i]; |
||||
} |
||||
} |
||||
|
||||
char* Arena::AllocateFallback(size_t bytes) { |
||||
if (bytes > kBlockSize / 4) { |
||||
// Object is more than a quarter of our block size. Allocate it separately
|
||||
// to avoid wasting too much space in leftover bytes.
|
||||
char* result = AllocateNewBlock(bytes); |
||||
return result; |
||||
} |
||||
|
||||
// We waste the remaining space in the current block.
|
||||
alloc_ptr_ = AllocateNewBlock(kBlockSize); |
||||
alloc_bytes_remaining_ = kBlockSize; |
||||
|
||||
char* result = alloc_ptr_; |
||||
alloc_ptr_ += bytes; |
||||
alloc_bytes_remaining_ -= bytes; |
||||
return result; |
||||
} |
||||
|
||||
char* Arena::AllocateAligned(size_t bytes) { |
||||
const int align = sizeof(void*); // We'll align to pointer size
|
||||
assert((align & (align-1)) == 0); // Pointer size should be a power of 2
|
||||
size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1); |
||||
size_t slop = (current_mod == 0 ? 0 : align - current_mod); |
||||
size_t needed = bytes + slop; |
||||
char* result; |
||||
if (needed <= alloc_bytes_remaining_) { |
||||
result = alloc_ptr_ + slop; |
||||
alloc_ptr_ += needed; |
||||
alloc_bytes_remaining_ -= needed; |
||||
} else { |
||||
// AllocateFallback always returned aligned memory
|
||||
result = AllocateFallback(bytes); |
||||
} |
||||
assert((reinterpret_cast<uintptr_t>(result) & (align-1)) == 0); |
||||
return result; |
||||
} |
||||
|
||||
char* Arena::AllocateNewBlock(size_t block_bytes) { |
||||
char* result = new char[block_bytes]; |
||||
blocks_memory_ += block_bytes; |
||||
blocks_.push_back(result); |
||||
return result; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,68 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ |
||||
#define STORAGE_LEVELDB_UTIL_ARENA_H_ |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <assert.h> |
||||
#include <stdint.h> |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Arena { |
||||
public: |
||||
Arena(); |
||||
~Arena(); |
||||
|
||||
// Return a pointer to a newly allocated memory block of "bytes" bytes.
|
||||
char* Allocate(size_t bytes); |
||||
|
||||
// Allocate memory with the normal alignment guarantees provided by malloc
|
||||
char* AllocateAligned(size_t bytes); |
||||
|
||||
// Returns an estimate of the total memory usage of data allocated
|
||||
// by the arena (including space allocated but not yet used for user
|
||||
// allocations).
|
||||
size_t MemoryUsage() const { |
||||
return blocks_memory_ + blocks_.capacity() * sizeof(char*); |
||||
} |
||||
|
||||
private: |
||||
char* AllocateFallback(size_t bytes); |
||||
char* AllocateNewBlock(size_t block_bytes); |
||||
|
||||
// Allocation state
|
||||
char* alloc_ptr_; |
||||
size_t alloc_bytes_remaining_; |
||||
|
||||
// Array of new[] allocated memory blocks
|
||||
std::vector<char*> blocks_; |
||||
|
||||
// Bytes of memory in blocks allocated so far
|
||||
size_t blocks_memory_; |
||||
|
||||
// No copying allowed
|
||||
Arena(const Arena&); |
||||
void operator=(const Arena&); |
||||
}; |
||||
|
||||
inline char* Arena::Allocate(size_t bytes) { |
||||
// The semantics of what to return are a bit messy if we allow
|
||||
// 0-byte allocations, so we disallow them here (we don't need
|
||||
// them for our internal use).
|
||||
assert(bytes > 0); |
||||
if (bytes <= alloc_bytes_remaining_) { |
||||
char* result = alloc_ptr_; |
||||
alloc_ptr_ += bytes; |
||||
alloc_bytes_remaining_ -= bytes; |
||||
return result; |
||||
} |
||||
return AllocateFallback(bytes); |
||||
} |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_UTIL_ARENA_H_
|
@ -0,0 +1,68 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "util/arena.h" |
||||
|
||||
#include "util/random.h" |
||||
#include "util/testharness.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class ArenaTest { }; |
||||
|
||||
TEST(ArenaTest, Empty) { |
||||
Arena arena; |
||||
} |
||||
|
||||
TEST(ArenaTest, Simple) { |
||||
std::vector<std::pair<size_t, char*> > allocated; |
||||
Arena arena; |
||||
const int N = 100000; |
||||
size_t bytes = 0; |
||||
Random rnd(301); |
||||
for (int i = 0; i < N; i++) { |
||||
size_t s; |
||||
if (i % (N / 10) == 0) { |
||||
s = i; |
||||
} else { |
||||
s = rnd.OneIn(4000) ? rnd.Uniform(6000) : |
||||
(rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); |
||||
} |
||||
if (s == 0) { |
||||
// Our arena disallows size 0 allocations.
|
||||
s = 1; |
||||
} |
||||
char* r; |
||||
if (rnd.OneIn(10)) { |
||||
r = arena.AllocateAligned(s); |
||||
} else { |
||||
r = arena.Allocate(s); |
||||
} |
||||
|
||||
for (int b = 0; b < s; b++) { |
||||
// Fill the "i"th allocation with a known bit pattern
|
||||
r[b] = i % 256; |
||||
} |
||||
bytes += s; |
||||
allocated.push_back(std::make_pair(s, r)); |
||||
ASSERT_GE(arena.MemoryUsage(), bytes); |
||||
if (i > N/10) { |
||||
ASSERT_LE(arena.MemoryUsage(), bytes * 1.10); |
||||
} |
||||
} |
||||
for (int i = 0; i < allocated.size(); i++) { |
||||
size_t num_bytes = allocated[i].first; |
||||
const char* p = allocated[i].second; |
||||
for (int b = 0; b < num_bytes; b++) { |
||||
// Check the "i"th allocation for the known bit pattern
|
||||
ASSERT_EQ(int(p[b]) & 0xff, i % 256); |
||||
} |
||||
} |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,253 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) |
||||
#include <unordered_set> |
||||
#elif defined(LEVELDB_PLATFORM_CHROMIUM) |
||||
#include "base/hash_tables.h" |
||||
#else |
||||
#include <hash_set> // TODO(sanjay): Switch to unordered_set when possible. |
||||
#endif |
||||
|
||||
#include <assert.h> |
||||
|
||||
#include "include/cache.h" |
||||
#include "port/port.h" |
||||
#include "util/hash.h" |
||||
#include "util/mutexlock.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
Cache::~Cache() { |
||||
} |
||||
|
||||
namespace { |
||||
|
||||
// LRU cache implementation
|
||||
|
||||
// An entry is a variable length heap-allocated structure. Entries
|
||||
// are kept in a circular doubly linked list ordered by access time.
|
||||
struct LRUHandle { |
||||
void* value; |
||||
void (*deleter)(const Slice&, void* value); |
||||
LRUHandle* next; |
||||
LRUHandle* prev; |
||||
size_t charge; // TODO(opt): Only allow uint32_t?
|
||||
size_t key_length; |
||||
size_t refs; // TODO(opt): Pack with "key_length"?
|
||||
char key_data[1]; // Beginning of key
|
||||
|
||||
Slice key() const { |
||||
// For cheaper lookups, we allow a temporary Handle object
|
||||
// to store a pointer to a key in "value".
|
||||
if (next == this) { |
||||
return *(reinterpret_cast<Slice*>(value)); |
||||
} else { |
||||
return Slice(key_data, key_length); |
||||
} |
||||
} |
||||
}; |
||||
|
||||
// Pick a platform specific hash_set instantiation
|
||||
#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN) |
||||
// Microsoft's hash_set deviates from the standard. See
|
||||
// http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx
|
||||
// for details. Basically the 2 param () operator is a less than and
|
||||
// the 1 param () operator is a hash function.
|
||||
struct HandleHashCompare : public stdext::hash_compare<LRUHandle*> { |
||||
size_t operator() (LRUHandle* h) const { |
||||
Slice k = h->key(); |
||||
return Hash(k.data(), k.size(), 0); |
||||
} |
||||
bool operator() (LRUHandle* a, LRUHandle* b) const { |
||||
return a->key().compare(b->key()) < 0; |
||||
} |
||||
}; |
||||
typedef base::hash_set<LRUHandle*, HandleHashCompare> HandleTable; |
||||
#else |
||||
struct HandleHash { |
||||
inline size_t operator()(LRUHandle* h) const { |
||||
Slice k = h->key(); |
||||
return Hash(k.data(), k.size(), 0); |
||||
} |
||||
}; |
||||
|
||||
struct HandleEq { |
||||
inline bool operator()(LRUHandle* a, LRUHandle* b) const { |
||||
return a->key() == b->key(); |
||||
} |
||||
}; |
||||
# if defined(LEVELDB_PLATFORM_CHROMIUM) |
||||
typedef base::hash_set<LRUHandle*, HandleHash, HandleEq> HandleTable; |
||||
# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) |
||||
typedef std::unordered_set<LRUHandle*, HandleHash, HandleEq> HandleTable; |
||||
# else |
||||
typedef __gnu_cxx::hash_set<LRUHandle*, HandleHash, HandleEq> HandleTable; |
||||
# endif |
||||
#endif |
||||
|
||||
class LRUCache : public Cache { |
||||
public: |
||||
explicit LRUCache(size_t capacity); |
||||
virtual ~LRUCache(); |
||||
|
||||
virtual Handle* Insert(const Slice& key, void* value, size_t charge, |
||||
void (*deleter)(const Slice& key, void* value)); |
||||
virtual Handle* Lookup(const Slice& key); |
||||
virtual void Release(Handle* handle); |
||||
virtual void* Value(Handle* handle); |
||||
virtual void Erase(const Slice& key); |
||||
virtual uint64_t NewId(); |
||||
|
||||
private: |
||||
void LRU_Remove(LRUHandle* e); |
||||
void LRU_Append(LRUHandle* e); |
||||
void Unref(LRUHandle* e); |
||||
|
||||
// Constructor parameters
|
||||
const size_t capacity_; |
||||
|
||||
// mutex_ protects the following state.
|
||||
port::Mutex mutex_; |
||||
size_t usage_; |
||||
uint64_t last_id_; |
||||
|
||||
// Dummy head of LRU list.
|
||||
// lru.prev is newest entry, lru.next is oldest entry.
|
||||
LRUHandle lru_; |
||||
|
||||
HandleTable table_; |
||||
}; |
||||
|
||||
LRUCache::LRUCache(size_t capacity) |
||||
: capacity_(capacity), |
||||
usage_(0), |
||||
last_id_(0) { |
||||
// Make empty circular linked list
|
||||
lru_.next = &lru_; |
||||
lru_.prev = &lru_; |
||||
} |
||||
|
||||
LRUCache::~LRUCache() { |
||||
table_.clear(); |
||||
for (LRUHandle* e = lru_.next; e != &lru_; ) { |
||||
LRUHandle* next = e->next; |
||||
assert(e->refs == 1); // Error if caller has an unreleased handle
|
||||
Unref(e); |
||||
e = next; |
||||
} |
||||
} |
||||
|
||||
void LRUCache::Unref(LRUHandle* e) { |
||||
assert(e->refs > 0); |
||||
e->refs--; |
||||
if (e->refs <= 0) { |
||||
usage_ -= e->charge; |
||||
(*e->deleter)(e->key(), e->value); |
||||
free(e); |
||||
} |
||||
} |
||||
|
||||
void LRUCache::LRU_Remove(LRUHandle* e) { |
||||
e->next->prev = e->prev; |
||||
e->prev->next = e->next; |
||||
} |
||||
|
||||
void LRUCache::LRU_Append(LRUHandle* e) { |
||||
// Make "e" newest entry by inserting just before lru_
|
||||
e->next = &lru_; |
||||
e->prev = lru_.prev; |
||||
e->prev->next = e; |
||||
e->next->prev = e; |
||||
} |
||||
|
||||
Cache::Handle* LRUCache::Lookup(const Slice& key) { |
||||
MutexLock l(&mutex_); |
||||
|
||||
LRUHandle dummy; |
||||
dummy.next = &dummy; |
||||
dummy.value = const_cast<Slice*>(&key); |
||||
HandleTable::iterator iter = table_.find(&dummy); |
||||
if (iter == table_.end()) { |
||||
return NULL; |
||||
} else { |
||||
LRUHandle* e = const_cast<LRUHandle*>(*iter); |
||||
e->refs++; |
||||
LRU_Remove(e); |
||||
LRU_Append(e); |
||||
return reinterpret_cast<Handle*>(e); |
||||
} |
||||
} |
||||
|
||||
void* LRUCache::Value(Handle* handle) { |
||||
return reinterpret_cast<LRUHandle*>(handle)->value; |
||||
} |
||||
|
||||
void LRUCache::Release(Handle* handle) { |
||||
MutexLock l(&mutex_); |
||||
Unref(reinterpret_cast<LRUHandle*>(handle)); |
||||
} |
||||
|
||||
Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge, |
||||
void (*deleter)(const Slice& key, void* value)) { |
||||
MutexLock l(&mutex_); |
||||
|
||||
LRUHandle* e = reinterpret_cast<LRUHandle*>( |
||||
malloc(sizeof(LRUHandle)-1 + key.size())); |
||||
e->value = value; |
||||
e->deleter = deleter; |
||||
e->charge = charge; |
||||
e->key_length = key.size(); |
||||
e->refs = 2; // One from LRUCache, one for the returned handle
|
||||
memcpy(e->key_data, key.data(), key.size()); |
||||
LRU_Append(e); |
||||
usage_ += charge; |
||||
|
||||
std::pair<HandleTable::iterator,bool> p = table_.insert(e); |
||||
if (!p.second) { |
||||
// Kill existing entry
|
||||
LRUHandle* old = const_cast<LRUHandle*>(*(p.first)); |
||||
LRU_Remove(old); |
||||
table_.erase(p.first); |
||||
table_.insert(e); |
||||
Unref(old); |
||||
} |
||||
|
||||
while (usage_ > capacity_ && lru_.next != &lru_) { |
||||
LRUHandle* old = lru_.next; |
||||
LRU_Remove(old); |
||||
table_.erase(old); |
||||
Unref(old); |
||||
} |
||||
|
||||
return reinterpret_cast<Handle*>(e); |
||||
} |
||||
|
||||
void LRUCache::Erase(const Slice& key) { |
||||
MutexLock l(&mutex_); |
||||
|
||||
LRUHandle dummy; |
||||
dummy.next = &dummy; |
||||
dummy.value = const_cast<Slice*>(&key); |
||||
HandleTable::iterator iter = table_.find(&dummy); |
||||
if (iter != table_.end()) { |
||||
LRUHandle* e = const_cast<LRUHandle*>(*iter); |
||||
LRU_Remove(e); |
||||
table_.erase(iter); |
||||
Unref(e); |
||||
} |
||||
} |
||||
|
||||
uint64_t LRUCache::NewId() { |
||||
MutexLock l(&mutex_); |
||||
return ++(last_id_); |
||||
} |
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
Cache* NewLRUCache(size_t capacity) { |
||||
return new LRUCache(capacity); |
||||
} |
||||
|
||||
} |
@ -0,0 +1,169 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "include/cache.h" |
||||
|
||||
#include <vector> |
||||
#include "util/coding.h" |
||||
#include "util/testharness.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
// Conversions between numeric keys/values and the types expected by Cache.
|
||||
static std::string EncodeKey(int k) { |
||||
std::string result; |
||||
PutFixed32(&result, k); |
||||
return result; |
||||
} |
||||
static int DecodeKey(const Slice& k) { |
||||
assert(k.size() == 4); |
||||
return DecodeFixed32(k.data()); |
||||
} |
||||
static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); } |
||||
static int DecodeValue(void* v) { return reinterpret_cast<uintptr_t>(v); } |
||||
|
||||
class CacheTest { |
||||
public: |
||||
static CacheTest* current_; |
||||
|
||||
static void Deleter(const Slice& key, void* v) { |
||||
current_->deleted_keys_.push_back(DecodeKey(key)); |
||||
current_->deleted_values_.push_back(DecodeValue(v)); |
||||
} |
||||
|
||||
static const int kCacheSize = 100; |
||||
std::vector<int> deleted_keys_; |
||||
std::vector<int> deleted_values_; |
||||
Cache* cache_; |
||||
|
||||
CacheTest() : cache_(NewLRUCache(kCacheSize)) { |
||||
current_ = this; |
||||
} |
||||
|
||||
~CacheTest() { |
||||
delete cache_; |
||||
} |
||||
|
||||
int Lookup(int key) { |
||||
Cache::Handle* handle = cache_->Lookup(EncodeKey(key)); |
||||
const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle)); |
||||
if (handle != NULL) { |
||||
cache_->Release(handle); |
||||
} |
||||
return r; |
||||
} |
||||
|
||||
void Insert(int key, int value, int charge = 1) { |
||||
cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge, |
||||
&CacheTest::Deleter)); |
||||
} |
||||
|
||||
void Erase(int key) { |
||||
cache_->Erase(EncodeKey(key)); |
||||
} |
||||
}; |
||||
CacheTest* CacheTest::current_; |
||||
|
||||
TEST(CacheTest, HitAndMiss) { |
||||
ASSERT_EQ(-1, Lookup(100)); |
||||
|
||||
Insert(100, 101); |
||||
ASSERT_EQ(101, Lookup(100)); |
||||
ASSERT_EQ(-1, Lookup(200)); |
||||
ASSERT_EQ(-1, Lookup(300)); |
||||
|
||||
Insert(200, 201); |
||||
ASSERT_EQ(101, Lookup(100)); |
||||
ASSERT_EQ(201, Lookup(200)); |
||||
ASSERT_EQ(-1, Lookup(300)); |
||||
|
||||
Insert(100, 102); |
||||
ASSERT_EQ(102, Lookup(100)); |
||||
ASSERT_EQ(201, Lookup(200)); |
||||
ASSERT_EQ(-1, Lookup(300)); |
||||
|
||||
ASSERT_EQ(1, deleted_keys_.size()); |
||||
ASSERT_EQ(100, deleted_keys_[0]); |
||||
ASSERT_EQ(101, deleted_values_[0]); |
||||
} |
||||
|
||||
TEST(CacheTest, Erase) { |
||||
Erase(200); |
||||
ASSERT_EQ(0, deleted_keys_.size()); |
||||
|
||||
Insert(100, 101); |
||||
Insert(200, 201); |
||||
Erase(100); |
||||
ASSERT_EQ(-1, Lookup(100)); |
||||
ASSERT_EQ(201, Lookup(200)); |
||||
ASSERT_EQ(1, deleted_keys_.size()); |
||||
ASSERT_EQ(100, deleted_keys_[0]); |
||||
ASSERT_EQ(101, deleted_values_[0]); |
||||
|
||||
Erase(100); |
||||
ASSERT_EQ(-1, Lookup(100)); |
||||
ASSERT_EQ(201, Lookup(200)); |
||||
ASSERT_EQ(1, deleted_keys_.size()); |
||||
} |
||||
|
||||
TEST(CacheTest, EntriesArePinned) { |
||||
Insert(100, 101); |
||||
Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); |
||||
ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); |
||||
|
||||
Insert(100, 102); |
||||
Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); |
||||
ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); |
||||
ASSERT_EQ(0, deleted_keys_.size()); |
||||
|
||||
cache_->Release(h1); |
||||
ASSERT_EQ(1, deleted_keys_.size()); |
||||
ASSERT_EQ(100, deleted_keys_[0]); |
||||
ASSERT_EQ(101, deleted_values_[0]); |
||||
|
||||
Erase(100); |
||||
ASSERT_EQ(-1, Lookup(100)); |
||||
ASSERT_EQ(1, deleted_keys_.size()); |
||||
|
||||
cache_->Release(h2); |
||||
ASSERT_EQ(2, deleted_keys_.size()); |
||||
ASSERT_EQ(100, deleted_keys_[1]); |
||||
ASSERT_EQ(102, deleted_values_[1]); |
||||
} |
||||
|
||||
TEST(CacheTest, EvictionPolicy) { |
||||
Insert(100, 101); |
||||
Insert(200, 201); |
||||
|
||||
// Frequently used entry must be kept around
|
||||
for (int i = 0; i < kCacheSize; i++) { |
||||
Insert(1000+i, 2000+i); |
||||
ASSERT_EQ(2000+i, Lookup(1000+i)); |
||||
ASSERT_EQ(101, Lookup(100)); |
||||
} |
||||
ASSERT_EQ(101, Lookup(100)); |
||||
ASSERT_EQ(2, deleted_keys_.size()); |
||||
ASSERT_EQ(200, deleted_keys_[0]); |
||||
ASSERT_EQ(201, deleted_values_[0]); |
||||
} |
||||
|
||||
TEST(CacheTest, HeavyEntry) { |
||||
Insert(100, 101); |
||||
Insert(200, 201, kCacheSize); |
||||
ASSERT_EQ(1, deleted_keys_.size()); |
||||
ASSERT_EQ(100, deleted_keys_[0]); |
||||
ASSERT_EQ(101, deleted_values_[0]); |
||||
} |
||||
|
||||
TEST(CacheTest, NewId) { |
||||
uint64_t a = cache_->NewId(); |
||||
uint64_t b = cache_->NewId(); |
||||
ASSERT_NE(a, b); |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,194 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "util/coding.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
void EncodeFixed32(char* buf, uint32_t value) { |
||||
#if __BYTE_ORDER == __LITTLE_ENDIAN |
||||
memcpy(buf, &value, sizeof(value)); |
||||
#else |
||||
buf[0] = value & 0xff; |
||||
buf[1] = (value >> 8) & 0xff; |
||||
buf[2] = (value >> 16) & 0xff; |
||||
buf[3] = (value >> 24) & 0xff; |
||||
#endif |
||||
} |
||||
|
||||
void EncodeFixed64(char* buf, uint64_t value) { |
||||
#if __BYTE_ORDER == __LITTLE_ENDIAN |
||||
memcpy(buf, &value, sizeof(value)); |
||||
#else |
||||
buf[0] = value & 0xff; |
||||
buf[1] = (value >> 8) & 0xff; |
||||
buf[2] = (value >> 16) & 0xff; |
||||
buf[3] = (value >> 24) & 0xff; |
||||
buf[4] = (value >> 32) & 0xff; |
||||
buf[5] = (value >> 40) & 0xff; |
||||
buf[6] = (value >> 48) & 0xff; |
||||
buf[7] = (value >> 56) & 0xff; |
||||
#endif |
||||
} |
||||
|
||||
void PutFixed32(std::string* dst, uint32_t value) { |
||||
char buf[sizeof(value)]; |
||||
EncodeFixed32(buf, value); |
||||
dst->append(buf, sizeof(buf)); |
||||
} |
||||
|
||||
void PutFixed64(std::string* dst, uint64_t value) { |
||||
char buf[sizeof(value)]; |
||||
EncodeFixed64(buf, value); |
||||
dst->append(buf, sizeof(buf)); |
||||
} |
||||
|
||||
char* EncodeVarint32(char* dst, uint32_t v) { |
||||
// Operate on characters as unsigneds
|
||||
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); |
||||
static const int B = 128; |
||||
if (v < (1<<7)) { |
||||
*(ptr++) = v; |
||||
} else if (v < (1<<14)) { |
||||
*(ptr++) = v | B; |
||||
*(ptr++) = v>>7; |
||||
} else if (v < (1<<21)) { |
||||
*(ptr++) = v | B; |
||||
*(ptr++) = (v>>7) | B; |
||||
*(ptr++) = v>>14; |
||||
} else if (v < (1<<28)) { |
||||
*(ptr++) = v | B; |
||||
*(ptr++) = (v>>7) | B; |
||||
*(ptr++) = (v>>14) | B; |
||||
*(ptr++) = v>>21; |
||||
} else { |
||||
*(ptr++) = v | B; |
||||
*(ptr++) = (v>>7) | B; |
||||
*(ptr++) = (v>>14) | B; |
||||
*(ptr++) = (v>>21) | B; |
||||
*(ptr++) = v>>28; |
||||
} |
||||
return reinterpret_cast<char*>(ptr); |
||||
} |
||||
|
||||
void PutVarint32(std::string* dst, uint32_t v) { |
||||
char buf[5]; |
||||
char* ptr = EncodeVarint32(buf, v); |
||||
dst->append(buf, ptr - buf); |
||||
} |
||||
|
||||
char* EncodeVarint64(char* dst, uint64_t v) { |
||||
static const int B = 128; |
||||
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); |
||||
while (v >= B) { |
||||
*(ptr++) = (v & (B-1)) | B; |
||||
v >>= 7; |
||||
} |
||||
*(ptr++) = v; |
||||
return reinterpret_cast<char*>(ptr); |
||||
} |
||||
|
||||
void PutVarint64(std::string* dst, uint64_t v) { |
||||
char buf[10]; |
||||
char* ptr = EncodeVarint64(buf, v); |
||||
dst->append(buf, ptr - buf); |
||||
} |
||||
|
||||
void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { |
||||
PutVarint32(dst, value.size()); |
||||
dst->append(value.data(), value.size()); |
||||
} |
||||
|
||||
int VarintLength(uint64_t v) { |
||||
int len = 1; |
||||
while (v >= 128) { |
||||
v >>= 7; |
||||
len++; |
||||
} |
||||
return len; |
||||
} |
||||
|
||||
const char* GetVarint32PtrFallback(const char* p, |
||||
const char* limit, |
||||
uint32_t* value) { |
||||
uint32_t result = 0; |
||||
for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { |
||||
uint32_t byte = *(reinterpret_cast<const unsigned char*>(p)); |
||||
p++; |
||||
if (byte & 128) { |
||||
// More bytes are present
|
||||
result |= ((byte & 127) << shift); |
||||
} else { |
||||
result |= (byte << shift); |
||||
*value = result; |
||||
return reinterpret_cast<const char*>(p); |
||||
} |
||||
} |
||||
return NULL; |
||||
} |
||||
|
||||
bool GetVarint32(Slice* input, uint32_t* value) { |
||||
const char* p = input->data(); |
||||
const char* limit = p + input->size(); |
||||
const char* q = GetVarint32Ptr(p, limit, value); |
||||
if (q == NULL) { |
||||
return false; |
||||
} else { |
||||
*input = Slice(q, limit - q); |
||||
return true; |
||||
} |
||||
} |
||||
|
||||
const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { |
||||
uint64_t result = 0; |
||||
for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { |
||||
uint64_t byte = *(reinterpret_cast<const unsigned char*>(p)); |
||||
p++; |
||||
if (byte & 128) { |
||||
// More bytes are present
|
||||
result |= ((byte & 127) << shift); |
||||
} else { |
||||
result |= (byte << shift); |
||||
*value = result; |
||||
return reinterpret_cast<const char*>(p); |
||||
} |
||||
} |
||||
return NULL; |
||||
} |
||||
|
||||
bool GetVarint64(Slice* input, uint64_t* value) { |
||||
const char* p = input->data(); |
||||
const char* limit = p + input->size(); |
||||
const char* q = GetVarint64Ptr(p, limit, value); |
||||
if (q == NULL) { |
||||
return false; |
||||
} else { |
||||
*input = Slice(q, limit - q); |
||||
return true; |
||||
} |
||||
} |
||||
|
||||
const char* GetLengthPrefixedSlice(const char* p, const char* limit, |
||||
Slice* result) { |
||||
uint32_t len; |
||||
p = GetVarint32Ptr(p, limit, &len); |
||||
if (p == NULL) return NULL; |
||||
if (p + len > limit) return NULL; |
||||
*result = Slice(p, len); |
||||
return p + len; |
||||
} |
||||
|
||||
bool GetLengthPrefixedSlice(Slice* input, Slice* result) { |
||||
uint32_t len; |
||||
if (GetVarint32(input, &len) && |
||||
input->size() >= len) { |
||||
*result = Slice(input->data(), len); |
||||
input->remove_prefix(len); |
||||
return true; |
||||
} else { |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
} |
@ -0,0 +1,104 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Endian-neutral encoding:
|
||||
// * Fixed-length numbers are encoded with least-significant byte first
|
||||
// * In addition we support variable length "varint" encoding
|
||||
// * Strings are encoded prefixed by their length in varint format
|
||||
|
||||
#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ |
||||
#define STORAGE_LEVELDB_UTIL_CODING_H_ |
||||
|
||||
#include <stdint.h> |
||||
#include <string.h> |
||||
#include <string> |
||||
#include "include/slice.h" |
||||
#include "port/port.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
// Standard Put... routines append to a string
|
||||
extern void PutFixed32(std::string* dst, uint32_t value); |
||||
extern void PutFixed64(std::string* dst, uint64_t value); |
||||
extern void PutVarint32(std::string* dst, uint32_t value); |
||||
extern void PutVarint64(std::string* dst, uint64_t value); |
||||
extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); |
||||
|
||||
// Standard Get... routines parse a value from the beginning of a Slice
|
||||
// and advance the slice past the parsed value.
|
||||
extern bool GetVarint32(Slice* input, uint32_t* value); |
||||
extern bool GetVarint64(Slice* input, uint64_t* value); |
||||
extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); |
||||
|
||||
// Pointer-based variants of GetVarint... These either store a value
|
||||
// in *v and return a pointer just past the parsed value, or return
|
||||
// NULL on error. These routines only look at bytes in the range
|
||||
// [p..limit-1]
|
||||
extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); |
||||
extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); |
||||
|
||||
// Returns the length of the varint32 or varint64 encoding of "v"
|
||||
extern int VarintLength(uint64_t v); |
||||
|
||||
// Lower-level versions of Put... that write directly into a character buffer
|
||||
// REQUIRES: dst has enough space for the value being written
|
||||
extern void EncodeFixed32(char* dst, uint32_t value); |
||||
extern void EncodeFixed64(char* dst, uint64_t value); |
||||
|
||||
// Lower-level versions of Put... that write directly into a character buffer
|
||||
// and return a pointer just past the last byte written.
|
||||
// REQUIRES: dst has enough space for the value being written
|
||||
extern char* EncodeVarint32(char* dst, uint32_t value); |
||||
extern char* EncodeVarint64(char* dst, uint64_t value); |
||||
|
||||
// Lower-level versions of Get... that read directly from a character buffer
|
||||
// without any bounds checking.
|
||||
|
||||
inline uint32_t DecodeFixed32(const char* ptr) { |
||||
if (port::kLittleEndian) { |
||||
// Load the raw bytes
|
||||
uint32_t result; |
||||
memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
|
||||
return result; |
||||
} else { |
||||
return ((static_cast<uint32_t>(ptr[0])) |
||||
| (static_cast<uint32_t>(ptr[1]) << 8) |
||||
| (static_cast<uint32_t>(ptr[2]) << 16) |
||||
| (static_cast<uint32_t>(ptr[3]) << 24)); |
||||
} |
||||
} |
||||
|
||||
inline uint64_t DecodeFixed64(const char* ptr) { |
||||
if (port::kLittleEndian) { |
||||
// Load the raw bytes
|
||||
uint64_t result; |
||||
memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
|
||||
return result; |
||||
} else { |
||||
uint64_t lo = DecodeFixed32(ptr); |
||||
uint64_t hi = DecodeFixed32(ptr + 4); |
||||
return (hi << 32) | lo; |
||||
} |
||||
} |
||||
|
||||
// Internal routine for use by fallback path of GetVarint32Ptr
|
||||
extern const char* GetVarint32PtrFallback(const char* p, |
||||
const char* limit, |
||||
uint32_t* value); |
||||
inline const char* GetVarint32Ptr(const char* p, |
||||
const char* limit, |
||||
uint32_t* value) { |
||||
if (p < limit) { |
||||
uint32_t result = *(reinterpret_cast<const unsigned char*>(p)); |
||||
if ((result & 128) == 0) { |
||||
*value = result; |
||||
return p + 1; |
||||
} |
||||
} |
||||
return GetVarint32PtrFallback(p, limit, value); |
||||
} |
||||
|
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_UTIL_CODING_H_
|
@ -0,0 +1,173 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "util/coding.h" |
||||
|
||||
#include "util/testharness.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
class Coding { }; |
||||
|
||||
TEST(Coding, Fixed32) { |
||||
std::string s; |
||||
for (uint32_t v = 0; v < 100000; v++) { |
||||
PutFixed32(&s, v); |
||||
} |
||||
|
||||
const char* p = s.data(); |
||||
for (uint32_t v = 0; v < 100000; v++) { |
||||
uint32_t actual = DecodeFixed32(p); |
||||
ASSERT_EQ(v, actual); |
||||
p += sizeof(uint32_t); |
||||
} |
||||
} |
||||
|
||||
TEST(Coding, Fixed64) { |
||||
std::string s; |
||||
for (int power = 0; power <= 63; power++) { |
||||
uint64_t v = static_cast<uint64_t>(1) << power; |
||||
PutFixed64(&s, v - 1); |
||||
PutFixed64(&s, v + 0); |
||||
PutFixed64(&s, v + 1); |
||||
} |
||||
|
||||
const char* p = s.data(); |
||||
for (int power = 0; power <= 63; power++) { |
||||
uint64_t v = static_cast<uint64_t>(1) << power; |
||||
uint64_t actual; |
||||
actual = DecodeFixed64(p); |
||||
ASSERT_EQ(v-1, actual); |
||||
p += sizeof(uint64_t); |
||||
|
||||
actual = DecodeFixed64(p); |
||||
ASSERT_EQ(v+0, actual); |
||||
p += sizeof(uint64_t); |
||||
|
||||
actual = DecodeFixed64(p); |
||||
ASSERT_EQ(v+1, actual); |
||||
p += sizeof(uint64_t); |
||||
} |
||||
} |
||||
|
||||
TEST(Coding, Varint32) { |
||||
std::string s; |
||||
for (uint32_t i = 0; i < (32 * 32); i++) { |
||||
uint32_t v = (i / 32) << (i % 32); |
||||
PutVarint32(&s, v); |
||||
} |
||||
|
||||
const char* p = s.data(); |
||||
const char* limit = p + s.size(); |
||||
for (uint32_t i = 0; i < (32 * 32); i++) { |
||||
uint32_t expected = (i / 32) << (i % 32); |
||||
uint32_t actual; |
||||
const char* start = p; |
||||
p = GetVarint32Ptr(p, limit, &actual); |
||||
ASSERT_TRUE(p != NULL); |
||||
ASSERT_EQ(expected, actual); |
||||
ASSERT_EQ(VarintLength(actual), p - start); |
||||
} |
||||
ASSERT_EQ(p, s.data() + s.size()); |
||||
} |
||||
|
||||
TEST(Coding, Varint64) { |
||||
// Construct the list of values to check
|
||||
std::vector<uint64_t> values; |
||||
// Some special values
|
||||
values.push_back(0); |
||||
values.push_back(100); |
||||
values.push_back(~static_cast<uint64_t>(0)); |
||||
values.push_back(~static_cast<uint64_t>(0) - 1); |
||||
for (uint32_t k = 0; k < 64; k++) { |
||||
// Test values near powers of two
|
||||
const uint64_t power = 1ull << k; |
||||
values.push_back(power); |
||||
values.push_back(power-1); |
||||
values.push_back(power+1); |
||||
}; |
||||
|
||||
std::string s; |
||||
for (int i = 0; i < values.size(); i++) { |
||||
PutVarint64(&s, values[i]); |
||||
} |
||||
|
||||
const char* p = s.data(); |
||||
const char* limit = p + s.size(); |
||||
for (int i = 0; i < values.size(); i++) { |
||||
ASSERT_TRUE(p < limit); |
||||
uint64_t actual; |
||||
const char* start = p; |
||||
p = GetVarint64Ptr(p, limit, &actual); |
||||
ASSERT_TRUE(p != NULL); |
||||
ASSERT_EQ(values[i], actual); |
||||
ASSERT_EQ(VarintLength(actual), p - start); |
||||
} |
||||
ASSERT_EQ(p, limit); |
||||
|
||||
} |
||||
|
||||
TEST(Coding, Varint32Overflow) { |
||||
uint32_t result; |
||||
std::string input("\x81\x82\x83\x84\x85\x11"); |
||||
ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) |
||||
== NULL); |
||||
} |
||||
|
||||
TEST(Coding, Varint32Truncation) { |
||||
uint32_t large_value = (1u << 31) + 100; |
||||
std::string s; |
||||
PutVarint32(&s, large_value); |
||||
uint32_t result; |
||||
for (int len = 0; len < s.size() - 1; len++) { |
||||
ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL); |
||||
} |
||||
ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL); |
||||
ASSERT_EQ(large_value, result); |
||||
} |
||||
|
||||
TEST(Coding, Varint64Overflow) { |
||||
uint64_t result; |
||||
std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); |
||||
ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) |
||||
== NULL); |
||||
} |
||||
|
||||
TEST(Coding, Varint64Truncation) { |
||||
uint64_t large_value = (1ull << 63) + 100ull; |
||||
std::string s; |
||||
PutVarint64(&s, large_value); |
||||
uint64_t result; |
||||
for (int len = 0; len < s.size() - 1; len++) { |
||||
ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL); |
||||
} |
||||
ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL); |
||||
ASSERT_EQ(large_value, result); |
||||
} |
||||
|
||||
TEST(Coding, Strings) { |
||||
std::string s; |
||||
PutLengthPrefixedSlice(&s, Slice("")); |
||||
PutLengthPrefixedSlice(&s, Slice("foo")); |
||||
PutLengthPrefixedSlice(&s, Slice("bar")); |
||||
PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); |
||||
|
||||
Slice input(s); |
||||
Slice v; |
||||
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); |
||||
ASSERT_EQ("", v.ToString()); |
||||
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); |
||||
ASSERT_EQ("foo", v.ToString()); |
||||
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); |
||||
ASSERT_EQ("bar", v.ToString()); |
||||
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); |
||||
ASSERT_EQ(std::string(200, 'x'), v.ToString()); |
||||
ASSERT_EQ("", input.ToString()); |
||||
} |
||||
|
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,72 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <stdint.h> |
||||
#include "include/comparator.h" |
||||
#include "include/slice.h" |
||||
#include "util/logging.h" |
||||
|
||||
namespace leveldb { |
||||
|
||||
Comparator::~Comparator() { } |
||||
|
||||
namespace { |
||||
class BytewiseComparatorImpl : public Comparator { |
||||
public: |
||||
BytewiseComparatorImpl() { } |
||||
|
||||
virtual const char* Name() const { |
||||
return "leveldb.BytewiseComparator"; |
||||
} |
||||
|
||||
virtual int Compare(const Slice& a, const Slice& b) const { |
||||
return a.compare(b); |
||||
} |
||||
|
||||
virtual void FindShortestSeparator( |
||||
std::string* start, |
||||
const Slice& limit) const { |
||||
// Find length of common prefix
|
||||
size_t min_length = std::min(start->size(), limit.size()); |
||||
size_t diff_index = 0; |
||||
while ((diff_index < min_length) && |
||||
((*start)[diff_index] == limit[diff_index])) { |
||||
diff_index++; |
||||
} |
||||
|
||||
if (diff_index >= min_length) { |
||||
// Do not shorten if one string is a prefix of the other
|
||||
} else { |
||||
uint8_t diff_byte = static_cast<uint8_t>((*start)[diff_index]); |
||||
if (diff_byte < static_cast<uint8_t>(0xff) && |
||||
diff_byte + 1 < static_cast<uint8_t>(limit[diff_index])) { |
||||
(*start)[diff_index]++; |
||||
start->resize(diff_index + 1); |
||||
assert(Compare(*start, limit) < 0); |
||||
} |
||||
} |
||||
} |
||||
|
||||
virtual void FindShortSuccessor(std::string* key) const { |
||||
// Find first character that can be incremented
|
||||
size_t n = key->size(); |
||||
for (int i = 0; i < n; i++) { |
||||
const uint8_t byte = (*key)[i]; |
||||
if (byte != static_cast<uint8_t>(0xff)) { |
||||
(*key)[i] = byte + 1; |
||||
key->resize(i+1); |
||||
return; |
||||
} |
||||
} |
||||
// *key is a run of 0xffs. Leave it alone.
|
||||
} |
||||
}; |
||||
} |
||||
static const BytewiseComparatorImpl bytewise; |
||||
|
||||
const Comparator* BytewiseComparator() { |
||||
return &bytewise; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,332 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A portable implementation of crc32c, optimized to handle
|
||||
// four bytes at a time.
|
||||
|
||||
#include "util/crc32c.h" |
||||
|
||||
#include <stdint.h> |
||||
#include "util/coding.h" |
||||
|
||||
namespace leveldb { |
||||
namespace crc32c { |
||||
|
||||
static const uint32_t table0_[256] = { |
||||
0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, |
||||
0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, |
||||
0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, |
||||
0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, |
||||
0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, |
||||
0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, |
||||
0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, |
||||
0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, |
||||
0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, |
||||
0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, |
||||
0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, |
||||
0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, |
||||
0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, |
||||
0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, |
||||
0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, |
||||
0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, |
||||
0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, |
||||
0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, |
||||
0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, |
||||
0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, |
||||
0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, |
||||
0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, |
||||
0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, |
||||
0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, |
||||
0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, |
||||
0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, |
||||
0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, |
||||
0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, |
||||
0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, |
||||
0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, |
||||
0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, |
||||
0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, |
||||
0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, |
||||
0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, |
||||
0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, |
||||
0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, |
||||
0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, |
||||
0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, |
||||
0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, |
||||
0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, |
||||
0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, |
||||
0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, |
||||
0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, |
||||
0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, |
||||
0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, |
||||
0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, |
||||
0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, |
||||
0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, |
||||
0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, |
||||
0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, |
||||
0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, |
||||
0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, |
||||
0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, |
||||
0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, |
||||
0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, |
||||
0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, |
||||
0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, |
||||
0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, |
||||
0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, |
||||
0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, |
||||
0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, |
||||
0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, |
||||
0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, |
||||
0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 |
||||
}; |
||||
static const uint32_t table1_[256] = { |
||||
0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, |
||||
0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, |
||||
0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, |
||||
0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, |
||||
0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, |
||||
0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, |
||||
0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, |
||||
0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, |
||||
0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, |
||||
0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, |
||||
0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, |
||||
0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, |
||||
0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, |
||||
0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, |
||||
0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, |
||||
0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, |
||||
0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, |
||||
0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, |
||||
0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, |
||||
0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, |
||||
0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, |
||||
0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, |
||||
0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, |
||||
0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, |
||||
0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, |
||||
0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, |
||||
0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, |
||||
0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, |
||||
0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, |
||||
0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, |
||||
0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, |
||||
0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, |
||||
0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, |
||||
0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, |
||||
0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, |
||||
0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, |
||||
0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, |
||||
0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, |
||||
0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, |
||||
0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, |
||||
0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, |
||||
0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, |
||||
0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, |
||||
0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, |
||||
0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, |
||||
0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, |
||||
0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, |
||||
0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, |
||||
0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, |
||||
0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, |
||||
0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, |
||||
0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, |
||||
0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, |
||||
0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, |
||||
0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, |
||||
0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, |
||||
0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, |
||||
0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, |
||||
0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, |
||||
0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, |
||||
0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, |
||||
0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, |
||||
0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, |
||||
0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 |
||||
}; |
||||
static const uint32_t table2_[256] = { |
||||
0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, |
||||
0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, |
||||
0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, |
||||
0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, |
||||
0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, |
||||
0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, |
||||
0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, |
||||
0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, |
||||
0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, |
||||
0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, |
||||
0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, |
||||
0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, |
||||
0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, |
||||
0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, |
||||
0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, |
||||
0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, |
||||
0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, |
||||
0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, |
||||
0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, |
||||
0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, |
||||
0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, |
||||
0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, |
||||
0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, |
||||
0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, |
||||
0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, |
||||
0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, |
||||
0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, |
||||
0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, |
||||
0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, |
||||
0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, |
||||
0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, |
||||
0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, |
||||
0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, |
||||
0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, |
||||
0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, |
||||
0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, |
||||
0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, |
||||
0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, |
||||
0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, |
||||
0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, |
||||
0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, |
||||
0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, |
||||
0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, |
||||
0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, |
||||
0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, |
||||
0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, |
||||
0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, |
||||
0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, |
||||
0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, |
||||
0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, |
||||
0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, |
||||
0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, |
||||
0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, |
||||
0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, |
||||
0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, |
||||
0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, |
||||
0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, |
||||
0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, |
||||
0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, |
||||
0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, |
||||
0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, |
||||
0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, |
||||
0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, |
||||
0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 |
||||
}; |
||||
static const uint32_t table3_[256] = { |
||||
0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, |
||||
0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, |
||||
0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, |
||||
0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, |
||||
0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, |
||||
0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, |
||||
0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, |
||||
0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, |
||||
0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, |
||||
0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, |
||||
0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, |
||||
0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, |
||||
0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, |
||||
0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, |
||||
0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, |
||||
0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, |
||||
0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, |
||||
0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, |
||||
0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, |
||||
0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, |
||||
0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, |
||||
0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, |
||||
0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, |
||||
0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, |
||||
0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, |
||||
0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, |
||||
0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, |
||||
0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, |
||||
0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, |
||||
0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, |
||||
0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, |
||||
0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, |
||||
0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, |
||||
0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, |
||||
0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, |
||||
0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, |
||||
0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, |
||||
0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, |
||||
0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, |
||||
0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, |
||||
0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, |
||||
0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, |
||||
0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, |
||||
0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, |
||||
0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, |
||||
0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, |
||||
0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, |
||||
0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, |
||||
0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, |
||||
0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, |
||||
0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, |
||||
0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, |
||||
0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, |
||||
0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, |
||||
0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, |
||||
0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, |
||||
0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, |
||||
0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, |
||||
0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, |
||||
0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, |
||||
0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, |
||||
0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, |
||||
0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, |
||||
0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 |
||||
}; |
||||
|
||||
// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
|
||||
static inline uint32_t LE_LOAD32(const uint8_t *p) { |
||||
return DecodeFixed32(reinterpret_cast<const char*>(p)); |
||||
} |
||||
|
||||
uint32_t Extend(uint32_t crc, const char* buf, size_t size) { |
||||
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); |
||||
const uint8_t *e = p + size; |
||||
uint32_t l = crc ^ 0xffffffffu; |
||||
|
||||
#define STEP1 do { \ |
||||
int c = (l & 0xff) ^ *p++; \
|
||||
l = table0_[c] ^ (l >> 8); \
|
||||
} while (0) |
||||
#define STEP4 do { \ |
||||
uint32_t c = l ^ LE_LOAD32(p); \
|
||||
p += 4; \
|
||||
l = table3_[c & 0xff] ^ \
|
||||
table2_[(c >> 8) & 0xff] ^ \
|
||||
table1_[(c >> 16) & 0xff] ^ \
|
||||
table0_[c >> 24]; \
|
||||
} while (0) |
||||
|
||||
// Point x at first 4-byte aligned byte in string. This might be
|
||||
// just past the end of the string.
|
||||
const uintptr_t pval = reinterpret_cast<uintptr_t>(p); |
||||
const uint8_t* x = reinterpret_cast<const uint8_t*>(((pval + 3) >> 2) << 2); |
||||
if (x <= e) { |
||||
// Process bytes until finished or p is 4-byte aligned
|
||||
while (p != x) { |
||||
STEP1; |
||||
} |
||||
} |
||||
// Process bytes 16 at a time
|
||||
while ((e-p) >= 16) { |
||||
STEP4; STEP4; STEP4; STEP4; |
||||
} |
||||
// Process bytes 4 at a time
|
||||
while ((e-p) >= 4) { |
||||
STEP4; |
||||
} |
||||
// Process the last few bytes
|
||||
while (p != e) { |
||||
STEP1; |
||||
} |
||||
#undef STEP4 |
||||
#undef STEP1 |
||||
return l ^ 0xffffffffu; |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,45 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_UTIL_CRC32C_H_ |
||||
#define STORAGE_LEVELDB_UTIL_CRC32C_H_ |
||||
|
||||
#include <stddef.h> |
||||
#include <stdint.h> |
||||
|
||||
namespace leveldb { |
||||
namespace crc32c { |
||||
|
||||
// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the
|
||||
// crc32c of some string A. Extend() is often used to maintain the
|
||||
// crc32c of a stream of data.
|
||||
extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); |
||||
|
||||
// Return the crc32c of data[0,n-1]
|
||||
inline uint32_t Value(const char* data, size_t n) { |
||||
return Extend(0, data, n); |
||||
} |
||||
|
||||
static const uint32_t kMaskDelta = 0xa282ead8ul; |
||||
|
||||
// Return a masked representation of crc.
|
||||
//
|
||||
// Motivation: it is problematic to compute the CRC of a string that
|
||||
// contains embedded CRCs. Therefore we recommend that CRCs stored
|
||||
// somewhere (e.g., in files) should be masked before being stored.
|
||||
inline uint32_t Mask(uint32_t crc) { |
||||
// Rotate right by 15 bits and add a constant.
|
||||
return ((crc >> 15) | (crc << 17)) + kMaskDelta; |
||||
} |
||||
|
||||
// Return the crc whose masked representation is masked_crc.
|
||||
inline uint32_t Unmask(uint32_t masked_crc) { |
||||
uint32_t rot = masked_crc - kMaskDelta; |
||||
return ((rot >> 17) | (rot << 15)); |
||||
} |
||||
|
||||
} |
||||
} |
||||
|
||||
#endif // STORAGE_LEVELDB_UTIL_CRC32C_H_
|
@ -0,0 +1,86 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "util/crc32c.h" |
||||
#include "util/testharness.h" |
||||
|
||||
namespace leveldb { |
||||
namespace crc32c { |
||||
|
||||
class CRC { }; |
||||
|
||||
TEST(CRC, StandardResults) { |
||||
// From rfc3720 section B.4.
|
||||
char buf[32]; |
||||
|
||||
memset(buf, 0, sizeof(buf)); |
||||
ASSERT_EQ(0x8a9136aa, Value(buf, sizeof(buf))); |
||||
|
||||
memset(buf, 0xff, sizeof(buf)); |
||||
ASSERT_EQ(0x62a8ab43, Value(buf, sizeof(buf))); |
||||
|
||||
for (int i = 0; i < 32; i++) { |
||||
buf[i] = i; |
||||
} |
||||
ASSERT_EQ(0x46dd794e, Value(buf, sizeof(buf))); |
||||
|
||||
for (int i = 0; i < 32; i++) { |
||||
buf[i] = 31 - i; |
||||
} |
||||
ASSERT_EQ(0x113fdb5c, Value(buf, sizeof(buf))); |
||||
|
||||
unsigned char data[48] = { |
||||
0x01, 0xc0, 0x00, 0x00, |
||||
0x00, 0x00, 0x00, 0x00, |
||||
0x00, 0x00, 0x00, 0x00, |
||||
0x00, 0x00, 0x00, 0x00, |
||||
0x14, 0x00, 0x00, 0x00, |
||||
0x00, 0x00, 0x04, 0x00, |
||||
0x00, 0x00, 0x00, 0x14, |
||||
0x00, 0x00, 0x00, 0x18, |
||||
0x28, 0x00, 0x00, 0x00, |
||||
0x00, 0x00, 0x00, 0x00, |
||||
0x02, 0x00, 0x00, 0x00, |
||||
0x00, 0x00, 0x00, 0x00, |
||||
}; |
||||
ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data))); |
||||
} |
||||
|
||||
TEST(CRC, Values) { |
||||
ASSERT_NE(Value("a", 1), Value("foo", 3)); |
||||
} |
||||
|
||||
TEST(CRC, Extend) { |
||||
ASSERT_EQ(Value("hello world", 11), |
||||
Extend(Value("hello ", 6), "world", 5)); |
||||
} |
||||
|
||||
TEST(CRC, Mask) { |
||||
uint32_t crc = Value("foo", 3); |
||||
ASSERT_NE(crc, Mask(crc)); |
||||
ASSERT_NE(crc, Mask(Mask(crc))); |
||||
ASSERT_EQ(crc, Unmask(Mask(crc))); |
||||
ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); |
||||
} |
||||
|
||||
TEST(CRC, Benchmark) { |
||||
std::string data(1048576 * 100, 'x'); |
||||
double start = Env::Default()->NowMicros() * 1e-6; |
||||
static const int kIters = 10; |
||||
uint32_t crc = 0; |
||||
for (int i = 0; i < kIters; i++) { |
||||
crc |= Value(data.data(), data.size()); |
||||
} |
||||
double finish = Env::Default()->NowMicros() * 1e-6; |
||||
double mb = (static_cast<long long int>(data.size()) * kIters) / 1048576.0; |
||||
fprintf(stderr, "CRC %0.0f MB: %.3f secs; %.1f MB/s, crc=0x%08x\n", |
||||
mb, (finish - start), mb / (finish - start), crc); |
||||
} |
||||
|
||||
} |
||||
} |
||||
|
||||
int main(int argc, char** argv) { |
||||
return leveldb::test::RunAllTests(); |
||||
} |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue