use mmap on 64-bit machines to speed-up reads; small build fixes

main
Sanjay Ghemawat 13 years ago
parent 583f1499c0
commit 9013f13b15
  1. 36
      Makefile
  2. 7
      table/block.cc
  3. 6
      table/block.h
  4. 19
      table/format.cc
  5. 5
      table/format.h
  6. 11
      table/table.cc
  7. 2
      table/table_test.cc
  8. 48
      util/env_posix.cc

@ -124,65 +124,65 @@ db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@
db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) $(LDFLAGS) -lsqlite3 doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@
$(CXX) -lsqlite3 doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(LDFLAGS
db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) $(LDFLAGS) -lkyotocabinet doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) -o $@
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
$(MEMENVLIBRARY) : helpers/memenv/memenv.o
rm -f $@
$(AR) -rs $@ helpers/memenv/memenv.o
memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
$(CXX) $(LDFLAGS) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@
$(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS)
ifeq ($(PLATFORM), IOS)
# For iOS, create universal object files to be used on both the simulator and

@ -19,9 +19,10 @@ inline uint32_t Block::NumRestarts() const {
return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
}
Block::Block(const char* data, size_t size)
Block::Block(const char* data, size_t size, bool take_ownership)
: data_(data),
size_(size) {
size_(size),
owned_(take_ownership) {
if (size_ < sizeof(uint32_t)) {
size_ = 0; // Error marker
} else {
@ -35,8 +36,10 @@ Block::Block(const char* data, size_t size)
}
Block::~Block() {
if (owned_) {
delete[] data_;
}
}
// Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes,

@ -16,8 +16,9 @@ class Comparator;
class Block {
public:
// Initialize the block with the specified contents.
// Takes ownership of data[] and will delete[] it when done.
Block(const char* data, size_t size);
// Takes ownership of data[] and will delete[] it when done iff
// "take_ownership is true.
Block(const char* data, size_t size, bool take_ownership);
~Block();
@ -30,6 +31,7 @@ class Block {
const char* data_;
size_t size_;
uint32_t restart_offset_; // Offset in data_ of restart array
bool owned_; // Block owns data_[]
// No copying allowed
Block(const Block&);

@ -66,8 +66,10 @@ Status Footer::DecodeFrom(Slice* input) {
Status ReadBlock(RandomAccessFile* file,
const ReadOptions& options,
const BlockHandle& handle,
Block** block) {
Block** block,
bool* may_cache) {
*block = NULL;
*may_cache = false;
// Read the block contents as well as the type/crc footer.
// See table_builder.cc for the code that built this structure.
@ -100,8 +102,14 @@ Status ReadBlock(RandomAccessFile* file,
case kNoCompression:
if (data != buf) {
// File implementation gave us pointer to some other data.
// Copy into buf[].
memcpy(buf, data, n + kBlockTrailerSize);
// Use it directly under the assumption that it will be live
// while the file is open.
delete[] buf;
*block = new Block(data, n, false /* do not take ownership */);
*may_cache = false; // Do not double-cache
} else {
*block = new Block(buf, n, true /* take ownership */);
*may_cache = true;
}
// Ok
@ -119,8 +127,8 @@ Status ReadBlock(RandomAccessFile* file,
return Status::Corruption("corrupted compressed block contents");
}
delete[] buf;
buf = ubuf;
n = ulength;
*block = new Block(ubuf, ulength, true /* take ownership */);
*may_cache = true;
break;
}
default:
@ -128,7 +136,6 @@ Status ReadBlock(RandomAccessFile* file,
return Status::Corruption("bad block type");
}
*block = new Block(buf, n); // Block takes ownership of buf[]
return Status::OK();
}

@ -86,10 +86,13 @@ static const size_t kBlockTrailerSize = 5;
// Read the block identified by "handle" from "file". On success,
// store a pointer to the heap-allocated result in *block and return
// OK. On failure store NULL in *block and return non-OK.
// On success, stores true in *may_cache if the result may be
// cached, false if it must not be cached.
extern Status ReadBlock(RandomAccessFile* file,
const ReadOptions& options,
const BlockHandle& handle,
Block** block);
Block** block,
bool* may_cache);
// Implementation details follow. Clients should ignore,

@ -49,7 +49,9 @@ Status Table::Open(const Options& options,
// Read the index block
Block* index_block = NULL;
if (s.ok()) {
s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block);
bool may_cache; // Ignored result
s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block,
&may_cache);
}
if (s.ok()) {
@ -105,6 +107,7 @@ Iterator* Table::BlockReader(void* arg,
// can add more features in the future.
if (s.ok()) {
bool may_cache;
if (block_cache != NULL) {
char cache_key_buffer[16];
EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
@ -114,14 +117,14 @@ Iterator* Table::BlockReader(void* arg,
if (cache_handle != NULL) {
block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
} else {
s = ReadBlock(table->rep_->file, options, handle, &block);
if (s.ok() && options.fill_cache) {
s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache);
if (s.ok() && may_cache && options.fill_cache) {
cache_handle = block_cache->Insert(
key, block, block->size(), &DeleteCachedBlock);
}
}
} else {
s = ReadBlock(table->rep_->file, options, handle, &block);
s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache);
}
}

@ -205,7 +205,7 @@ class BlockConstructor: public Constructor {
block_size_ = block_data.size();
char* block_data_copy = new char[block_size_];
memcpy(block_data_copy, block_data.data(), block_size_);
block_ = new Block(block_data_copy, block_size_);
block_ = new Block(block_data_copy, block_size_, true /* take ownership */);
return Status::OK();
}
virtual size_t NumBytes() const { return block_size_; }

@ -66,6 +66,7 @@ class PosixSequentialFile: public SequentialFile {
}
};
// pread() based random-access
class PosixRandomAccessFile: public RandomAccessFile {
private:
std::string filename_;
@ -89,6 +90,32 @@ class PosixRandomAccessFile: public RandomAccessFile {
}
};
// mmap() based random-access
class PosixMmapReadableFile: public RandomAccessFile {
private:
std::string filename_;
void* mmapped_region_;
size_t length_;
public:
// base[0,length-1] contains the mmapped contents of the file.
PosixMmapReadableFile(const std::string& fname, void* base, size_t length)
: filename_(fname), mmapped_region_(base), length_(length) { }
virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); }
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
Status s;
if (offset + n > length_) {
*result = Slice();
s = IOError(filename_, EINVAL);
} else {
*result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
}
return s;
}
};
// We preallocate up to an extra megabyte and use memcpy to append new
// data to the file. This is safe since we either properly close the
// file before reading from it, or for log files, the reading code
@ -297,13 +324,28 @@ class PosixEnv : public Env {
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result) {
*result = NULL;
Status s;
int fd = open(fname.c_str(), O_RDONLY);
if (fd < 0) {
*result = NULL;
return IOError(fname, errno);
s = IOError(fname, errno);
} else if (sizeof(void*) >= 8) {
// Use mmap when virtual address-space is plentiful.
uint64_t size;
s = GetFileSize(fname, &size);
if (s.ok()) {
void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
if (base != MAP_FAILED) {
*result = new PosixMmapReadableFile(fname, base, size);
} else {
s = IOError(fname, errno);
}
}
close(fd);
} else {
*result = new PosixRandomAccessFile(fname, fd);
return Status::OK();
}
return s;
}
virtual Status NewWritableFile(const std::string& fname,

Loading…
Cancel
Save