|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
#include <set>
|
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
|
|
|
#include <string>
|
|
|
|
#include "rocksdb/cache.h"
|
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "util/arena.h"
|
|
|
|
#include "util/autovector.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
class VersionSet;
|
|
|
|
|
|
|
|
const uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
|
|
|
|
|
|
|
|
extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
|
|
|
|
|
|
|
|
// A copyable structure contains information needed to read data from an SST
|
|
|
|
// file. It can contains a pointer to a table reader opened for the file, or
|
|
|
|
// file number and size, which can be used to create a new table reader for it.
|
|
|
|
// The behavior is undefined when a copied of the structure is used when the
|
|
|
|
// file is not in any live version any more.
|
|
|
|
struct FileDescriptor {
|
|
|
|
// Table reader in table_reader_handle
|
|
|
|
TableReader* table_reader;
|
|
|
|
uint64_t packed_number_and_path_id;
|
|
|
|
uint64_t file_size; // File size in bytes
|
|
|
|
|
|
|
|
FileDescriptor() : FileDescriptor(0, 0, 0) {}
|
|
|
|
|
|
|
|
FileDescriptor(uint64_t number, uint32_t path_id, uint64_t file_size)
|
|
|
|
: table_reader(nullptr),
|
|
|
|
packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
|
|
|
|
file_size(file_size) {}
|
|
|
|
|
|
|
|
FileDescriptor& operator=(const FileDescriptor& fd) {
|
|
|
|
table_reader = fd.table_reader;
|
|
|
|
packed_number_and_path_id = fd.packed_number_and_path_id;
|
|
|
|
file_size = fd.file_size;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t GetNumber() const {
|
|
|
|
return packed_number_and_path_id & kFileNumberMask;
|
|
|
|
}
|
|
|
|
uint32_t GetPathId() const {
|
|
|
|
return packed_number_and_path_id / (kFileNumberMask + 1);
|
|
|
|
}
|
|
|
|
uint64_t GetFileSize() const { return file_size; }
|
|
|
|
};
|
|
|
|
|
|
|
|
struct FileMetaData {
|
|
|
|
int refs;
|
|
|
|
FileDescriptor fd;
|
|
|
|
InternalKey smallest; // Smallest internal key served by table
|
|
|
|
InternalKey largest; // Largest internal key served by table
|
|
|
|
bool being_compacted; // Is this file undergoing compaction?
|
|
|
|
SequenceNumber smallest_seqno; // The smallest seqno in this file
|
|
|
|
SequenceNumber largest_seqno; // The largest seqno in this file
|
|
|
|
|
|
|
|
// Needs to be disposed when refs becomes 0.
|
|
|
|
Cache::Handle* table_reader_handle;
|
|
|
|
|
|
|
|
// Stats for compensating deletion entries during compaction
|
|
|
|
|
|
|
|
// File size compensated by deletion entry.
|
|
|
|
// This is updated in Version::UpdateTemporaryStats() first time when the
|
|
|
|
// file is created or loaded. After it is updated, it is immutable.
|
|
|
|
uint64_t compensated_file_size;
|
|
|
|
uint64_t num_entries; // the number of entries.
|
|
|
|
uint64_t num_deletions; // the number of deletion entries.
|
|
|
|
uint64_t raw_key_size; // total uncompressed key size.
|
|
|
|
uint64_t raw_value_size; // total uncompressed value size.
|
|
|
|
bool init_stats_from_file; // true if the data-entry stats of this file
|
|
|
|
// has initialized from file.
|
|
|
|
|
|
|
|
FileMetaData()
|
|
|
|
: refs(0),
|
|
|
|
being_compacted(false),
|
|
|
|
table_reader_handle(nullptr),
|
|
|
|
compensated_file_size(0),
|
|
|
|
num_entries(0),
|
|
|
|
num_deletions(0),
|
|
|
|
raw_key_size(0),
|
|
|
|
raw_value_size(0),
|
|
|
|
init_stats_from_file(false) {}
|
|
|
|
};
|
|
|
|
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
11 years ago
|
|
|
// A compressed copy of file meta data that just contain
|
|
|
|
// smallest and largest key's slice
|
|
|
|
struct FdWithKeyRange {
|
|
|
|
FileDescriptor fd;
|
|
|
|
Slice smallest_key; // slice that contain smallest key
|
|
|
|
Slice largest_key; // slice that contain largest key
|
|
|
|
|
|
|
|
FdWithKeyRange()
|
|
|
|
: fd(),
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
11 years ago
|
|
|
smallest_key(),
|
|
|
|
largest_key() {
|
|
|
|
}
|
|
|
|
|
|
|
|
FdWithKeyRange(FileDescriptor fd,
|
|
|
|
Slice smallest_key, Slice largest_key)
|
|
|
|
: fd(fd),
|
|
|
|
smallest_key(smallest_key),
|
|
|
|
largest_key(largest_key) {
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Data structure to store an array of FdWithKeyRange in one level
|
|
|
|
// Actual data is guaranteed to be stored closely
|
|
|
|
struct FileLevel {
|
|
|
|
size_t num_files;
|
|
|
|
FdWithKeyRange* files;
|
|
|
|
FileLevel() {
|
|
|
|
num_files = 0;
|
|
|
|
files = nullptr;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
class VersionEdit {
|
|
|
|
public:
|
|
|
|
VersionEdit() { Clear(); }
|
|
|
|
~VersionEdit() { }
|
|
|
|
|
|
|
|
void Clear();
|
|
|
|
|
|
|
|
void SetComparatorName(const Slice& name) {
|
|
|
|
has_comparator_ = true;
|
|
|
|
comparator_ = name.ToString();
|
|
|
|
}
|
|
|
|
void SetLogNumber(uint64_t num) {
|
|
|
|
has_log_number_ = true;
|
|
|
|
log_number_ = num;
|
|
|
|
}
|
|
|
|
void SetPrevLogNumber(uint64_t num) {
|
|
|
|
has_prev_log_number_ = true;
|
|
|
|
prev_log_number_ = num;
|
|
|
|
}
|
|
|
|
void SetNextFile(uint64_t num) {
|
|
|
|
has_next_file_number_ = true;
|
|
|
|
next_file_number_ = num;
|
|
|
|
}
|
|
|
|
void SetLastSequence(SequenceNumber seq) {
|
|
|
|
has_last_sequence_ = true;
|
|
|
|
last_sequence_ = seq;
|
|
|
|
}
|
|
|
|
void SetMaxColumnFamily(uint32_t max_column_family) {
|
|
|
|
has_max_column_family_ = true;
|
|
|
|
max_column_family_ = max_column_family;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add the specified file at the specified number.
|
|
|
|
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
|
|
|
|
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
|
|
|
|
void AddFile(int level, uint64_t file, uint64_t file_path_id,
|
|
|
|
uint64_t file_size, const InternalKey& smallest,
|
|
|
|
const InternalKey& largest, const SequenceNumber& smallest_seqno,
|
|
|
|
const SequenceNumber& largest_seqno) {
|
|
|
|
assert(smallest_seqno <= largest_seqno);
|
|
|
|
FileMetaData f;
|
|
|
|
f.fd = FileDescriptor(file, file_path_id, file_size);
|
|
|
|
f.smallest = smallest;
|
|
|
|
f.largest = largest;
|
|
|
|
f.smallest_seqno = smallest_seqno;
|
|
|
|
f.largest_seqno = largest_seqno;
|
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete the specified "file" from the specified "level".
|
|
|
|
void DeleteFile(int level, uint64_t file) {
|
|
|
|
deleted_files_.insert({level, file});
|
|
|
|
}
|
|
|
|
|
|
|
|
// Number of edits
|
|
|
|
int NumEntries() {
|
|
|
|
return new_files_.size() + deleted_files_.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsColumnFamilyManipulation() {
|
|
|
|
return is_column_family_add_ || is_column_family_drop_;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetColumnFamily(uint32_t column_family_id) {
|
|
|
|
column_family_ = column_family_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
// set column family ID by calling SetColumnFamily()
|
|
|
|
void AddColumnFamily(const std::string& name) {
|
|
|
|
assert(!is_column_family_drop_);
|
|
|
|
assert(!is_column_family_add_);
|
|
|
|
assert(NumEntries() == 0);
|
|
|
|
is_column_family_add_ = true;
|
|
|
|
column_family_name_ = name;
|
|
|
|
}
|
|
|
|
|
|
|
|
// set column family ID by calling SetColumnFamily()
|
|
|
|
void DropColumnFamily() {
|
|
|
|
assert(!is_column_family_drop_);
|
|
|
|
assert(!is_column_family_add_);
|
|
|
|
assert(NumEntries() == 0);
|
|
|
|
is_column_family_drop_ = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void EncodeTo(std::string* dst) const;
|
|
|
|
Status DecodeFrom(const Slice& src);
|
|
|
|
|
|
|
|
std::string DebugString(bool hex_key = false) const;
|
|
|
|
|
|
|
|
private:
|
|
|
|
friend class VersionSet;
|
|
|
|
friend class Version;
|
|
|
|
|
|
|
|
typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
|
|
|
|
|
|
|
|
bool GetLevel(Slice* input, int* level, const char** msg);
|
|
|
|
|
|
|
|
int max_level_;
|
|
|
|
std::string comparator_;
|
|
|
|
uint64_t log_number_;
|
|
|
|
uint64_t prev_log_number_;
|
|
|
|
uint64_t next_file_number_;
|
|
|
|
uint32_t max_column_family_;
|
|
|
|
SequenceNumber last_sequence_;
|
|
|
|
bool has_comparator_;
|
|
|
|
bool has_log_number_;
|
|
|
|
bool has_prev_log_number_;
|
|
|
|
bool has_next_file_number_;
|
|
|
|
bool has_last_sequence_;
|
|
|
|
bool has_max_column_family_;
|
|
|
|
|
|
|
|
DeletedFileSet deleted_files_;
|
|
|
|
std::vector<std::pair<int, FileMetaData>> new_files_;
|
|
|
|
|
|
|
|
// Each version edit record should have column_family_id set
|
|
|
|
// If it's not set, it is default (0)
|
|
|
|
uint32_t column_family_;
|
|
|
|
// a version edit can be either column_family add or
|
|
|
|
// column_family drop. If it's column family add,
|
|
|
|
// it also includes column family name.
|
|
|
|
bool is_column_family_drop_;
|
|
|
|
bool is_column_family_add_;
|
|
|
|
std::string column_family_name_;
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace rocksdb
|