Summary: Add a encoding feature of PlainTable to encode PlainTable's keys to save some bytes for the same prefixes. The data format is documented in table/plain_table_factory.h Test Plan: Add unit test coverage in plain_table_db_test Reviewers: yhchiang, igor, dhruba, ljin, haobo Reviewed By: haobo Subscribers: nkg-, leveldb Differential Revision: https://reviews.facebook.net/D18735main
parent
0f0076ed5a
commit
edd47c5104
@ -0,0 +1,323 @@ |
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE |
||||||
|
#include "table/plain_table_key_coding.h" |
||||||
|
|
||||||
|
#include "table/plain_table_factory.h" |
||||||
|
#include "db/dbformat.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
namespace { |
||||||
|
|
||||||
|
enum EntryType : unsigned char { |
||||||
|
kFullKey = 0, |
||||||
|
kPrefixFromPreviousKey = 1, |
||||||
|
kKeySuffix = 2, |
||||||
|
}; |
||||||
|
|
||||||
|
// Control byte:
|
||||||
|
// First two bits indicate type of entry
|
||||||
|
// Other bytes are inlined sizes. If all bits are 1 (0x03F), overflow bytes
|
||||||
|
// are used. key_size-0x3F will be encoded as a variint32 after this bytes.
|
||||||
|
|
||||||
|
const unsigned char kSizeInlineLimit = 0x3F; |
||||||
|
|
||||||
|
// Return 0 for error
|
||||||
|
size_t EncodeSize(EntryType type, uint32_t key_size, char* out_buffer) { |
||||||
|
out_buffer[0] = type << 6; |
||||||
|
|
||||||
|
if (key_size < 0x3F) { |
||||||
|
// size inlined
|
||||||
|
out_buffer[0] |= static_cast<char>(key_size); |
||||||
|
return 1; |
||||||
|
} else { |
||||||
|
out_buffer[0] |= kSizeInlineLimit; |
||||||
|
char* ptr = EncodeVarint32(out_buffer + 1, key_size - kSizeInlineLimit); |
||||||
|
return ptr - out_buffer; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Return position after the size byte(s). nullptr means error
|
||||||
|
const char* DecodeSize(const char* offset, const char* limit, |
||||||
|
EntryType* entry_type, size_t* key_size) { |
||||||
|
assert(offset < limit); |
||||||
|
*entry_type = static_cast<EntryType>( |
||||||
|
(static_cast<unsigned char>(offset[0]) & ~kSizeInlineLimit) >> 6); |
||||||
|
char inline_key_size = offset[0] & kSizeInlineLimit; |
||||||
|
if (inline_key_size < kSizeInlineLimit) { |
||||||
|
*key_size = inline_key_size; |
||||||
|
return offset + 1; |
||||||
|
} else { |
||||||
|
uint32_t extra_size; |
||||||
|
const char* ptr = GetVarint32Ptr(offset + 1, limit, &extra_size); |
||||||
|
if (ptr == nullptr) { |
||||||
|
return nullptr; |
||||||
|
} |
||||||
|
*key_size = kSizeInlineLimit + extra_size; |
||||||
|
return ptr; |
||||||
|
} |
||||||
|
} |
||||||
|
} // namespace
|
||||||
|
|
||||||
|
Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file, |
||||||
|
uint64_t* offset, char* meta_bytes_buf, |
||||||
|
size_t* meta_bytes_buf_size) { |
||||||
|
ParsedInternalKey parsed_key; |
||||||
|
if (!ParseInternalKey(key, &parsed_key)) { |
||||||
|
return Status::Corruption(Slice()); |
||||||
|
} |
||||||
|
|
||||||
|
Slice key_to_write = key; // Portion of internal key to write out.
|
||||||
|
|
||||||
|
size_t user_key_size = fixed_user_key_len_; |
||||||
|
if (encoding_type_ == kPlain) { |
||||||
|
if (fixed_user_key_len_ == kPlainTableVariableLength) { |
||||||
|
user_key_size = key.size() - 8; |
||||||
|
// Write key length
|
||||||
|
char key_size_buf[5]; // tmp buffer for key size as varint32
|
||||||
|
char* ptr = EncodeVarint32(key_size_buf, user_key_size); |
||||||
|
assert(ptr <= key_size_buf + sizeof(key_size_buf)); |
||||||
|
auto len = ptr - key_size_buf; |
||||||
|
Status s = file->Append(Slice(key_size_buf, len)); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
*offset += len; |
||||||
|
} |
||||||
|
} else { |
||||||
|
assert(encoding_type_ == kPrefix); |
||||||
|
char size_bytes[12]; |
||||||
|
size_t size_bytes_pos = 0; |
||||||
|
|
||||||
|
user_key_size = key.size() - 8; |
||||||
|
|
||||||
|
Slice prefix = |
||||||
|
prefix_extractor_->Transform(Slice(key.data(), user_key_size)); |
||||||
|
if (key_count_for_prefix == 0 || prefix != pre_prefix_.GetKey() || |
||||||
|
key_count_for_prefix % index_sparseness_ == 0) { |
||||||
|
key_count_for_prefix = 1; |
||||||
|
pre_prefix_.SetKey(prefix); |
||||||
|
size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes); |
||||||
|
Status s = file->Append(Slice(size_bytes, size_bytes_pos)); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
*offset += size_bytes_pos; |
||||||
|
} else { |
||||||
|
key_count_for_prefix++; |
||||||
|
if (key_count_for_prefix == 2) { |
||||||
|
// For second key within a prefix, need to encode prefix length
|
||||||
|
size_bytes_pos += |
||||||
|
EncodeSize(kPrefixFromPreviousKey, pre_prefix_.GetKey().size(), |
||||||
|
size_bytes + size_bytes_pos); |
||||||
|
} |
||||||
|
size_t prefix_len = pre_prefix_.GetKey().size(); |
||||||
|
size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len, |
||||||
|
size_bytes + size_bytes_pos); |
||||||
|
Status s = file->Append(Slice(size_bytes, size_bytes_pos)); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
*offset += size_bytes_pos; |
||||||
|
key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Encode full key
|
||||||
|
// For value size as varint32 (up to 5 bytes).
|
||||||
|
// If the row is of value type with seqId 0, flush the special flag together
|
||||||
|
// in this buffer to safe one file append call, which takes 1 byte.
|
||||||
|
if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) { |
||||||
|
Status s = |
||||||
|
file->Append(Slice(key_to_write.data(), key_to_write.size() - 8)); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
*offset += key_to_write.size() - 8; |
||||||
|
meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0; |
||||||
|
*meta_bytes_buf_size += 1; |
||||||
|
} else { |
||||||
|
file->Append(key_to_write); |
||||||
|
*offset += key_to_write.size(); |
||||||
|
} |
||||||
|
|
||||||
|
return Status::OK(); |
||||||
|
} |
||||||
|
|
||||||
|
namespace { |
||||||
|
Status ReadInternalKey(const char* key_ptr, const char* limit, |
||||||
|
uint32_t user_key_size, ParsedInternalKey* parsed_key, |
||||||
|
size_t* bytes_read, bool* internal_key_valid, |
||||||
|
Slice* internal_key) { |
||||||
|
if (key_ptr + user_key_size + 1 >= limit) { |
||||||
|
return Status::Corruption("Unexpected EOF when reading the next key"); |
||||||
|
} |
||||||
|
if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) { |
||||||
|
// Special encoding for the row with seqID=0
|
||||||
|
parsed_key->user_key = Slice(key_ptr, user_key_size); |
||||||
|
parsed_key->sequence = 0; |
||||||
|
parsed_key->type = kTypeValue; |
||||||
|
*bytes_read += user_key_size + 1; |
||||||
|
*internal_key_valid = false; |
||||||
|
} else { |
||||||
|
if (key_ptr + user_key_size + 8 >= limit) { |
||||||
|
return Status::Corruption( |
||||||
|
"Unexpected EOF when reading internal bytes of the next key"); |
||||||
|
} |
||||||
|
*internal_key_valid = true; |
||||||
|
*internal_key = Slice(key_ptr, user_key_size + 8); |
||||||
|
if (!ParseInternalKey(*internal_key, parsed_key)) { |
||||||
|
return Status::Corruption( |
||||||
|
Slice("Incorrect value type found when reading the next key")); |
||||||
|
} |
||||||
|
*bytes_read += user_key_size + 8; |
||||||
|
} |
||||||
|
return Status::OK(); |
||||||
|
} |
||||||
|
} // namespace
|
||||||
|
|
||||||
|
Status PlainTableKeyDecoder::NextPlainEncodingKey( |
||||||
|
const char* start, const char* limit, ParsedInternalKey* parsed_key, |
||||||
|
Slice* internal_key, size_t* bytes_read, bool* seekable) { |
||||||
|
const char* key_ptr = start; |
||||||
|
size_t user_key_size = 0; |
||||||
|
if (fixed_user_key_len_ != kPlainTableVariableLength) { |
||||||
|
user_key_size = fixed_user_key_len_; |
||||||
|
key_ptr = start; |
||||||
|
} else { |
||||||
|
uint32_t tmp_size = 0; |
||||||
|
key_ptr = GetVarint32Ptr(start, limit, &tmp_size); |
||||||
|
if (key_ptr == nullptr) { |
||||||
|
return Status::Corruption( |
||||||
|
"Unexpected EOF when reading the next key's size"); |
||||||
|
} |
||||||
|
user_key_size = static_cast<size_t>(tmp_size); |
||||||
|
*bytes_read = key_ptr - start; |
||||||
|
} |
||||||
|
bool decoded_internal_key_valid; |
||||||
|
Slice decoded_internal_key; |
||||||
|
Status s = |
||||||
|
ReadInternalKey(key_ptr, limit, user_key_size, parsed_key, bytes_read, |
||||||
|
&decoded_internal_key_valid, &decoded_internal_key); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
if (internal_key != nullptr) { |
||||||
|
if (decoded_internal_key_valid) { |
||||||
|
*internal_key = decoded_internal_key; |
||||||
|
} else { |
||||||
|
// Need to copy out the internal key
|
||||||
|
cur_key_.SetInternalKey(*parsed_key); |
||||||
|
*internal_key = cur_key_.GetKey(); |
||||||
|
} |
||||||
|
} |
||||||
|
return Status::OK(); |
||||||
|
} |
||||||
|
|
||||||
|
Status PlainTableKeyDecoder::NextPrefixEncodingKey( |
||||||
|
const char* start, const char* limit, ParsedInternalKey* parsed_key, |
||||||
|
Slice* internal_key, size_t* bytes_read, bool* seekable) { |
||||||
|
const char* key_ptr = start; |
||||||
|
EntryType entry_type; |
||||||
|
|
||||||
|
bool expect_suffix = false; |
||||||
|
do { |
||||||
|
size_t size = 0; |
||||||
|
bool decoded_internal_key_valid; |
||||||
|
const char* pos = DecodeSize(key_ptr, limit, &entry_type, &size); |
||||||
|
if (pos == nullptr) { |
||||||
|
return Status::Corruption("Unexpected EOF when reading size of the key"); |
||||||
|
} |
||||||
|
*bytes_read += pos - key_ptr; |
||||||
|
key_ptr = pos; |
||||||
|
|
||||||
|
switch (entry_type) { |
||||||
|
case kFullKey: { |
||||||
|
expect_suffix = false; |
||||||
|
Slice decoded_internal_key; |
||||||
|
Status s = |
||||||
|
ReadInternalKey(key_ptr, limit, size, parsed_key, bytes_read, |
||||||
|
&decoded_internal_key_valid, &decoded_internal_key); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
saved_user_key_ = parsed_key->user_key; |
||||||
|
if (internal_key != nullptr) { |
||||||
|
if (decoded_internal_key_valid) { |
||||||
|
*internal_key = decoded_internal_key; |
||||||
|
} else { |
||||||
|
cur_key_.SetInternalKey(*parsed_key); |
||||||
|
*internal_key = cur_key_.GetKey(); |
||||||
|
} |
||||||
|
} |
||||||
|
break; |
||||||
|
} |
||||||
|
case kPrefixFromPreviousKey: { |
||||||
|
if (seekable != nullptr) { |
||||||
|
*seekable = false; |
||||||
|
} |
||||||
|
prefix_len_ = size; |
||||||
|
assert(prefix_extractor_ == nullptr || |
||||||
|
prefix_extractor_->Transform(saved_user_key_).size() == |
||||||
|
prefix_len_); |
||||||
|
// Need read another size flag for suffix
|
||||||
|
expect_suffix = true; |
||||||
|
break; |
||||||
|
} |
||||||
|
case kKeySuffix: { |
||||||
|
expect_suffix = false; |
||||||
|
if (seekable != nullptr) { |
||||||
|
*seekable = false; |
||||||
|
} |
||||||
|
assert(prefix_len_ >= 0); |
||||||
|
cur_key_.Reserve(prefix_len_ + size); |
||||||
|
|
||||||
|
Slice tmp_slice; |
||||||
|
Status s = ReadInternalKey(key_ptr, limit, size, parsed_key, bytes_read, |
||||||
|
&decoded_internal_key_valid, &tmp_slice); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_), |
||||||
|
*parsed_key); |
||||||
|
assert( |
||||||
|
prefix_extractor_ == nullptr || |
||||||
|
prefix_extractor_->Transform(ExtractUserKey(cur_key_.GetKey())) == |
||||||
|
Slice(saved_user_key_.data(), prefix_len_)); |
||||||
|
parsed_key->user_key = ExtractUserKey(cur_key_.GetKey()); |
||||||
|
if (internal_key != nullptr) { |
||||||
|
*internal_key = cur_key_.GetKey(); |
||||||
|
} |
||||||
|
break; |
||||||
|
} |
||||||
|
default: |
||||||
|
return Status::Corruption("Identified size flag."); |
||||||
|
} |
||||||
|
} while (expect_suffix); // Another round if suffix is expected.
|
||||||
|
return Status::OK(); |
||||||
|
} |
||||||
|
|
||||||
|
Status PlainTableKeyDecoder::NextKey(const char* start, const char* limit, |
||||||
|
ParsedInternalKey* parsed_key, |
||||||
|
Slice* internal_key, size_t* bytes_read, |
||||||
|
bool* seekable) { |
||||||
|
*bytes_read = 0; |
||||||
|
if (seekable != nullptr) { |
||||||
|
*seekable = true; |
||||||
|
} |
||||||
|
if (encoding_type_ == kPlain) { |
||||||
|
return NextPlainEncodingKey(start, limit, parsed_key, internal_key, |
||||||
|
bytes_read, seekable); |
||||||
|
} else { |
||||||
|
assert(encoding_type_ == kPrefix); |
||||||
|
return NextPrefixEncodingKey(start, limit, parsed_key, internal_key, |
||||||
|
bytes_read, seekable); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
} // namespace rocksdb
|
||||||
|
#endif // ROCKSDB_LITE
|
@ -0,0 +1,97 @@ |
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE |
||||||
|
#pragma once |
||||||
|
|
||||||
|
#include "rocksdb/slice.h" |
||||||
|
#include "db/dbformat.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
class WritableFile; |
||||||
|
class ParsedInternalKey; |
||||||
|
|
||||||
|
// Helper class to write out a key to an output file
|
||||||
|
// Actual data format of the key is documented in plain_table_factory.h
|
||||||
|
class PlainTableKeyEncoder { |
||||||
|
public: |
||||||
|
explicit PlainTableKeyEncoder(EncodingType encoding_type, |
||||||
|
uint32_t user_key_len, |
||||||
|
const SliceTransform* prefix_extractor, |
||||||
|
size_t index_sparseness) |
||||||
|
: encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain), |
||||||
|
fixed_user_key_len_(user_key_len), |
||||||
|
prefix_extractor_(prefix_extractor), |
||||||
|
index_sparseness_((index_sparseness > 1) ? index_sparseness : 1), |
||||||
|
key_count_for_prefix(0) {} |
||||||
|
// key: the key to write out, in the format of internal key.
|
||||||
|
// file: the output file to write out
|
||||||
|
// offset: offset in the file. Needs to be updated after appending bytes
|
||||||
|
// for the key
|
||||||
|
// meta_bytes_buf: buffer for extra meta bytes
|
||||||
|
// meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
|
||||||
|
// if meta_bytes_buf is updated.
|
||||||
|
Status AppendKey(const Slice& key, WritableFile* file, uint64_t* offset, |
||||||
|
char* meta_bytes_buf, size_t* meta_bytes_buf_size); |
||||||
|
|
||||||
|
// Return actual encoding type to be picked
|
||||||
|
EncodingType GetEncodingType() { return encoding_type_; } |
||||||
|
|
||||||
|
private: |
||||||
|
EncodingType encoding_type_; |
||||||
|
uint32_t fixed_user_key_len_; |
||||||
|
const SliceTransform* prefix_extractor_; |
||||||
|
const size_t index_sparseness_; |
||||||
|
size_t key_count_for_prefix; |
||||||
|
IterKey pre_prefix_; |
||||||
|
}; |
||||||
|
|
||||||
|
// A helper class to decode keys from input buffer
|
||||||
|
// Actual data format of the key is documented in plain_table_factory.h
|
||||||
|
class PlainTableKeyDecoder { |
||||||
|
public: |
||||||
|
explicit PlainTableKeyDecoder(EncodingType encoding_type, |
||||||
|
uint32_t user_key_len, |
||||||
|
const SliceTransform* prefix_extractor) |
||||||
|
: encoding_type_(encoding_type), |
||||||
|
prefix_len_(0), |
||||||
|
fixed_user_key_len_(user_key_len), |
||||||
|
prefix_extractor_(prefix_extractor), |
||||||
|
in_prefix_(false) {} |
||||||
|
// Find the next key.
|
||||||
|
// start: char array where the key starts.
|
||||||
|
// limit: boundary of the char array
|
||||||
|
// parsed_key: the output of the result key
|
||||||
|
// internal_key: if not null, fill with the output of the result key in
|
||||||
|
// un-parsed format
|
||||||
|
// bytes_read: how many bytes read from start. Output
|
||||||
|
// seekable: whether key can be read from this place. Used when building
|
||||||
|
// indexes. Output.
|
||||||
|
Status NextKey(const char* start, const char* limit, |
||||||
|
ParsedInternalKey* parsed_key, Slice* internal_key, |
||||||
|
size_t* bytes_read, bool* seekable = nullptr); |
||||||
|
EncodingType encoding_type_; |
||||||
|
uint32_t prefix_len_; |
||||||
|
uint32_t fixed_user_key_len_; |
||||||
|
Slice saved_user_key_; |
||||||
|
IterKey cur_key_; |
||||||
|
const SliceTransform* prefix_extractor_; |
||||||
|
bool in_prefix_; |
||||||
|
|
||||||
|
private: |
||||||
|
Status NextPlainEncodingKey(const char* start, const char* limit, |
||||||
|
ParsedInternalKey* parsed_key, |
||||||
|
Slice* internal_key, size_t* bytes_read, |
||||||
|
bool* seekable = nullptr); |
||||||
|
Status NextPrefixEncodingKey(const char* start, const char* limit, |
||||||
|
ParsedInternalKey* parsed_key, |
||||||
|
Slice* internal_key, size_t* bytes_read, |
||||||
|
bool* seekable = nullptr); |
||||||
|
}; |
||||||
|
|
||||||
|
} // namespace rocksdb
|
||||||
|
|
||||||
|
#endif // ROCKSDB_LITE
|
Loading…
Reference in new issue