Summary: Experiments on column-aware encodings. Supported features: 1) extract data blocks from SST file and encode with specified encodings; 2) Decode encoded data back into row format; 3) Directly extract data blocks and write in row format (without prefix encoding); 4) Get column distribution statistics for column format; 5) Dump data blocks separated by columns in human-readable format. There is still on-going work on this diff. More refactoring is necessary. Test Plan: Wrote tests in `column_aware_encoding_test.cc`. More tests should be added. Reviewers: sdong Reviewed By: sdong Subscribers: arahut, andrewkr, dhruba Differential Revision: https://reviews.facebook.net/D60027main
parent
c116b47804
commit
d51dc96a79
@ -0,0 +1,240 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "utilities/col_buf_decoder.h" |
||||
#include <cstring> |
||||
#include <string> |
||||
#include "port/port.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
ColBufDecoder::~ColBufDecoder() {} |
||||
|
||||
namespace { |
||||
|
||||
inline uint64_t EncodeFixed64WithEndian(uint64_t val, bool big_endian) { |
||||
if (big_endian && port::kLittleEndian) { |
||||
val = le64toh(val); |
||||
val = htobe64(val); |
||||
} else if (!big_endian && !port::kLittleEndian) { |
||||
val = be64toh(val); |
||||
val = htole64(val); |
||||
} |
||||
return val; |
||||
} |
||||
|
||||
} // namespace
|
||||
ColBufDecoder* ColBufDecoder::NewColBufDecoder( |
||||
const ColDeclaration& col_declaration) { |
||||
if (col_declaration.col_type == "FixedLength") { |
||||
return new FixedLengthColBufDecoder( |
||||
col_declaration.size, col_declaration.col_compression_type, |
||||
col_declaration.nullable, col_declaration.big_endian); |
||||
} else if (col_declaration.col_type == "VariableLength") { |
||||
return new VariableLengthColBufDecoder(); |
||||
} else if (col_declaration.col_type == "VariableChunk") { |
||||
return new VariableChunkColBufDecoder(col_declaration.col_compression_type); |
||||
} else if (col_declaration.col_type == "LongFixedLength") { |
||||
return new LongFixedLengthColBufDecoder(col_declaration.size, |
||||
col_declaration.nullable); |
||||
} |
||||
// Unrecognized column type
|
||||
return nullptr; |
||||
} |
||||
|
||||
namespace { |
||||
|
||||
void ReadVarint64(const char** src_ptr, uint64_t* val_ptr) { |
||||
const char* q = GetVarint64Ptr(*src_ptr, *src_ptr + 10, val_ptr); |
||||
assert(q != nullptr); |
||||
*src_ptr = q; |
||||
} |
||||
} // namespace
|
||||
|
||||
size_t FixedLengthColBufDecoder::Init(const char* src) { |
||||
remain_runs_ = 0; |
||||
last_val_ = 0; |
||||
// Dictionary initialization
|
||||
dict_vec_.clear(); |
||||
const char* orig_src = src; |
||||
if (col_compression_type_ == kColDict || |
||||
col_compression_type_ == kColRleDict) { |
||||
const char* q; |
||||
size_t dict_size; |
||||
// Bypass limit
|
||||
q = GetVarint64Ptr(src, src + 10, &dict_size); |
||||
assert(q != nullptr); |
||||
src = q; |
||||
|
||||
uint64_t dict_key; |
||||
for (size_t i = 0; i < dict_size; ++i) { |
||||
// Bypass limit
|
||||
ReadVarint64(&src, &dict_key); |
||||
|
||||
dict_key = EncodeFixed64WithEndian(dict_key, big_endian_); |
||||
dict_vec_.push_back(dict_key); |
||||
} |
||||
} |
||||
return src - orig_src; |
||||
} |
||||
|
||||
size_t FixedLengthColBufDecoder::Decode(const char* src, char** dest) { |
||||
uint64_t read_val = 0; |
||||
const char* orig_src = src; |
||||
const char* src_limit = src + 20; |
||||
if (nullable_) { |
||||
bool not_null; |
||||
not_null = *src; |
||||
src += 1; |
||||
if (!not_null) { |
||||
return 1; |
||||
} |
||||
} |
||||
if (IsRunLength(col_compression_type_)) { |
||||
if (remain_runs_ == 0) { |
||||
const char* q; |
||||
run_val_ = 0; |
||||
if (col_compression_type_ == kColRle) { |
||||
memcpy(&run_val_, src, size_); |
||||
src += size_; |
||||
} else { |
||||
q = GetVarint64Ptr(src, src_limit, &run_val_); |
||||
assert(q != nullptr); |
||||
src = q; |
||||
} |
||||
|
||||
q = GetVarint64Ptr(src, src_limit, &remain_runs_); |
||||
assert(q != nullptr); |
||||
src = q; |
||||
|
||||
if (col_compression_type_ != kColRleDeltaVarint && |
||||
col_compression_type_ != kColRleDict) { |
||||
run_val_ = EncodeFixed64WithEndian(run_val_, big_endian_); |
||||
} |
||||
} |
||||
read_val = run_val_; |
||||
} else { |
||||
if (col_compression_type_ == kColNoCompression) { |
||||
memcpy(&read_val, src, size_); |
||||
src += size_; |
||||
} else { |
||||
// Assume a column does not exceed 8 bytes here
|
||||
const char* q = GetVarint64Ptr(src, src_limit, &read_val); |
||||
assert(q != nullptr); |
||||
src = q; |
||||
} |
||||
if (col_compression_type_ != kColDeltaVarint && |
||||
col_compression_type_ != kColDict) { |
||||
read_val = EncodeFixed64WithEndian(read_val, big_endian_); |
||||
} |
||||
} |
||||
|
||||
uint64_t write_val = read_val; |
||||
if (col_compression_type_ == kColDeltaVarint || |
||||
col_compression_type_ == kColRleDeltaVarint) { |
||||
// does not support 64 bit
|
||||
|
||||
uint64_t mask = (write_val & 1) ? (~0UL) : 0; |
||||
int64_t delta = (write_val >> 1) ^ mask; |
||||
write_val = last_val_ + delta; |
||||
|
||||
uint64_t tmp = write_val; |
||||
write_val = EncodeFixed64WithEndian(write_val, big_endian_); |
||||
last_val_ = tmp; |
||||
} else if (col_compression_type_ == kColRleDict || |
||||
col_compression_type_ == kColDict) { |
||||
size_t dict_val = read_val; |
||||
assert(dict_val < dict_vec_.size()); |
||||
write_val = dict_vec_[dict_val]; |
||||
} |
||||
|
||||
// dest->append(reinterpret_cast<char*>(&write_val), size_);
|
||||
memcpy(*dest, reinterpret_cast<char*>(&write_val), size_); |
||||
*dest += size_; |
||||
if (IsRunLength(col_compression_type_)) { |
||||
--remain_runs_; |
||||
} |
||||
return src - orig_src; |
||||
} |
||||
|
||||
size_t LongFixedLengthColBufDecoder::Decode(const char* src, char** dest) { |
||||
if (nullable_) { |
||||
bool not_null; |
||||
not_null = *src; |
||||
src += 1; |
||||
if (!not_null) { |
||||
return 1; |
||||
} |
||||
} |
||||
memcpy(*dest, src, size_); |
||||
*dest += size_; |
||||
return size_ + 1; |
||||
} |
||||
|
||||
size_t VariableLengthColBufDecoder::Decode(const char* src, char** dest) { |
||||
uint8_t len; |
||||
len = *src; |
||||
memcpy(dest, reinterpret_cast<char*>(&len), 1); |
||||
*dest += 1; |
||||
src += 1; |
||||
memcpy(*dest, src, len); |
||||
*dest += len; |
||||
return len + 1; |
||||
} |
||||
|
||||
size_t VariableChunkColBufDecoder::Init(const char* src) { |
||||
// Dictionary initialization
|
||||
dict_vec_.clear(); |
||||
const char* orig_src = src; |
||||
if (col_compression_type_ == kColDict) { |
||||
const char* q; |
||||
size_t dict_size; |
||||
// Bypass limit
|
||||
q = GetVarint64Ptr(src, src + 10, &dict_size); |
||||
assert(q != nullptr); |
||||
src = q; |
||||
|
||||
uint64_t dict_key; |
||||
for (size_t i = 0; i < dict_size; ++i) { |
||||
// Bypass limit
|
||||
ReadVarint64(&src, &dict_key); |
||||
dict_vec_.push_back(dict_key); |
||||
} |
||||
} |
||||
return src - orig_src; |
||||
} |
||||
|
||||
size_t VariableChunkColBufDecoder::Decode(const char* src, char** dest) { |
||||
const char* orig_src = src; |
||||
uint64_t size = 0; |
||||
ReadVarint64(&src, &size); |
||||
int64_t full_chunks = size / 8; |
||||
uint64_t chunk_buf; |
||||
size_t chunk_size = 8; |
||||
for (int64_t i = 0; i < full_chunks + 1; ++i) { |
||||
chunk_buf = 0; |
||||
if (i == full_chunks) { |
||||
chunk_size = size % 8; |
||||
} |
||||
if (col_compression_type_ == kColDict) { |
||||
size_t dict_val; |
||||
ReadVarint64(&src, &dict_val); |
||||
assert(dict_val < dict_vec_.size()); |
||||
chunk_buf = dict_vec_[dict_val]; |
||||
} else { |
||||
memcpy(&chunk_buf, src, chunk_size); |
||||
src += chunk_size; |
||||
} |
||||
memcpy(*dest, reinterpret_cast<char*>(&chunk_buf), 8); |
||||
*dest += 8; |
||||
uint8_t mask = 0xFF - 8 + chunk_size; |
||||
memcpy(*dest, reinterpret_cast<char*>(&mask), 1); |
||||
*dest += 1; |
||||
} |
||||
|
||||
return src - orig_src; |
||||
} |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,117 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
#include <endian.h> |
||||
#include <cstdio> |
||||
#include <cstring> |
||||
#include <memory> |
||||
#include <string> |
||||
#include <unordered_map> |
||||
#include <vector> |
||||
#include "util/coding.h" |
||||
#include "utilities/col_buf_encoder.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
struct ColDeclaration; |
||||
|
||||
// ColBufDecoder is a class to decode column buffers. It can be populated from a
|
||||
// ColDeclaration. Before starting decoding, a Init() method should be called.
|
||||
// Each time it takes a column value into Decode() method.
|
||||
class ColBufDecoder { |
||||
public: |
||||
virtual ~ColBufDecoder() = 0; |
||||
virtual size_t Init(const char* src) { return 0; } |
||||
virtual size_t Decode(const char* src, char** dest) = 0; |
||||
static ColBufDecoder* NewColBufDecoder(const ColDeclaration& col_declaration); |
||||
|
||||
protected: |
||||
std::string buffer_; |
||||
static inline bool IsRunLength(ColCompressionType type) { |
||||
return type == kColRle || type == kColRleVarint || |
||||
type == kColRleDeltaVarint || type == kColRleDict; |
||||
} |
||||
}; |
||||
|
||||
class FixedLengthColBufDecoder : public ColBufDecoder { |
||||
public: |
||||
explicit FixedLengthColBufDecoder( |
||||
size_t size, ColCompressionType col_compression_type = kColNoCompression, |
||||
bool nullable = false, bool big_endian = false) |
||||
: size_(size), |
||||
col_compression_type_(col_compression_type), |
||||
nullable_(nullable), |
||||
big_endian_(big_endian) {} |
||||
|
||||
size_t Init(const char* src) override; |
||||
size_t Decode(const char* src, char** dest) override; |
||||
~FixedLengthColBufDecoder() {} |
||||
|
||||
private: |
||||
size_t size_; |
||||
ColCompressionType col_compression_type_; |
||||
bool nullable_; |
||||
bool big_endian_; |
||||
|
||||
// for decoding
|
||||
std::vector<uint64_t> dict_vec_; |
||||
uint64_t remain_runs_; |
||||
uint64_t run_val_; |
||||
uint64_t last_val_; |
||||
}; |
||||
|
||||
class LongFixedLengthColBufDecoder : public ColBufDecoder { |
||||
public: |
||||
LongFixedLengthColBufDecoder(size_t size, bool nullable) |
||||
: size_(size), nullable_(nullable) {} |
||||
|
||||
size_t Decode(const char* src, char** dest) override; |
||||
~LongFixedLengthColBufDecoder() {} |
||||
|
||||
private: |
||||
size_t size_; |
||||
bool nullable_; |
||||
}; |
||||
|
||||
class VariableLengthColBufDecoder : public ColBufDecoder { |
||||
public: |
||||
size_t Decode(const char* src, char** dest) override; |
||||
~VariableLengthColBufDecoder() {} |
||||
}; |
||||
|
||||
class VariableChunkColBufDecoder : public VariableLengthColBufDecoder { |
||||
public: |
||||
size_t Init(const char* src) override; |
||||
size_t Decode(const char* src, char** dest) override; |
||||
explicit VariableChunkColBufDecoder(ColCompressionType col_compression_type) |
||||
: col_compression_type_(col_compression_type) {} |
||||
VariableChunkColBufDecoder() : col_compression_type_(kColNoCompression) {} |
||||
|
||||
private: |
||||
ColCompressionType col_compression_type_; |
||||
std::unordered_map<uint64_t, uint64_t> dictionary_; |
||||
std::vector<uint64_t> dict_vec_; |
||||
}; |
||||
|
||||
struct KVPairColBufDecoders { |
||||
std::vector<std::unique_ptr<ColBufDecoder>> key_col_bufs; |
||||
std::vector<std::unique_ptr<ColBufDecoder>> value_col_bufs; |
||||
std::unique_ptr<ColBufDecoder> value_checksum_buf; |
||||
|
||||
explicit KVPairColBufDecoders(const KVPairColDeclarations& kvp_cd) { |
||||
for (auto kcd : *kvp_cd.key_col_declarations) { |
||||
key_col_bufs.emplace_back( |
||||
std::move(ColBufDecoder::NewColBufDecoder(kcd))); |
||||
} |
||||
for (auto vcd : *kvp_cd.value_col_declarations) { |
||||
value_col_bufs.emplace_back( |
||||
std::move(ColBufDecoder::NewColBufDecoder(vcd))); |
||||
} |
||||
value_checksum_buf.reset( |
||||
ColBufDecoder::NewColBufDecoder(*kvp_cd.value_checksum_declaration)); |
||||
} |
||||
}; |
||||
} // namespace rocksdb
|
@ -0,0 +1,213 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "utilities/col_buf_encoder.h" |
||||
#include <cstring> |
||||
#include <string> |
||||
#include "port/port.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
ColBufEncoder::~ColBufEncoder() {} |
||||
|
||||
namespace { |
||||
|
||||
inline uint64_t DecodeFixed64WithEndian(uint64_t val, bool big_endian) { |
||||
if (big_endian && port::kLittleEndian) { |
||||
val = be64toh(val); |
||||
val = htole64(val); |
||||
} else if (!big_endian && !port::kLittleEndian) { |
||||
val = le64toh(val); |
||||
val = htobe64(val); |
||||
} |
||||
return val; |
||||
} |
||||
|
||||
} // namespace
|
||||
|
||||
const std::string &ColBufEncoder::GetData() { return buffer_; } |
||||
|
||||
ColBufEncoder *ColBufEncoder::NewColBufEncoder( |
||||
const ColDeclaration &col_declaration) { |
||||
if (col_declaration.col_type == "FixedLength") { |
||||
return new FixedLengthColBufEncoder( |
||||
col_declaration.size, col_declaration.col_compression_type, |
||||
col_declaration.nullable, col_declaration.big_endian); |
||||
} else if (col_declaration.col_type == "VariableLength") { |
||||
return new VariableLengthColBufEncoder(); |
||||
} else if (col_declaration.col_type == "VariableChunk") { |
||||
return new VariableChunkColBufEncoder(col_declaration.col_compression_type); |
||||
} else if (col_declaration.col_type == "LongFixedLength") { |
||||
return new LongFixedLengthColBufEncoder(col_declaration.size, |
||||
col_declaration.nullable); |
||||
} |
||||
// Unrecognized column type
|
||||
return nullptr; |
||||
} |
||||
|
||||
size_t FixedLengthColBufEncoder::Append(const char *buf) { |
||||
if (nullable_) { |
||||
if (buf == nullptr) { |
||||
buffer_.append(1, 0); |
||||
return 0; |
||||
} else { |
||||
buffer_.append(1, 1); |
||||
} |
||||
} |
||||
uint64_t read_val = 0; |
||||
memcpy(&read_val, buf, size_); |
||||
if (big_endian_) { |
||||
read_val = DecodeFixed64WithEndian(read_val, big_endian_); |
||||
} |
||||
|
||||
// Determine write value
|
||||
uint64_t write_val = read_val; |
||||
if (col_compression_type_ == kColDeltaVarint || |
||||
col_compression_type_ == kColRleDeltaVarint) { |
||||
int64_t delta = read_val - last_val_; |
||||
// Encode signed delta value
|
||||
delta = (delta << 1) ^ (delta >> 63); |
||||
write_val = delta; |
||||
last_val_ = read_val; |
||||
} else if (col_compression_type_ == kColDict || |
||||
col_compression_type_ == kColRleDict) { |
||||
auto iter = dictionary_.find(read_val); |
||||
uint64_t dict_val; |
||||
if (iter == dictionary_.end()) { |
||||
// Add new entry to dictionary
|
||||
dict_val = dictionary_.size(); |
||||
dictionary_.insert(std::make_pair(read_val, dict_val)); |
||||
dict_vec_.push_back(read_val); |
||||
} else { |
||||
dict_val = iter->second; |
||||
} |
||||
write_val = dict_val; |
||||
} |
||||
|
||||
// Write into buffer
|
||||
if (IsRunLength(col_compression_type_)) { |
||||
if (run_length_ == -1) { |
||||
// First element
|
||||
run_val_ = write_val; |
||||
run_length_ = 1; |
||||
} else if (write_val != run_val_) { |
||||
// End of run
|
||||
// Write run value
|
||||
if (col_compression_type_ == kColRle) { |
||||
buffer_.append(reinterpret_cast<char *>(&run_val_), size_); |
||||
} else { |
||||
PutVarint64(&buffer_, run_val_); |
||||
} |
||||
// Write run length
|
||||
PutVarint64(&buffer_, run_length_); |
||||
run_val_ = write_val; |
||||
run_length_ = 1; |
||||
} else { |
||||
run_length_++; |
||||
} |
||||
} else { // non run-length encodings
|
||||
if (col_compression_type_ == kColNoCompression) { |
||||
buffer_.append(reinterpret_cast<char *>(&write_val), size_); |
||||
} else { |
||||
PutVarint64(&buffer_, write_val); |
||||
} |
||||
} |
||||
return size_; |
||||
} |
||||
|
||||
void FixedLengthColBufEncoder::Finish() { |
||||
if (col_compression_type_ == kColDict || |
||||
col_compression_type_ == kColRleDict) { |
||||
std::string header; |
||||
PutVarint64(&header, dict_vec_.size()); |
||||
// Put dictionary in the header
|
||||
for (auto item : dict_vec_) { |
||||
PutVarint64(&header, item); |
||||
} |
||||
buffer_ = header + buffer_; |
||||
} |
||||
if (IsRunLength(col_compression_type_)) { |
||||
// Finish last run value
|
||||
if (col_compression_type_ == kColRle) { |
||||
buffer_.append(reinterpret_cast<char *>(&run_val_), size_); |
||||
} else { |
||||
PutVarint64(&buffer_, run_val_); |
||||
} |
||||
PutVarint64(&buffer_, run_length_); |
||||
} |
||||
} |
||||
|
||||
size_t LongFixedLengthColBufEncoder::Append(const char *buf) { |
||||
if (nullable_) { |
||||
if (buf == nullptr) { |
||||
buffer_.append(1, 0); |
||||
return 0; |
||||
} else { |
||||
buffer_.append(1, 1); |
||||
} |
||||
} |
||||
buffer_.append(buf, size_); |
||||
return size_; |
||||
} |
||||
|
||||
void LongFixedLengthColBufEncoder::Finish() {} |
||||
|
||||
size_t VariableLengthColBufEncoder::Append(const char *buf) { |
||||
uint8_t length = 0; |
||||
length = *buf; |
||||
buffer_.append(buf, 1); |
||||
buf += 1; |
||||
buffer_.append(buf, length); |
||||
return length + 1; |
||||
} |
||||
|
||||
void VariableLengthColBufEncoder::Finish() {} |
||||
|
||||
size_t VariableChunkColBufEncoder::Append(const char *buf) { |
||||
const char *orig_buf = buf; |
||||
uint8_t mark = 0xFF; |
||||
size_t length = 0; |
||||
std::string tmp_buffer; |
||||
while (mark == 0xFF) { |
||||
uint64_t val; |
||||
memcpy(&val, buf, 8); |
||||
buf += 8; |
||||
mark = *buf; |
||||
buf += 1; |
||||
int8_t chunk_size = 8 - (0xFF - mark); |
||||
if (col_compression_type_ == kColDict) { |
||||
auto iter = dictionary_.find(val); |
||||
size_t dict_val; |
||||
if (iter == dictionary_.end()) { |
||||
dict_val = dictionary_.size(); |
||||
dictionary_.insert(std::make_pair(val, dict_val)); |
||||
dict_vec_.push_back(val); |
||||
} else { |
||||
dict_val = iter->second; |
||||
} |
||||
PutVarint64(&tmp_buffer, dict_val); |
||||
} else { |
||||
tmp_buffer.append(reinterpret_cast<char *>(&val), chunk_size); |
||||
} |
||||
length += chunk_size; |
||||
} |
||||
|
||||
PutVarint64(&buffer_, length); |
||||
buffer_.append(tmp_buffer); |
||||
return buf - orig_buf; |
||||
} |
||||
|
||||
void VariableChunkColBufEncoder::Finish() { |
||||
if (col_compression_type_ == kColDict) { |
||||
std::string header; |
||||
PutVarint64(&header, dict_vec_.size()); |
||||
for (auto item : dict_vec_) { |
||||
PutVarint64(&header, item); |
||||
} |
||||
buffer_ = header + buffer_; |
||||
} |
||||
} |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,220 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
#include <endian.h> |
||||
#include <cstdio> |
||||
#include <cstring> |
||||
#include <memory> |
||||
#include <string> |
||||
#include <unordered_map> |
||||
#include <vector> |
||||
#include "util/coding.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
enum ColCompressionType { |
||||
kColNoCompression, |
||||
kColRle, |
||||
kColVarint, |
||||
kColRleVarint, |
||||
kColDeltaVarint, |
||||
kColRleDeltaVarint, |
||||
kColDict, |
||||
kColRleDict |
||||
}; |
||||
|
||||
struct ColDeclaration; |
||||
|
||||
// ColBufEncoder is a class to encode column buffers. It can be populated from a
|
||||
// ColDeclaration. Each time it takes a column value into Append() method to
|
||||
// encode the column and store it into an internal buffer. After all rows for
|
||||
// this column are consumed, a Finish() should be called to add header and
|
||||
// remaining data.
|
||||
class ColBufEncoder { |
||||
public: |
||||
// Read a column, encode data and append into internal buffer.
|
||||
virtual size_t Append(const char *buf) = 0; |
||||
virtual ~ColBufEncoder() = 0; |
||||
// Get the internal column buffer. Should only be called after Finish().
|
||||
const std::string &GetData(); |
||||
// Finish encoding. Add header and remaining data.
|
||||
virtual void Finish() = 0; |
||||
// Populate a ColBufEncoder from ColDeclaration.
|
||||
static ColBufEncoder *NewColBufEncoder(const ColDeclaration &col_declaration); |
||||
|
||||
protected: |
||||
std::string buffer_; |
||||
static inline bool IsRunLength(ColCompressionType type) { |
||||
return type == kColRle || type == kColRleVarint || |
||||
type == kColRleDeltaVarint || type == kColRleDict; |
||||
} |
||||
}; |
||||
|
||||
// Encoder for fixed length column buffer. In fixed length column buffer, the
|
||||
// size of the column should not exceed 8 bytes.
|
||||
// The following encodings are supported:
|
||||
// Varint: Variable length integer. See util/coding.h for more details
|
||||
// Rle (Run length encoding): encode a sequence of contiguous value as
|
||||
// [run_value][run_length]. Can be combined with Varint
|
||||
// Delta: Encode value to its delta with its adjacent entry. Use varint to
|
||||
// possibly reduce stored bytes. Can be combined with Rle.
|
||||
// Dictionary: Use a dictionary to record all possible values in the block and
|
||||
// encode them with an ID started from 0. IDs are encoded as varint. A column
|
||||
// with dictionary encoding will have a header to store all actual values,
|
||||
// ordered by their dictionary value, and the data will be replaced by
|
||||
// dictionary value. Can be combined with Rle.
|
||||
class FixedLengthColBufEncoder : public ColBufEncoder { |
||||
public: |
||||
explicit FixedLengthColBufEncoder( |
||||
size_t size, ColCompressionType col_compression_type = kColNoCompression, |
||||
bool nullable = false, bool big_endian = false) |
||||
: size_(size), |
||||
col_compression_type_(col_compression_type), |
||||
nullable_(nullable), |
||||
big_endian_(big_endian), |
||||
last_val_(0), |
||||
run_length_(-1), |
||||
run_val_(0) {} |
||||
|
||||
size_t Append(const char *buf) override; |
||||
void Finish() override; |
||||
~FixedLengthColBufEncoder() {} |
||||
|
||||
private: |
||||
size_t size_; |
||||
ColCompressionType col_compression_type_; |
||||
// If set as true, the input value can be null (represented as nullptr). When
|
||||
// nullable is true, use one more byte before actual value to indicate if the
|
||||
// current value is null.
|
||||
bool nullable_; |
||||
// If set as true, input value will be treated as big endian encoded.
|
||||
bool big_endian_; |
||||
|
||||
// for encoding
|
||||
uint64_t last_val_; |
||||
int16_t run_length_; |
||||
uint64_t run_val_; |
||||
// Map to store dictionary for dictionary encoding
|
||||
std::unordered_map<uint64_t, uint64_t> dictionary_; |
||||
// Vector of dictionary keys.
|
||||
std::vector<uint64_t> dict_vec_; |
||||
}; |
||||
|
||||
// Long fixed length column buffer is a variant of fixed length buffer to hold
|
||||
// fixed length buffer with more than 8 bytes. We do not support any special
|
||||
// encoding schemes in LongFixedLengthColBufEncoder.
|
||||
class LongFixedLengthColBufEncoder : public ColBufEncoder { |
||||
public: |
||||
LongFixedLengthColBufEncoder(size_t size, bool nullable) |
||||
: size_(size), nullable_(nullable) {} |
||||
size_t Append(const char *buf) override; |
||||
void Finish() override; |
||||
|
||||
~LongFixedLengthColBufEncoder() {} |
||||
|
||||
private: |
||||
size_t size_; |
||||
bool nullable_; |
||||
}; |
||||
|
||||
// Variable length column buffer holds a format of variable length column. In
|
||||
// this format, a column is composed of one byte length k, followed by data with
|
||||
// k bytes long data.
|
||||
class VariableLengthColBufEncoder : public ColBufEncoder { |
||||
public: |
||||
size_t Append(const char *buf) override; |
||||
void Finish() override; |
||||
|
||||
~VariableLengthColBufEncoder() {} |
||||
}; |
||||
|
||||
// Variable chunk column buffer holds another format of variable length column.
|
||||
// In this format, a column contains multiple chunks of data, each of which is
|
||||
// composed of 8 bytes long data, and one byte as a mask to indicate whether we
|
||||
// have more data to come. If no more data coming, the mask is set as 0xFF. If
|
||||
// the chunk is the last chunk and has only k valid bytes, the mask is set as
|
||||
// 0xFF - (8 - k).
|
||||
class VariableChunkColBufEncoder : public VariableLengthColBufEncoder { |
||||
public: |
||||
size_t Append(const char *buf) override; |
||||
void Finish() override; |
||||
explicit VariableChunkColBufEncoder(ColCompressionType col_compression_type) |
||||
: col_compression_type_(col_compression_type) {} |
||||
VariableChunkColBufEncoder() : col_compression_type_(kColNoCompression) {} |
||||
|
||||
private: |
||||
ColCompressionType col_compression_type_; |
||||
// Map to store dictionary for dictionary encoding
|
||||
std::unordered_map<uint64_t, uint64_t> dictionary_; |
||||
// Vector of dictionary keys.
|
||||
std::vector<uint64_t> dict_vec_; |
||||
}; |
||||
|
||||
// ColDeclaration declares a column's type, algorithm of column-aware encoding,
|
||||
// and other column data like endian and nullability.
|
||||
struct ColDeclaration { |
||||
explicit ColDeclaration( |
||||
std::string _col_type, |
||||
ColCompressionType _col_compression_type = kColNoCompression, |
||||
size_t _size = 0, bool _nullable = false, bool _big_endian = false) |
||||
: col_type(_col_type), |
||||
col_compression_type(_col_compression_type), |
||||
size(_size), |
||||
nullable(_nullable), |
||||
big_endian(_big_endian) {} |
||||
std::string col_type; |
||||
ColCompressionType col_compression_type; |
||||
size_t size; |
||||
bool nullable; |
||||
bool big_endian; |
||||
}; |
||||
|
||||
// KVPairColDeclarations is a class to hold column declaration of columns in
|
||||
// key and value.
|
||||
struct KVPairColDeclarations { |
||||
std::vector<ColDeclaration> *key_col_declarations; |
||||
std::vector<ColDeclaration> *value_col_declarations; |
||||
ColDeclaration *value_checksum_declaration; |
||||
KVPairColDeclarations(std::vector<ColDeclaration> *_key_col_declarations, |
||||
std::vector<ColDeclaration> *_value_col_declarations, |
||||
ColDeclaration *_value_checksum_declaration) |
||||
: key_col_declarations(_key_col_declarations), |
||||
value_col_declarations(_value_col_declarations), |
||||
value_checksum_declaration(_value_checksum_declaration) {} |
||||
}; |
||||
|
||||
// Similar to KVPairDeclarations, KVPairColBufEncoders is used to hold column
|
||||
// buffer encoders of all columns in key and value.
|
||||
struct KVPairColBufEncoders { |
||||
std::vector<std::unique_ptr<ColBufEncoder>> key_col_bufs; |
||||
std::vector<std::unique_ptr<ColBufEncoder>> value_col_bufs; |
||||
std::unique_ptr<ColBufEncoder> value_checksum_buf; |
||||
|
||||
explicit KVPairColBufEncoders(const KVPairColDeclarations &kvp_cd) { |
||||
for (auto kcd : *kvp_cd.key_col_declarations) { |
||||
key_col_bufs.emplace_back( |
||||
std::move(ColBufEncoder::NewColBufEncoder(kcd))); |
||||
} |
||||
for (auto vcd : *kvp_cd.value_col_declarations) { |
||||
value_col_bufs.emplace_back( |
||||
std::move(ColBufEncoder::NewColBufEncoder(vcd))); |
||||
} |
||||
value_checksum_buf.reset( |
||||
ColBufEncoder::NewColBufEncoder(*kvp_cd.value_checksum_declaration)); |
||||
} |
||||
|
||||
// Helper function to call Finish()
|
||||
void Finish() { |
||||
for (auto &col_buf : key_col_bufs) { |
||||
col_buf->Finish(); |
||||
} |
||||
for (auto &col_buf : value_col_bufs) { |
||||
col_buf->Finish(); |
||||
} |
||||
value_checksum_buf->Finish(); |
||||
} |
||||
}; |
||||
} // namespace rocksdb
|
@ -0,0 +1,171 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#include <cstdio> |
||||
|
||||
#ifndef ROCKSDB_LITE |
||||
#ifdef GFLAGS |
||||
|
||||
#include <gflags/gflags.h> |
||||
#include <vector> |
||||
#include "rocksdb/env.h" |
||||
#include "rocksdb/options.h" |
||||
#include "table/block_based_table_builder.h" |
||||
#include "table/block_based_table_reader.h" |
||||
#include "table/format.h" |
||||
#include "tools/sst_dump_tool_imp.h" |
||||
#include "util/compression.h" |
||||
#include "util/stop_watch.h" |
||||
#include "utilities/col_buf_encoder.h" |
||||
#include "utilities/column_aware_encoding_util.h" |
||||
|
||||
using GFLAGS::ParseCommandLineFlags; |
||||
DEFINE_string(encoded_file, "", "file to store encoded data blocks"); |
||||
DEFINE_string(decoded_file, "", |
||||
"file to store decoded data blocks after encoding"); |
||||
DEFINE_string(format, "col", "Output Format. Can be 'row' or 'col'"); |
||||
// TODO(jhli): option `col` should be removed and replaced by general
|
||||
// column specifications.
|
||||
DEFINE_string(index_type, "col", "Index type. Can be 'primary' or 'secondary'"); |
||||
DEFINE_string(dump_file, "", |
||||
"Dump data blocks separated by columns in human-readable format"); |
||||
DEFINE_bool(decode, false, "Deocde blocks after they are encoded"); |
||||
DEFINE_bool(stat, false, |
||||
"Print column distribution statistics. Cannot decode in this mode"); |
||||
DEFINE_string(compression_type, "kNoCompression", |
||||
"The compression algorithm used to compress data blocks"); |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class ColumnAwareEncodingExp { |
||||
public: |
||||
static void Run(const std::string& sst_file) { |
||||
bool decode = FLAGS_decode; |
||||
if (FLAGS_decoded_file.size() > 0) { |
||||
decode = true; |
||||
} |
||||
if (FLAGS_stat) { |
||||
decode = false; |
||||
} |
||||
|
||||
ColumnAwareEncodingReader reader(sst_file); |
||||
std::vector<ColDeclaration>* key_col_declarations; |
||||
std::vector<ColDeclaration>* value_col_declarations; |
||||
ColDeclaration* value_checksum_declaration; |
||||
if (FLAGS_index_type == "primary") { |
||||
ColumnAwareEncodingReader::GetColDeclarationsPrimary( |
||||
&key_col_declarations, &value_col_declarations, |
||||
&value_checksum_declaration); |
||||
} else { |
||||
ColumnAwareEncodingReader::GetColDeclarationsSecondary( |
||||
&key_col_declarations, &value_col_declarations, |
||||
&value_checksum_declaration); |
||||
} |
||||
KVPairColDeclarations kvp_cd(key_col_declarations, value_col_declarations, |
||||
value_checksum_declaration); |
||||
|
||||
if (!FLAGS_dump_file.empty()) { |
||||
std::vector<KVPairBlock> kv_pair_blocks; |
||||
reader.GetKVPairsFromDataBlocks(&kv_pair_blocks); |
||||
reader.DumpDataColumns(FLAGS_dump_file, kvp_cd, kv_pair_blocks); |
||||
return; |
||||
} |
||||
std::unordered_map<std::string, CompressionType> compressions = { |
||||
{"kNoCompression", CompressionType::kNoCompression}, |
||||
{"kZlibCompression", CompressionType::kZlibCompression}, |
||||
{"kZSTDNotFinalCompression", |
||||
CompressionType::kZSTDNotFinalCompression}}; |
||||
|
||||
// Find Compression
|
||||
CompressionType compression_type = compressions[FLAGS_compression_type]; |
||||
EnvOptions env_options; |
||||
if (CompressionTypeSupported(compression_type)) { |
||||
fprintf(stdout, "[%s]\n", FLAGS_compression_type.c_str()); |
||||
unique_ptr<WritableFile> encoded_out_file; |
||||
|
||||
std::unique_ptr<Env> env(NewMemEnv(Env::Default())); |
||||
if (!FLAGS_encoded_file.empty()) { |
||||
env->NewWritableFile(FLAGS_encoded_file, &encoded_out_file, |
||||
env_options); |
||||
} |
||||
|
||||
std::vector<KVPairBlock> kv_pair_blocks; |
||||
reader.GetKVPairsFromDataBlocks(&kv_pair_blocks); |
||||
|
||||
std::vector<std::string> encoded_blocks; |
||||
StopWatchNano sw(env.get(), true); |
||||
if (FLAGS_format == "col") { |
||||
reader.EncodeBlocks(kvp_cd, encoded_out_file.get(), compression_type, |
||||
kv_pair_blocks, &encoded_blocks, FLAGS_stat); |
||||
} else { // row format
|
||||
reader.EncodeBlocksToRowFormat(encoded_out_file.get(), compression_type, |
||||
kv_pair_blocks, &encoded_blocks); |
||||
} |
||||
if (encoded_out_file != nullptr) { |
||||
uint64_t size = 0; |
||||
env->GetFileSize(FLAGS_encoded_file, &size); |
||||
fprintf(stdout, "File size: %lu\n", size); |
||||
} |
||||
uint64_t encode_time = sw.ElapsedNanosSafe(false /* reset */); |
||||
fprintf(stdout, "Encode time:%lu\n", encode_time); |
||||
if (decode) { |
||||
unique_ptr<WritableFile> decoded_out_file; |
||||
if (!FLAGS_decoded_file.empty()) { |
||||
env->NewWritableFile(FLAGS_decoded_file, &decoded_out_file, |
||||
env_options); |
||||
} |
||||
sw.Start(); |
||||
if (FLAGS_format == "col") { |
||||
reader.DecodeBlocks(kvp_cd, decoded_out_file.get(), &encoded_blocks); |
||||
} else { |
||||
reader.DecodeBlocksFromRowFormat(decoded_out_file.get(), |
||||
&encoded_blocks); |
||||
} |
||||
uint64_t decode_time = sw.ElapsedNanosSafe(true /* reset */); |
||||
fprintf(stdout, "Decode time:%lu\n", decode_time); |
||||
} |
||||
} else { |
||||
fprintf(stdout, "Unsupported compression type: %s.\n", |
||||
FLAGS_compression_type.c_str()); |
||||
} |
||||
delete key_col_declarations; |
||||
delete value_col_declarations; |
||||
delete value_checksum_declaration; |
||||
} |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) { |
||||
int arg_idx = ParseCommandLineFlags(&argc, &argv, true); |
||||
if (arg_idx >= argc) { |
||||
fprintf(stdout, "SST filename required.\n"); |
||||
exit(1); |
||||
} |
||||
std::string sst_file(argv[arg_idx]); |
||||
if (FLAGS_format != "row" && FLAGS_format != "col") { |
||||
fprintf(stderr, "Format must be 'row' or 'col'\n"); |
||||
exit(1); |
||||
} |
||||
if (FLAGS_index_type != "primary" && FLAGS_index_type != "secondary") { |
||||
fprintf(stderr, "Format must be 'primary' or 'secondary'\n"); |
||||
exit(1); |
||||
} |
||||
rocksdb::ColumnAwareEncodingExp::Run(sst_file); |
||||
return 0; |
||||
} |
||||
|
||||
#else |
||||
int main() { |
||||
fprintf(stderr, "Please install gflags to run rocksdb tools\n"); |
||||
return 1; |
||||
} |
||||
#endif // GFLAGS
|
||||
#else |
||||
int main(int argc, char** argv) { |
||||
fprintf(stderr, "Not supported in lite mode.\n"); |
||||
return 1; |
||||
} |
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,193 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include <vector> |
||||
#include "util/testharness.h" |
||||
#include "util/testutil.h" |
||||
#include "utilities/col_buf_decoder.h" |
||||
#include "utilities/col_buf_encoder.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class ColumnAwareEncodingTest : public testing::Test { |
||||
public: |
||||
ColumnAwareEncodingTest() {} |
||||
|
||||
~ColumnAwareEncodingTest() {} |
||||
}; |
||||
|
||||
TEST_F(ColumnAwareEncodingTest, NoCompressionEncodeDecode) { |
||||
std::unique_ptr<ColBufEncoder> col_buf_encoder( |
||||
new FixedLengthColBufEncoder(8, kColNoCompression, false, true)); |
||||
std::string str_buf; |
||||
uint64_t val = 0x0102030405060708; |
||||
for (int i = 0; i < 4; ++i) { |
||||
str_buf.append(reinterpret_cast<char*>(&val), 8); |
||||
} |
||||
const char* str_buf_ptr = str_buf.c_str(); |
||||
for (int i = 0; i < 4; ++i) { |
||||
col_buf_encoder->Append(str_buf_ptr); |
||||
} |
||||
col_buf_encoder->Finish(); |
||||
const std::string& encoded_data = col_buf_encoder->GetData(); |
||||
ASSERT_EQ(encoded_data.size(), 32); |
||||
|
||||
const char* encoded_data_ptr = encoded_data.c_str(); |
||||
uint64_t encoded_val; |
||||
for (int i = 0; i < 4; ++i) { |
||||
memcpy(&encoded_val, encoded_data_ptr, 8); |
||||
ASSERT_EQ(encoded_val, 0x0807060504030201); |
||||
encoded_data_ptr += 8; |
||||
} |
||||
|
||||
std::unique_ptr<ColBufDecoder> col_buf_decoder( |
||||
new FixedLengthColBufDecoder(8, kColNoCompression, false, true)); |
||||
encoded_data_ptr = encoded_data.c_str(); |
||||
encoded_data_ptr += col_buf_decoder->Init(encoded_data_ptr); |
||||
char* decoded_data = new char[100]; |
||||
char* decoded_data_base = decoded_data; |
||||
for (int i = 0; i < 4; ++i) { |
||||
encoded_data_ptr += |
||||
col_buf_decoder->Decode(encoded_data_ptr, &decoded_data); |
||||
} |
||||
|
||||
ASSERT_EQ(4 * 8, decoded_data - decoded_data_base); |
||||
decoded_data = decoded_data_base; |
||||
for (int i = 0; i < 4; ++i) { |
||||
uint64_t decoded_val; |
||||
decoded_val = 0; |
||||
memcpy(&decoded_val, decoded_data, 8); |
||||
ASSERT_EQ(decoded_val, val); |
||||
decoded_data += 8; |
||||
} |
||||
delete[] decoded_data_base; |
||||
} |
||||
|
||||
TEST_F(ColumnAwareEncodingTest, RleEncodeDecode) { |
||||
std::unique_ptr<ColBufEncoder> col_buf_encoder( |
||||
new FixedLengthColBufEncoder(8, kColRle, false, true)); |
||||
std::string str_buf; |
||||
uint64_t val = 0x0102030405060708; |
||||
for (int i = 0; i < 4; ++i) { |
||||
str_buf.append(reinterpret_cast<char*>(&val), 8); |
||||
} |
||||
const char* str_buf_ptr = str_buf.c_str(); |
||||
for (int i = 0; i < 4; ++i) { |
||||
str_buf_ptr += col_buf_encoder->Append(str_buf_ptr); |
||||
} |
||||
col_buf_encoder->Finish(); |
||||
const std::string& encoded_data = col_buf_encoder->GetData(); |
||||
ASSERT_EQ(encoded_data.size(), 9); |
||||
|
||||
const char* encoded_data_ptr = encoded_data.c_str(); |
||||
uint64_t encoded_val; |
||||
memcpy(&encoded_val, encoded_data_ptr, 8); |
||||
ASSERT_EQ(encoded_val, 0x0807060504030201); |
||||
|
||||
std::unique_ptr<ColBufDecoder> col_buf_decoder( |
||||
new FixedLengthColBufDecoder(8, kColRle, false, true)); |
||||
char* decoded_data = new char[100]; |
||||
char* decoded_data_base = decoded_data; |
||||
encoded_data_ptr += col_buf_decoder->Init(encoded_data_ptr); |
||||
for (int i = 0; i < 4; ++i) { |
||||
encoded_data_ptr += |
||||
col_buf_decoder->Decode(encoded_data_ptr, &decoded_data); |
||||
} |
||||
ASSERT_EQ(4 * 8, decoded_data - decoded_data_base); |
||||
decoded_data = decoded_data_base; |
||||
for (int i = 0; i < 4; ++i) { |
||||
uint64_t decoded_val; |
||||
decoded_val = 0; |
||||
memcpy(&decoded_val, decoded_data, 8); |
||||
ASSERT_EQ(decoded_val, val); |
||||
decoded_data += 8; |
||||
} |
||||
delete[] decoded_data_base; |
||||
} |
||||
|
||||
TEST_F(ColumnAwareEncodingTest, DeltaEncodeDecode) { |
||||
std::unique_ptr<ColBufEncoder> col_buf_encoder( |
||||
new FixedLengthColBufEncoder(8, kColDeltaVarint, false, true)); |
||||
std::string str_buf; |
||||
uint64_t val = 0x0102030405060708; |
||||
uint64_t val2 = 0x0202030405060708; |
||||
const char* str_buf_ptr; |
||||
for (int i = 0; i < 2; ++i) { |
||||
str_buf = std::string(reinterpret_cast<char*>(&val), 8); |
||||
str_buf_ptr = str_buf.c_str(); |
||||
col_buf_encoder->Append(str_buf_ptr); |
||||
|
||||
str_buf = std::string(reinterpret_cast<char*>(&val2), 8); |
||||
str_buf_ptr = str_buf.c_str(); |
||||
col_buf_encoder->Append(str_buf_ptr); |
||||
} |
||||
col_buf_encoder->Finish(); |
||||
const std::string& encoded_data = col_buf_encoder->GetData(); |
||||
ASSERT_EQ(encoded_data.size(), 9 + 3); |
||||
|
||||
std::unique_ptr<ColBufDecoder> col_buf_decoder( |
||||
new FixedLengthColBufDecoder(8, kColDeltaVarint, false, true)); |
||||
char* decoded_data = new char[100]; |
||||
char* decoded_data_base = decoded_data; |
||||
const char* encoded_data_ptr = encoded_data.c_str(); |
||||
encoded_data_ptr += col_buf_decoder->Init(encoded_data_ptr); |
||||
for (int i = 0; i < 4; ++i) { |
||||
encoded_data_ptr += |
||||
col_buf_decoder->Decode(encoded_data_ptr, &decoded_data); |
||||
} |
||||
ASSERT_EQ(4 * 8, decoded_data - decoded_data_base); |
||||
decoded_data = decoded_data_base; |
||||
for (int i = 0; i < 2; ++i) { |
||||
uint64_t decoded_val; |
||||
memcpy(&decoded_val, decoded_data, 8); |
||||
ASSERT_EQ(decoded_val, val); |
||||
decoded_data += 8; |
||||
memcpy(&decoded_val, decoded_data, 8); |
||||
ASSERT_EQ(decoded_val, val2); |
||||
decoded_data += 8; |
||||
} |
||||
delete[] decoded_data_base; |
||||
} |
||||
|
||||
TEST_F(ColumnAwareEncodingTest, ChunkBufEncodeDecode) { |
||||
std::unique_ptr<ColBufEncoder> col_buf_encoder( |
||||
new VariableChunkColBufEncoder(kColDict)); |
||||
std::string buf("12345678\377\1\0\0\0\0\0\0\0\376", 18); |
||||
col_buf_encoder->Append(buf.c_str()); |
||||
col_buf_encoder->Finish(); |
||||
const std::string& encoded_data = col_buf_encoder->GetData(); |
||||
const char* str_ptr = encoded_data.c_str(); |
||||
|
||||
std::unique_ptr<ColBufDecoder> col_buf_decoder( |
||||
new VariableChunkColBufDecoder(kColDict)); |
||||
str_ptr += col_buf_decoder->Init(str_ptr); |
||||
char* decoded_data = new char[100]; |
||||
char* decoded_data_base = decoded_data; |
||||
col_buf_decoder->Decode(str_ptr, &decoded_data); |
||||
for (size_t i = 0; i < buf.size(); ++i) { |
||||
ASSERT_EQ(buf[i], decoded_data_base[i]); |
||||
} |
||||
delete[] decoded_data_base; |
||||
} |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) { |
||||
::testing::InitGoogleTest(&argc, argv); |
||||
return RUN_ALL_TESTS(); |
||||
} |
||||
|
||||
#else |
||||
|
||||
#include <cstdio> |
||||
|
||||
int main() { |
||||
fprintf(stderr, |
||||
"SKIPPED as column aware encoding experiment is not enabled in " |
||||
"ROCKSDB_LITE\n"); |
||||
} |
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,490 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include "utilities/column_aware_encoding_util.h" |
||||
|
||||
#ifndef __STDC_FORMAT_MACROS |
||||
#define __STDC_FORMAT_MACROS |
||||
#endif |
||||
|
||||
#include <stddef.h> |
||||
#include <stdint.h> |
||||
#include <stdio.h> |
||||
#include <algorithm> |
||||
#include <utility> |
||||
#include <vector> |
||||
#include "include/rocksdb/comparator.h" |
||||
#include "include/rocksdb/slice.h" |
||||
#include "rocksdb/env.h" |
||||
#include "rocksdb/status.h" |
||||
#include "table/block_based_table_builder.h" |
||||
#include "table/block_based_table_factory.h" |
||||
#include "table/format.h" |
||||
#include "table/table_reader.h" |
||||
#include "util/coding.h" |
||||
#include "utilities/col_buf_decoder.h" |
||||
#include "utilities/col_buf_encoder.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
ColumnAwareEncodingReader::ColumnAwareEncodingReader( |
||||
const std::string& file_path) |
||||
: file_name_(file_path), |
||||
ioptions_(options_), |
||||
internal_comparator_(BytewiseComparator()) { |
||||
InitTableReader(file_name_); |
||||
} |
||||
|
||||
void ColumnAwareEncodingReader::InitTableReader(const std::string& file_path) { |
||||
std::unique_ptr<RandomAccessFile> file; |
||||
uint64_t file_size; |
||||
options_.env->NewRandomAccessFile(file_path, &file, soptions_); |
||||
options_.env->GetFileSize(file_path, &file_size); |
||||
|
||||
file_.reset(new RandomAccessFileReader(std::move(file))); |
||||
|
||||
options_.comparator = &internal_comparator_; |
||||
options_.table_factory = std::make_shared<BlockBasedTableFactory>(); |
||||
shared_ptr<BlockBasedTableFactory> block_table_factory = |
||||
std::dynamic_pointer_cast<BlockBasedTableFactory>(options_.table_factory); |
||||
|
||||
std::unique_ptr<TableReader> table_reader; |
||||
block_table_factory->NewTableReader( |
||||
TableReaderOptions(ioptions_, soptions_, internal_comparator_, |
||||
/*skip_filters=*/false), |
||||
std::move(file_), file_size, &table_reader, /*enable_prefetch=*/false); |
||||
|
||||
table_reader_.reset(dynamic_cast<BlockBasedTable*>(table_reader.release())); |
||||
} |
||||
|
||||
void ColumnAwareEncodingReader::GetKVPairsFromDataBlocks( |
||||
std::vector<KVPairBlock>* kv_pair_blocks) { |
||||
table_reader_->GetKVPairsFromDataBlocks(kv_pair_blocks); |
||||
} |
||||
|
||||
void ColumnAwareEncodingReader::DecodeBlocks( |
||||
const KVPairColDeclarations& kvp_col_declarations, WritableFile* out_file, |
||||
const std::vector<std::string>* blocks) { |
||||
char* decoded_content_base = new char[16384]; |
||||
Options options; |
||||
ImmutableCFOptions ioptions(options); |
||||
for (auto& block : *blocks) { |
||||
KVPairColBufDecoders kvp_col_bufs(kvp_col_declarations); |
||||
auto& key_col_bufs = kvp_col_bufs.key_col_bufs; |
||||
auto& value_col_bufs = kvp_col_bufs.value_col_bufs; |
||||
auto& value_checksum_buf = kvp_col_bufs.value_checksum_buf; |
||||
|
||||
auto& slice_final_with_bit = block; |
||||
uint32_t format_version = 2; |
||||
Slice compression_dict; |
||||
BlockContents contents; |
||||
const char* content_ptr; |
||||
|
||||
CompressionType type = |
||||
(CompressionType)slice_final_with_bit[slice_final_with_bit.size() - 1]; |
||||
if (type != kNoCompression) { |
||||
UncompressBlockContents(slice_final_with_bit.c_str(), |
||||
slice_final_with_bit.size() - 1, &contents, |
||||
format_version, compression_dict, ioptions); |
||||
content_ptr = contents.data.data(); |
||||
} else { |
||||
content_ptr = slice_final_with_bit.data(); |
||||
} |
||||
|
||||
size_t num_kv_pairs; |
||||
const char* header_content_ptr = content_ptr; |
||||
num_kv_pairs = DecodeFixed64(header_content_ptr); |
||||
|
||||
header_content_ptr += sizeof(size_t); |
||||
size_t num_key_columns = key_col_bufs.size(); |
||||
size_t num_value_columns = value_col_bufs.size(); |
||||
std::vector<const char*> key_content_ptr(num_key_columns); |
||||
std::vector<const char*> value_content_ptr(num_value_columns); |
||||
const char* checksum_content_ptr; |
||||
|
||||
size_t num_columns = num_key_columns + num_value_columns; |
||||
const char* col_content_ptr = |
||||
header_content_ptr + sizeof(size_t) * num_columns; |
||||
|
||||
// Read headers
|
||||
for (size_t i = 0; i < num_key_columns; ++i) { |
||||
key_content_ptr[i] = col_content_ptr; |
||||
key_content_ptr[i] += key_col_bufs[i]->Init(key_content_ptr[i]); |
||||
size_t offset; |
||||
offset = DecodeFixed64(header_content_ptr); |
||||
header_content_ptr += sizeof(size_t); |
||||
col_content_ptr += offset; |
||||
} |
||||
for (size_t i = 0; i < num_value_columns; ++i) { |
||||
value_content_ptr[i] = col_content_ptr; |
||||
value_content_ptr[i] += value_col_bufs[i]->Init(value_content_ptr[i]); |
||||
size_t offset; |
||||
offset = DecodeFixed64(header_content_ptr); |
||||
header_content_ptr += sizeof(size_t); |
||||
col_content_ptr += offset; |
||||
} |
||||
checksum_content_ptr = col_content_ptr; |
||||
checksum_content_ptr += value_checksum_buf->Init(checksum_content_ptr); |
||||
|
||||
// Decode block
|
||||
char* decoded_content = decoded_content_base; |
||||
for (size_t j = 0; j < num_kv_pairs; ++j) { |
||||
for (size_t i = 0; i < num_key_columns; ++i) { |
||||
key_content_ptr[i] += |
||||
key_col_bufs[i]->Decode(key_content_ptr[i], &decoded_content); |
||||
} |
||||
for (size_t i = 0; i < num_value_columns; ++i) { |
||||
value_content_ptr[i] += |
||||
value_col_bufs[i]->Decode(value_content_ptr[i], &decoded_content); |
||||
} |
||||
checksum_content_ptr += |
||||
value_checksum_buf->Decode(checksum_content_ptr, &decoded_content); |
||||
} |
||||
|
||||
size_t offset = decoded_content - decoded_content_base; |
||||
Slice output_content(decoded_content, offset); |
||||
|
||||
if (out_file != nullptr) { |
||||
out_file->Append(output_content); |
||||
} |
||||
} |
||||
delete[] decoded_content_base; |
||||
} |
||||
|
||||
void ColumnAwareEncodingReader::DecodeBlocksFromRowFormat( |
||||
WritableFile* out_file, const std::vector<std::string>* blocks) { |
||||
Options options; |
||||
ImmutableCFOptions ioptions(options); |
||||
for (auto& block : *blocks) { |
||||
auto& slice_final_with_bit = block; |
||||
uint32_t format_version = 2; |
||||
Slice compression_dict; |
||||
BlockContents contents; |
||||
std::string decoded_content; |
||||
|
||||
CompressionType type = |
||||
(CompressionType)slice_final_with_bit[slice_final_with_bit.size() - 1]; |
||||
if (type != kNoCompression) { |
||||
UncompressBlockContents(slice_final_with_bit.c_str(), |
||||
slice_final_with_bit.size() - 1, &contents, |
||||
format_version, compression_dict, ioptions); |
||||
decoded_content = std::string(contents.data.data(), contents.data.size()); |
||||
} else { |
||||
decoded_content = std::move(slice_final_with_bit); |
||||
} |
||||
|
||||
if (out_file != nullptr) { |
||||
out_file->Append(decoded_content); |
||||
} |
||||
} |
||||
} |
||||
|
||||
void ColumnAwareEncodingReader::DumpDataColumns( |
||||
const std::string& filename, |
||||
const KVPairColDeclarations& kvp_col_declarations, |
||||
const std::vector<KVPairBlock>& kv_pair_blocks) { |
||||
KVPairColBufEncoders kvp_col_bufs(kvp_col_declarations); |
||||
auto& key_col_bufs = kvp_col_bufs.key_col_bufs; |
||||
auto& value_col_bufs = kvp_col_bufs.value_col_bufs; |
||||
auto& value_checksum_buf = kvp_col_bufs.value_checksum_buf; |
||||
|
||||
FILE* fp = fopen(filename.c_str(), "w"); |
||||
size_t block_id = 1; |
||||
for (auto& kv_pairs : kv_pair_blocks) { |
||||
fprintf(fp, "---------------- Block: %-4lu ----------------\n", block_id); |
||||
for (auto& kv_pair : kv_pairs) { |
||||
const auto& key = kv_pair.first; |
||||
const auto& value = kv_pair.second; |
||||
size_t value_offset = 0; |
||||
|
||||
const char* key_ptr = key.data(); |
||||
for (auto& buf : key_col_bufs) { |
||||
size_t col_size = buf->Append(key_ptr); |
||||
std::string tmp_buf(key_ptr, col_size); |
||||
Slice col(tmp_buf); |
||||
fprintf(fp, "%s ", col.ToString(true).c_str()); |
||||
key_ptr += col_size; |
||||
} |
||||
fprintf(fp, "|"); |
||||
|
||||
const char* value_ptr = value.data(); |
||||
for (auto& buf : value_col_bufs) { |
||||
size_t col_size = buf->Append(value_ptr); |
||||
std::string tmp_buf(value_ptr, col_size); |
||||
Slice col(tmp_buf); |
||||
fprintf(fp, " %s", col.ToString(true).c_str()); |
||||
value_ptr += col_size; |
||||
value_offset += col_size; |
||||
} |
||||
|
||||
if (value_offset < value.size()) { |
||||
size_t col_size = value_checksum_buf->Append(value_ptr); |
||||
std::string tmp_buf(value_ptr, col_size); |
||||
Slice col(tmp_buf); |
||||
fprintf(fp, "|%s", col.ToString(true).c_str()); |
||||
} else { |
||||
value_checksum_buf->Append(nullptr); |
||||
} |
||||
fprintf(fp, "\n"); |
||||
} |
||||
block_id++; |
||||
} |
||||
fclose(fp); |
||||
} |
||||
|
||||
namespace { |
||||
|
||||
void CompressDataBlock(const std::string& output_content, Slice* slice_final, |
||||
CompressionType* type, std::string* compressed_output) { |
||||
CompressionOptions compression_opts; |
||||
uint32_t format_version = 2; // hard-coded version
|
||||
Slice compression_dict; |
||||
*slice_final = |
||||
CompressBlock(output_content, compression_opts, type, format_version, |
||||
compression_dict, compressed_output); |
||||
} |
||||
|
||||
} // namespace
|
||||
|
||||
void ColumnAwareEncodingReader::EncodeBlocksToRowFormat( |
||||
WritableFile* out_file, CompressionType compression_type, |
||||
const std::vector<KVPairBlock>& kv_pair_blocks, |
||||
std::vector<std::string>* blocks) { |
||||
std::string output_content; |
||||
for (auto& kv_pairs : kv_pair_blocks) { |
||||
output_content.clear(); |
||||
std::string last_key; |
||||
size_t counter = 0; |
||||
const size_t block_restart_interval = 16; |
||||
for (auto& kv_pair : kv_pairs) { |
||||
const auto& key = kv_pair.first; |
||||
const auto& value = kv_pair.second; |
||||
|
||||
Slice last_key_piece(last_key); |
||||
size_t shared = 0; |
||||
if (counter >= block_restart_interval) { |
||||
counter = 0; |
||||
} else { |
||||
const size_t min_length = std::min(last_key_piece.size(), key.size()); |
||||
while ((shared < min_length) && last_key_piece[shared] == key[shared]) { |
||||
shared++; |
||||
} |
||||
} |
||||
const size_t non_shared = key.size() - shared; |
||||
output_content.append(key.c_str() + shared, non_shared); |
||||
output_content.append(value); |
||||
|
||||
last_key.resize(shared); |
||||
last_key.append(key.data() + shared, non_shared); |
||||
counter++; |
||||
} |
||||
Slice slice_final; |
||||
auto type = compression_type; |
||||
std::string compressed_output; |
||||
CompressDataBlock(output_content, &slice_final, &type, &compressed_output); |
||||
|
||||
if (out_file != nullptr) { |
||||
out_file->Append(slice_final); |
||||
} |
||||
|
||||
// Add a bit in the end for decoding
|
||||
std::string slice_final_with_bit(slice_final.data(), slice_final.size()); |
||||
slice_final_with_bit.append(reinterpret_cast<char*>(&type), 1); |
||||
blocks->push_back( |
||||
std::string(slice_final_with_bit.data(), slice_final_with_bit.size())); |
||||
} |
||||
} |
||||
|
||||
Status ColumnAwareEncodingReader::EncodeBlocks( |
||||
const KVPairColDeclarations& kvp_col_declarations, WritableFile* out_file, |
||||
CompressionType compression_type, |
||||
const std::vector<KVPairBlock>& kv_pair_blocks, |
||||
std::vector<std::string>* blocks, bool print_column_stat) { |
||||
std::vector<size_t> key_col_sizes( |
||||
kvp_col_declarations.key_col_declarations->size(), 0); |
||||
std::vector<size_t> value_col_sizes( |
||||
kvp_col_declarations.value_col_declarations->size(), 0); |
||||
size_t value_checksum_size = 0; |
||||
|
||||
for (auto& kv_pairs : kv_pair_blocks) { |
||||
KVPairColBufEncoders kvp_col_bufs(kvp_col_declarations); |
||||
auto& key_col_bufs = kvp_col_bufs.key_col_bufs; |
||||
auto& value_col_bufs = kvp_col_bufs.value_col_bufs; |
||||
auto& value_checksum_buf = kvp_col_bufs.value_checksum_buf; |
||||
|
||||
size_t num_kv_pairs = 0; |
||||
for (auto& kv_pair : kv_pairs) { |
||||
const auto& key = kv_pair.first; |
||||
const auto& value = kv_pair.second; |
||||
size_t value_offset = 0; |
||||
num_kv_pairs++; |
||||
|
||||
const char* key_ptr = key.data(); |
||||
for (auto& buf : key_col_bufs) { |
||||
size_t col_size = buf->Append(key_ptr); |
||||
key_ptr += col_size; |
||||
} |
||||
|
||||
const char* value_ptr = value.data(); |
||||
for (auto& buf : value_col_bufs) { |
||||
size_t col_size = buf->Append(value_ptr); |
||||
value_ptr += col_size; |
||||
value_offset += col_size; |
||||
} |
||||
|
||||
if (value_offset < value.size()) { |
||||
value_checksum_buf->Append(value_ptr); |
||||
} else { |
||||
value_checksum_buf->Append(nullptr); |
||||
} |
||||
} |
||||
|
||||
kvp_col_bufs.Finish(); |
||||
// Get stats
|
||||
// Compress and write a block
|
||||
if (print_column_stat) { |
||||
for (size_t i = 0; i < key_col_bufs.size(); ++i) { |
||||
Slice slice_final; |
||||
auto type = compression_type; |
||||
std::string compressed_output; |
||||
CompressDataBlock(key_col_bufs[i]->GetData(), &slice_final, &type, |
||||
&compressed_output); |
||||
out_file->Append(slice_final); |
||||
key_col_sizes[i] += slice_final.size(); |
||||
} |
||||
for (size_t i = 0; i < value_col_bufs.size(); ++i) { |
||||
Slice slice_final; |
||||
auto type = compression_type; |
||||
std::string compressed_output; |
||||
CompressDataBlock(value_col_bufs[i]->GetData(), &slice_final, &type, |
||||
&compressed_output); |
||||
out_file->Append(slice_final); |
||||
value_col_sizes[i] += slice_final.size(); |
||||
} |
||||
Slice slice_final; |
||||
auto type = compression_type; |
||||
std::string compressed_output; |
||||
CompressDataBlock(value_checksum_buf->GetData(), &slice_final, &type, |
||||
&compressed_output); |
||||
out_file->Append(slice_final); |
||||
value_checksum_size += slice_final.size(); |
||||
} else { |
||||
std::string output_content; |
||||
// Write column sizes
|
||||
PutFixed64(&output_content, num_kv_pairs); |
||||
for (auto& buf : key_col_bufs) { |
||||
size_t size = buf->GetData().size(); |
||||
PutFixed64(&output_content, size); |
||||
} |
||||
for (auto& buf : value_col_bufs) { |
||||
size_t size = buf->GetData().size(); |
||||
PutFixed64(&output_content, size); |
||||
} |
||||
// Write data
|
||||
for (auto& buf : key_col_bufs) { |
||||
output_content.append(buf->GetData()); |
||||
} |
||||
for (auto& buf : value_col_bufs) { |
||||
output_content.append(buf->GetData()); |
||||
} |
||||
output_content.append(value_checksum_buf->GetData()); |
||||
|
||||
Slice slice_final; |
||||
auto type = compression_type; |
||||
std::string compressed_output; |
||||
CompressDataBlock(output_content, &slice_final, &type, |
||||
&compressed_output); |
||||
|
||||
if (out_file != nullptr) { |
||||
out_file->Append(slice_final); |
||||
} |
||||
|
||||
// Add a bit in the end for decoding
|
||||
std::string slice_final_with_bit(slice_final.data(), |
||||
slice_final.size() + 1); |
||||
slice_final_with_bit[slice_final.size()] = static_cast<char>(type); |
||||
blocks->push_back(std::string(slice_final_with_bit.data(), |
||||
slice_final_with_bit.size())); |
||||
} |
||||
} |
||||
|
||||
if (print_column_stat) { |
||||
size_t total_size = 0; |
||||
for (size_t i = 0; i < key_col_sizes.size(); ++i) |
||||
total_size += key_col_sizes[i]; |
||||
for (size_t i = 0; i < value_col_sizes.size(); ++i) |
||||
total_size += value_col_sizes[i]; |
||||
total_size += value_checksum_size; |
||||
|
||||
for (size_t i = 0; i < key_col_sizes.size(); ++i) |
||||
printf("Key col %lu size: %lu percentage %lf%%\n", i, key_col_sizes[i], |
||||
100.0 * key_col_sizes[i] / total_size); |
||||
for (size_t i = 0; i < value_col_sizes.size(); ++i) |
||||
printf("Value col %lu size: %lu percentage %lf%%\n", i, |
||||
value_col_sizes[i], 100.0 * value_col_sizes[i] / total_size); |
||||
printf("Value checksum size: %lu percentage %lf%%\n", value_checksum_size, |
||||
100.0 * value_checksum_size / total_size); |
||||
} |
||||
return Status::OK(); |
||||
} |
||||
|
||||
void ColumnAwareEncodingReader::GetColDeclarationsPrimary( |
||||
std::vector<ColDeclaration>** key_col_declarations, |
||||
std::vector<ColDeclaration>** value_col_declarations, |
||||
ColDeclaration** value_checksum_declaration) { |
||||
*key_col_declarations = new std::vector<ColDeclaration>{ |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 4, false, |
||||
true), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRleDeltaVarint, 8, |
||||
false, true), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 8, |
||||
false, true), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 8, |
||||
false, true), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 8)}; |
||||
|
||||
*value_col_declarations = new std::vector<ColDeclaration>{ |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 4), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 4), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRle, 1), |
||||
ColDeclaration("VariableLength"), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 4), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 8)}; |
||||
*value_checksum_declaration = new ColDeclaration( |
||||
"LongFixedLength", ColCompressionType::kColNoCompression, 9, |
||||
true /* nullable */); |
||||
} |
||||
|
||||
void ColumnAwareEncodingReader::GetColDeclarationsSecondary( |
||||
std::vector<ColDeclaration>** key_col_declarations, |
||||
std::vector<ColDeclaration>** value_col_declarations, |
||||
ColDeclaration** value_checksum_declaration) { |
||||
*key_col_declarations = new std::vector<ColDeclaration>{ |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 4, false, |
||||
true), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 8, |
||||
false, true), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRleDeltaVarint, 8, |
||||
false, true), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRle, 1), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 4, |
||||
false, true), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 8, |
||||
false, true), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 8, false, |
||||
true), |
||||
ColDeclaration("VariableChunk", ColCompressionType::kColNoCompression), |
||||
ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 8)}; |
||||
*value_col_declarations = new std::vector<ColDeclaration>(); |
||||
*value_checksum_declaration = new ColDeclaration( |
||||
"LongFixedLength", ColCompressionType::kColNoCompression, 9, |
||||
true /* nullable */); |
||||
} |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,80 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once |
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include <string> |
||||
#include <vector> |
||||
#include "db/dbformat.h" |
||||
#include "include/rocksdb/env.h" |
||||
#include "include/rocksdb/immutable_options.h" |
||||
#include "include/rocksdb/listener.h" |
||||
#include "include/rocksdb/options.h" |
||||
#include "include/rocksdb/status.h" |
||||
#include "table/block_based_table_reader.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
struct ColDeclaration; |
||||
struct KVPairColDeclarations; |
||||
|
||||
class ColumnAwareEncodingReader { |
||||
public: |
||||
explicit ColumnAwareEncodingReader(const std::string& file_name); |
||||
|
||||
void GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks); |
||||
|
||||
void EncodeBlocksToRowFormat(WritableFile* out_file, |
||||
CompressionType compression_type, |
||||
const std::vector<KVPairBlock>& kv_pair_blocks, |
||||
std::vector<std::string>* blocks); |
||||
|
||||
void DecodeBlocksFromRowFormat(WritableFile* out_file, |
||||
const std::vector<std::string>* blocks); |
||||
|
||||
void DumpDataColumns(const std::string& filename, |
||||
const KVPairColDeclarations& kvp_col_declarations, |
||||
const std::vector<KVPairBlock>& kv_pair_blocks); |
||||
|
||||
Status EncodeBlocks(const KVPairColDeclarations& kvp_col_declarations, |
||||
WritableFile* out_file, CompressionType compression_type, |
||||
const std::vector<KVPairBlock>& kv_pair_blocks, |
||||
std::vector<std::string>* blocks, bool print_column_stat); |
||||
|
||||
void DecodeBlocks(const KVPairColDeclarations& kvp_col_declarations, |
||||
WritableFile* out_file, |
||||
const std::vector<std::string>* blocks); |
||||
|
||||
static void GetColDeclarationsPrimary( |
||||
std::vector<ColDeclaration>** key_col_declarations, |
||||
std::vector<ColDeclaration>** value_col_declarations, |
||||
ColDeclaration** value_checksum_declaration); |
||||
|
||||
static void GetColDeclarationsSecondary( |
||||
std::vector<ColDeclaration>** key_col_declarations, |
||||
std::vector<ColDeclaration>** value_col_declarations, |
||||
ColDeclaration** value_checksum_declaration); |
||||
|
||||
private: |
||||
// Init the TableReader for the sst file
|
||||
void InitTableReader(const std::string& file_path); |
||||
|
||||
std::string file_name_; |
||||
EnvOptions soptions_; |
||||
|
||||
Options options_; |
||||
|
||||
Status init_result_; |
||||
std::unique_ptr<BlockBasedTable> table_reader_; |
||||
std::unique_ptr<RandomAccessFileReader> file_; |
||||
|
||||
const ImmutableCFOptions ioptions_; |
||||
InternalKeyComparator internal_comparator_; |
||||
std::unique_ptr<TableProperties> table_properties_; |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
Loading…
Reference in new issue