switch to use RocksDB UnorderedMap (#11507)

Summary:
Switch from std::unordered_map to RocksDB UnorderedMap for all the places that logging user-defined timestamp size in WAL used.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11507

Test Plan:
```
make all check
```

Reviewed By: ltamasi

Differential Revision: D46448975

Pulled By: jowlyzhang

fbshipit-source-id: bdb4d56a723b697a33daaf0f856a61d49a367a99
oxigraph-main
Yu Zhang 2 years ago committed by Facebook GitHub Bot
parent 4aa52d89cf
commit 4dafa5b220
  1. 10
      db/column_family.h
  2. 4
      db/db_impl/db_impl_open.cc
  3. 4
      db/db_impl/db_impl_secondary.cc
  4. 5
      db/log_reader.h
  5. 36
      db/log_test.cc
  6. 2
      db/log_writer.cc
  7. 5
      db/log_writer.h
  8. 4
      db/repair.cc
  9. 6
      db/version_set.h
  10. 16
      util/udt_util.cc
  11. 14
      util/udt_util.h
  12. 32
      util/udt_util_test.cc

@ -705,12 +705,12 @@ class ColumnFamilySet {
Version* dummy_version,
const ColumnFamilyOptions& options);
const std::unordered_map<uint32_t, size_t>&
GetRunningColumnFamiliesTimestampSize() const {
const UnorderedMap<uint32_t, size_t>& GetRunningColumnFamiliesTimestampSize()
const {
return running_ts_sz_;
}
const std::unordered_map<uint32_t, size_t>&
const UnorderedMap<uint32_t, size_t>&
GetColumnFamiliesTimestampSizeForRecord() const {
return ts_sz_for_record_;
}
@ -744,10 +744,10 @@ class ColumnFamilySet {
// the same requirements as `column_families_` and `column_family_data_`.
// Mapping from column family id to user-defined timestamp size for all
// running column families.
std::unordered_map<uint32_t, size_t> running_ts_sz_;
UnorderedMap<uint32_t, size_t> running_ts_sz_;
// Mapping from column family id to user-defined timestamp size for
// column families with non-zero user-defined timestamp size.
std::unordered_map<uint32_t, size_t> ts_sz_for_record_;
UnorderedMap<uint32_t, size_t> ts_sz_for_record_;
uint32_t max_column_family_;
const FileOptions file_options_;

@ -1187,7 +1187,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
std::string scratch;
Slice record;
const std::unordered_map<uint32_t, size_t>& running_ts_sz =
const UnorderedMap<uint32_t, size_t>& running_ts_sz =
versions_->GetRunningColumnFamiliesTimestampSize();
TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
@ -1213,7 +1213,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
return status;
}
const std::unordered_map<uint32_t, size_t>& record_ts_sz =
const UnorderedMap<uint32_t, size_t>& record_ts_sz =
reader.GetRecordedTimestampSize();
// TODO(yuzhangyu): update mode to kReconcileInconsistency when user
// comparator can be changed.

@ -199,7 +199,7 @@ Status DBImplSecondary::RecoverLogFiles(
assert(reader != nullptr);
}
const std::unordered_map<uint32_t, size_t>& running_ts_sz =
const UnorderedMap<uint32_t, size_t>& running_ts_sz =
versions_->GetRunningColumnFamiliesTimestampSize();
for (auto log_number : log_numbers) {
auto it = log_readers_.find(log_number);
@ -228,7 +228,7 @@ Status DBImplSecondary::RecoverLogFiles(
if (!status.ok()) {
break;
}
const std::unordered_map<uint32_t, size_t>& record_ts_sz =
const UnorderedMap<uint32_t, size_t>& record_ts_sz =
reader->GetRecordedTimestampSize();
status = HandleWriteBatchTimestampSizeDifference(
&batch, running_ts_sz, record_ts_sz,

@ -20,6 +20,7 @@
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "util/compression.h"
#include "util/hash_containers.h"
#include "util/udt_util.h"
#include "util/xxhash.h"
@ -79,7 +80,7 @@ class Reader {
// Return the recorded user-defined timestamp size that have been read so
// far. This only applies to WAL logs.
const std::unordered_map<uint32_t, size_t>& GetRecordedTimestampSize() const {
const UnorderedMap<uint32_t, size_t>& GetRecordedTimestampSize() const {
return recorded_cf_to_ts_sz_;
}
@ -165,7 +166,7 @@ class Reader {
// The recorded user-defined timestamp sizes that have been read so far. This
// is only for WAL logs.
std::unordered_map<uint32_t, size_t> recorded_cf_to_ts_sz_;
UnorderedMap<uint32_t, size_t> recorded_cf_to_ts_sz_;
// Extend record types with the following special values
enum {

@ -182,9 +182,8 @@ class LogTest
Slice* get_reader_contents() { return &reader_contents_; }
void Write(
const std::string& msg,
const std::unordered_map<uint32_t, size_t>* cf_to_ts_sz = nullptr) {
void Write(const std::string& msg,
const UnorderedMap<uint32_t, size_t>* cf_to_ts_sz = nullptr) {
if (cf_to_ts_sz != nullptr && !cf_to_ts_sz->empty()) {
ASSERT_OK(writer_->MaybeAddUserDefinedTimestampSizeRecord(*cf_to_ts_sz));
}
@ -193,10 +192,9 @@ class LogTest
size_t WrittenBytes() const { return dest_contents().size(); }
std::string Read(
const WALRecoveryMode wal_recovery_mode =
std::string Read(const WALRecoveryMode wal_recovery_mode =
WALRecoveryMode::kTolerateCorruptedTailRecords,
std::unordered_map<uint32_t, size_t>* cf_to_ts_sz = nullptr) {
UnorderedMap<uint32_t, size_t>* cf_to_ts_sz = nullptr) {
std::string scratch;
Slice record;
bool ret = false;
@ -270,9 +268,8 @@ class LogTest
}
void CheckRecordAndTimestampSize(
std::string record,
std::unordered_map<uint32_t, size_t>& expected_ts_sz) {
std::unordered_map<uint32_t, size_t> recorded_ts_sz;
std::string record, UnorderedMap<uint32_t, size_t>& expected_ts_sz) {
UnorderedMap<uint32_t, size_t> recorded_ts_sz;
ASSERT_EQ(record,
Read(WALRecoveryMode::
kTolerateCorruptedTailRecords /* wal_recovery_mode */,
@ -297,18 +294,18 @@ TEST_P(LogTest, ReadWrite) {
}
TEST_P(LogTest, ReadWriteWithTimestampSize) {
std::unordered_map<uint32_t, size_t> ts_sz_one = {
UnorderedMap<uint32_t, size_t> ts_sz_one = {
{1, sizeof(uint64_t)},
};
Write("foo", &ts_sz_one);
Write("bar");
std::unordered_map<uint32_t, size_t> ts_sz_two = {{2, sizeof(char)}};
UnorderedMap<uint32_t, size_t> ts_sz_two = {{2, sizeof(char)}};
Write("", &ts_sz_two);
Write("xxxx");
CheckRecordAndTimestampSize("foo", ts_sz_one);
CheckRecordAndTimestampSize("bar", ts_sz_one);
std::unordered_map<uint32_t, size_t> expected_ts_sz_two;
UnorderedMap<uint32_t, size_t> expected_ts_sz_two;
// User-defined timestamp size records are accumulated and applied to
// subsequent records.
expected_ts_sz_two.insert(ts_sz_one.begin(), ts_sz_one.end());
@ -320,10 +317,9 @@ TEST_P(LogTest, ReadWriteWithTimestampSize) {
}
TEST_P(LogTest, ReadWriteWithTimestampSizeZeroTimestampIgnored) {
std::unordered_map<uint32_t, size_t> ts_sz_one = {{1, sizeof(uint64_t)}};
UnorderedMap<uint32_t, size_t> ts_sz_one = {{1, sizeof(uint64_t)}};
Write("foo", &ts_sz_one);
std::unordered_map<uint32_t, size_t> ts_sz_two(ts_sz_one.begin(),
ts_sz_one.end());
UnorderedMap<uint32_t, size_t> ts_sz_two(ts_sz_one.begin(), ts_sz_one.end());
ts_sz_two.insert(std::make_pair(2, 0));
Write("bar", &ts_sz_two);
@ -749,7 +745,7 @@ TEST_P(LogTest, RecycleWithTimestampSize) {
if (!recyclable_log) {
return; // test is only valid for recycled logs
}
std::unordered_map<uint32_t, size_t> ts_sz_one = {
UnorderedMap<uint32_t, size_t> ts_sz_one = {
{1, sizeof(uint32_t)},
};
Write("foo", &ts_sz_one);
@ -765,7 +761,7 @@ TEST_P(LogTest, RecycleWithTimestampSize) {
std::unique_ptr<WritableFileWriter> dest_holder(new WritableFileWriter(
std::move(sink), "" /* don't care */, FileOptions()));
Writer recycle_writer(std::move(dest_holder), 123, true);
std::unordered_map<uint32_t, size_t> ts_sz_two = {
UnorderedMap<uint32_t, size_t> ts_sz_two = {
{2, sizeof(uint64_t)},
};
ASSERT_OK(recycle_writer.MaybeAddUserDefinedTimestampSizeRecord(ts_sz_two));
@ -1039,18 +1035,18 @@ TEST_P(CompressionLogTest, ReadWriteWithTimestampSize) {
return;
}
ASSERT_OK(SetupTestEnv());
std::unordered_map<uint32_t, size_t> ts_sz_one = {
UnorderedMap<uint32_t, size_t> ts_sz_one = {
{1, sizeof(uint64_t)},
};
Write("foo", &ts_sz_one);
Write("bar");
std::unordered_map<uint32_t, size_t> ts_sz_two = {{2, sizeof(char)}};
UnorderedMap<uint32_t, size_t> ts_sz_two = {{2, sizeof(char)}};
Write("", &ts_sz_two);
Write("xxxx");
CheckRecordAndTimestampSize("foo", ts_sz_one);
CheckRecordAndTimestampSize("bar", ts_sz_one);
std::unordered_map<uint32_t, size_t> expected_ts_sz_two;
UnorderedMap<uint32_t, size_t> expected_ts_sz_two;
// User-defined timestamp size records are accumulated and applied to
// subsequent records.
expected_ts_sz_two.insert(ts_sz_one.begin(), ts_sz_one.end());

@ -197,7 +197,7 @@ IOStatus Writer::AddCompressionTypeRecord() {
}
IOStatus Writer::MaybeAddUserDefinedTimestampSizeRecord(
const std::unordered_map<uint32_t, size_t>& cf_to_ts_sz,
const UnorderedMap<uint32_t, size_t>& cf_to_ts_sz,
Env::IOPriority rate_limiter_priority) {
std::vector<std::pair<uint32_t, size_t>> ts_sz_to_record;
for (const auto& [cf_id, ts_sz] : cf_to_ts_sz) {

@ -20,6 +20,7 @@
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "util/compression.h"
#include "util/hash_containers.h"
namespace ROCKSDB_NAMESPACE {
@ -95,7 +96,7 @@ class Writer {
// kRecyclableUserDefinedTimestampSizeType for these column families.
// This timestamp size record applies to all subsequent records.
IOStatus MaybeAddUserDefinedTimestampSizeRecord(
const std::unordered_map<uint32_t, size_t>& cf_to_ts_sz,
const UnorderedMap<uint32_t, size_t>& cf_to_ts_sz,
Env::IOPriority rate_limiter_priority = Env::IO_TOTAL);
WritableFileWriter* file() { return dest_.get(); }
@ -137,7 +138,7 @@ class Writer {
// The recorded user-defined timestamp size that have been written so far.
// Since the user-defined timestamp size cannot be changed while the DB is
// running, existing entry in this map cannot be updated.
std::unordered_map<uint32_t, size_t> recorded_cf_to_ts_sz_;
UnorderedMap<uint32_t, size_t> recorded_cf_to_ts_sz_;
};
} // namespace log

@ -394,7 +394,7 @@ class Repairer {
auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet());
// Read all the records and add to a memtable
const std::unordered_map<uint32_t, size_t>& running_ts_sz =
const UnorderedMap<uint32_t, size_t>& running_ts_sz =
vset_.GetRunningColumnFamiliesTimestampSize();
std::string scratch;
Slice record;
@ -409,7 +409,7 @@ class Repairer {
}
Status record_status = WriteBatchInternal::SetContents(&batch, record);
if (record_status.ok()) {
const std::unordered_map<uint32_t, size_t>& record_ts_sz =
const UnorderedMap<uint32_t, size_t>& record_ts_sz =
reader.GetRecordedTimestampSize();
record_status = HandleWriteBatchTimestampSizeDifference(
&batch, running_ts_sz, record_ts_sz,

@ -1468,12 +1468,12 @@ class VersionSet {
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
const std::unordered_map<uint32_t, size_t>&
GetRunningColumnFamiliesTimestampSize() const {
const UnorderedMap<uint32_t, size_t>& GetRunningColumnFamiliesTimestampSize()
const {
return column_family_set_->GetRunningColumnFamiliesTimestampSize();
}
const std::unordered_map<uint32_t, size_t>&
const UnorderedMap<uint32_t, size_t>&
GetColumnFamiliesTimestampSizeForRecord() const {
return column_family_set_->GetColumnFamiliesTimestampSizeForRecord();
}

@ -44,8 +44,8 @@ RecoveryType GetRecoveryType(const size_t running_ts_sz,
}
bool AllRunningColumnFamiliesConsistent(
const std::unordered_map<uint32_t, size_t>& running_ts_sz,
const std::unordered_map<uint32_t, size_t>& record_ts_sz) {
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
const UnorderedMap<uint32_t, size_t>& record_ts_sz) {
for (const auto& [cf_id, ts_sz] : running_ts_sz) {
auto record_it = record_ts_sz.find(cf_id);
RecoveryType recovery_type =
@ -61,8 +61,8 @@ bool AllRunningColumnFamiliesConsistent(
Status CheckWriteBatchTimestampSizeConsistency(
const WriteBatch* batch,
const std::unordered_map<uint32_t, size_t>& running_ts_sz,
const std::unordered_map<uint32_t, size_t>& record_ts_sz,
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
const UnorderedMap<uint32_t, size_t>& record_ts_sz,
TimestampSizeConsistencyMode check_mode, bool* ts_need_recovery) {
std::vector<uint32_t> column_family_ids;
Status status =
@ -103,8 +103,8 @@ Status CheckWriteBatchTimestampSizeConsistency(
} // namespace
TimestampRecoveryHandler::TimestampRecoveryHandler(
const std::unordered_map<uint32_t, size_t>& running_ts_sz,
const std::unordered_map<uint32_t, size_t>& record_ts_sz)
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
const UnorderedMap<uint32_t, size_t>& record_ts_sz)
: running_ts_sz_(running_ts_sz),
record_ts_sz_(record_ts_sz),
new_batch_(new WriteBatch()),
@ -234,8 +234,8 @@ Status TimestampRecoveryHandler::ReconcileTimestampDiscrepancy(
Status HandleWriteBatchTimestampSizeDifference(
const WriteBatch* batch,
const std::unordered_map<uint32_t, size_t>& running_ts_sz,
const std::unordered_map<uint32_t, size_t>& record_ts_sz,
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
const UnorderedMap<uint32_t, size_t>& record_ts_sz,
TimestampSizeConsistencyMode check_mode,
std::unique_ptr<WriteBatch>* new_batch) {
// Quick path to bypass checking the WriteBatch.

@ -16,6 +16,7 @@
#include "rocksdb/status.h"
#include "rocksdb/write_batch.h"
#include "util/coding.h"
#include "util/hash_containers.h"
namespace ROCKSDB_NAMESPACE {
@ -102,9 +103,8 @@ class UserDefinedTimestampSizeRecord {
// but not equal, return Status::InvalidArgument.
class TimestampRecoveryHandler : public WriteBatch::Handler {
public:
TimestampRecoveryHandler(
const std::unordered_map<uint32_t, size_t>& running_ts_sz,
const std::unordered_map<uint32_t, size_t>& record_ts_sz);
TimestampRecoveryHandler(const UnorderedMap<uint32_t, size_t>& running_ts_sz,
const UnorderedMap<uint32_t, size_t>& record_ts_sz);
~TimestampRecoveryHandler() override {}
@ -155,11 +155,11 @@ class TimestampRecoveryHandler : public WriteBatch::Handler {
// Mapping from column family id to user-defined timestamp size for all
// running column families including the ones with zero timestamp size.
const std::unordered_map<uint32_t, size_t>& running_ts_sz_;
const UnorderedMap<uint32_t, size_t>& running_ts_sz_;
// Mapping from column family id to user-defined timestamp size as recorded
// in the WAL. This only contains non-zero user-defined timestamp size.
const std::unordered_map<uint32_t, size_t>& record_ts_sz_;
const UnorderedMap<uint32_t, size_t>& record_ts_sz_;
std::unique_ptr<WriteBatch> new_batch_;
// Handler is valid upon creation and becomes invalid after its `new_batch_`
@ -211,8 +211,8 @@ enum class TimestampSizeConsistencyMode {
// families including the ones with zero timestamp size.
Status HandleWriteBatchTimestampSizeDifference(
const WriteBatch* batch,
const std::unordered_map<uint32_t, size_t>& running_ts_sz,
const std::unordered_map<uint32_t, size_t>& record_ts_sz,
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
const UnorderedMap<uint32_t, size_t>& record_ts_sz,
TimestampSizeConsistencyMode check_mode,
std::unique_ptr<WriteBatch>* new_batch = nullptr);
} // namespace ROCKSDB_NAMESPACE

@ -104,8 +104,7 @@ class HandleTimestampSizeDifferenceTest : public testing::Test {
}
}
void CreateWriteBatch(
const std::unordered_map<uint32_t, size_t>& ts_sz_for_batch,
void CreateWriteBatch(const UnorderedMap<uint32_t, size_t>& ts_sz_for_batch,
WriteBatch* batch) {
for (const auto& [cf_id, ts_sz] : ts_sz_for_batch) {
std::string key;
@ -185,9 +184,9 @@ class HandleTimestampSizeDifferenceTest : public testing::Test {
};
TEST_F(HandleTimestampSizeDifferenceTest, AllColumnFamiliesConsistent) {
std::unordered_map<uint32_t, size_t> running_ts_sz = {{1, sizeof(uint64_t)},
UnorderedMap<uint32_t, size_t> running_ts_sz = {{1, sizeof(uint64_t)},
{2, 0}};
std::unordered_map<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)}};
UnorderedMap<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)}};
WriteBatch batch;
CreateWriteBatch(running_ts_sz, &batch);
@ -204,8 +203,8 @@ TEST_F(HandleTimestampSizeDifferenceTest, AllColumnFamiliesConsistent) {
TEST_F(HandleTimestampSizeDifferenceTest,
AllInconsistentColumnFamiliesDropped) {
std::unordered_map<uint32_t, size_t> running_ts_sz = {{2, 0}};
std::unordered_map<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)},
UnorderedMap<uint32_t, size_t> running_ts_sz = {{2, 0}};
UnorderedMap<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)},
{3, sizeof(char)}};
WriteBatch batch;
CreateWriteBatch(record_ts_sz, &batch);
@ -222,9 +221,9 @@ TEST_F(HandleTimestampSizeDifferenceTest,
}
TEST_F(HandleTimestampSizeDifferenceTest, InvolvedColumnFamiliesConsistent) {
std::unordered_map<uint32_t, size_t> running_ts_sz = {{1, sizeof(uint64_t)},
UnorderedMap<uint32_t, size_t> running_ts_sz = {{1, sizeof(uint64_t)},
{2, sizeof(char)}};
std::unordered_map<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)}};
UnorderedMap<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)}};
WriteBatch batch;
CreateWriteBatch(record_ts_sz, &batch);
@ -241,9 +240,8 @@ TEST_F(HandleTimestampSizeDifferenceTest, InvolvedColumnFamiliesConsistent) {
TEST_F(HandleTimestampSizeDifferenceTest,
InconsistentColumnFamilyNeedsTimestampStripping) {
std::unordered_map<uint32_t, size_t> running_ts_sz = {{1, 0},
{2, sizeof(char)}};
std::unordered_map<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)}};
UnorderedMap<uint32_t, size_t> running_ts_sz = {{1, 0}, {2, sizeof(char)}};
UnorderedMap<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)}};
WriteBatch batch;
CreateWriteBatch(record_ts_sz, &batch);
@ -265,10 +263,10 @@ TEST_F(HandleTimestampSizeDifferenceTest,
TEST_F(HandleTimestampSizeDifferenceTest,
InconsistentColumnFamilyNeedsTimestampPadding) {
std::unordered_map<uint32_t, size_t> running_ts_sz = {{1, sizeof(uint64_t)}};
UnorderedMap<uint32_t, size_t> running_ts_sz = {{1, sizeof(uint64_t)}};
// Make `record_ts_sz` not contain zero timestamp size entries to follow the
// behavior of actual WAL log timestamp size record.
std::unordered_map<uint32_t, size_t> record_ts_sz;
UnorderedMap<uint32_t, size_t> record_ts_sz;
WriteBatch batch;
CreateWriteBatch({{1, 0}}, &batch);
@ -289,8 +287,8 @@ TEST_F(HandleTimestampSizeDifferenceTest,
TEST_F(HandleTimestampSizeDifferenceTest,
InconsistencyReconcileCopyOverDroppedColumnFamily) {
std::unordered_map<uint32_t, size_t> running_ts_sz = {{1, 0}};
std::unordered_map<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)},
UnorderedMap<uint32_t, size_t> running_ts_sz = {{1, 0}};
UnorderedMap<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)},
{2, sizeof(char)}};
WriteBatch batch;
CreateWriteBatch(record_ts_sz, &batch);
@ -308,8 +306,8 @@ TEST_F(HandleTimestampSizeDifferenceTest,
}
TEST_F(HandleTimestampSizeDifferenceTest, UnrecoverableInconsistency) {
std::unordered_map<uint32_t, size_t> running_ts_sz = {{1, sizeof(char)}};
std::unordered_map<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)}};
UnorderedMap<uint32_t, size_t> running_ts_sz = {{1, sizeof(char)}};
UnorderedMap<uint32_t, size_t> record_ts_sz = {{1, sizeof(uint64_t)}};
WriteBatch batch;
CreateWriteBatch(record_ts_sz, &batch);

Loading…
Cancel
Save