Experimental support for SST unique IDs (#8990)

Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).

Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990

Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.

Reviewed By: zhichao-cao, mrambacher

Differential Revision: D31582865

Pulled By: pdillinger

fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
main
Peter Dillinger 3 years ago committed by Facebook GitHub Bot
parent aa21896880
commit ad5325a736
  1. 3
      CMakeLists.txt
  2. 1
      HISTORY.md
  3. 7
      TARGETS
  4. 5
      db/cuckoo_table_db_test.cc
  5. 32
      db/db_impl/db_impl.cc
  6. 2
      db/db_table_properties_test.cc
  7. 7
      db/db_test.cc
  8. 11
      db/db_test_util.cc
  9. 4
      db/db_test_util.h
  10. 8
      db_stress_tool/CMakeLists.txt
  11. 136
      db_stress_tool/db_stress_listener.cc
  12. 50
      db_stress_tool/db_stress_listener.h
  13. 2
      db_stress_tool/db_stress_test_base.cc
  14. 2
      env/env.cc
  15. 19
      env/env_test.cc
  16. 40
      env/unique_id.h
  17. 21
      env/unique_id_gen.cc
  18. 69
      env/unique_id_gen.h
  19. 46
      include/rocksdb/unique_id.h
  20. 6
      src.mk
  21. 32
      table/table_properties.cc
  22. 4
      table/table_properties_internal.h
  23. 256
      table/table_test.cc
  24. 166
      table/unique_id.cc
  25. 59
      table/unique_id_impl.h
  26. 80
      util/hash.cc
  27. 15
      util/hash.h
  28. 49
      util/hash_test.cc
  29. 3
      util/random.h
  30. 31
      util/string_util.h

@ -713,7 +713,7 @@ set(SOURCES
env/file_system_tracer.cc
env/fs_remap.cc
env/mock_env.cc
env/unique_id.cc
env/unique_id_gen.cc
file/delete_scheduler.cc
file/file_prefetch_buffer.cc
file/file_util.cc
@ -807,6 +807,7 @@ set(SOURCES
table/table_factory.cc
table/table_properties.cc
table/two_level_iterator.cc
table/unique_id.cc
test_util/sync_point.cc
test_util/sync_point_impl.cc
test_util/testutil.cc

@ -17,6 +17,7 @@
* Add remote compaction read/write bytes statistics: `REMOTE_COMPACT_READ_BYTES`, `REMOTE_COMPACT_WRITE_BYTES`.
* Introduce an experimental feature to dump out the blocks from block cache and insert them to the secondary cache to reduce the cache warmup time (e.g., used while migrating DB instance). More information are in `class CacheDumper` and `CacheDumpedLoader` at `rocksdb/utilities/cache_dump_load.h` Note that, this feature is subject to the potential change in the future, it is still experimental.
* Introduced a new BlobDB configuration option `blob_garbage_collection_force_threshold`, which can be used to trigger compactions targeting the SST files which reference the oldest blob files when the ratio of garbage in those blob files meets or exceeds the specified threshold. This can reduce space amplification with skewed workloads where the affected SST files might not otherwise get picked up for compaction.
* Added EXPERIMENTAL support for table file (SST) unique identifiers that are stable and universally unique, available with new function `GetUniqueIdFromTableProperties`. Only SST files from RocksDB >= 6.24 support unique IDs.
* [JAVA] `keyMayExist()` supports ByteBuffer.
### Public API change

@ -225,7 +225,7 @@ cpp_library(
"env/fs_remap.cc",
"env/io_posix.cc",
"env/mock_env.cc",
"env/unique_id.cc",
"env/unique_id_gen.cc",
"file/delete_scheduler.cc",
"file/file_prefetch_buffer.cc",
"file/file_util.cc",
@ -327,6 +327,7 @@ cpp_library(
"table/table_factory.cc",
"table/table_properties.cc",
"table/two_level_iterator.cc",
"table/unique_id.cc",
"test_util/sync_point.cc",
"test_util/sync_point_impl.cc",
"test_util/transaction_test_util.cc",
@ -550,7 +551,7 @@ cpp_library(
"env/fs_remap.cc",
"env/io_posix.cc",
"env/mock_env.cc",
"env/unique_id.cc",
"env/unique_id_gen.cc",
"file/delete_scheduler.cc",
"file/file_prefetch_buffer.cc",
"file/file_util.cc",
@ -652,6 +653,7 @@ cpp_library(
"table/table_factory.cc",
"table/table_properties.cc",
"table/two_level_iterator.cc",
"table/unique_id.cc",
"test_util/sync_point.cc",
"test_util/sync_point_impl.cc",
"test_util/transaction_test_util.cc",
@ -848,6 +850,7 @@ cpp_library(
"db_stress_tool/db_stress_common.cc",
"db_stress_tool/db_stress_driver.cc",
"db_stress_tool/db_stress_gflags.cc",
"db_stress_tool/db_stress_listener.cc",
"db_stress_tool/db_stress_shared_state.cc",
"db_stress_tool/db_stress_test_base.cc",
"db_stress_tool/db_stress_tool.cc",

@ -6,6 +6,7 @@
#ifndef ROCKSDB_LITE
#include "db/db_impl/db_impl.h"
#include "db/db_test_util.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "table/cuckoo/cuckoo_table_factory.h"
@ -133,6 +134,7 @@ TEST_F(CuckooTableDBTest, Flush) {
TablePropertiesCollection ptc;
ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
VerifySstUniqueIds(ptc);
ASSERT_EQ(1U, ptc.size());
ASSERT_EQ(3U, ptc.begin()->second->num_entries);
ASSERT_EQ("1", FilesPerLevel());
@ -149,6 +151,7 @@ TEST_F(CuckooTableDBTest, Flush) {
ASSERT_OK(dbfull()->TEST_FlushMemTable());
ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
VerifySstUniqueIds(ptc);
ASSERT_EQ(2U, ptc.size());
auto row = ptc.begin();
ASSERT_EQ(3U, row->second->num_entries);
@ -166,6 +169,7 @@ TEST_F(CuckooTableDBTest, Flush) {
ASSERT_OK(Delete("key4"));
ASSERT_OK(dbfull()->TEST_FlushMemTable());
ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
VerifySstUniqueIds(ptc);
ASSERT_EQ(3U, ptc.size());
row = ptc.begin();
ASSERT_EQ(3U, row->second->num_entries);
@ -190,6 +194,7 @@ TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) {
TablePropertiesCollection ptc;
ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
VerifySstUniqueIds(ptc);
ASSERT_EQ(1U, ptc.size());
ASSERT_EQ(2U, ptc.begin()->second->num_entries);
ASSERT_EQ("1", FilesPerLevel());

@ -53,7 +53,7 @@
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "db/write_callback.h"
#include "env/unique_id.h"
#include "env/unique_id_gen.h"
#include "file/file_util.h"
#include "file/filename.h"
#include "file/random_access_file_reader.h"
@ -92,6 +92,7 @@
#include "table/sst_file_dumper.h"
#include "table/table_builder.h"
#include "table/two_level_iterator.h"
#include "table/unique_id_impl.h"
#include "test_util/sync_point.h"
#include "trace_replay/trace_replay.h"
#include "util/autovector.h"
@ -3947,23 +3948,18 @@ Status DBImpl::GetDbSessionId(std::string& session_id) const {
}
std::string DBImpl::GenerateDbSessionId(Env*) {
// GenerateRawUniqueId() generates an identifier that has a negligible
// probability of being duplicated. It should have full 128 bits of entropy.
uint64_t a, b;
GenerateRawUniqueId(&a, &b);
// Hash and reformat that down to a more compact format, 20 characters
// in base-36 ([0-9A-Z]), which is ~103 bits of entropy, which is enough
// to expect no collisions across a billion servers each opening DBs
// a million times (~2^50). Benefits vs. raw unique id:
// * Save ~ dozen bytes per SST file
// * Shorter shared backup file names (some platforms have low limits)
// * Visually distinct from DB id format
std::string db_session_id(20U, '\0');
char* buf = &db_session_id[0];
PutBaseChars<36>(&buf, 10, a, /*uppercase*/ true);
PutBaseChars<36>(&buf, 10, b, /*uppercase*/ true);
return db_session_id;
// See SemiStructuredUniqueIdGen for its desirable properties.
static SemiStructuredUniqueIdGen gen;
uint64_t lo, hi;
gen.GenerateNext(&hi, &lo);
if (lo == 0) {
// Avoid emitting session ID with lo==0, so that SST unique
// IDs can be more easily ensured non-zero
gen.GenerateNext(&hi, &lo);
assert(lo != 0);
}
return EncodeSessionId(hi, lo);
}
void DBImpl::SetDbSessionId() {

@ -45,6 +45,8 @@ void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
ASSERT_EQ(props.size(), unique_entries.size());
ASSERT_EQ(expected_entries_size, sum);
VerifySstUniqueIds(props);
}
} // namespace

@ -2077,6 +2077,13 @@ TEST_F(DBTest, OverlapInLevel0) {
Flush(1);
ASSERT_EQ("2,1,1", FilesPerLevel(1));
// BEGIN addition to existing test
// Take this opportunity to verify SST unique ids (including Plain table)
TablePropertiesCollection tbc;
ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &tbc));
VerifySstUniqueIds(tbc);
// END addition to existing test
// Compact away the placeholder files we created initially
dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]);

@ -13,6 +13,7 @@
#include "env/mock_env.h"
#include "rocksdb/convenience.h"
#include "rocksdb/env_encryption.h"
#include "rocksdb/unique_id.h"
#include "rocksdb/utilities/object_registry.h"
#include "util/random.h"
@ -1654,4 +1655,14 @@ uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily(
}
#endif // ROCKSDB_LITE
void VerifySstUniqueIds(const TablePropertiesCollection& props) {
ASSERT_FALSE(props.empty()); // suspicious test if empty
std::unordered_set<std::string> seen;
for (auto& pair : props) {
std::string id;
ASSERT_OK(GetUniqueIdFromTableProperties(*pair.second, &id));
ASSERT_TRUE(seen.insert(id).second);
}
}
} // namespace ROCKSDB_NAMESPACE

@ -1195,4 +1195,8 @@ class DBTestBase : public testing::Test {
bool time_elapse_only_sleep_on_reopen_ = false;
};
// For verifying that all files generated by current version have SST
// unique ids.
void VerifySstUniqueIds(const TablePropertiesCollection& props);
} // namespace ROCKSDB_NAMESPACE

@ -1,13 +1,13 @@
add_executable(db_stress${ARTIFACT_SUFFIX}
db_stress.cc
db_stress_tool.cc
batched_ops_stress.cc
cf_consistency_stress.cc
db_stress.cc
db_stress_common.cc
db_stress_driver.cc
db_stress_test_base.cc
db_stress_shared_state.cc
db_stress_gflags.cc
db_stress_listener.cc
db_stress_shared_state.cc
db_stress_test_base.cc
db_stress_tool.cc
expected_state.cc
no_batched_ops_stress.cc)

@ -0,0 +1,136 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db_stress_tool/db_stress_listener.h"
#include <cstdint>
#include "rocksdb/file_system.h"
#include "util/coding_lean.h"
namespace ROCKSDB_NAMESPACE {
#ifdef GFLAGS
#ifndef ROCKSDB_LITE
// TODO: consider using expected_values_dir instead, but this is more
// convenient for now.
UniqueIdVerifier::UniqueIdVerifier(const std::string& db_name)
: path_(db_name + "/.unique_ids") {
// We expect such a small number of files generated during this test
// (thousands?), checking full 192-bit IDs for uniqueness is a very
// weak check. For a stronger check, we pick a specific 64-bit
// subsequence from the ID to check for uniqueness. All bits of the
// ID should be high quality, and 64 bits should be unique with
// very good probability for the quantities in this test.
offset_ = Random::GetTLSInstance()->Uniform(17); // 0 to 16
// Use default FileSystem to avoid fault injection, etc.
FileSystem& fs = *FileSystem::Default();
IOOptions opts;
{
std::unique_ptr<FSSequentialFile> reader;
Status s =
fs.NewSequentialFile(path_, FileOptions(), &reader, /*dbg*/ nullptr);
if (s.ok()) {
// Load from file
std::string id(24U, '\0');
Slice result;
for (;;) {
s = reader->Read(id.size(), opts, &result, &id[0], /*dbg*/ nullptr);
if (!s.ok()) {
fprintf(stderr, "Error reading unique id file: %s\n",
s.ToString().c_str());
assert(false);
}
if (result.size() < id.size()) {
// EOF
if (result.size() != 0) {
// Corrupt file. Not a DB bug but could happen if OS doesn't provide
// good guarantees on process crash.
fprintf(stdout, "Warning: clearing corrupt unique id file\n");
id_set_.clear();
reader.reset();
s = fs.DeleteFile(path_, opts, /*dbg*/ nullptr);
assert(s.ok());
}
break;
}
VerifyNoWrite(id);
}
} else {
// Newly created is ok.
// But FileSystem doesn't tell us whether non-existence was the cause of
// the failure. (Issue #9021)
Status s2 = fs.FileExists(path_, opts, /*dbg*/ nullptr);
if (!s2.IsNotFound()) {
fprintf(stderr, "Error opening unique id file: %s\n",
s.ToString().c_str());
assert(false);
}
}
}
fprintf(stdout, "(Re-)verified %zu unique IDs\n", id_set_.size());
Status s = fs.ReopenWritableFile(path_, FileOptions(), &data_file_writer_,
/*dbg*/ nullptr);
if (!s.ok()) {
fprintf(stderr, "Error opening unique id file for append: %s\n",
s.ToString().c_str());
assert(false);
}
}
UniqueIdVerifier::~UniqueIdVerifier() {
data_file_writer_->Close(IOOptions(), /*dbg*/ nullptr);
}
void UniqueIdVerifier::VerifyNoWrite(const std::string& id) {
assert(id.size() == 24);
bool is_new = id_set_.insert(DecodeFixed64(&id[offset_])).second;
if (!is_new) {
fprintf(stderr,
"Duplicate partial unique ID found (offset=%zu, count=%zu)\n",
offset_, id_set_.size());
assert(false);
}
}
void UniqueIdVerifier::Verify(const std::string& id) {
assert(id.size() == 24);
std::lock_guard<std::mutex> lock(mutex_);
// If we accumulate more than ~4 million IDs, there would be > 1 in 1M
// natural chance of collision. Thus, simply stop checking at that point.
if (id_set_.size() >= 4294967) {
return;
}
IOStatus s =
data_file_writer_->Append(Slice(id), IOOptions(), /*dbg*/ nullptr);
if (!s.ok()) {
fprintf(stderr, "Error writing to unique id file: %s\n",
s.ToString().c_str());
assert(false);
}
s = data_file_writer_->Flush(IOOptions(), /*dbg*/ nullptr);
if (!s.ok()) {
fprintf(stderr, "Error flushing unique id file: %s\n",
s.ToString().c_str());
assert(false);
}
VerifyNoWrite(id);
}
void DbStressListener::VerifyTableFileUniqueId(
const TableProperties& new_file_properties) {
// Verify unique ID
std::string id;
GetUniqueIdFromTableProperties(new_file_properties, &id);
unique_ids_.Verify(id);
}
#endif // !ROCKSDB_LITE
#endif // GFLAGS
} // namespace ROCKSDB_NAMESPACE

@ -3,18 +3,48 @@
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "file/filename.h"
#ifdef GFLAGS
#pragma once
#include <mutex>
#include <unordered_set>
#include "file/filename.h"
#include "rocksdb/db.h"
#include "rocksdb/file_system.h"
#include "rocksdb/listener.h"
#include "rocksdb/table_properties.h"
#include "rocksdb/unique_id.h"
#include "util/gflags_compat.h"
#include "util/random.h"
DECLARE_int32(compact_files_one_in);
namespace ROCKSDB_NAMESPACE {
#ifndef ROCKSDB_LITE
// Verify across process executions that all seen IDs are unique
class UniqueIdVerifier {
public:
explicit UniqueIdVerifier(const std::string& db_name);
~UniqueIdVerifier();
void Verify(const std::string& id);
private:
void VerifyNoWrite(const std::string& id);
private:
std::mutex mutex_;
// IDs persisted to a hidden file inside DB dir
std::string path_;
std::unique_ptr<FSWritableFile> data_file_writer_;
// Starting byte for which 8 bytes to check in memory within 24 byte ID
size_t offset_;
// Working copy of the set of 8 byte pieces
std::unordered_set<uint64_t> id_set_;
};
class DbStressListener : public EventListener {
public:
DbStressListener(const std::string& db_name,
@ -23,9 +53,9 @@ class DbStressListener : public EventListener {
: db_name_(db_name),
db_paths_(db_paths),
column_families_(column_families),
num_pending_file_creations_(0) {}
num_pending_file_creations_(0),
unique_ids_(db_name) {}
#ifndef ROCKSDB_LITE
const char* Name() const override { return kClassName(); }
static const char* kClassName() { return "DBStressListener"; }
@ -82,6 +112,8 @@ class DbStressListener : public EventListener {
assert(info.table_properties.num_entries > 0);
}
--num_pending_file_creations_;
VerifyTableFileUniqueId(info.table_properties);
}
void OnMemTableSealed(const MemTableInfo& /*info*/) override {
@ -93,9 +125,12 @@ class DbStressListener : public EventListener {
RandomSleep();
}
void OnExternalFileIngested(
DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) override {
void OnExternalFileIngested(DB* /*db*/,
const ExternalFileIngestionInfo& info) override {
RandomSleep();
// Here we assume that each generated external file is ingested
// exactly once (or thrown away in case of crash)
VerifyTableFileUniqueId(info.table_properties);
}
void OnBackgroundError(BackgroundErrorReason /* reason */,
@ -213,17 +248,20 @@ class DbStressListener : public EventListener {
#endif // !NDEBUG
}
void VerifyTableFileUniqueId(const TableProperties& new_file_properties);
void RandomSleep() {
std::this_thread::sleep_for(
std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
}
#endif // !ROCKSDB_LITE
private:
std::string db_name_;
std::vector<DbPath> db_paths_;
std::vector<ColumnFamilyDescriptor> column_families_;
std::atomic<int> num_pending_file_creations_;
UniqueIdVerifier unique_ids_;
};
#endif // !ROCKSDB_LITE
} // namespace ROCKSDB_NAMESPACE
#endif // GFLAGS

@ -2486,8 +2486,10 @@ void StressTest::Open() {
column_family_names_.push_back(name);
}
options_.listeners.clear();
#ifndef ROCKSDB_LITE
options_.listeners.emplace_back(
new DbStressListener(FLAGS_db, options_.db_paths, cf_descriptors));
#endif // !ROCKSDB_LITE
options_.create_missing_column_families = true;
if (!FLAGS_use_txn) {
#ifndef NDEBUG

2
env/env.cc vendored

@ -13,7 +13,7 @@
#include "env/composite_env_wrapper.h"
#include "env/emulated_clock.h"
#include "env/unique_id.h"
#include "env/unique_id_gen.h"
#include "logging/env_logger.h"
#include "memory/arena.h"
#include "options/db_options.h"

19
env/env_test.cc vendored

@ -39,7 +39,7 @@
#include "env/emulated_clock.h"
#include "env/env_chroot.h"
#include "env/env_encryption_ctr.h"
#include "env/unique_id.h"
#include "env/unique_id_gen.h"
#include "logging/log_buffer.h"
#include "logging/logging.h"
#include "port/malloc.h"
@ -2704,6 +2704,23 @@ TEST_F(EnvTest, GenerateRawUniqueIdTrackRandomDeviceOnly) {
t.Run();
}
TEST_F(EnvTest, SemiStructuredUniqueIdGenTest) {
// Must be thread safe and usable as a static
static SemiStructuredUniqueIdGen gen;
struct MyStressTest
: public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
uint64_pair_t Generate() override {
uint64_pair_t p;
gen.GenerateNext(&p.first, &p.second);
return p;
}
};
MyStressTest t;
t.Run();
}
TEST_F(EnvTest, FailureToCreateLockFile) {
auto env = Env::Default();
auto fs = env->GetFileSystem();

40
env/unique_id.h vendored

@ -1,40 +0,0 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
// This file is for functions that extract novel entropy or sources of
// uniqueness from the execution environment. (By contrast, random.h is
// for algorithmic pseudorandomness.)
//
// These functions could eventually migrate to public APIs, such as in Env.
#pragma once
#include <cstdint>
#include "rocksdb/rocksdb_namespace.h"
namespace ROCKSDB_NAMESPACE {
// Generates a new 128-bit identifier that is universally unique
// (with high probability) for each call. The result is split into
// two 64-bit pieces. This function has NOT been validated for use in
// cryptography.
//
// This is used in generating DB session IDs and by Env::GenerateUniqueId
// (used for DB IDENTITY) if the platform does not provide a generator of
// RFC 4122 UUIDs or fails somehow. (Set exclude_port_uuid=true if this
// function is used as a fallback for GenerateRfcUuid, because no need
// trying it again.)
void GenerateRawUniqueId(uint64_t* a, uint64_t* b,
bool exclude_port_uuid = false);
#ifndef NDEBUG
// A version of above with options for challenge testing
void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid,
bool exclude_env_details,
bool exclude_random_device);
#endif
} // namespace ROCKSDB_NAMESPACE

@ -3,7 +3,7 @@
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "env/unique_id.h"
#include "env/unique_id_gen.h"
#include <algorithm>
#include <array>
@ -141,4 +141,23 @@ void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid,
}
#endif
SemiStructuredUniqueIdGen::SemiStructuredUniqueIdGen() : counter_{} {
saved_process_id_ = port::GetProcessID();
GenerateRawUniqueId(&base_upper_, &base_lower_);
}
void SemiStructuredUniqueIdGen::GenerateNext(uint64_t* upper, uint64_t* lower) {
if (port::GetProcessID() == saved_process_id_) {
// Safe to increment the atomic for guaranteed uniqueness within this
// process lifetime. Xor slightly better than +. See
// https://github.com/pdillinger/unique_id
*lower = base_lower_ ^ counter_.fetch_add(1);
*upper = base_upper_;
} else {
// There must have been a fork() or something. Rather than attempting to
// update in a thread-safe way, simply fall back on GenerateRawUniqueId.
GenerateRawUniqueId(upper, lower);
}
}
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,69 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
// This file is for functions that generate unique identifiers by
// (at least in part) by extracting novel entropy or sources of uniqueness
// from the execution environment. (By contrast, random.h is for algorithmic
// pseudorandomness.)
//
// These functions could eventually migrate to public APIs, such as in Env.
#pragma once
#include <atomic>
#include <cstdint>
#include "rocksdb/rocksdb_namespace.h"
namespace ROCKSDB_NAMESPACE {
// Generates a new 128-bit identifier that is universally unique
// (with high probability) for each call. The result is split into
// two 64-bit pieces. This function has NOT been validated for use in
// cryptography.
//
// This is used in generating DB session IDs and by Env::GenerateUniqueId
// (used for DB IDENTITY) if the platform does not provide a generator of
// RFC 4122 UUIDs or fails somehow. (Set exclude_port_uuid=true if this
// function is used as a fallback for GenerateRfcUuid, because no need
// trying it again.)
void GenerateRawUniqueId(uint64_t* a, uint64_t* b,
bool exclude_port_uuid = false);
#ifndef NDEBUG
// A version of above with options for challenge testing
void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid,
bool exclude_env_details,
bool exclude_random_device);
#endif
// Generates globally unique ids with lower probability of any collisions
// vs. each unique id being independently random (GenerateRawUniqueId).
// We call this "semi-structured" because between different
// SemiStructuredUniqueIdGen objects, the IDs are separated by random
// intervals (unstructured), but within a single SemiStructuredUniqueIdGen
// object, the generated IDs are trivially related (structured). See
// https://github.com/pdillinger/unique_id for how this improves probability
// of no collision. In short, if we have n SemiStructuredUniqueIdGen
// objects each generating m IDs, the first collision is expected at
// around n = sqrt(2^128 / m), equivalently n * sqrt(m) = 2^64,
// rather than n * m = 2^64 for fully random IDs.
class SemiStructuredUniqueIdGen {
public:
// Initializes with random starting state (from GenerateRawUniqueId)
SemiStructuredUniqueIdGen();
// Assuming no fork(), `lower` is guaranteed unique from one call
// to the next (thread safe).
void GenerateNext(uint64_t* upper, uint64_t* lower);
private:
uint64_t base_upper_;
uint64_t base_lower_;
std::atomic<uint64_t> counter_;
int64_t saved_process_id_;
};
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,46 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include "rocksdb/table_properties.h"
namespace ROCKSDB_NAMESPACE {
// EXPERIMENTAL: This API is subject to change
//
// Computes a stable, universally unique 192-bit (24 binary char) identifier
// for an SST file from TableProperties. This is supported for table (SST)
// files created with RocksDB 6.24 and later. NotSupported will be returned
// for other cases. The first 16 bytes (128 bits) is of sufficient quality
// for almost all applications, and shorter prefixes are usable as a
// hash of the full unique id.
//
// Note: .c_str() is not compatible with binary char strings, so using
// .c_str() on the result will often result in information loss and very
// poor uniqueness probability.
//
// More detail: the first 128 bits are *guaranteed* unique for SST files
// generated in the same process (even different DBs, RocksDB >= 6.26),
// and first 128 bits are guaranteed not "all zeros" (RocksDB >= 6.26)
// so that the "all zeros" value can be used reliably for a null ID.
// Assuming one generates many SST files in the lifetime of each process,
// the probability of collision between processes is "better than
// random": if processes generate n SST files on average, we expect to
// generate roughly 2^64 * sqrt(n) files before first collision in the
// first 128 bits. See https://github.com/pdillinger/unique_id
// Using the full 192 bits, we expect to generate roughly 2^96 * sqrt(n)
// files before first collision.
Status GetUniqueIdFromTableProperties(const TableProperties &props,
std::string *out_id);
// EXPERIMENTAL: This API is subject to change
//
// Converts a binary string (unique id) to hexadecimal, with each 64 bits
// separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B
// Also works on unique id prefix.
std::string UniqueIdToHumanString(const std::string &id);
} // namespace ROCKSDB_NAMESPACE

@ -94,7 +94,7 @@ LIB_SOURCES = \
env/file_system_tracer.cc \
env/io_posix.cc \
env/mock_env.cc \
env/unique_id.cc \
env/unique_id_gen.cc \
file/delete_scheduler.cc \
file/file_prefetch_buffer.cc \
file/file_util.cc \
@ -196,6 +196,7 @@ LIB_SOURCES = \
table/table_factory.cc \
table/table_properties.cc \
table/two_level_iterator.cc \
table/unique_id.cc \
test_util/sync_point.cc \
test_util/sync_point_impl.cc \
test_util/transaction_test_util.cc \
@ -343,9 +344,10 @@ STRESS_LIB_SOURCES = \
db_stress_tool/cf_consistency_stress.cc \
db_stress_tool/db_stress_common.cc \
db_stress_tool/db_stress_driver.cc \
db_stress_tool/db_stress_test_base.cc \
db_stress_tool/db_stress_gflags.cc \
db_stress_tool/db_stress_listener.cc \
db_stress_tool/db_stress_shared_state.cc \
db_stress_tool/db_stress_test_base.cc \
db_stress_tool/db_stress_tool.cc \
db_stress_tool/expected_state.cc \
db_stress_tool/no_batched_ops_stress.cc \

@ -8,9 +8,11 @@
#include "port/port.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/unique_id.h"
#include "table/block_based/block.h"
#include "table/internal_iterator.h"
#include "table/table_properties_internal.h"
#include "table/unique_id_impl.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
@ -183,6 +185,13 @@ std::string TableProperties::ToString(
AppendProperty(result, "original file number", orig_file_number, prop_delim,
kv_delim);
// Unique ID, when available
std::string id;
Status s = GetUniqueIdFromTableProperties(*this, &id);
AppendProperty(result, "unique ID",
s.ok() ? UniqueIdToHumanString(id) : "N/A", prop_delim,
kv_delim);
return result;
}
@ -303,6 +312,29 @@ extern const std::string kPropertiesBlockOldName = "rocksdb.stats";
extern const std::string kCompressionDictBlock = "rocksdb.compression_dict";
extern const std::string kRangeDelBlock = "rocksdb.range_del";
#ifndef NDEBUG
void TEST_SetRandomTableProperties(TableProperties* props) {
Random* r = Random::GetTLSInstance();
// For now, TableProperties is composed of a number of uint64_t followed by
// a number of std::string, followed by some extras starting with
// user_collected_properties.
uint64_t* pu = &props->orig_file_number;
assert(static_cast<void*>(pu) == static_cast<void*>(props));
std::string* ps = &props->db_id;
const uint64_t* const pu_end = reinterpret_cast<const uint64_t*>(ps);
const std::string* const ps_end =
reinterpret_cast<const std::string*>(&props->user_collected_properties);
for (; pu < pu_end; ++pu) {
*pu = r->Next64();
}
assert(static_cast<void*>(pu) == static_cast<void*>(ps));
for (; ps < ps_end; ++ps) {
*ps = r->RandomBinaryString(13);
}
}
#endif
// Seek to the properties block.
// Return true if it successfully seeks to the properties block.
Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found) {

@ -6,6 +6,7 @@
#pragma once
#include "rocksdb/status.h"
#include "rocksdb/table_properties.h"
#include "table/internal_iterator.h"
namespace ROCKSDB_NAMESPACE {
@ -27,4 +28,7 @@ Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found,
Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found,
BlockHandle* block_handle);
#ifndef NDEBUG
void TEST_SetRandomTableProperties(TableProperties* props);
#endif
} // namespace ROCKSDB_NAMESPACE

@ -15,6 +15,7 @@
#include <map>
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "block_fetcher.h"
@ -37,7 +38,9 @@
#include "rocksdb/perf_context.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/statistics.h"
#include "rocksdb/table_properties.h"
#include "rocksdb/trace_record.h"
#include "rocksdb/unique_id.h"
#include "rocksdb/write_buffer_manager.h"
#include "table/block_based/block.h"
#include "table/block_based/block_based_table_builder.h"
@ -51,9 +54,11 @@
#include "table/plain/plain_table_factory.h"
#include "table/scoped_arena_iterator.h"
#include "table/sst_file_writer_collectors.h"
#include "table/unique_id_impl.h"
#include "test_util/sync_point.h"
#include "test_util/testharness.h"
#include "test_util/testutil.h"
#include "util/coding_lean.h"
#include "util/compression.h"
#include "util/file_checksum_helper.h"
#include "util/random.h"
@ -1388,6 +1393,257 @@ TEST_F(TablePropertyTest, PrefixScanTest) {
}
}
namespace {
struct TestIds {
UniqueId64x3 internal_id;
UniqueId64x3 external_id;
};
inline bool operator==(const TestIds& lhs, const TestIds& rhs) {
return lhs.internal_id == rhs.internal_id &&
lhs.external_id == rhs.external_id;
}
std::ostream& operator<<(std::ostream& os, const TestIds& ids) {
return os << std::hex << "{{{ 0x" << ids.internal_id[0] << "U, 0x"
<< ids.internal_id[1] << "U, 0x" << ids.internal_id[2]
<< "U }}, {{ 0x" << ids.external_id[0] << "U, 0x"
<< ids.external_id[1] << "U, 0x" << ids.external_id[2] << "U }}}";
}
TestIds GetUniqueId(TableProperties* tp, std::unordered_set<uint64_t>* seen,
const std::string& db_id, const std::string& db_session_id,
uint64_t file_number) {
// First test session id logic
if (db_session_id.size() == 20) {
uint64_t upper;
uint64_t lower;
EXPECT_OK(DecodeSessionId(db_session_id, &upper, &lower));
EXPECT_EQ(EncodeSessionId(upper, lower), db_session_id);
}
// Get external using public API
tp->db_id = db_id;
tp->db_session_id = db_session_id;
tp->orig_file_number = file_number;
TestIds t;
{
std::string uid;
EXPECT_OK(GetUniqueIdFromTableProperties(*tp, &uid));
EXPECT_EQ(uid.size(), 24U);
t.external_id[0] = DecodeFixed64(&uid[0]);
t.external_id[1] = DecodeFixed64(&uid[8]);
t.external_id[2] = DecodeFixed64(&uid[16]);
}
// All these should be effectively random
EXPECT_TRUE(seen->insert(t.external_id[0]).second);
EXPECT_TRUE(seen->insert(t.external_id[1]).second);
EXPECT_TRUE(seen->insert(t.external_id[2]).second);
// Get internal with internal API
EXPECT_OK(GetSstInternalUniqueId(db_id, db_session_id, file_number,
&t.internal_id));
// Verify relationship
UniqueId64x3 tmp = t.internal_id;
InternalUniqueIdToExternal(&tmp);
EXPECT_EQ(tmp, t.external_id);
ExternalUniqueIdToInternal(&tmp);
EXPECT_EQ(tmp, t.internal_id);
return t;
}
} // namespace
TEST_F(TablePropertyTest, UniqueIdsSchemaAndQuality) {
// To ensure the computation only depends on the expected entries, we set
// the rest randomly
TableProperties tp;
TEST_SetRandomTableProperties(&tp);
// DB id is normally RFC-4122
const std::string db_id1 = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
// Allow other forms of DB id
const std::string db_id2 = "1728000184588763620";
const std::string db_id3 = "x";
// DB session id is normally 20 chars in base-36, but 13 to 24 chars
// is ok, roughly 64 to 128 bits.
const std::string ses_id1 = "ABCDEFGHIJ0123456789";
// Same trailing 13 digits
const std::string ses_id2 = "HIJ0123456789";
const std::string ses_id3 = "0123ABCDEFGHIJ0123456789";
// Different trailing 12 digits
const std::string ses_id4 = "ABCDEFGH888888888888";
// And change length
const std::string ses_id5 = "ABCDEFGHIJ012";
const std::string ses_id6 = "ABCDEFGHIJ0123456789ABCD";
using T = TestIds;
std::unordered_set<uint64_t> seen;
// Establish a stable schema for the unique IDs. These values must not
// change for existing table files.
// (Note: parens needed for macro parsing, extra braces needed for some
// compilers.)
EXPECT_EQ(
GetUniqueId(&tp, &seen, db_id1, ses_id1, 1),
T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
{{0xf0bd230365df7464U, 0xca089303f3648eb4U, 0x4b44f7e7324b2817U}}}));
// Only change internal_id[1] with file number
EXPECT_EQ(
GetUniqueId(&tp, &seen, db_id1, ses_id1, 2),
T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757feU, 0x907f41dfd90724ffU}},
{{0xf13fdf7adcfebb6dU, 0x97cd2226cc033ea2U, 0x198c438182091f0eU}}}));
EXPECT_EQ(
GetUniqueId(&tp, &seen, db_id1, ses_id1, 123456789),
T({{{0x61d7dcf415d9cf19U, 0x160d77aaee5c9ae9U, 0x907f41dfd90724ffU}},
{{0x81fbcebe1ac6c4f0U, 0x6b14a64cfdc0f1c4U, 0x7d8fb6eaf18edbb3U}}}));
// Change internal_id[1] and internal_id[2] with db_id
EXPECT_EQ(
GetUniqueId(&tp, &seen, db_id2, ses_id1, 1),
T({{{0x61d7dcf415d9cf19U, 0xf89c471f572f0d25U, 0x1f0f2a5eb0e6257eU}},
{{0x7f1d01d453616991U, 0x32ddf2afec804ab2U, 0xd10a1ee2f0c7d9c1U}}}));
EXPECT_EQ(
GetUniqueId(&tp, &seen, db_id3, ses_id1, 1),
T({{{0x61d7dcf415d9cf19U, 0xfed297a8154a57d0U, 0x8b931b9cdebd9e8U}},
{{0x62b2f43183f6894bU, 0x897ff2b460eefad1U, 0xf4ec189fb2d15e04U}}}));
// Keeping same last 13 digits of ses_id keeps same internal_id[0]
EXPECT_EQ(
GetUniqueId(&tp, &seen, db_id1, ses_id2, 1),
T({{{0x61d7dcf415d9cf19U, 0x5f6cc4fa2d528c8U, 0x7b70845d5bfb5446U}},
{{0x96d1c83ffcc94266U, 0x82663eac0ec6e14aU, 0x94a88b49678b77f6U}}}));
EXPECT_EQ(
GetUniqueId(&tp, &seen, db_id1, ses_id3, 1),
T({{{0x61d7dcf415d9cf19U, 0xfc7232879db37ea2U, 0xc0378d74ea4c89cdU}},
{{0xdf2ef57e98776905U, 0xda5b31c987da833bU, 0x79c1b4bd0a9e760dU}}}));
// Changing last 12 digits of ses_id only changes internal_id[0]
// (vs. db_id1, ses_id1, 1)
EXPECT_EQ(
GetUniqueId(&tp, &seen, db_id1, ses_id4, 1),
T({{{0x4f07cc0d003a83a8U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
{{0xbcf85336a9f71f04U, 0x4f2949e2f3adb60dU, 0x9ca0def976abfa10U}}}));
// ses_id can change everything.
EXPECT_EQ(
GetUniqueId(&tp, &seen, db_id1, ses_id5, 1),
T({{{0x94b8768e43f87ce6U, 0xc2559653ac4e7c93U, 0xde6dff6bbb1223U}},
{{0x5a9537af681817fbU, 0x1afcd1fecaead5eaU, 0x767077ad9ebe0008U}}}));
EXPECT_EQ(
GetUniqueId(&tp, &seen, db_id1, ses_id6, 1),
T({{{0x43cfb0ffa3b710edU, 0x263c580426406a1bU, 0xfacc91379a80d29dU}},
{{0xfa90547d84cb1cdbU, 0x2afe99c641992d4aU, 0x205b7f7b60e51cc2U}}}));
// Now verify more thoroughly that any small change in inputs completely
// changes external unique id.
// (Relying on 'seen' checks etc. in GetUniqueId)
std::string db_id = "00000000-0000-0000-0000-000000000000";
std::string ses_id = "000000000000000000000000";
uint64_t file_num = 1;
// change db_id
for (size_t i = 0; i < db_id.size(); ++i) {
if (db_id[i] == '-') {
continue;
}
for (char alt : std::string("123456789abcdef")) {
db_id[i] = alt;
GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
}
db_id[i] = '0';
}
// change ses_id
for (size_t i = 0; i < ses_id.size(); ++i) {
for (char alt : std::string("123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")) {
ses_id[i] = alt;
GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
}
ses_id[i] = '0';
}
// change file_num
for (int i = 1; i < 64; ++i) {
GetUniqueId(&tp, &seen, db_id, ses_id, file_num << i);
}
// Verify that "all zeros" in first 128 bits is equivalent for internal and
// external IDs. This way, as long as we avoid "all zeros" in internal IDs,
// we avoid it in external IDs.
{
UniqueId64x3 id1{{0, 0, Random::GetTLSInstance()->Next64()}};
UniqueId64x3 id2 = id1;
InternalUniqueIdToExternal(&id1);
EXPECT_EQ(id1, id2);
ExternalUniqueIdToInternal(&id2);
EXPECT_EQ(id1, id2);
}
}
namespace {
void SetGoodTableProperties(TableProperties* tp) {
// To ensure the computation only depends on the expected entries, we set
// the rest randomly
TEST_SetRandomTableProperties(tp);
tp->db_id = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
tp->db_session_id = "ABCDEFGHIJ0123456789";
tp->orig_file_number = 1;
}
} // namespace
TEST_F(TablePropertyTest, UniqueIdHumanStrings) {
TableProperties tp;
SetGoodTableProperties(&tp);
std::string tmp;
EXPECT_OK(GetUniqueIdFromTableProperties(tp, &tmp));
EXPECT_EQ(tmp,
(std::string{{'\x64', '\x74', '\xdf', '\x65', '\x03', '\x23',
'\xbd', '\xf0', '\xb4', '\x8e', '\x64', '\xf3',
'\x03', '\x93', '\x08', '\xca', '\x17', '\x28',
'\x4b', '\x32', '\xe7', '\xf7', '\x44', '\x4b'}}));
EXPECT_EQ(UniqueIdToHumanString(tmp),
"6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B");
// including zero padding
tmp = std::string(24U, '\0');
tmp[15] = '\x12';
tmp[23] = '\xAB';
EXPECT_EQ(UniqueIdToHumanString(tmp),
"0000000000000000-0000000000000012-00000000000000AB");
// And shortened
tmp = std::string(20U, '\0');
tmp[5] = '\x12';
tmp[10] = '\xAB';
tmp[17] = '\xEF';
EXPECT_EQ(UniqueIdToHumanString(tmp),
"0000000000120000-0000AB0000000000-00EF0000");
tmp.resize(16);
EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB0000000000");
tmp.resize(11);
EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB");
tmp.resize(6);
EXPECT_EQ(UniqueIdToHumanString(tmp), "000000000012");
}
TEST_F(TablePropertyTest, UniqueIdsFailure) {
TableProperties tp;
std::string tmp;
// Missing DB id
SetGoodTableProperties(&tp);
tp.db_id = "";
EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
// Missing session id
SetGoodTableProperties(&tp);
tp.db_session_id = "";
EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
// Missing file number
SetGoodTableProperties(&tp);
tp.orig_file_number = 0;
EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
}
// This test include all the basic checks except those for index size and block
// size, which will be conducted in separated unit tests.
TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) {

@ -0,0 +1,166 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include <cstdint>
#include "table/unique_id_impl.h"
#include "util/coding_lean.h"
#include "util/hash.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
std::string EncodeSessionId(uint64_t upper, uint64_t lower) {
std::string db_session_id(20U, '\0');
char *buf = &db_session_id[0];
// Preserving `lower` is slightly tricky. 36^12 is slightly more than
// 62 bits, so we use 12 chars plus the bottom two bits of one more.
// (A tiny fraction of 20 digit strings go unused.)
uint64_t a = (upper << 2) | (lower >> 62);
uint64_t b = lower & (UINT64_MAX >> 2);
PutBaseChars<36>(&buf, 8, a, /*uppercase*/ true);
PutBaseChars<36>(&buf, 12, b, /*uppercase*/ true);
assert(buf == &db_session_id.back() + 1);
return db_session_id;
}
Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
uint64_t *lower) {
const size_t len = db_session_id.size();
if (len == 0) {
return Status::NotSupported("Missing db_session_id");
}
// Anything from 13 to 24 chars is reasonable. We don't have to limit to
// exactly 20.
if (len < 13) {
return Status::NotSupported("Too short db_session_id");
}
if (len > 24) {
return Status::NotSupported("Too long db_session_id");
}
uint64_t a = 0, b = 0;
const char *buf = &db_session_id.front();
bool success = ParseBaseChars<36>(&buf, len - 12U, &a);
if (!success) {
return Status::NotSupported("Bad digit in db_session_id");
}
success = ParseBaseChars<36>(&buf, 12U, &b);
if (!success) {
return Status::NotSupported("Bad digit in db_session_id");
}
assert(buf == &db_session_id.back() + 1);
*upper = a >> 2;
*lower = (b & (UINT64_MAX >> 2)) | (a << 62);
return Status::OK();
}
Status GetSstInternalUniqueId(const std::string &db_id,
const std::string &db_session_id,
uint64_t file_number, UniqueId64x3 *out) {
if (db_id.empty()) {
return Status::NotSupported("Missing db_id");
}
if (file_number == 0) {
return Status::NotSupported("Missing or bad file number");
}
if (db_session_id.empty()) {
return Status::NotSupported("Missing db_session_id");
}
uint64_t session_upper = 0; // Assignment to appease clang-analyze
uint64_t session_lower = 0; // Assignment to appease clang-analyze
{
Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower);
if (!s.ok()) {
return s;
}
}
// Exactly preserve session lower to ensure that session ids generated
// during the same process lifetime are guaranteed unique.
// DBImpl also guarantees (in recent versions) that this is not zero,
// so that we can guarantee unique ID is never all zeros. (Can't assert
// that here because of testing and old versions.)
// We put this first in anticipation of matching a small-ish set of cache
// key prefixes to cover entries relevant to any DB.
(*out)[0] = session_lower;
// Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy)
// for very high global uniqueness entropy.
// (It is possible that many DBs descended from one common DB id are copied
// around and proliferate, in which case session id is critical, but it is
// more common for different DBs to have different DB ids.)
uint64_t db_a, db_b;
Hash2x64(db_id.data(), db_id.size(), session_upper, &db_a, &db_b);
// Xor in file number for guaranteed uniqueness by file number for a given
// session and DB id. (Xor slightly better than + here. See
// https://github.com/pdillinger/unique_id )
(*out)[1] = db_a ^ file_number;
// Extra (optional) global uniqueness
(*out)[2] = db_b;
return Status::OK();
}
namespace {
// For InternalUniqueIdToExternal / ExternalUniqueIdToInternal we want all
// zeros in first 128 bits to map to itself, so that excluding zero in
// internal IDs (session_lower != 0 above) does the same for external IDs.
// These values are meaningless except for making that work.
constexpr uint64_t kHiOffsetForZero = 17391078804906429400U;
constexpr uint64_t kLoOffsetForZero = 6417269962128484497U;
} // namespace
void InternalUniqueIdToExternal(UniqueId64x3 *in_out) {
uint64_t hi, lo;
BijectiveHash2x64((*in_out)[1] + kHiOffsetForZero,
(*in_out)[0] + kLoOffsetForZero, &hi, &lo);
(*in_out)[0] = lo;
(*in_out)[1] = hi;
(*in_out)[2] += lo + hi;
}
void ExternalUniqueIdToInternal(UniqueId64x3 *in_out) {
uint64_t lo = (*in_out)[0];
uint64_t hi = (*in_out)[1];
(*in_out)[2] -= lo + hi;
BijectiveUnhash2x64(hi, lo, &hi, &lo);
(*in_out)[0] = lo - kLoOffsetForZero;
(*in_out)[1] = hi - kHiOffsetForZero;
}
std::string EncodeUniqueIdBytes(const UniqueId64x3 &in) {
std::string ret(24U, '\0');
EncodeFixed64(&ret[0], in[0]);
EncodeFixed64(&ret[8], in[1]);
EncodeFixed64(&ret[16], in[2]);
return ret;
}
Status GetUniqueIdFromTableProperties(const TableProperties &props,
std::string *out_id) {
UniqueId64x3 tmp{};
Status s = GetSstInternalUniqueId(props.db_id, props.db_session_id,
props.orig_file_number, &tmp);
if (s.ok()) {
InternalUniqueIdToExternal(&tmp);
*out_id = EncodeUniqueIdBytes(tmp);
} else {
out_id->clear();
}
return s;
}
std::string UniqueIdToHumanString(const std::string &id) {
// Not so efficient, but that's OK
std::string str = Slice(id).ToString(/*hex*/ true);
for (size_t i = 16; i < str.size(); i += 17) {
str.insert(i, "-");
}
return str;
}
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,59 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <array>
#include "rocksdb/unique_id.h"
namespace ROCKSDB_NAMESPACE {
using UniqueId64x3 = std::array<uint64_t, 3>;
// Helper for GetUniqueIdFromTableProperties. This function can also be used
// for temporary ids for files without sufficient information in table
// properties. The internal unique id is more structured than the public
// unique id, so can be manipulated in more ways but very carefully.
// These must be long term stable to ensure GetUniqueIdFromTableProperties
// is long term stable.
Status GetSstInternalUniqueId(const std::string &db_id,
const std::string &db_session_id,
uint64_t file_number, UniqueId64x3 *out);
// Helper for GetUniqueIdFromTableProperties. External unique ids go through
// this extra hashing layer so that prefixes of the unique id have predictable
// "full" entropy. This hashing layer is 1-to-1 on the first 128 bits and on
// the full 192 bits.
// This transformation must be long term stable to ensure
// GetUniqueIdFromTableProperties is long term stable.
void InternalUniqueIdToExternal(UniqueId64x3 *in_out);
// Reverse of InternalUniqueIdToExternal mostly for testing purposes
// (demonstrably 1-to-1 on the first 128 bits and on the full 192 bits).
void ExternalUniqueIdToInternal(UniqueId64x3 *in_out);
// Convert numerical format to byte format for public API
std::string EncodeUniqueIdBytes(const UniqueId64x3 &in);
// Reformat a random value down to our "DB session id" format,
// which is intended to be compact and friendly for use in file names.
// `lower` is fully preserved and data is lost from `upper`.
//
// Detail: Encoded into 20 chars in base-36 ([0-9A-Z]), which is ~103 bits of
// entropy, which is enough to expect no collisions across a billion servers
// each opening DBs a million times (~2^50). Benefits vs. RFC-4122 unique id:
// * Save ~ dozen bytes per SST file
// * Shorter shared backup file names (some platforms have low limits)
// * Visually distinct from DB id format (usually RFC-4122)
std::string EncodeSessionId(uint64_t upper, uint64_t lower);
// Reverse of EncodeSessionId. Returns NotSupported on error rather than
// Corruption because non-standard session IDs should be allowed with degraded
// functionality.
Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
uint64_t *lower);
} // namespace ROCKSDB_NAMESPACE

@ -9,7 +9,7 @@
#include "util/hash.h"
#include <string.h>
#include <string>
#include "port/lang.h"
#include "util/coding.h"
@ -120,4 +120,82 @@ void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64) {
*low64 = h.low64;
}
void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64,
uint64_t* low64) {
auto h = XXH3_128bits_withSeed(data, n, seed);
*high64 = h.high64;
*low64 = h.low64;
}
namespace {
inline uint64_t XXH3_avalanche(uint64_t h64) {
h64 ^= h64 >> 37;
h64 *= 0x165667919E3779F9U;
h64 ^= h64 >> 32;
return h64;
}
inline uint64_t XXH3_unavalanche(uint64_t h64) {
h64 ^= h64 >> 32;
h64 *= 0x8da8ee41d6df849U; // inverse of 0x165667919E3779F9U
h64 ^= h64 >> 37;
return h64;
}
} // namespace
void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
uint64_t* out_high64, uint64_t* out_low64) {
// Adapted from XXH3_len_9to16_128b
const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
Unsigned128 tmp128 =
Multiply64to128(in_low64 ^ in_high64 ^ bitflipl, 0x9E3779B185EBCA87U);
uint64_t lo = Lower64of128(tmp128);
uint64_t hi = Upper64of128(tmp128);
lo += 0x3c0000000000000U; // (len - 1) << 54
in_high64 ^= bitfliph;
hi += in_high64 + (Lower32of64(in_high64) * uint64_t{0x85EBCA76});
lo ^= EndianSwapValue(hi);
tmp128 = Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU);
lo = Lower64of128(tmp128);
hi = Upper64of128(tmp128) + (hi * 0xC2B2AE3D27D4EB4FU);
*out_low64 = XXH3_avalanche(lo);
*out_high64 = XXH3_avalanche(hi);
}
void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
uint64_t* out_high64, uint64_t* out_low64) {
// Inverted above (also consulting XXH3_len_9to16_128b)
const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
uint64_t lo = XXH3_unavalanche(in_low64);
uint64_t hi = XXH3_unavalanche(in_high64);
lo *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU
hi -= Upper64of128(Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU));
hi *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU
lo ^= EndianSwapValue(hi);
lo -= 0x3c0000000000000U;
lo *= 0x887493432badb37U; // inverse of 0x9E3779B185EBCA87U
hi -= Upper64of128(Multiply64to128(lo, 0x9E3779B185EBCA87U));
uint32_t tmp32 = Lower32of64(hi) * 0xb6c92f47; // inverse of 0x85EBCA77
hi -= tmp32;
hi = (hi & 0xFFFFFFFF00000000U) -
((tmp32 * uint64_t{0x85EBCA76}) & 0xFFFFFFFF00000000U) + tmp32;
hi ^= bitfliph;
lo ^= hi ^ bitflipl;
*out_high64 = hi;
*out_low64 = lo;
}
void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64,
uint64_t* out_high64, uint64_t* out_low64) {
BijectiveHash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
}
void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64,
uint64_t* out_high64, uint64_t* out_low64) {
BijectiveUnhash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
}
} // namespace ROCKSDB_NAMESPACE

@ -66,6 +66,21 @@ inline uint64_t NPHash64(const char* data, size_t n) {
// Convenient and equivalent version of Hash128 without depending on 128-bit
// scalars
void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64);
void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64,
uint64_t* low64);
// Hash 128 bits to 128 bits, guaranteed not to lose data (equivalent to
// Hash2x64 on 16 bytes little endian)
void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64,
uint64_t* out_high64, uint64_t* out_low64);
void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
uint64_t* out_high64, uint64_t* out_low64);
// Inverse of above (mostly for testing)
void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64,
uint64_t* out_high64, uint64_t* out_low64);
void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
uint64_t* out_high64, uint64_t* out_low64);
// Stable/persistent 32-bit hash. Moderate quality and high speed on
// small inputs.

@ -14,9 +14,13 @@
#include "test_util/testharness.h"
#include "util/coding.h"
#include "util/coding_lean.h"
#include "util/hash128.h"
#include "util/math128.h"
using ROCKSDB_NAMESPACE::BijectiveHash2x64;
using ROCKSDB_NAMESPACE::BijectiveUnhash2x64;
using ROCKSDB_NAMESPACE::DecodeFixed64;
using ROCKSDB_NAMESPACE::EncodeFixed32;
using ROCKSDB_NAMESPACE::GetSliceHash64;
using ROCKSDB_NAMESPACE::Hash;
@ -277,9 +281,16 @@ TEST(HashTest, Hash64LargeValueSchema) {
TEST(HashTest, Hash128Misc) {
constexpr uint32_t kSeed = 0; // Same as GetSliceHash128
for (char fill : {'\0', 'a', '1', '\xff'}) {
for (char fill : {'\0', 'a', '1', '\xff', 'e'}) {
const size_t max_size = 1000;
const std::string str(max_size, fill);
std::string str(max_size, fill);
if (fill == 'e') {
// Use different characters to check endianness handling
for (size_t i = 0; i < str.size(); ++i) {
str[i] += static_cast<char>(i);
}
}
for (size_t size = 0; size <= max_size; ++size) {
Unsigned128 here = Hash128(str.data(), size, kSeed);
@ -293,6 +304,18 @@ TEST(HashTest, Hash128Misc) {
EXPECT_EQ(Lower64of128(here), lo);
EXPECT_EQ(Upper64of128(here), hi);
}
if (size == 16) {
const uint64_t in_hi = DecodeFixed64(str.data() + 8);
const uint64_t in_lo = DecodeFixed64(str.data());
uint64_t hi, lo;
BijectiveHash2x64(in_hi, in_lo, &hi, &lo);
EXPECT_EQ(Lower64of128(here), lo);
EXPECT_EQ(Upper64of128(here), hi);
uint64_t un_hi, un_lo;
BijectiveUnhash2x64(hi, lo, &un_hi, &un_lo);
EXPECT_EQ(in_lo, un_lo);
EXPECT_EQ(in_hi, un_hi);
}
// Upper and Lower must reconstruct hash
EXPECT_EQ(here,
@ -302,7 +325,27 @@ TEST(HashTest, Hash128Misc) {
// Seed changes hash value (with high probability)
for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) {
EXPECT_NE(here, Hash128(str.data(), size, var_seed));
Unsigned128 seeded = Hash128(str.data(), size, var_seed);
EXPECT_NE(here, seeded);
// Must match seeded Hash2x64
{
uint64_t hi, lo;
Hash2x64(str.data(), size, var_seed, &hi, &lo);
EXPECT_EQ(Lower64of128(seeded), lo);
EXPECT_EQ(Upper64of128(seeded), hi);
}
if (size == 16) {
const uint64_t in_hi = DecodeFixed64(str.data() + 8);
const uint64_t in_lo = DecodeFixed64(str.data());
uint64_t hi, lo;
BijectiveHash2x64(in_hi, in_lo, var_seed, &hi, &lo);
EXPECT_EQ(Lower64of128(seeded), lo);
EXPECT_EQ(Upper64of128(seeded), hi);
uint64_t un_hi, un_lo;
BijectiveUnhash2x64(hi, lo, var_seed, &un_hi, &un_lo);
EXPECT_EQ(in_lo, un_lo);
EXPECT_EQ(in_hi, un_hi);
}
}
// Size changes hash value (with high probability)

@ -9,6 +9,7 @@
#pragma once
#include <stdint.h>
#include <algorithm>
#include <random>
@ -60,6 +61,8 @@ class Random {
return seed_;
}
uint64_t Next64() { return (uint64_t{Next()} << 32) | Next(); }
// Returns a uniformly distributed value in the range [0..n-1]
// REQUIRES: n > 0
uint32_t Uniform(int n) { return Next() % n; }

@ -46,12 +46,41 @@ inline void PutBaseChars(char** buf, size_t n, uint64_t v, bool uppercase) {
const char* digitChars = uppercase ? "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
: "0123456789abcdefghijklmnopqrstuvwxyz";
for (size_t i = n; i > 0; --i) {
(*buf)[i - 1] = digitChars[v % kBase];
(*buf)[i - 1] = digitChars[static_cast<size_t>(v % kBase)];
v /= kBase;
}
*buf += n;
}
// Parse n digits from *buf in base kBase to *v and advance *buf to the
// position after what was read. On success, true is returned. On failure,
// false is returned, *buf is placed at the first bad character, and *v
// contains the partial parsed data. Overflow is not checked but the
// result is accurate mod 2^64. Requires the starting value of *v to be
// zero or previously accumulated parsed digits, i.e.
// ParseBaseChars(&b, n, &v);
// is equivalent to n calls to
// ParseBaseChars(&b, 1, &v);
template <int kBase>
inline bool ParseBaseChars(const char** buf, size_t n, uint64_t* v) {
while (n) {
char c = **buf;
*v *= static_cast<uint64_t>(kBase);
if (c >= '0' && (kBase >= 10 ? c <= '9' : c < '0' + kBase)) {
*v += static_cast<uint64_t>(c - '0');
} else if (kBase > 10 && c >= 'A' && c < 'A' + kBase - 10) {
*v += static_cast<uint64_t>(c - 'A' + 10);
} else if (kBase > 10 && c >= 'a' && c < 'a' + kBase - 10) {
*v += static_cast<uint64_t>(c - 'a' + 10);
} else {
return false;
}
--n;
++*buf;
}
return true;
}
// Return a human-readable version of num.
// for num >= 10.000, prints "xxK"
// for num >= 10.000.000, prints "xxM"

Loading…
Cancel
Save