diff --git a/CMakeLists.txt b/CMakeLists.txt index b6fa5ee34..1a6411653 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -713,7 +713,7 @@ set(SOURCES env/file_system_tracer.cc env/fs_remap.cc env/mock_env.cc - env/unique_id.cc + env/unique_id_gen.cc file/delete_scheduler.cc file/file_prefetch_buffer.cc file/file_util.cc @@ -807,6 +807,7 @@ set(SOURCES table/table_factory.cc table/table_properties.cc table/two_level_iterator.cc + table/unique_id.cc test_util/sync_point.cc test_util/sync_point_impl.cc test_util/testutil.cc diff --git a/HISTORY.md b/HISTORY.md index 59b543be3..e6b548088 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -17,6 +17,7 @@ * Add remote compaction read/write bytes statistics: `REMOTE_COMPACT_READ_BYTES`, `REMOTE_COMPACT_WRITE_BYTES`. * Introduce an experimental feature to dump out the blocks from block cache and insert them to the secondary cache to reduce the cache warmup time (e.g., used while migrating DB instance). More information are in `class CacheDumper` and `CacheDumpedLoader` at `rocksdb/utilities/cache_dump_load.h` Note that, this feature is subject to the potential change in the future, it is still experimental. * Introduced a new BlobDB configuration option `blob_garbage_collection_force_threshold`, which can be used to trigger compactions targeting the SST files which reference the oldest blob files when the ratio of garbage in those blob files meets or exceeds the specified threshold. This can reduce space amplification with skewed workloads where the affected SST files might not otherwise get picked up for compaction. +* Added EXPERIMENTAL support for table file (SST) unique identifiers that are stable and universally unique, available with new function `GetUniqueIdFromTableProperties`. Only SST files from RocksDB >= 6.24 support unique IDs. * [JAVA] `keyMayExist()` supports ByteBuffer. ### Public API change diff --git a/TARGETS b/TARGETS index 75837ce88..200298bff 100644 --- a/TARGETS +++ b/TARGETS @@ -225,7 +225,7 @@ cpp_library( "env/fs_remap.cc", "env/io_posix.cc", "env/mock_env.cc", - "env/unique_id.cc", + "env/unique_id_gen.cc", "file/delete_scheduler.cc", "file/file_prefetch_buffer.cc", "file/file_util.cc", @@ -327,6 +327,7 @@ cpp_library( "table/table_factory.cc", "table/table_properties.cc", "table/two_level_iterator.cc", + "table/unique_id.cc", "test_util/sync_point.cc", "test_util/sync_point_impl.cc", "test_util/transaction_test_util.cc", @@ -550,7 +551,7 @@ cpp_library( "env/fs_remap.cc", "env/io_posix.cc", "env/mock_env.cc", - "env/unique_id.cc", + "env/unique_id_gen.cc", "file/delete_scheduler.cc", "file/file_prefetch_buffer.cc", "file/file_util.cc", @@ -652,6 +653,7 @@ cpp_library( "table/table_factory.cc", "table/table_properties.cc", "table/two_level_iterator.cc", + "table/unique_id.cc", "test_util/sync_point.cc", "test_util/sync_point_impl.cc", "test_util/transaction_test_util.cc", @@ -848,6 +850,7 @@ cpp_library( "db_stress_tool/db_stress_common.cc", "db_stress_tool/db_stress_driver.cc", "db_stress_tool/db_stress_gflags.cc", + "db_stress_tool/db_stress_listener.cc", "db_stress_tool/db_stress_shared_state.cc", "db_stress_tool/db_stress_test_base.cc", "db_stress_tool/db_stress_tool.cc", diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index de59df79e..a68f7ea59 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -6,6 +6,7 @@ #ifndef ROCKSDB_LITE #include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "table/cuckoo/cuckoo_table_factory.h" @@ -133,6 +134,7 @@ TEST_F(CuckooTableDBTest, Flush) { TablePropertiesCollection ptc; ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); ASSERT_EQ(1U, ptc.size()); ASSERT_EQ(3U, ptc.begin()->second->num_entries); ASSERT_EQ("1", FilesPerLevel()); @@ -149,6 +151,7 @@ TEST_F(CuckooTableDBTest, Flush) { ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); ASSERT_EQ(2U, ptc.size()); auto row = ptc.begin(); ASSERT_EQ(3U, row->second->num_entries); @@ -166,6 +169,7 @@ TEST_F(CuckooTableDBTest, Flush) { ASSERT_OK(Delete("key4")); ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); ASSERT_EQ(3U, ptc.size()); row = ptc.begin(); ASSERT_EQ(3U, row->second->num_entries); @@ -190,6 +194,7 @@ TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) { TablePropertiesCollection ptc; ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); ASSERT_EQ(1U, ptc.size()); ASSERT_EQ(2U, ptc.begin()->second->num_entries); ASSERT_EQ("1", FilesPerLevel()); diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 11b30e2df..ce4ad580f 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -53,7 +53,7 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "db/write_callback.h" -#include "env/unique_id.h" +#include "env/unique_id_gen.h" #include "file/file_util.h" #include "file/filename.h" #include "file/random_access_file_reader.h" @@ -92,6 +92,7 @@ #include "table/sst_file_dumper.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" +#include "table/unique_id_impl.h" #include "test_util/sync_point.h" #include "trace_replay/trace_replay.h" #include "util/autovector.h" @@ -3947,23 +3948,18 @@ Status DBImpl::GetDbSessionId(std::string& session_id) const { } std::string DBImpl::GenerateDbSessionId(Env*) { - // GenerateRawUniqueId() generates an identifier that has a negligible - // probability of being duplicated. It should have full 128 bits of entropy. - uint64_t a, b; - GenerateRawUniqueId(&a, &b); - - // Hash and reformat that down to a more compact format, 20 characters - // in base-36 ([0-9A-Z]), which is ~103 bits of entropy, which is enough - // to expect no collisions across a billion servers each opening DBs - // a million times (~2^50). Benefits vs. raw unique id: - // * Save ~ dozen bytes per SST file - // * Shorter shared backup file names (some platforms have low limits) - // * Visually distinct from DB id format - std::string db_session_id(20U, '\0'); - char* buf = &db_session_id[0]; - PutBaseChars<36>(&buf, 10, a, /*uppercase*/ true); - PutBaseChars<36>(&buf, 10, b, /*uppercase*/ true); - return db_session_id; + // See SemiStructuredUniqueIdGen for its desirable properties. + static SemiStructuredUniqueIdGen gen; + + uint64_t lo, hi; + gen.GenerateNext(&hi, &lo); + if (lo == 0) { + // Avoid emitting session ID with lo==0, so that SST unique + // IDs can be more easily ensured non-zero + gen.GenerateNext(&hi, &lo); + assert(lo != 0); + } + return EncodeSessionId(hi, lo); } void DBImpl::SetDbSessionId() { diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc index c7618562d..788e2a9d7 100644 --- a/db/db_table_properties_test.cc +++ b/db/db_table_properties_test.cc @@ -45,6 +45,8 @@ void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { ASSERT_EQ(props.size(), unique_entries.size()); ASSERT_EQ(expected_entries_size, sum); + + VerifySstUniqueIds(props); } } // namespace diff --git a/db/db_test.cc b/db/db_test.cc index a3c0937ce..55bc7e815 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2077,6 +2077,13 @@ TEST_F(DBTest, OverlapInLevel0) { Flush(1); ASSERT_EQ("2,1,1", FilesPerLevel(1)); + // BEGIN addition to existing test + // Take this opportunity to verify SST unique ids (including Plain table) + TablePropertiesCollection tbc; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &tbc)); + VerifySstUniqueIds(tbc); + // END addition to existing test + // Compact away the placeholder files we created initially dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]); diff --git a/db/db_test_util.cc b/db/db_test_util.cc index ea07516b8..1cc3c8737 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -13,6 +13,7 @@ #include "env/mock_env.h" #include "rocksdb/convenience.h" #include "rocksdb/env_encryption.h" +#include "rocksdb/unique_id.h" #include "rocksdb/utilities/object_registry.h" #include "util/random.h" @@ -1654,4 +1655,14 @@ uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily( } #endif // ROCKSDB_LITE +void VerifySstUniqueIds(const TablePropertiesCollection& props) { + ASSERT_FALSE(props.empty()); // suspicious test if empty + std::unordered_set seen; + for (auto& pair : props) { + std::string id; + ASSERT_OK(GetUniqueIdFromTableProperties(*pair.second, &id)); + ASSERT_TRUE(seen.insert(id).second); + } +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_test_util.h b/db/db_test_util.h index 2dfb65327..6693fa2f8 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -1195,4 +1195,8 @@ class DBTestBase : public testing::Test { bool time_elapse_only_sleep_on_reopen_ = false; }; +// For verifying that all files generated by current version have SST +// unique ids. +void VerifySstUniqueIds(const TablePropertiesCollection& props); + } // namespace ROCKSDB_NAMESPACE diff --git a/db_stress_tool/CMakeLists.txt b/db_stress_tool/CMakeLists.txt index 6ab0e9a58..68001967e 100644 --- a/db_stress_tool/CMakeLists.txt +++ b/db_stress_tool/CMakeLists.txt @@ -1,13 +1,13 @@ add_executable(db_stress${ARTIFACT_SUFFIX} - db_stress.cc - db_stress_tool.cc batched_ops_stress.cc cf_consistency_stress.cc + db_stress.cc db_stress_common.cc db_stress_driver.cc - db_stress_test_base.cc - db_stress_shared_state.cc db_stress_gflags.cc + db_stress_listener.cc + db_stress_shared_state.cc + db_stress_test_base.cc db_stress_tool.cc expected_state.cc no_batched_ops_stress.cc) diff --git a/db_stress_tool/db_stress_listener.cc b/db_stress_tool/db_stress_listener.cc new file mode 100644 index 000000000..b6fd5071c --- /dev/null +++ b/db_stress_tool/db_stress_listener.cc @@ -0,0 +1,136 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db_stress_tool/db_stress_listener.h" + +#include + +#include "rocksdb/file_system.h" +#include "util/coding_lean.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef GFLAGS +#ifndef ROCKSDB_LITE + +// TODO: consider using expected_values_dir instead, but this is more +// convenient for now. +UniqueIdVerifier::UniqueIdVerifier(const std::string& db_name) + : path_(db_name + "/.unique_ids") { + // We expect such a small number of files generated during this test + // (thousands?), checking full 192-bit IDs for uniqueness is a very + // weak check. For a stronger check, we pick a specific 64-bit + // subsequence from the ID to check for uniqueness. All bits of the + // ID should be high quality, and 64 bits should be unique with + // very good probability for the quantities in this test. + offset_ = Random::GetTLSInstance()->Uniform(17); // 0 to 16 + + // Use default FileSystem to avoid fault injection, etc. + FileSystem& fs = *FileSystem::Default(); + IOOptions opts; + + { + std::unique_ptr reader; + Status s = + fs.NewSequentialFile(path_, FileOptions(), &reader, /*dbg*/ nullptr); + if (s.ok()) { + // Load from file + std::string id(24U, '\0'); + Slice result; + for (;;) { + s = reader->Read(id.size(), opts, &result, &id[0], /*dbg*/ nullptr); + if (!s.ok()) { + fprintf(stderr, "Error reading unique id file: %s\n", + s.ToString().c_str()); + assert(false); + } + if (result.size() < id.size()) { + // EOF + if (result.size() != 0) { + // Corrupt file. Not a DB bug but could happen if OS doesn't provide + // good guarantees on process crash. + fprintf(stdout, "Warning: clearing corrupt unique id file\n"); + id_set_.clear(); + reader.reset(); + s = fs.DeleteFile(path_, opts, /*dbg*/ nullptr); + assert(s.ok()); + } + break; + } + VerifyNoWrite(id); + } + } else { + // Newly created is ok. + // But FileSystem doesn't tell us whether non-existence was the cause of + // the failure. (Issue #9021) + Status s2 = fs.FileExists(path_, opts, /*dbg*/ nullptr); + if (!s2.IsNotFound()) { + fprintf(stderr, "Error opening unique id file: %s\n", + s.ToString().c_str()); + assert(false); + } + } + } + fprintf(stdout, "(Re-)verified %zu unique IDs\n", id_set_.size()); + Status s = fs.ReopenWritableFile(path_, FileOptions(), &data_file_writer_, + /*dbg*/ nullptr); + if (!s.ok()) { + fprintf(stderr, "Error opening unique id file for append: %s\n", + s.ToString().c_str()); + assert(false); + } +} + +UniqueIdVerifier::~UniqueIdVerifier() { + data_file_writer_->Close(IOOptions(), /*dbg*/ nullptr); +} + +void UniqueIdVerifier::VerifyNoWrite(const std::string& id) { + assert(id.size() == 24); + bool is_new = id_set_.insert(DecodeFixed64(&id[offset_])).second; + if (!is_new) { + fprintf(stderr, + "Duplicate partial unique ID found (offset=%zu, count=%zu)\n", + offset_, id_set_.size()); + assert(false); + } +} + +void UniqueIdVerifier::Verify(const std::string& id) { + assert(id.size() == 24); + std::lock_guard lock(mutex_); + // If we accumulate more than ~4 million IDs, there would be > 1 in 1M + // natural chance of collision. Thus, simply stop checking at that point. + if (id_set_.size() >= 4294967) { + return; + } + IOStatus s = + data_file_writer_->Append(Slice(id), IOOptions(), /*dbg*/ nullptr); + if (!s.ok()) { + fprintf(stderr, "Error writing to unique id file: %s\n", + s.ToString().c_str()); + assert(false); + } + s = data_file_writer_->Flush(IOOptions(), /*dbg*/ nullptr); + if (!s.ok()) { + fprintf(stderr, "Error flushing unique id file: %s\n", + s.ToString().c_str()); + assert(false); + } + VerifyNoWrite(id); +} + +void DbStressListener::VerifyTableFileUniqueId( + const TableProperties& new_file_properties) { + // Verify unique ID + std::string id; + GetUniqueIdFromTableProperties(new_file_properties, &id); + unique_ids_.Verify(id); +} + +#endif // !ROCKSDB_LITE +#endif // GFLAGS + +} // namespace ROCKSDB_NAMESPACE diff --git a/db_stress_tool/db_stress_listener.h b/db_stress_tool/db_stress_listener.h index 7fbab57e7..931e78afb 100644 --- a/db_stress_tool/db_stress_listener.h +++ b/db_stress_tool/db_stress_listener.h @@ -3,18 +3,48 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "file/filename.h" #ifdef GFLAGS #pragma once +#include +#include + +#include "file/filename.h" #include "rocksdb/db.h" +#include "rocksdb/file_system.h" #include "rocksdb/listener.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/unique_id.h" #include "util/gflags_compat.h" #include "util/random.h" DECLARE_int32(compact_files_one_in); namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +// Verify across process executions that all seen IDs are unique +class UniqueIdVerifier { + public: + explicit UniqueIdVerifier(const std::string& db_name); + ~UniqueIdVerifier(); + + void Verify(const std::string& id); + + private: + void VerifyNoWrite(const std::string& id); + + private: + std::mutex mutex_; + // IDs persisted to a hidden file inside DB dir + std::string path_; + std::unique_ptr data_file_writer_; + // Starting byte for which 8 bytes to check in memory within 24 byte ID + size_t offset_; + // Working copy of the set of 8 byte pieces + std::unordered_set id_set_; +}; + class DbStressListener : public EventListener { public: DbStressListener(const std::string& db_name, @@ -23,9 +53,9 @@ class DbStressListener : public EventListener { : db_name_(db_name), db_paths_(db_paths), column_families_(column_families), - num_pending_file_creations_(0) {} + num_pending_file_creations_(0), + unique_ids_(db_name) {} -#ifndef ROCKSDB_LITE const char* Name() const override { return kClassName(); } static const char* kClassName() { return "DBStressListener"; } @@ -82,6 +112,8 @@ class DbStressListener : public EventListener { assert(info.table_properties.num_entries > 0); } --num_pending_file_creations_; + + VerifyTableFileUniqueId(info.table_properties); } void OnMemTableSealed(const MemTableInfo& /*info*/) override { @@ -93,9 +125,12 @@ class DbStressListener : public EventListener { RandomSleep(); } - void OnExternalFileIngested( - DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) override { + void OnExternalFileIngested(DB* /*db*/, + const ExternalFileIngestionInfo& info) override { RandomSleep(); + // Here we assume that each generated external file is ingested + // exactly once (or thrown away in case of crash) + VerifyTableFileUniqueId(info.table_properties); } void OnBackgroundError(BackgroundErrorReason /* reason */, @@ -213,17 +248,20 @@ class DbStressListener : public EventListener { #endif // !NDEBUG } + void VerifyTableFileUniqueId(const TableProperties& new_file_properties); + void RandomSleep() { std::this_thread::sleep_for( std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000))); } -#endif // !ROCKSDB_LITE private: std::string db_name_; std::vector db_paths_; std::vector column_families_; std::atomic num_pending_file_creations_; + UniqueIdVerifier unique_ids_; }; +#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 5d4c414fd..fdb1f2edc 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -2486,8 +2486,10 @@ void StressTest::Open() { column_family_names_.push_back(name); } options_.listeners.clear(); +#ifndef ROCKSDB_LITE options_.listeners.emplace_back( new DbStressListener(FLAGS_db, options_.db_paths, cf_descriptors)); +#endif // !ROCKSDB_LITE options_.create_missing_column_families = true; if (!FLAGS_use_txn) { #ifndef NDEBUG diff --git a/env/env.cc b/env/env.cc index c940b36c9..144c898a9 100644 --- a/env/env.cc +++ b/env/env.cc @@ -13,7 +13,7 @@ #include "env/composite_env_wrapper.h" #include "env/emulated_clock.h" -#include "env/unique_id.h" +#include "env/unique_id_gen.h" #include "logging/env_logger.h" #include "memory/arena.h" #include "options/db_options.h" diff --git a/env/env_test.cc b/env/env_test.cc index 48c3b9e3d..9fd088dad 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -39,7 +39,7 @@ #include "env/emulated_clock.h" #include "env/env_chroot.h" #include "env/env_encryption_ctr.h" -#include "env/unique_id.h" +#include "env/unique_id_gen.h" #include "logging/log_buffer.h" #include "logging/logging.h" #include "port/malloc.h" @@ -2704,6 +2704,23 @@ TEST_F(EnvTest, GenerateRawUniqueIdTrackRandomDeviceOnly) { t.Run(); } +TEST_F(EnvTest, SemiStructuredUniqueIdGenTest) { + // Must be thread safe and usable as a static + static SemiStructuredUniqueIdGen gen; + + struct MyStressTest + : public NoDuplicateMiniStressTest { + uint64_pair_t Generate() override { + uint64_pair_t p; + gen.GenerateNext(&p.first, &p.second); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + TEST_F(EnvTest, FailureToCreateLockFile) { auto env = Env::Default(); auto fs = env->GetFileSystem(); diff --git a/env/unique_id.h b/env/unique_id.h deleted file mode 100644 index 890d4d776..000000000 --- a/env/unique_id.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -// This file is for functions that extract novel entropy or sources of -// uniqueness from the execution environment. (By contrast, random.h is -// for algorithmic pseudorandomness.) -// -// These functions could eventually migrate to public APIs, such as in Env. - -#pragma once - -#include - -#include "rocksdb/rocksdb_namespace.h" - -namespace ROCKSDB_NAMESPACE { - -// Generates a new 128-bit identifier that is universally unique -// (with high probability) for each call. The result is split into -// two 64-bit pieces. This function has NOT been validated for use in -// cryptography. -// -// This is used in generating DB session IDs and by Env::GenerateUniqueId -// (used for DB IDENTITY) if the platform does not provide a generator of -// RFC 4122 UUIDs or fails somehow. (Set exclude_port_uuid=true if this -// function is used as a fallback for GenerateRfcUuid, because no need -// trying it again.) -void GenerateRawUniqueId(uint64_t* a, uint64_t* b, - bool exclude_port_uuid = false); - -#ifndef NDEBUG -// A version of above with options for challenge testing -void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid, - bool exclude_env_details, - bool exclude_random_device); -#endif - -} // namespace ROCKSDB_NAMESPACE diff --git a/env/unique_id.cc b/env/unique_id_gen.cc similarity index 84% rename from env/unique_id.cc rename to env/unique_id_gen.cc index ab8216a2b..73777c171 100644 --- a/env/unique_id.cc +++ b/env/unique_id_gen.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "env/unique_id.h" +#include "env/unique_id_gen.h" #include #include @@ -141,4 +141,23 @@ void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid, } #endif +SemiStructuredUniqueIdGen::SemiStructuredUniqueIdGen() : counter_{} { + saved_process_id_ = port::GetProcessID(); + GenerateRawUniqueId(&base_upper_, &base_lower_); +} + +void SemiStructuredUniqueIdGen::GenerateNext(uint64_t* upper, uint64_t* lower) { + if (port::GetProcessID() == saved_process_id_) { + // Safe to increment the atomic for guaranteed uniqueness within this + // process lifetime. Xor slightly better than +. See + // https://github.com/pdillinger/unique_id + *lower = base_lower_ ^ counter_.fetch_add(1); + *upper = base_upper_; + } else { + // There must have been a fork() or something. Rather than attempting to + // update in a thread-safe way, simply fall back on GenerateRawUniqueId. + GenerateRawUniqueId(upper, lower); + } +} + } // namespace ROCKSDB_NAMESPACE diff --git a/env/unique_id_gen.h b/env/unique_id_gen.h new file mode 100644 index 000000000..f48c3b5e7 --- /dev/null +++ b/env/unique_id_gen.h @@ -0,0 +1,69 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This file is for functions that generate unique identifiers by +// (at least in part) by extracting novel entropy or sources of uniqueness +// from the execution environment. (By contrast, random.h is for algorithmic +// pseudorandomness.) +// +// These functions could eventually migrate to public APIs, such as in Env. + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Generates a new 128-bit identifier that is universally unique +// (with high probability) for each call. The result is split into +// two 64-bit pieces. This function has NOT been validated for use in +// cryptography. +// +// This is used in generating DB session IDs and by Env::GenerateUniqueId +// (used for DB IDENTITY) if the platform does not provide a generator of +// RFC 4122 UUIDs or fails somehow. (Set exclude_port_uuid=true if this +// function is used as a fallback for GenerateRfcUuid, because no need +// trying it again.) +void GenerateRawUniqueId(uint64_t* a, uint64_t* b, + bool exclude_port_uuid = false); + +#ifndef NDEBUG +// A version of above with options for challenge testing +void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid, + bool exclude_env_details, + bool exclude_random_device); +#endif + +// Generates globally unique ids with lower probability of any collisions +// vs. each unique id being independently random (GenerateRawUniqueId). +// We call this "semi-structured" because between different +// SemiStructuredUniqueIdGen objects, the IDs are separated by random +// intervals (unstructured), but within a single SemiStructuredUniqueIdGen +// object, the generated IDs are trivially related (structured). See +// https://github.com/pdillinger/unique_id for how this improves probability +// of no collision. In short, if we have n SemiStructuredUniqueIdGen +// objects each generating m IDs, the first collision is expected at +// around n = sqrt(2^128 / m), equivalently n * sqrt(m) = 2^64, +// rather than n * m = 2^64 for fully random IDs. +class SemiStructuredUniqueIdGen { + public: + // Initializes with random starting state (from GenerateRawUniqueId) + SemiStructuredUniqueIdGen(); + + // Assuming no fork(), `lower` is guaranteed unique from one call + // to the next (thread safe). + void GenerateNext(uint64_t* upper, uint64_t* lower); + + private: + uint64_t base_upper_; + uint64_t base_lower_; + std::atomic counter_; + int64_t saved_process_id_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/unique_id.h b/include/rocksdb/unique_id.h new file mode 100644 index 000000000..030b2a724 --- /dev/null +++ b/include/rocksdb/unique_id.h @@ -0,0 +1,46 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// EXPERIMENTAL: This API is subject to change +// +// Computes a stable, universally unique 192-bit (24 binary char) identifier +// for an SST file from TableProperties. This is supported for table (SST) +// files created with RocksDB 6.24 and later. NotSupported will be returned +// for other cases. The first 16 bytes (128 bits) is of sufficient quality +// for almost all applications, and shorter prefixes are usable as a +// hash of the full unique id. +// +// Note: .c_str() is not compatible with binary char strings, so using +// .c_str() on the result will often result in information loss and very +// poor uniqueness probability. +// +// More detail: the first 128 bits are *guaranteed* unique for SST files +// generated in the same process (even different DBs, RocksDB >= 6.26), +// and first 128 bits are guaranteed not "all zeros" (RocksDB >= 6.26) +// so that the "all zeros" value can be used reliably for a null ID. +// Assuming one generates many SST files in the lifetime of each process, +// the probability of collision between processes is "better than +// random": if processes generate n SST files on average, we expect to +// generate roughly 2^64 * sqrt(n) files before first collision in the +// first 128 bits. See https://github.com/pdillinger/unique_id +// Using the full 192 bits, we expect to generate roughly 2^96 * sqrt(n) +// files before first collision. +Status GetUniqueIdFromTableProperties(const TableProperties &props, + std::string *out_id); + +// EXPERIMENTAL: This API is subject to change +// +// Converts a binary string (unique id) to hexadecimal, with each 64 bits +// separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B +// Also works on unique id prefix. +std::string UniqueIdToHumanString(const std::string &id); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src.mk b/src.mk index 28d53ce5a..c59f5b7e5 100644 --- a/src.mk +++ b/src.mk @@ -94,7 +94,7 @@ LIB_SOURCES = \ env/file_system_tracer.cc \ env/io_posix.cc \ env/mock_env.cc \ - env/unique_id.cc \ + env/unique_id_gen.cc \ file/delete_scheduler.cc \ file/file_prefetch_buffer.cc \ file/file_util.cc \ @@ -196,6 +196,7 @@ LIB_SOURCES = \ table/table_factory.cc \ table/table_properties.cc \ table/two_level_iterator.cc \ + table/unique_id.cc \ test_util/sync_point.cc \ test_util/sync_point_impl.cc \ test_util/transaction_test_util.cc \ @@ -343,9 +344,10 @@ STRESS_LIB_SOURCES = \ db_stress_tool/cf_consistency_stress.cc \ db_stress_tool/db_stress_common.cc \ db_stress_tool/db_stress_driver.cc \ - db_stress_tool/db_stress_test_base.cc \ db_stress_tool/db_stress_gflags.cc \ + db_stress_tool/db_stress_listener.cc \ db_stress_tool/db_stress_shared_state.cc \ + db_stress_tool/db_stress_test_base.cc \ db_stress_tool/db_stress_tool.cc \ db_stress_tool/expected_state.cc \ db_stress_tool/no_batched_ops_stress.cc \ diff --git a/table/table_properties.cc b/table/table_properties.cc index d0aa45026..1938f3342 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -8,9 +8,11 @@ #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" +#include "rocksdb/unique_id.h" #include "table/block_based/block.h" #include "table/internal_iterator.h" #include "table/table_properties_internal.h" +#include "table/unique_id_impl.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -183,6 +185,13 @@ std::string TableProperties::ToString( AppendProperty(result, "original file number", orig_file_number, prop_delim, kv_delim); + // Unique ID, when available + std::string id; + Status s = GetUniqueIdFromTableProperties(*this, &id); + AppendProperty(result, "unique ID", + s.ok() ? UniqueIdToHumanString(id) : "N/A", prop_delim, + kv_delim); + return result; } @@ -303,6 +312,29 @@ extern const std::string kPropertiesBlockOldName = "rocksdb.stats"; extern const std::string kCompressionDictBlock = "rocksdb.compression_dict"; extern const std::string kRangeDelBlock = "rocksdb.range_del"; +#ifndef NDEBUG +void TEST_SetRandomTableProperties(TableProperties* props) { + Random* r = Random::GetTLSInstance(); + // For now, TableProperties is composed of a number of uint64_t followed by + // a number of std::string, followed by some extras starting with + // user_collected_properties. + uint64_t* pu = &props->orig_file_number; + assert(static_cast(pu) == static_cast(props)); + std::string* ps = &props->db_id; + const uint64_t* const pu_end = reinterpret_cast(ps); + const std::string* const ps_end = + reinterpret_cast(&props->user_collected_properties); + + for (; pu < pu_end; ++pu) { + *pu = r->Next64(); + } + assert(static_cast(pu) == static_cast(ps)); + for (; ps < ps_end; ++ps) { + *ps = r->RandomBinaryString(13); + } +} +#endif + // Seek to the properties block. // Return true if it successfully seeks to the properties block. Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found) { diff --git a/table/table_properties_internal.h b/table/table_properties_internal.h index 171192434..25bc75f66 100644 --- a/table/table_properties_internal.h +++ b/table/table_properties_internal.h @@ -6,6 +6,7 @@ #pragma once #include "rocksdb/status.h" +#include "rocksdb/table_properties.h" #include "table/internal_iterator.h" namespace ROCKSDB_NAMESPACE { @@ -27,4 +28,7 @@ Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found, Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found, BlockHandle* block_handle); +#ifndef NDEBUG +void TEST_SetRandomTableProperties(TableProperties* props); +#endif } // namespace ROCKSDB_NAMESPACE diff --git a/table/table_test.cc b/table/table_test.cc index cd33c7d49..9827c95d5 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "block_fetcher.h" @@ -37,7 +38,9 @@ #include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" #include "rocksdb/statistics.h" +#include "rocksdb/table_properties.h" #include "rocksdb/trace_record.h" +#include "rocksdb/unique_id.h" #include "rocksdb/write_buffer_manager.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_builder.h" @@ -51,9 +54,11 @@ #include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" +#include "table/unique_id_impl.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/coding_lean.h" #include "util/compression.h" #include "util/file_checksum_helper.h" #include "util/random.h" @@ -1388,6 +1393,257 @@ TEST_F(TablePropertyTest, PrefixScanTest) { } } +namespace { +struct TestIds { + UniqueId64x3 internal_id; + UniqueId64x3 external_id; +}; + +inline bool operator==(const TestIds& lhs, const TestIds& rhs) { + return lhs.internal_id == rhs.internal_id && + lhs.external_id == rhs.external_id; +} + +std::ostream& operator<<(std::ostream& os, const TestIds& ids) { + return os << std::hex << "{{{ 0x" << ids.internal_id[0] << "U, 0x" + << ids.internal_id[1] << "U, 0x" << ids.internal_id[2] + << "U }}, {{ 0x" << ids.external_id[0] << "U, 0x" + << ids.external_id[1] << "U, 0x" << ids.external_id[2] << "U }}}"; +} + +TestIds GetUniqueId(TableProperties* tp, std::unordered_set* seen, + const std::string& db_id, const std::string& db_session_id, + uint64_t file_number) { + // First test session id logic + if (db_session_id.size() == 20) { + uint64_t upper; + uint64_t lower; + EXPECT_OK(DecodeSessionId(db_session_id, &upper, &lower)); + EXPECT_EQ(EncodeSessionId(upper, lower), db_session_id); + } + + // Get external using public API + tp->db_id = db_id; + tp->db_session_id = db_session_id; + tp->orig_file_number = file_number; + TestIds t; + { + std::string uid; + EXPECT_OK(GetUniqueIdFromTableProperties(*tp, &uid)); + EXPECT_EQ(uid.size(), 24U); + t.external_id[0] = DecodeFixed64(&uid[0]); + t.external_id[1] = DecodeFixed64(&uid[8]); + t.external_id[2] = DecodeFixed64(&uid[16]); + } + // All these should be effectively random + EXPECT_TRUE(seen->insert(t.external_id[0]).second); + EXPECT_TRUE(seen->insert(t.external_id[1]).second); + EXPECT_TRUE(seen->insert(t.external_id[2]).second); + + // Get internal with internal API + EXPECT_OK(GetSstInternalUniqueId(db_id, db_session_id, file_number, + &t.internal_id)); + + // Verify relationship + UniqueId64x3 tmp = t.internal_id; + InternalUniqueIdToExternal(&tmp); + EXPECT_EQ(tmp, t.external_id); + ExternalUniqueIdToInternal(&tmp); + EXPECT_EQ(tmp, t.internal_id); + return t; +} +} // namespace + +TEST_F(TablePropertyTest, UniqueIdsSchemaAndQuality) { + // To ensure the computation only depends on the expected entries, we set + // the rest randomly + TableProperties tp; + TEST_SetRandomTableProperties(&tp); + + // DB id is normally RFC-4122 + const std::string db_id1 = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d"; + // Allow other forms of DB id + const std::string db_id2 = "1728000184588763620"; + const std::string db_id3 = "x"; + + // DB session id is normally 20 chars in base-36, but 13 to 24 chars + // is ok, roughly 64 to 128 bits. + const std::string ses_id1 = "ABCDEFGHIJ0123456789"; + // Same trailing 13 digits + const std::string ses_id2 = "HIJ0123456789"; + const std::string ses_id3 = "0123ABCDEFGHIJ0123456789"; + // Different trailing 12 digits + const std::string ses_id4 = "ABCDEFGH888888888888"; + // And change length + const std::string ses_id5 = "ABCDEFGHIJ012"; + const std::string ses_id6 = "ABCDEFGHIJ0123456789ABCD"; + + using T = TestIds; + std::unordered_set seen; + // Establish a stable schema for the unique IDs. These values must not + // change for existing table files. + // (Note: parens needed for macro parsing, extra braces needed for some + // compilers.) + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id1, 1), + T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}}, + {{0xf0bd230365df7464U, 0xca089303f3648eb4U, 0x4b44f7e7324b2817U}}})); + // Only change internal_id[1] with file number + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id1, 2), + T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757feU, 0x907f41dfd90724ffU}}, + {{0xf13fdf7adcfebb6dU, 0x97cd2226cc033ea2U, 0x198c438182091f0eU}}})); + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id1, 123456789), + T({{{0x61d7dcf415d9cf19U, 0x160d77aaee5c9ae9U, 0x907f41dfd90724ffU}}, + {{0x81fbcebe1ac6c4f0U, 0x6b14a64cfdc0f1c4U, 0x7d8fb6eaf18edbb3U}}})); + // Change internal_id[1] and internal_id[2] with db_id + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id2, ses_id1, 1), + T({{{0x61d7dcf415d9cf19U, 0xf89c471f572f0d25U, 0x1f0f2a5eb0e6257eU}}, + {{0x7f1d01d453616991U, 0x32ddf2afec804ab2U, 0xd10a1ee2f0c7d9c1U}}})); + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id3, ses_id1, 1), + T({{{0x61d7dcf415d9cf19U, 0xfed297a8154a57d0U, 0x8b931b9cdebd9e8U}}, + {{0x62b2f43183f6894bU, 0x897ff2b460eefad1U, 0xf4ec189fb2d15e04U}}})); + // Keeping same last 13 digits of ses_id keeps same internal_id[0] + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id2, 1), + T({{{0x61d7dcf415d9cf19U, 0x5f6cc4fa2d528c8U, 0x7b70845d5bfb5446U}}, + {{0x96d1c83ffcc94266U, 0x82663eac0ec6e14aU, 0x94a88b49678b77f6U}}})); + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id3, 1), + T({{{0x61d7dcf415d9cf19U, 0xfc7232879db37ea2U, 0xc0378d74ea4c89cdU}}, + {{0xdf2ef57e98776905U, 0xda5b31c987da833bU, 0x79c1b4bd0a9e760dU}}})); + // Changing last 12 digits of ses_id only changes internal_id[0] + // (vs. db_id1, ses_id1, 1) + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id4, 1), + T({{{0x4f07cc0d003a83a8U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}}, + {{0xbcf85336a9f71f04U, 0x4f2949e2f3adb60dU, 0x9ca0def976abfa10U}}})); + // ses_id can change everything. + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id5, 1), + T({{{0x94b8768e43f87ce6U, 0xc2559653ac4e7c93U, 0xde6dff6bbb1223U}}, + {{0x5a9537af681817fbU, 0x1afcd1fecaead5eaU, 0x767077ad9ebe0008U}}})); + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id6, 1), + T({{{0x43cfb0ffa3b710edU, 0x263c580426406a1bU, 0xfacc91379a80d29dU}}, + {{0xfa90547d84cb1cdbU, 0x2afe99c641992d4aU, 0x205b7f7b60e51cc2U}}})); + + // Now verify more thoroughly that any small change in inputs completely + // changes external unique id. + // (Relying on 'seen' checks etc. in GetUniqueId) + std::string db_id = "00000000-0000-0000-0000-000000000000"; + std::string ses_id = "000000000000000000000000"; + uint64_t file_num = 1; + // change db_id + for (size_t i = 0; i < db_id.size(); ++i) { + if (db_id[i] == '-') { + continue; + } + for (char alt : std::string("123456789abcdef")) { + db_id[i] = alt; + GetUniqueId(&tp, &seen, db_id, ses_id, file_num); + } + db_id[i] = '0'; + } + // change ses_id + for (size_t i = 0; i < ses_id.size(); ++i) { + for (char alt : std::string("123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")) { + ses_id[i] = alt; + GetUniqueId(&tp, &seen, db_id, ses_id, file_num); + } + ses_id[i] = '0'; + } + // change file_num + for (int i = 1; i < 64; ++i) { + GetUniqueId(&tp, &seen, db_id, ses_id, file_num << i); + } + + // Verify that "all zeros" in first 128 bits is equivalent for internal and + // external IDs. This way, as long as we avoid "all zeros" in internal IDs, + // we avoid it in external IDs. + { + UniqueId64x3 id1{{0, 0, Random::GetTLSInstance()->Next64()}}; + UniqueId64x3 id2 = id1; + InternalUniqueIdToExternal(&id1); + EXPECT_EQ(id1, id2); + ExternalUniqueIdToInternal(&id2); + EXPECT_EQ(id1, id2); + } +} + +namespace { +void SetGoodTableProperties(TableProperties* tp) { + // To ensure the computation only depends on the expected entries, we set + // the rest randomly + TEST_SetRandomTableProperties(tp); + tp->db_id = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d"; + tp->db_session_id = "ABCDEFGHIJ0123456789"; + tp->orig_file_number = 1; +} +} // namespace + +TEST_F(TablePropertyTest, UniqueIdHumanStrings) { + TableProperties tp; + SetGoodTableProperties(&tp); + + std::string tmp; + EXPECT_OK(GetUniqueIdFromTableProperties(tp, &tmp)); + EXPECT_EQ(tmp, + (std::string{{'\x64', '\x74', '\xdf', '\x65', '\x03', '\x23', + '\xbd', '\xf0', '\xb4', '\x8e', '\x64', '\xf3', + '\x03', '\x93', '\x08', '\xca', '\x17', '\x28', + '\x4b', '\x32', '\xe7', '\xf7', '\x44', '\x4b'}})); + EXPECT_EQ(UniqueIdToHumanString(tmp), + "6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B"); + + // including zero padding + tmp = std::string(24U, '\0'); + tmp[15] = '\x12'; + tmp[23] = '\xAB'; + EXPECT_EQ(UniqueIdToHumanString(tmp), + "0000000000000000-0000000000000012-00000000000000AB"); + + // And shortened + tmp = std::string(20U, '\0'); + tmp[5] = '\x12'; + tmp[10] = '\xAB'; + tmp[17] = '\xEF'; + EXPECT_EQ(UniqueIdToHumanString(tmp), + "0000000000120000-0000AB0000000000-00EF0000"); + + tmp.resize(16); + EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB0000000000"); + + tmp.resize(11); + EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB"); + + tmp.resize(6); + EXPECT_EQ(UniqueIdToHumanString(tmp), "000000000012"); +} + +TEST_F(TablePropertyTest, UniqueIdsFailure) { + TableProperties tp; + std::string tmp; + + // Missing DB id + SetGoodTableProperties(&tp); + tp.db_id = ""; + EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported()); + + // Missing session id + SetGoodTableProperties(&tp); + tp.db_session_id = ""; + EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported()); + + // Missing file number + SetGoodTableProperties(&tp); + tp.orig_file_number = 0; + EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported()); +} + // This test include all the basic checks except those for index size and block // size, which will be conducted in separated unit tests. TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) { diff --git a/table/unique_id.cc b/table/unique_id.cc new file mode 100644 index 000000000..95e9ded29 --- /dev/null +++ b/table/unique_id.cc @@ -0,0 +1,166 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "table/unique_id_impl.h" +#include "util/coding_lean.h" +#include "util/hash.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +std::string EncodeSessionId(uint64_t upper, uint64_t lower) { + std::string db_session_id(20U, '\0'); + char *buf = &db_session_id[0]; + // Preserving `lower` is slightly tricky. 36^12 is slightly more than + // 62 bits, so we use 12 chars plus the bottom two bits of one more. + // (A tiny fraction of 20 digit strings go unused.) + uint64_t a = (upper << 2) | (lower >> 62); + uint64_t b = lower & (UINT64_MAX >> 2); + PutBaseChars<36>(&buf, 8, a, /*uppercase*/ true); + PutBaseChars<36>(&buf, 12, b, /*uppercase*/ true); + assert(buf == &db_session_id.back() + 1); + return db_session_id; +} + +Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper, + uint64_t *lower) { + const size_t len = db_session_id.size(); + if (len == 0) { + return Status::NotSupported("Missing db_session_id"); + } + // Anything from 13 to 24 chars is reasonable. We don't have to limit to + // exactly 20. + if (len < 13) { + return Status::NotSupported("Too short db_session_id"); + } + if (len > 24) { + return Status::NotSupported("Too long db_session_id"); + } + uint64_t a = 0, b = 0; + const char *buf = &db_session_id.front(); + bool success = ParseBaseChars<36>(&buf, len - 12U, &a); + if (!success) { + return Status::NotSupported("Bad digit in db_session_id"); + } + success = ParseBaseChars<36>(&buf, 12U, &b); + if (!success) { + return Status::NotSupported("Bad digit in db_session_id"); + } + assert(buf == &db_session_id.back() + 1); + *upper = a >> 2; + *lower = (b & (UINT64_MAX >> 2)) | (a << 62); + return Status::OK(); +} + +Status GetSstInternalUniqueId(const std::string &db_id, + const std::string &db_session_id, + uint64_t file_number, UniqueId64x3 *out) { + if (db_id.empty()) { + return Status::NotSupported("Missing db_id"); + } + if (file_number == 0) { + return Status::NotSupported("Missing or bad file number"); + } + if (db_session_id.empty()) { + return Status::NotSupported("Missing db_session_id"); + } + uint64_t session_upper = 0; // Assignment to appease clang-analyze + uint64_t session_lower = 0; // Assignment to appease clang-analyze + { + Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower); + if (!s.ok()) { + return s; + } + } + + // Exactly preserve session lower to ensure that session ids generated + // during the same process lifetime are guaranteed unique. + // DBImpl also guarantees (in recent versions) that this is not zero, + // so that we can guarantee unique ID is never all zeros. (Can't assert + // that here because of testing and old versions.) + // We put this first in anticipation of matching a small-ish set of cache + // key prefixes to cover entries relevant to any DB. + (*out)[0] = session_lower; + + // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy) + // for very high global uniqueness entropy. + // (It is possible that many DBs descended from one common DB id are copied + // around and proliferate, in which case session id is critical, but it is + // more common for different DBs to have different DB ids.) + uint64_t db_a, db_b; + Hash2x64(db_id.data(), db_id.size(), session_upper, &db_a, &db_b); + + // Xor in file number for guaranteed uniqueness by file number for a given + // session and DB id. (Xor slightly better than + here. See + // https://github.com/pdillinger/unique_id ) + (*out)[1] = db_a ^ file_number; + + // Extra (optional) global uniqueness + (*out)[2] = db_b; + + return Status::OK(); +} + +namespace { +// For InternalUniqueIdToExternal / ExternalUniqueIdToInternal we want all +// zeros in first 128 bits to map to itself, so that excluding zero in +// internal IDs (session_lower != 0 above) does the same for external IDs. +// These values are meaningless except for making that work. +constexpr uint64_t kHiOffsetForZero = 17391078804906429400U; +constexpr uint64_t kLoOffsetForZero = 6417269962128484497U; +} // namespace + +void InternalUniqueIdToExternal(UniqueId64x3 *in_out) { + uint64_t hi, lo; + BijectiveHash2x64((*in_out)[1] + kHiOffsetForZero, + (*in_out)[0] + kLoOffsetForZero, &hi, &lo); + (*in_out)[0] = lo; + (*in_out)[1] = hi; + (*in_out)[2] += lo + hi; +} + +void ExternalUniqueIdToInternal(UniqueId64x3 *in_out) { + uint64_t lo = (*in_out)[0]; + uint64_t hi = (*in_out)[1]; + (*in_out)[2] -= lo + hi; + BijectiveUnhash2x64(hi, lo, &hi, &lo); + (*in_out)[0] = lo - kLoOffsetForZero; + (*in_out)[1] = hi - kHiOffsetForZero; +} + +std::string EncodeUniqueIdBytes(const UniqueId64x3 &in) { + std::string ret(24U, '\0'); + EncodeFixed64(&ret[0], in[0]); + EncodeFixed64(&ret[8], in[1]); + EncodeFixed64(&ret[16], in[2]); + return ret; +} + +Status GetUniqueIdFromTableProperties(const TableProperties &props, + std::string *out_id) { + UniqueId64x3 tmp{}; + Status s = GetSstInternalUniqueId(props.db_id, props.db_session_id, + props.orig_file_number, &tmp); + if (s.ok()) { + InternalUniqueIdToExternal(&tmp); + *out_id = EncodeUniqueIdBytes(tmp); + } else { + out_id->clear(); + } + return s; +} + +std::string UniqueIdToHumanString(const std::string &id) { + // Not so efficient, but that's OK + std::string str = Slice(id).ToString(/*hex*/ true); + for (size_t i = 16; i < str.size(); i += 17) { + str.insert(i, "-"); + } + return str; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/table/unique_id_impl.h b/table/unique_id_impl.h new file mode 100644 index 000000000..8f414f7d6 --- /dev/null +++ b/table/unique_id_impl.h @@ -0,0 +1,59 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/unique_id.h" + +namespace ROCKSDB_NAMESPACE { + +using UniqueId64x3 = std::array; + +// Helper for GetUniqueIdFromTableProperties. This function can also be used +// for temporary ids for files without sufficient information in table +// properties. The internal unique id is more structured than the public +// unique id, so can be manipulated in more ways but very carefully. +// These must be long term stable to ensure GetUniqueIdFromTableProperties +// is long term stable. +Status GetSstInternalUniqueId(const std::string &db_id, + const std::string &db_session_id, + uint64_t file_number, UniqueId64x3 *out); + +// Helper for GetUniqueIdFromTableProperties. External unique ids go through +// this extra hashing layer so that prefixes of the unique id have predictable +// "full" entropy. This hashing layer is 1-to-1 on the first 128 bits and on +// the full 192 bits. +// This transformation must be long term stable to ensure +// GetUniqueIdFromTableProperties is long term stable. +void InternalUniqueIdToExternal(UniqueId64x3 *in_out); + +// Reverse of InternalUniqueIdToExternal mostly for testing purposes +// (demonstrably 1-to-1 on the first 128 bits and on the full 192 bits). +void ExternalUniqueIdToInternal(UniqueId64x3 *in_out); + +// Convert numerical format to byte format for public API +std::string EncodeUniqueIdBytes(const UniqueId64x3 &in); + +// Reformat a random value down to our "DB session id" format, +// which is intended to be compact and friendly for use in file names. +// `lower` is fully preserved and data is lost from `upper`. +// +// Detail: Encoded into 20 chars in base-36 ([0-9A-Z]), which is ~103 bits of +// entropy, which is enough to expect no collisions across a billion servers +// each opening DBs a million times (~2^50). Benefits vs. RFC-4122 unique id: +// * Save ~ dozen bytes per SST file +// * Shorter shared backup file names (some platforms have low limits) +// * Visually distinct from DB id format (usually RFC-4122) +std::string EncodeSessionId(uint64_t upper, uint64_t lower); + +// Reverse of EncodeSessionId. Returns NotSupported on error rather than +// Corruption because non-standard session IDs should be allowed with degraded +// functionality. +Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper, + uint64_t *lower); + +} // namespace ROCKSDB_NAMESPACE diff --git a/util/hash.cc b/util/hash.cc index f53aa8ff1..0f7f2edc1 100644 --- a/util/hash.cc +++ b/util/hash.cc @@ -9,7 +9,7 @@ #include "util/hash.h" -#include +#include #include "port/lang.h" #include "util/coding.h" @@ -120,4 +120,82 @@ void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64) { *low64 = h.low64; } +void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64, + uint64_t* low64) { + auto h = XXH3_128bits_withSeed(data, n, seed); + *high64 = h.high64; + *low64 = h.low64; +} + +namespace { + +inline uint64_t XXH3_avalanche(uint64_t h64) { + h64 ^= h64 >> 37; + h64 *= 0x165667919E3779F9U; + h64 ^= h64 >> 32; + return h64; +} + +inline uint64_t XXH3_unavalanche(uint64_t h64) { + h64 ^= h64 >> 32; + h64 *= 0x8da8ee41d6df849U; // inverse of 0x165667919E3779F9U + h64 ^= h64 >> 37; + return h64; +} + +} // namespace + +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64) { + // Adapted from XXH3_len_9to16_128b + const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed; + const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed; + Unsigned128 tmp128 = + Multiply64to128(in_low64 ^ in_high64 ^ bitflipl, 0x9E3779B185EBCA87U); + uint64_t lo = Lower64of128(tmp128); + uint64_t hi = Upper64of128(tmp128); + lo += 0x3c0000000000000U; // (len - 1) << 54 + in_high64 ^= bitfliph; + hi += in_high64 + (Lower32of64(in_high64) * uint64_t{0x85EBCA76}); + lo ^= EndianSwapValue(hi); + tmp128 = Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU); + lo = Lower64of128(tmp128); + hi = Upper64of128(tmp128) + (hi * 0xC2B2AE3D27D4EB4FU); + *out_low64 = XXH3_avalanche(lo); + *out_high64 = XXH3_avalanche(hi); +} + +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64) { + // Inverted above (also consulting XXH3_len_9to16_128b) + const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed; + const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed; + uint64_t lo = XXH3_unavalanche(in_low64); + uint64_t hi = XXH3_unavalanche(in_high64); + lo *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU + hi -= Upper64of128(Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU)); + hi *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU + lo ^= EndianSwapValue(hi); + lo -= 0x3c0000000000000U; + lo *= 0x887493432badb37U; // inverse of 0x9E3779B185EBCA87U + hi -= Upper64of128(Multiply64to128(lo, 0x9E3779B185EBCA87U)); + uint32_t tmp32 = Lower32of64(hi) * 0xb6c92f47; // inverse of 0x85EBCA77 + hi -= tmp32; + hi = (hi & 0xFFFFFFFF00000000U) - + ((tmp32 * uint64_t{0x85EBCA76}) & 0xFFFFFFFF00000000U) + tmp32; + hi ^= bitfliph; + lo ^= hi ^ bitflipl; + *out_high64 = hi; + *out_low64 = lo; +} + +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64) { + BijectiveHash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64); +} + +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64) { + BijectiveUnhash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64); +} } // namespace ROCKSDB_NAMESPACE diff --git a/util/hash.h b/util/hash.h index cf1fda3f6..eafa47f34 100644 --- a/util/hash.h +++ b/util/hash.h @@ -66,6 +66,21 @@ inline uint64_t NPHash64(const char* data, size_t n) { // Convenient and equivalent version of Hash128 without depending on 128-bit // scalars void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64); +void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64, + uint64_t* low64); + +// Hash 128 bits to 128 bits, guaranteed not to lose data (equivalent to +// Hash2x64 on 16 bytes little endian) +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64); +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64); + +// Inverse of above (mostly for testing) +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64); +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64); // Stable/persistent 32-bit hash. Moderate quality and high speed on // small inputs. diff --git a/util/hash_test.cc b/util/hash_test.cc index 231d06fce..d7e9ae781 100644 --- a/util/hash_test.cc +++ b/util/hash_test.cc @@ -14,9 +14,13 @@ #include "test_util/testharness.h" #include "util/coding.h" +#include "util/coding_lean.h" #include "util/hash128.h" #include "util/math128.h" +using ROCKSDB_NAMESPACE::BijectiveHash2x64; +using ROCKSDB_NAMESPACE::BijectiveUnhash2x64; +using ROCKSDB_NAMESPACE::DecodeFixed64; using ROCKSDB_NAMESPACE::EncodeFixed32; using ROCKSDB_NAMESPACE::GetSliceHash64; using ROCKSDB_NAMESPACE::Hash; @@ -277,9 +281,16 @@ TEST(HashTest, Hash64LargeValueSchema) { TEST(HashTest, Hash128Misc) { constexpr uint32_t kSeed = 0; // Same as GetSliceHash128 - for (char fill : {'\0', 'a', '1', '\xff'}) { + for (char fill : {'\0', 'a', '1', '\xff', 'e'}) { const size_t max_size = 1000; - const std::string str(max_size, fill); + std::string str(max_size, fill); + + if (fill == 'e') { + // Use different characters to check endianness handling + for (size_t i = 0; i < str.size(); ++i) { + str[i] += static_cast(i); + } + } for (size_t size = 0; size <= max_size; ++size) { Unsigned128 here = Hash128(str.data(), size, kSeed); @@ -293,6 +304,18 @@ TEST(HashTest, Hash128Misc) { EXPECT_EQ(Lower64of128(here), lo); EXPECT_EQ(Upper64of128(here), hi); } + if (size == 16) { + const uint64_t in_hi = DecodeFixed64(str.data() + 8); + const uint64_t in_lo = DecodeFixed64(str.data()); + uint64_t hi, lo; + BijectiveHash2x64(in_hi, in_lo, &hi, &lo); + EXPECT_EQ(Lower64of128(here), lo); + EXPECT_EQ(Upper64of128(here), hi); + uint64_t un_hi, un_lo; + BijectiveUnhash2x64(hi, lo, &un_hi, &un_lo); + EXPECT_EQ(in_lo, un_lo); + EXPECT_EQ(in_hi, un_hi); + } // Upper and Lower must reconstruct hash EXPECT_EQ(here, @@ -302,7 +325,27 @@ TEST(HashTest, Hash128Misc) { // Seed changes hash value (with high probability) for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) { - EXPECT_NE(here, Hash128(str.data(), size, var_seed)); + Unsigned128 seeded = Hash128(str.data(), size, var_seed); + EXPECT_NE(here, seeded); + // Must match seeded Hash2x64 + { + uint64_t hi, lo; + Hash2x64(str.data(), size, var_seed, &hi, &lo); + EXPECT_EQ(Lower64of128(seeded), lo); + EXPECT_EQ(Upper64of128(seeded), hi); + } + if (size == 16) { + const uint64_t in_hi = DecodeFixed64(str.data() + 8); + const uint64_t in_lo = DecodeFixed64(str.data()); + uint64_t hi, lo; + BijectiveHash2x64(in_hi, in_lo, var_seed, &hi, &lo); + EXPECT_EQ(Lower64of128(seeded), lo); + EXPECT_EQ(Upper64of128(seeded), hi); + uint64_t un_hi, un_lo; + BijectiveUnhash2x64(hi, lo, var_seed, &un_hi, &un_lo); + EXPECT_EQ(in_lo, un_lo); + EXPECT_EQ(in_hi, un_hi); + } } // Size changes hash value (with high probability) diff --git a/util/random.h b/util/random.h index 7e1350f06..16162f67b 100644 --- a/util/random.h +++ b/util/random.h @@ -9,6 +9,7 @@ #pragma once #include + #include #include @@ -60,6 +61,8 @@ class Random { return seed_; } + uint64_t Next64() { return (uint64_t{Next()} << 32) | Next(); } + // Returns a uniformly distributed value in the range [0..n-1] // REQUIRES: n > 0 uint32_t Uniform(int n) { return Next() % n; } diff --git a/util/string_util.h b/util/string_util.h index bb4a88514..7794dbb06 100644 --- a/util/string_util.h +++ b/util/string_util.h @@ -46,12 +46,41 @@ inline void PutBaseChars(char** buf, size_t n, uint64_t v, bool uppercase) { const char* digitChars = uppercase ? "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" : "0123456789abcdefghijklmnopqrstuvwxyz"; for (size_t i = n; i > 0; --i) { - (*buf)[i - 1] = digitChars[v % kBase]; + (*buf)[i - 1] = digitChars[static_cast(v % kBase)]; v /= kBase; } *buf += n; } +// Parse n digits from *buf in base kBase to *v and advance *buf to the +// position after what was read. On success, true is returned. On failure, +// false is returned, *buf is placed at the first bad character, and *v +// contains the partial parsed data. Overflow is not checked but the +// result is accurate mod 2^64. Requires the starting value of *v to be +// zero or previously accumulated parsed digits, i.e. +// ParseBaseChars(&b, n, &v); +// is equivalent to n calls to +// ParseBaseChars(&b, 1, &v); +template +inline bool ParseBaseChars(const char** buf, size_t n, uint64_t* v) { + while (n) { + char c = **buf; + *v *= static_cast(kBase); + if (c >= '0' && (kBase >= 10 ? c <= '9' : c < '0' + kBase)) { + *v += static_cast(c - '0'); + } else if (kBase > 10 && c >= 'A' && c < 'A' + kBase - 10) { + *v += static_cast(c - 'A' + 10); + } else if (kBase > 10 && c >= 'a' && c < 'a' + kBase - 10) { + *v += static_cast(c - 'a' + 10); + } else { + return false; + } + --n; + ++*buf; + } + return true; +} + // Return a human-readable version of num. // for num >= 10.000, prints "xxK" // for num >= 10.000.000, prints "xxM"