diff --git a/HISTORY.md b/HISTORY.md index 4912d6480..0ce45fc65 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## Unreleased ### Bug Fixes * Fixed a bug in handling file rename error in distributed/network file systems when the server succeeds but client returns error. The bug can cause CURRENT file to point to non-existing MANIFEST file, thus DB cannot be opened. +* Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results. * Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`. ## 6.20.0 (04/16/2021) diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index f61f78df0..a11a44b99 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -1542,6 +1542,44 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) { ASSERT_EQ(2, NumTableFilesAtLevel(0)); } +TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) { + // Repro https://github.com/facebook/rocksdb/issues/6245. + // Flush three files to L0. Ingest one more file to trigger L0->L1 compaction + // via trivial move. The bug happened when L1 files were incorrectly sorted + // resulting in an old value for "k" returned by `Get()`. + Options options = CurrentOptions(); + + ASSERT_OK(Put("k", "a")); + Flush(); + ASSERT_OK(Put("k", "a")); + Flush(); + ASSERT_OK(Put("k", "a")); + Flush(); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open a + // file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("k", "b")); + + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s) << s.ToString(); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + IngestExternalFileOptions ifo; + s = db_->IngestExternalFile({file1}, ifo); + ASSERT_OK(s); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(Get("k"), "b"); +} + INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest, testing::Values(std::make_tuple(true, true), std::make_tuple(true, false), diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 761b2419f..ff5450138 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -367,9 +367,32 @@ Status ExternalSstFileIngestionJob::Run() { super_version, force_global_seqno, cfd_->ioptions()->compaction_style, last_seqno, &f, &assigned_seqno); } + + // Modify the smallest/largest internal key to include the sequence number + // that we just learned. Only overwrite sequence number zero. There could + // be a nonzero sequence number already to indicate a range tombstone's + // exclusive endpoint. + ParsedInternalKey smallest_parsed, largest_parsed; + if (status.ok()) { + status = ParseInternalKey(*f.smallest_internal_key.rep(), + &smallest_parsed, false /* log_err_key */); + } + if (status.ok()) { + status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed, + false /* log_err_key */); + } if (!status.ok()) { return status; } + if (smallest_parsed.sequence == 0) { + UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno, + smallest_parsed.type); + } + if (largest_parsed.sequence == 0) { + UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno, + largest_parsed.type); + } + status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno); TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", &assigned_seqno);