diff --git a/db/db_impl.cc b/db/db_impl.cc index 846122f60..ce74d62a9 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1508,6 +1508,16 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { earliest_snapshot = compact->existing_snapshots[0]; } + // Is this compaction producing files at the bottommost level? + bool bottommost_level = true; + for (int i = compact->compaction->level() + 2; + i < versions_->NumberLevels(); i++) { + if (versions_->NumLevelFiles(i) > 0) { + bottommost_level = false; + break; + } + } + // Allocate the output file numbers before we release the lock AllocateCompactionOutputFileNumbers(compact); @@ -1621,14 +1631,25 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { #if 0 Log(options_.info_log, " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " - "%d smallest_snapshot: %d", + "%d smallest_snapshot: %d level: %d bottommost %d", ikey.user_key.ToString().c_str(), (int)ikey.sequence, ikey.type, kTypeValue, drop, compact->compaction->IsBaseLevelForKey(ikey.user_key), - (int)last_sequence_for_key, (int)compact->smallest_snapshot); + (int)last_sequence_for_key, (int)earliest_snapshot, + compact->compaction->level(), bottommost_level); #endif if (!drop) { + + // Zeroing out the sequence number leads to better compression. + // If this is the bottommost level (no files in lower levels) + // and the earliest snapshot is larger than this seqno + // then we can squash the seqno to zero. + if (bottommost_level && ikey.sequence < earliest_snapshot) { + assert(ikey.type != kTypeDeletion); + UpdateInternalKey(key, (uint64_t)0, ikey.type); + } + // Open output file if necessary if (compact->builder == NULL) { status = OpenCompactionOutputFile(compact); diff --git a/db/db_test.cc b/db/db_test.cc index 37a9d2be8..f74b91110 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -1309,7 +1309,6 @@ TEST(DBTest, RepeatedWritesToSameKey) { for (int i = 0; i < 5 * kMaxFiles; i++) { Put("key", value); ASSERT_LE(TotalTableFiles(), kMaxFiles); - fprintf(stderr, "after %d: %d files\n", int(i+1), TotalTableFiles()); } } @@ -1372,6 +1371,30 @@ TEST(DBTest, CompactionFilter) { ASSERT_NE(NumTableFilesAtLevel(2), 0); cfilter_count = 0; + // All the files are in the lowest level. + // Verify that all but the 100001st record + // has sequence number zero. The 100001st record + // is at the tip of this snapshot and cannot + // be zeroed out. + int count = 0; + int total = 0; + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_EQ(iter->status().ok(), true); + while (iter->Valid()) { + ParsedInternalKey ikey; + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + ASSERT_EQ(total, 100001); + ASSERT_EQ(count, 1); + delete iter; + // overwrite all the 100K+1 keys once again. for (int i = 0; i < 100001; i++) { char key[100]; @@ -1427,10 +1450,27 @@ TEST(DBTest, CompactionFilter) { // 100001th key is left in the db. The 100001th key // is part of the default-most-current snapshot and // cannot be deleted. - Iterator* iter = db_->NewIterator(ReadOptions()); + iter = db_->NewIterator(ReadOptions()); iter->SeekToFirst(); - int count = 0; + count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + ASSERT_EQ(count, 1); + delete iter; + + // The sequence number of the remaining record + // is not zeroed out even though it is at the + // level Lmax because this record is at the tip + count = 0; + iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_EQ(iter->status().ok(), true); while (iter->Valid()) { + ParsedInternalKey ikey; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_NE(ikey.sequence, 0); count++; iter->Next(); } @@ -2144,7 +2184,6 @@ TEST(DBTest, NonWritableFileSystem) std::string big(100000, 'x'); int errors = 0; for (int i = 0; i < 20; i++) { - fprintf(stderr, "iter %d; errors %d\n", i, errors); if (!Put("foo", big).ok()) { errors++; env_->SleepForMicroseconds(100000); diff --git a/db/dbformat.h b/db/dbformat.h index 66f0e76fa..ceedc0438 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -157,6 +157,16 @@ inline bool ParseInternalKey(const Slice& internal_key, return (c <= static_cast(kTypeValue)); } +// Update the sequence number in the internal key +inline void UpdateInternalKey(const Slice& internal_key, + uint64_t seq, ValueType t) { + const size_t n = internal_key.size(); + assert(n >= 8); + char* seqtype = (char *)internal_key.data() + n - 8; + uint64_t newval = (seq << 8) | t; + EncodeFixed64(seqtype, newval); +} + // A helper class useful for DBImpl::Get() class LookupKey { public: