// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/version_edit.h" #include "db/blob_index.h" #include "db/version_set.h" #include "logging/event_logger.h" #include "rocksdb/slice.h" #include "test_util/sync_point.h" #include "util/coding.h" #include "util/string_util.h" namespace rocksdb { // Mask for an identified tag from the future which can be safely ignored. const uint32_t kTagSafeIgnoreMask = 1 << 13; // Tag numbers for serialized VersionEdit. These numbers are written to // disk and should not be changed. The number should be forward compatible so // users can down-grade RocksDB safely. A future Tag is ignored by doing '&' // between Tag and kTagSafeIgnoreMask field. enum Tag : uint32_t { kComparator = 1, kLogNumber = 2, kNextFileNumber = 3, kLastSequence = 4, kCompactPointer = 5, kDeletedFile = 6, kNewFile = 7, // 8 was used for large value refs kPrevLogNumber = 9, kMinLogNumberToKeep = 10, // Ignore-able field kDbId = kTagSafeIgnoreMask + 1, // these are new formats divergent from open source leveldb kNewFile2 = 100, kNewFile3 = 102, kNewFile4 = 103, // 4th (the latest) format version of adding files kColumnFamily = 200, // specify column family for version edit kColumnFamilyAdd = 201, kColumnFamilyDrop = 202, kMaxColumnFamily = 203, kInAtomicGroup = 300, }; enum CustomTag : uint32_t { kTerminate = 1, // The end of customized fields kNeedCompaction = 2, // Since Manifest is not entirely currently forward-compatible, and the only // forward-compatible part is the CutsomtTag of kNewFile, we currently encode // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be // removed when manifest becomes forward-comptabile. kMinLogNumberToKeepHack = 3, kOldestBlobFileNumber = 4, kOldestAncesterTime = 5, kPathId = 65, }; // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. uint32_t kCustomTagNonSafeIgnoreMask = 1 << 6; uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) { assert(number <= kFileNumberMask); return number | (path_id * (kFileNumberMask + 1)); } void FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value, SequenceNumber seqno, ValueType value_type) { if (smallest.size() == 0) { smallest.DecodeFrom(key); } largest.DecodeFrom(key); fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); fd.largest_seqno = std::max(fd.largest_seqno, seqno); #ifndef ROCKSDB_LITE if (value_type == kTypeBlobIndex) { BlobIndex blob_index; const Status s = blob_index.DecodeFrom(value); if (!s.ok()) { return; } if (blob_index.IsInlined()) { return; } if (blob_index.HasTTL()) { return; } // Paranoid check: this should not happen because BlobDB numbers the blob // files starting from 1. if (blob_index.file_number() == kInvalidBlobFileNumber) { return; } if (oldest_blob_file_number == kInvalidBlobFileNumber || oldest_blob_file_number > blob_index.file_number()) { oldest_blob_file_number = blob_index.file_number(); } } #else (void)value; (void)value_type; #endif } void VersionEdit::Clear() { db_id_.clear(); comparator_.clear(); max_level_ = 0; log_number_ = 0; prev_log_number_ = 0; last_sequence_ = 0; next_file_number_ = 0; max_column_family_ = 0; min_log_number_to_keep_ = 0; has_db_id_ = false; has_comparator_ = false; has_log_number_ = false; has_prev_log_number_ = false; has_next_file_number_ = false; has_last_sequence_ = false; has_max_column_family_ = false; has_min_log_number_to_keep_ = false; deleted_files_.clear(); new_files_.clear(); column_family_ = 0; is_column_family_add_ = 0; is_column_family_drop_ = 0; column_family_name_.clear(); is_in_atomic_group_ = false; remaining_entries_ = 0; } bool VersionEdit::EncodeTo(std::string* dst) const { if (has_db_id_) { PutVarint32(dst, kDbId); PutLengthPrefixedSlice(dst, db_id_); } if (has_comparator_) { PutVarint32(dst, kComparator); PutLengthPrefixedSlice(dst, comparator_); } if (has_log_number_) { PutVarint32Varint64(dst, kLogNumber, log_number_); } if (has_prev_log_number_) { PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_); } if (has_next_file_number_) { PutVarint32Varint64(dst, kNextFileNumber, next_file_number_); } if (has_last_sequence_) { PutVarint32Varint64(dst, kLastSequence, last_sequence_); } if (has_max_column_family_) { PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_); } for (const auto& deleted : deleted_files_) { PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */, deleted.second /* file number */); } bool min_log_num_written = false; for (size_t i = 0; i < new_files_.size(); i++) { const FileMetaData& f = new_files_[i].second; if (!f.smallest.Valid() || !f.largest.Valid()) { return false; } PutVarint32(dst, kNewFile4); PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber()); PutVarint64(dst, f.fd.GetFileSize()); PutLengthPrefixedSlice(dst, f.smallest.Encode()); PutLengthPrefixedSlice(dst, f.largest.Encode()); PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno); // Customized fields' format: // +-----------------------------+ // | 1st field's tag (varint32) | // +-----------------------------+ // | 1st field's size (varint32) | // +-----------------------------+ // | bytes for 1st field | // | (based on size decoded) | // +-----------------------------+ // | | // | ...... | // | | // +-----------------------------+ // | last field's size (varint32)| // +-----------------------------+ // | bytes for last field | // | (based on size decoded) | // +-----------------------------+ // | terminating tag (varint32) | // +-----------------------------+ // // Customized encoding for fields: // tag kPathId: 1 byte as path_id // tag kNeedCompaction: // now only can take one char value 1 indicating need-compaction // PutVarint32(dst, CustomTag::kOldestAncesterTime); std::string varint_oldest_ancester_time; PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time); TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime", &varint_oldest_ancester_time); PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time)); if (f.fd.GetPathId() != 0) { PutVarint32(dst, CustomTag::kPathId); char p = static_cast(f.fd.GetPathId()); PutLengthPrefixedSlice(dst, Slice(&p, 1)); } if (f.marked_for_compaction) { PutVarint32(dst, CustomTag::kNeedCompaction); char p = static_cast(1); PutLengthPrefixedSlice(dst, Slice(&p, 1)); } if (has_min_log_number_to_keep_ && !min_log_num_written) { PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack); std::string varint_log_number; PutFixed64(&varint_log_number, min_log_number_to_keep_); PutLengthPrefixedSlice(dst, Slice(varint_log_number)); min_log_num_written = true; } if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { PutVarint32(dst, CustomTag::kOldestBlobFileNumber); std::string oldest_blob_file_number; PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number); PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number)); } TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", dst); PutVarint32(dst, CustomTag::kTerminate); } // 0 is default and does not need to be explicitly written if (column_family_ != 0) { PutVarint32Varint32(dst, kColumnFamily, column_family_); } if (is_column_family_add_) { PutVarint32(dst, kColumnFamilyAdd); PutLengthPrefixedSlice(dst, Slice(column_family_name_)); } if (is_column_family_drop_) { PutVarint32(dst, kColumnFamilyDrop); } if (is_in_atomic_group_) { PutVarint32(dst, kInAtomicGroup); PutVarint32(dst, remaining_entries_); } return true; } static bool GetInternalKey(Slice* input, InternalKey* dst) { Slice str; if (GetLengthPrefixedSlice(input, &str)) { dst->DecodeFrom(str); return dst->Valid(); } else { return false; } } bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) { uint32_t v; if (GetVarint32(input, &v)) { *level = v; if (max_level_ < *level) { max_level_ = *level; } return true; } else { return false; } } const char* VersionEdit::DecodeNewFile4From(Slice* input) { const char* msg = nullptr; int level; FileMetaData f; uint64_t number; uint32_t path_id = 0; uint64_t file_size; SequenceNumber smallest_seqno; SequenceNumber largest_seqno; // Since this is the only forward-compatible part of the code, we hack new // extension into this record. When we do, we set this boolean to distinguish // the record from the normal NewFile records. if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) && GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) && GetInternalKey(input, &f.largest) && GetVarint64(input, &smallest_seqno) && GetVarint64(input, &largest_seqno)) { // See comments in VersionEdit::EncodeTo() for format of customized fields while (true) { uint32_t custom_tag; Slice field; if (!GetVarint32(input, &custom_tag)) { return "new-file4 custom field"; } if (custom_tag == kTerminate) { break; } if (!GetLengthPrefixedSlice(input, &field)) { return "new-file4 custom field length prefixed slice error"; } switch (custom_tag) { case kPathId: if (field.size() != 1) { return "path_id field wrong size"; } path_id = field[0]; if (path_id > 3) { return "path_id wrong vaue"; } break; case kOldestAncesterTime: if (!GetVarint64(&field, &f.oldest_ancester_time)) { return "invalid oldest ancester time"; } break; case kNeedCompaction: if (field.size() != 1) { return "need_compaction field wrong size"; } f.marked_for_compaction = (field[0] == 1); break; case kMinLogNumberToKeepHack: // This is a hack to encode kMinLogNumberToKeep in a // forward-compatible fashion. if (!GetFixed64(&field, &min_log_number_to_keep_)) { return "deleted log number malformatted"; } has_min_log_number_to_keep_ = true; break; case kOldestBlobFileNumber: if (!GetVarint64(&field, &f.oldest_blob_file_number)) { return "invalid oldest blob file number"; } break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it return "new-file4 custom field not supported"; } break; } } } else { return "new-file4 entry"; } f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno); new_files_.push_back(std::make_pair(level, f)); return nullptr; } Status VersionEdit::DecodeFrom(const Slice& src) { Clear(); Slice input = src; const char* msg = nullptr; uint32_t tag; // Temporary storage for parsing int level; FileMetaData f; Slice str; InternalKey key; while (msg == nullptr && GetVarint32(&input, &tag)) { switch (tag) { case kDbId: if (GetLengthPrefixedSlice(&input, &str)) { db_id_ = str.ToString(); has_db_id_ = true; } else { msg = "db id"; } break; case kComparator: if (GetLengthPrefixedSlice(&input, &str)) { comparator_ = str.ToString(); has_comparator_ = true; } else { msg = "comparator name"; } break; case kLogNumber: if (GetVarint64(&input, &log_number_)) { has_log_number_ = true; } else { msg = "log number"; } break; case kPrevLogNumber: if (GetVarint64(&input, &prev_log_number_)) { has_prev_log_number_ = true; } else { msg = "previous log number"; } break; case kNextFileNumber: if (GetVarint64(&input, &next_file_number_)) { has_next_file_number_ = true; } else { msg = "next file number"; } break; case kLastSequence: if (GetVarint64(&input, &last_sequence_)) { has_last_sequence_ = true; } else { msg = "last sequence number"; } break; case kMaxColumnFamily: if (GetVarint32(&input, &max_column_family_)) { has_max_column_family_ = true; } else { msg = "max column family"; } break; case kMinLogNumberToKeep: if (GetVarint64(&input, &min_log_number_to_keep_)) { has_min_log_number_to_keep_ = true; } else { msg = "min log number to kee"; } break; case kCompactPointer: if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) { // we don't use compact pointers anymore, // but we should not fail if they are still // in manifest } else { if (!msg) { msg = "compaction pointer"; } } break; case kDeletedFile: { uint64_t number; if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) { deleted_files_.insert(std::make_pair(level, number)); } else { if (!msg) { msg = "deleted file"; } } break; } case kNewFile: { uint64_t number; uint64_t file_size; if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && GetVarint64(&input, &file_size) && GetInternalKey(&input, &f.smallest) && GetInternalKey(&input, &f.largest)) { f.fd = FileDescriptor(number, 0, file_size); new_files_.push_back(std::make_pair(level, f)); } else { if (!msg) { msg = "new-file entry"; } } break; } case kNewFile2: { uint64_t number; uint64_t file_size; SequenceNumber smallest_seqno; SequenceNumber largest_seqno; if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && GetVarint64(&input, &file_size) && GetInternalKey(&input, &f.smallest) && GetInternalKey(&input, &f.largest) && GetVarint64(&input, &smallest_seqno) && GetVarint64(&input, &largest_seqno)) { f.fd = FileDescriptor(number, 0, file_size, smallest_seqno, largest_seqno); new_files_.push_back(std::make_pair(level, f)); } else { if (!msg) { msg = "new-file2 entry"; } } break; } case kNewFile3: { uint64_t number; uint32_t path_id; uint64_t file_size; SequenceNumber smallest_seqno; SequenceNumber largest_seqno; if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) && GetInternalKey(&input, &f.smallest) && GetInternalKey(&input, &f.largest) && GetVarint64(&input, &smallest_seqno) && GetVarint64(&input, &largest_seqno)) { f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno); new_files_.push_back(std::make_pair(level, f)); } else { if (!msg) { msg = "new-file3 entry"; } } break; } case kNewFile4: { msg = DecodeNewFile4From(&input); break; } case kColumnFamily: if (!GetVarint32(&input, &column_family_)) { if (!msg) { msg = "set column family id"; } } break; case kColumnFamilyAdd: if (GetLengthPrefixedSlice(&input, &str)) { is_column_family_add_ = true; column_family_name_ = str.ToString(); } else { if (!msg) { msg = "column family add"; } } break; case kColumnFamilyDrop: is_column_family_drop_ = true; break; case kInAtomicGroup: is_in_atomic_group_ = true; if (!GetVarint32(&input, &remaining_entries_)) { if (!msg) { msg = "remaining entries"; } } break; default: if (tag & kTagSafeIgnoreMask) { // Tag from future which can be safely ignored. // The next field must be the length of the entry. uint32_t field_len; if (!GetVarint32(&input, &field_len) || static_cast(field_len) > input.size()) { if (!msg) { msg = "safely ignoreable tag length error"; } } else { input.remove_prefix(static_cast(field_len)); } } else { msg = "unknown tag"; } break; } } if (msg == nullptr && !input.empty()) { msg = "invalid tag"; } Status result; if (msg != nullptr) { result = Status::Corruption("VersionEdit", msg); } return result; } std::string VersionEdit::DebugString(bool hex_key) const { std::string r; r.append("VersionEdit {"); if (has_db_id_) { r.append("\n DB ID: "); r.append(db_id_); } if (has_comparator_) { r.append("\n Comparator: "); r.append(comparator_); } if (has_log_number_) { r.append("\n LogNumber: "); AppendNumberTo(&r, log_number_); } if (has_prev_log_number_) { r.append("\n PrevLogNumber: "); AppendNumberTo(&r, prev_log_number_); } if (has_next_file_number_) { r.append("\n NextFileNumber: "); AppendNumberTo(&r, next_file_number_); } if (has_min_log_number_to_keep_) { r.append("\n MinLogNumberToKeep: "); AppendNumberTo(&r, min_log_number_to_keep_); } if (has_last_sequence_) { r.append("\n LastSeq: "); AppendNumberTo(&r, last_sequence_); } for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); iter != deleted_files_.end(); ++iter) { r.append("\n DeleteFile: "); AppendNumberTo(&r, iter->first); r.append(" "); AppendNumberTo(&r, iter->second); } for (size_t i = 0; i < new_files_.size(); i++) { const FileMetaData& f = new_files_[i].second; r.append("\n AddFile: "); AppendNumberTo(&r, new_files_[i].first); r.append(" "); AppendNumberTo(&r, f.fd.GetNumber()); r.append(" "); AppendNumberTo(&r, f.fd.GetFileSize()); r.append(" "); r.append(f.smallest.DebugString(hex_key)); r.append(" .. "); r.append(f.largest.DebugString(hex_key)); if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { r.append(" blob_file:"); AppendNumberTo(&r, f.oldest_blob_file_number); } r.append(" oldest_ancester_time:"); AppendNumberTo(&r, f.oldest_ancester_time); } r.append("\n ColumnFamily: "); AppendNumberTo(&r, column_family_); if (is_column_family_add_) { r.append("\n ColumnFamilyAdd: "); r.append(column_family_name_); } if (is_column_family_drop_) { r.append("\n ColumnFamilyDrop"); } if (has_max_column_family_) { r.append("\n MaxColumnFamily: "); AppendNumberTo(&r, max_column_family_); } if (is_in_atomic_group_) { r.append("\n AtomicGroup: "); AppendNumberTo(&r, remaining_entries_); r.append(" entries remains"); } r.append("\n}\n"); return r; } std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { JSONWriter jw; jw << "EditNumber" << edit_num; if (has_db_id_) { jw << "DB ID" << db_id_; } if (has_comparator_) { jw << "Comparator" << comparator_; } if (has_log_number_) { jw << "LogNumber" << log_number_; } if (has_prev_log_number_) { jw << "PrevLogNumber" << prev_log_number_; } if (has_next_file_number_) { jw << "NextFileNumber" << next_file_number_; } if (has_last_sequence_) { jw << "LastSeq" << last_sequence_; } if (!deleted_files_.empty()) { jw << "DeletedFiles"; jw.StartArray(); for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); iter != deleted_files_.end(); ++iter) { jw.StartArrayedObject(); jw << "Level" << iter->first; jw << "FileNumber" << iter->second; jw.EndArrayedObject(); } jw.EndArray(); } if (!new_files_.empty()) { jw << "AddedFiles"; jw.StartArray(); for (size_t i = 0; i < new_files_.size(); i++) { jw.StartArrayedObject(); jw << "Level" << new_files_[i].first; const FileMetaData& f = new_files_[i].second; jw << "FileNumber" << f.fd.GetNumber(); jw << "FileSize" << f.fd.GetFileSize(); jw << "SmallestIKey" << f.smallest.DebugString(hex_key); jw << "LargestIKey" << f.largest.DebugString(hex_key); if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { jw << "OldestBlobFile" << f.oldest_blob_file_number; } jw.EndArrayedObject(); } jw.EndArray(); } jw << "ColumnFamily" << column_family_; if (is_column_family_add_) { jw << "ColumnFamilyAdd" << column_family_name_; } if (is_column_family_drop_) { jw << "ColumnFamilyDrop" << column_family_name_; } if (has_max_column_family_) { jw << "MaxColumnFamily" << max_column_family_; } if (has_min_log_number_to_keep_) { jw << "MinLogNumberToKeep" << min_log_number_to_keep_; } if (is_in_atomic_group_) { jw << "AtomicGroup" << remaining_entries_; } jw.EndObject(); return jw.Get(); } } // namespace rocksdb