rocksdb/db/db_iter.cc

// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "db/db_iter.h"

#include "db/filename.h"
#include "db/dbformat.h"
#include "include/env.h"
#include "include/iterator.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/mutexlock.h"

namespace leveldb {

#if 0
static void DumpInternalIter(Iterator* iter) {
  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
    ParsedInternalKey k;
    if (!ParseInternalKey(iter->key(), &k)) {
      fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
    } else {
      fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
    }
  }
}
#endif

namespace {

// Memtables and sstables that make the DB representation contain
// (userkey,seq,type) => uservalue entries.  DBIter
// combines multiple entries for the same userkey found in the DB
// representation into a single entry while accounting for sequence
// numbers, deletion markers, overwrites, etc.
class DBIter: public Iterator {
 public:
  DBIter(const std::string* dbname, Env* env,
         const Comparator* cmp, Iterator* iter, SequenceNumber s)
      : dbname_(dbname),
        env_(env),
        user_comparator_(cmp),
        iter_(iter),
        sequence_(s),
        large_(NULL),
        valid_(false) {
  }
  virtual ~DBIter() {
    delete iter_;
    delete large_;
  }
  virtual bool Valid() const { return valid_; }
  virtual Slice key() const {
    assert(valid_);
    return key_;
  }
  virtual Slice value() const {
    assert(valid_);
    if (large_ == NULL) {
      return value_;
    } else {
      MutexLock l(&large_->mutex);
      if (!large_->produced) {
        ReadIndirectValue();
      }
      return large_->value;
    }
  }

  virtual void Next() {
    assert(valid_);
    // iter_ is already positioned past DBIter::key()
    FindNextUserEntry();
  }

  virtual void Prev() {
    assert(valid_);
    bool ignored;
    ScanUntilBeforeCurrentKey(&ignored);
    FindPrevUserEntry();
  }

  virtual void Seek(const Slice& target) {
    ParsedInternalKey ikey(target, sequence_, kValueTypeForSeek);
    std::string tmp;
    AppendInternalKey(&tmp, ikey);
    iter_->Seek(tmp);
    FindNextUserEntry();
  }
  virtual void SeekToFirst() {
    iter_->SeekToFirst();
    FindNextUserEntry();
  }

  virtual void SeekToLast();

  virtual Status status() const {
    if (status_.ok()) {
      if (large_ != NULL && !large_->status.ok()) return large_->status;
      return iter_->status();
    } else {
      return status_;
    }
  }

 private:
  void FindNextUserEntry();
  void FindPrevUserEntry();
  void SaveKey(const Slice& k) { key_.assign(k.data(), k.size()); }
  void SaveValue(const Slice& v) {
    if (value_.capacity() > v.size() + 1048576) {
      std::string empty;
      swap(empty, value_);
    }
    value_.assign(v.data(), v.size());
  }
  bool ParseKey(ParsedInternalKey* key);
  void SkipPast(const Slice& k);
  void ScanUntilBeforeCurrentKey(bool* found_live);

  void ReadIndirectValue() const;

  struct Large {
    port::Mutex mutex;
    std::string value;
    bool produced;
    Status status;
  };

  const std::string* const dbname_;
  Env* const env_;

  const Comparator* const user_comparator_;

  // iter_ is positioned just past current entry for DBIter if valid_
  Iterator* const iter_;

  SequenceNumber const sequence_;
  Status status_;
  std::string key_;                  // Always a user key
  std::string value_;
  Large* large_;      // Non-NULL if value is an indirect reference
  bool valid_;

  // No copying allowed
  DBIter(const DBIter&);
  void operator=(const DBIter&);
};

inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
  if (!ParseInternalKey(iter_->key(), ikey)) {
    status_ = Status::Corruption("corrupted internal key in DBIter");
    return false;
  } else {
    return true;
  }
}

void DBIter::FindNextUserEntry() {
  if (large_ != NULL) {
    if (status_.ok() && !large_->status.ok()) {
      status_ = large_->status;
    }
    delete large_;
    large_ = NULL;
  }
  while (iter_->Valid()) {
    ParsedInternalKey ikey;
    if (!ParseKey(&ikey)) {
      // Skip past corrupted entry
      iter_->Next();
      continue;
    }
    if (ikey.sequence > sequence_) {
      // Ignore entries newer than the snapshot
      iter_->Next();
      continue;
    }

    switch (ikey.type) {
      case kTypeDeletion:
        SaveKey(ikey.user_key);  // Make local copy for use by SkipPast()
        iter_->Next();
        SkipPast(key_);
        // Do not return deleted entries.  Instead keep looping.
        break;

      case kTypeValue:
        SaveKey(ikey.user_key);
        SaveValue(iter_->value());
        iter_->Next();
        SkipPast(key_);
        // Yield the value we just found.
        valid_ = true;
        return;

      case kTypeLargeValueRef:
        SaveKey(ikey.user_key);
        // Save the large value ref as value_, and read it lazily on a call
        // to value()
        SaveValue(iter_->value());
        large_ = new Large;
        large_->produced = false;
        iter_->Next();
        SkipPast(key_);
        // Yield the value we just found.
        valid_ = true;
        return;
    }
  }
  valid_ = false;
  key_.clear();
  value_.clear();
  assert(large_ == NULL);
}

void DBIter::SkipPast(const Slice& k) {
  while (iter_->Valid()) {
    ParsedInternalKey ikey;
    // Note that if we cannot parse an internal key, we keep looping
    // so that if we have a run like the following:
    //     <x,100,v> => value100
    //     <corrupted entry for user key x>
    //     <x,50,v> => value50
    // we will skip over the corrupted entry as well as value50.
    if (ParseKey(&ikey) && user_comparator_->Compare(ikey.user_key, k) != 0) {
      break;
    }
    iter_->Next();
  }
}

void DBIter::SeekToLast() {
  // Position iter_ at the last uncorrupted user key and then
  // let FindPrevUserEntry() do the heavy lifting to find
  // a user key that is live.
  iter_->SeekToLast();
  ParsedInternalKey current;
  while (iter_->Valid() && !ParseKey(&current)) {
    iter_->Prev();
  }
  if (iter_->Valid()) {
    SaveKey(current.user_key);
  }
  FindPrevUserEntry();
}

// Let X be the user key at which iter_ is currently positioned.
// Adjust DBIter to point at the last entry with a key <= X that
// has a live value.
void DBIter::FindPrevUserEntry() {
  // Consider the following example:
  //
  //     A@540
  //     A@400
  //
  //     B@300
  //     B@200
  //     B@100        <- iter_
  //
  //     C@301
  //     C@201
  //
  // The comments marked "(first iteration)" below relate what happens
  // for the preceding example in the first iteration of the while loop
  // below.  There may be more than one iteration either if there are
  // no live values for B, or if there is a corruption.
  while (iter_->Valid()) {
    std::string saved = key_;
    bool found_live;
    ScanUntilBeforeCurrentKey(&found_live);
    // (first iteration) iter_ at A@400
    if (found_live) {
      // Step forward into range of entries with user key >= saved
      if (!iter_->Valid()) {
        iter_->SeekToFirst();
      } else {
        iter_->Next();
      }
      // (first iteration) iter_ at B@300

      FindNextUserEntry();  // Sets key_ to the key of the next value it found
      if (valid_ && user_comparator_->Compare(key_, saved) == 0) {
        // (first iteration) iter_ at C@301
        return;
      }

      // FindNextUserEntry() could not find any entries under the
      // user key "saved".  This is probably a corruption since
      // ScanUntilBefore(saved) found a live value.  So we skip
      // backwards to an earlier key and ignore the corrupted
      // entries for "saved".
      //
      // (first iteration) iter_ at C@301 and saved == "B"
      key_ = saved;
      bool ignored;
      ScanUntilBeforeCurrentKey(&ignored);
      // (first iteration) iter_ at A@400
    }
  }
  valid_ = false;
  key_.clear();
  value_.clear();
}

void DBIter::ScanUntilBeforeCurrentKey(bool* found_live) {
  *found_live = false;
  if (!iter_->Valid()) {
    iter_->SeekToLast();
  }

  while (iter_->Valid()) {
    ParsedInternalKey current;
    if (!ParseKey(&current)) {
      iter_->Prev();
      continue;
    }

    if (current.sequence > sequence_) {
      // Ignore entries that are serialized after this read
      iter_->Prev();
      continue;
    }

    const int cmp = user_comparator_->Compare(current.user_key, key_);
    if (cmp < 0) {
      SaveKey(current.user_key);
      return;
    } else if (cmp == 0) {
      switch (current.type) {
        case kTypeDeletion:
          *found_live = false;
          break;

        case kTypeValue:
        case kTypeLargeValueRef:
          *found_live = true;
          break;
      }
    } else {  // cmp > 0
      *found_live = false;
    }

    iter_->Prev();
  }
}

void DBIter::ReadIndirectValue() const {
  assert(!large_->produced);
  large_->produced = true;
  LargeValueRef large_ref;
  if (value_.size() != LargeValueRef::ByteSize()) {
    large_->status = Status::Corruption("malformed large value reference");
    return;
  }
  memcpy(large_ref.data, value_.data(), LargeValueRef::ByteSize());
  std::string fname = LargeValueFileName(*dbname_, large_ref);
  RandomAccessFile* file;
  Status s = env_->NewRandomAccessFile(fname, &file);
  if (s.ok()) {
    uint64_t file_size = file->Size();
    uint64_t value_size = large_ref.ValueSize();
    large_->value.resize(value_size);
    Slice result;
    s = file->Read(0, file_size, &result,
                   const_cast<char*>(large_->value.data()));
    if (s.ok()) {
      if (result.size() == file_size) {
        switch (large_ref.compression_type()) {
          case kNoCompression: {
            if (result.data() != large_->value.data()) {
              large_->value.assign(result.data(), result.size());
            }
            break;
          }
          case kLightweightCompression: {
            std::string uncompressed;
            if (port::Lightweight_Uncompress(result.data(), result.size(),
                                       &uncompressed) &&
                uncompressed.size() == large_ref.ValueSize()) {
              swap(uncompressed, large_->value);
            } else {
              s = Status::Corruption(
                  "Unable to read entire compressed large value file");
            }
          }
        }
      } else {
        s = Status::Corruption("Unable to read entire large value file");
      }
    }
    delete file;        // Ignore errors on closing
  }
  if (!s.ok()) {
    large_->value.clear();
    large_->status = s;
  }
}

}  // anonymous namespace

Iterator* NewDBIterator(
    const std::string* dbname,
    Env* env,
    const Comparator* user_key_comparator,
    Iterator* internal_iter,
    const SequenceNumber& sequence) {
  return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence);
}

}