From 5e3d5c5f6efe52cf0bc1d60be75237f9c3a81dad Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Fri, 1 Aug 2014 16:50:40 -0400 Subject: [PATCH] Simplify SpatialIndexCursor Summary: Since we have enough memory to hold all primary keys loaded from spatial index, it is better if we first load all of them (store them in unordered_set for deduplication) and then query on primary key column family one by one. We need to dedup all IDs, so we'll end up storing all of them in memory even with the current approach. Test Plan: ./spatial_db_test is happy Reviewers: yinwang Reviewed By: yinwang Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D20949 --- utilities/spatialdb/spatial_db.cc | 183 +++++++++++++----------------- 1 file changed, 79 insertions(+), 104 deletions(-) diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc index 2ea2a6092..a0dff5120 100644 --- a/utilities/spatialdb/spatial_db.cc +++ b/utilities/spatialdb/spatial_db.cc @@ -13,6 +13,7 @@ #include #include #include +#include #include "rocksdb/cache.h" #include "rocksdb/db.h" @@ -244,23 +245,62 @@ std::string FeatureSet::DebugString() const { class SpatialIndexCursor : public Cursor { public: + // tile_box is inclusive SpatialIndexCursor(Iterator* spatial_iterator, Iterator* data_iterator, const BoundingBox& tile_bbox, uint32_t tile_bits) - : spatial_iterator_(spatial_iterator), - data_iterator_(data_iterator), - tile_bbox_(tile_bbox), - tile_bits_(tile_bits), + : data_iterator_(data_iterator), valid_(true) { - current_x_ = tile_bbox.min_x; - current_y_ = tile_bbox.min_y; - UpdateQuadKey(); - ReSeek(); - if (valid_) { - // this is the first ID returned, so I don't care about return value of - // Dedup - Dedup(); + // calculate quad keys we'll need to query + std::vector quad_keys; + quad_keys.reserve((tile_bbox.max_x - tile_bbox.min_x + 1) * + (tile_bbox.max_y - tile_bbox.min_y + 1)); + for (uint64_t x = tile_bbox.min_x; x <= tile_bbox.max_x; ++x) { + for (uint64_t y = tile_bbox.min_y; y <= tile_bbox.max_y; ++y) { + quad_keys.push_back(GetQuadKeyFromTile(x, y, tile_bits)); + } } + std::sort(quad_keys.begin(), quad_keys.end()); + + // load primary key ids for all quad keys + for (auto quad_key : quad_keys) { + std::string encoded_quad_key; + PutFixed64BigEndian(&encoded_quad_key, quad_key); + Slice slice_quad_key(encoded_quad_key); + + // If CheckQuadKey is true, there is no need to reseek, since + // spatial_iterator is already pointing at the correct quad key. This is + // an optimization. + if (!CheckQuadKey(spatial_iterator, slice_quad_key)) { + spatial_iterator->Seek(slice_quad_key); + } + + while (CheckQuadKey(spatial_iterator, slice_quad_key)) { + // extract ID from spatial_iterator + uint64_t id; + bool ok = GetFixed64BigEndian( + Slice(spatial_iterator->key().data() + sizeof(uint64_t), + sizeof(uint64_t)), + &id); + if (!ok) { + valid_ = false; + status_ = Status::Corruption("Spatial index corruption"); + break; + } + primary_key_ids_.insert(id); + spatial_iterator->Next(); + } + } + + if (!spatial_iterator->status().ok()) { + status_ = spatial_iterator->status(); + valid_ = false; + } + delete spatial_iterator; + + valid_ = valid_ && primary_key_ids_.size() > 0; + if (valid_) { + primary_keys_iterator_ = primary_key_ids_.begin(); ExtractData(); } } @@ -270,28 +310,13 @@ class SpatialIndexCursor : public Cursor { virtual void Next() override { assert(valid_); - // this do-while loop deals only with deduplication - do { - spatial_iterator_->Next(); - if (ExtractID()) { - // OK, found what we needed - continue; - } - - // move to the next tile - Increment(); - - if (ExtractID()) { - // no need to reseek, found what we needed - continue; - } - // reseek, find next good tile - ReSeek(); - } while (valid_ && !Dedup() && valid_); - - if (valid_) { - ExtractData(); + ++primary_keys_iterator_; + if (primary_keys_iterator_ == primary_key_ids_.end()) { + valid_ = false; + return; } + + ExtractData(); } virtual const Slice blob() override { return current_blob_; } @@ -303,88 +328,44 @@ class SpatialIndexCursor : public Cursor { if (!status_.ok()) { return status_; } - if (!spatial_iterator_->status().ok()) { - return spatial_iterator_->status(); - } return data_iterator_->status(); } private: - // returns true if OK, false if already returned (duplicate) - bool Dedup() { - assert(valid_); - uint64_t id; - bool ok = GetFixed64BigEndian(current_id_, &id); - if (!ok) { - valid_ = false; - status_ = Status::Corruption("Spatial index corruption"); - return false; - } - if (returned_ids_.find(id) != returned_ids_.end()) { - return false; - } - returned_ids_.insert(id); - return true; - } - void ReSeek() { - while (valid_) { - spatial_iterator_->Seek(current_quad_key_); - if (ExtractID()) { - // found what we're looking for! - break; - } - Increment(); - } - } - - void Increment() { - ++current_x_; - if (current_x_ > tile_bbox_.max_x) { - current_x_ = tile_bbox_.min_x; - ++current_y_; - } - if (current_y_ > tile_bbox_.max_y) { - valid_ = false; - } else { - UpdateQuadKey(); - } - } - void UpdateQuadKey() { - current_quad_key_.clear(); - PutFixed64BigEndian(¤t_quad_key_, - GetQuadKeyFromTile(current_x_, current_y_, tile_bits_)); - } // * returns true if spatial iterator is on the current quad key and all is - // well. Caller will call Next() to get new data - // * returns false if spatial iterator is not on current, or invalid or status - // bad. Caller will need to reseek to get new data - bool ExtractID() { - if (!spatial_iterator_->Valid()) { - // caller needs to reseek + // well + // * returns false if spatial iterator is not on current, or iterator is + // invalid or corruption + bool CheckQuadKey(Iterator* spatial_iterator, const Slice& quad_key) { + if (!spatial_iterator->Valid()) { return false; } - if (spatial_iterator_->key().size() != 2 * sizeof(uint64_t)) { + if (spatial_iterator->key().size() != 2 * sizeof(uint64_t)) { status_ = Status::Corruption("Invalid spatial index key"); valid_ = false; return false; } - Slice quad_key(spatial_iterator_->key().data(), sizeof(uint64_t)); - if (quad_key != current_quad_key_) { + Slice spatial_iterator_quad_key(spatial_iterator->key().data(), + sizeof(uint64_t)); + if (spatial_iterator_quad_key != quad_key) { // caller needs to reseek return false; } // if we come to here, we have found the quad key - current_id_ = Slice(spatial_iterator_->key().data() + sizeof(uint64_t), - sizeof(uint64_t)); return true; } + // doesn't return anything, but sets valid_ and status_ on corruption void ExtractData() { assert(valid_); - data_iterator_->Seek(current_id_); + std::string encoded_id; + PutFixed64BigEndian(&encoded_id, *primary_keys_iterator_); + + data_iterator_->Seek(encoded_id); - if (!data_iterator_->Valid() || data_iterator_->key() != current_id_) { - status_ = Status::Corruption("Inconsistency in data column family"); + if (!data_iterator_->Valid() || + data_iterator_->key() != Slice(encoded_id)) { + status_ = Status::Corruption("Index inconsistency"); valid_ = false; return; } @@ -393,28 +374,22 @@ class SpatialIndexCursor : public Cursor { current_feature_set_.Clear(); if (!GetLengthPrefixedSlice(&data, ¤t_blob_) || !current_feature_set_.Deserialize(data)) { - status_ = Status::Corruption("Data column family corruption"); + status_ = Status::Corruption("Primary key column family corruption"); valid_ = false; return; } } - unique_ptr spatial_iterator_; unique_ptr data_iterator_; - BoundingBox tile_bbox_; - uint32_t tile_bits_; - uint64_t current_x_; - uint64_t current_y_; - std::string current_quad_key_; - Slice current_id_; bool valid_; Status status_; FeatureSet current_feature_set_; Slice current_blob_; - // used for deduplicating results - std::set returned_ids_; + // This is loaded from spatial iterator. + std::unordered_set primary_key_ids_; + std::unordered_set::iterator primary_keys_iterator_; }; class ErrorCursor : public Cursor {