From ff6ec0eb17b9c5c7eeca3bc78928da225ca2c3b2 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 25 Aug 2014 17:50:18 -0700 Subject: [PATCH] Optimize SpatialDB Summary: Two things: 1. Use hash-based index for data column family 2. Use Get() instead of Iterator Seek() when DB is opened read-only Test Plan: added read-only test in unit test Reviewers: yinwang Reviewed By: yinwang Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D22323 --- utilities/spatialdb/spatial_db.cc | 244 ++++++++++++++++++------- utilities/spatialdb/spatial_db_test.cc | 80 ++++---- 2 files changed, 219 insertions(+), 105 deletions(-) diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc index c8d013fed..3dff346bb 100644 --- a/utilities/spatialdb/spatial_db.cc +++ b/utilities/spatialdb/spatial_db.cc @@ -16,6 +16,8 @@ #include #include "rocksdb/cache.h" +#include "rocksdb/options.h" +#include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "rocksdb/db.h" #include "rocksdb/utilities/stackable_db.h" @@ -244,13 +246,76 @@ std::string FeatureSet::DebugString() const { return out + "}"; } +class ValueGetter { + public: + ValueGetter() {} + virtual ~ValueGetter() {} + + virtual bool Get(uint64_t id) = 0; + virtual const Slice value() const = 0; + + virtual Status status() const = 0; +}; + +class ValueGetterFromDB : public ValueGetter { + public: + ValueGetterFromDB(DB* db, ColumnFamilyHandle* cf) : db_(db), cf_(cf) {} + + virtual bool Get(uint64_t id) override { + std::string encoded_id; + PutFixed64BigEndian(&encoded_id, id); + status_ = db_->Get(ReadOptions(), cf_, encoded_id, &value_); + if (status_.IsNotFound()) { + status_ = Status::Corruption("Index inconsistency"); + return false; + } + + return true; + } + + virtual const Slice value() const override { return value_; } + + virtual Status status() const override { return status_; } + + private: + std::string value_; + DB* db_; + ColumnFamilyHandle* cf_; + Status status_; +}; + +class ValueGetterFromIterator : public ValueGetter { + public: + explicit ValueGetterFromIterator(Iterator* iterator) : iterator_(iterator) {} + + virtual bool Get(uint64_t id) override { + std::string encoded_id; + PutFixed64BigEndian(&encoded_id, id); + iterator_->Seek(encoded_id); + + if (!iterator_->Valid() || iterator_->key() != Slice(encoded_id)) { + status_ = Status::Corruption("Index inconsistency"); + return false; + } + + return true; + } + + virtual const Slice value() const override { return iterator_->value(); } + + virtual Status status() const override { return status_; } + + private: + std::unique_ptr iterator_; + Status status_; +}; + class SpatialIndexCursor : public Cursor { public: // tile_box is inclusive - SpatialIndexCursor(Iterator* spatial_iterator, Iterator* data_iterator, + SpatialIndexCursor(Iterator* spatial_iterator, ValueGetter* value_getter, const BoundingBox& tile_bbox, uint32_t tile_bits) - : data_iterator_(data_iterator), - valid_(true) { + : value_getter_(value_getter), valid_(true) { // calculate quad keys we'll need to query std::vector quad_keys; quad_keys.reserve((tile_bbox.max_x - tile_bbox.min_x + 1) * @@ -329,7 +394,7 @@ class SpatialIndexCursor : public Cursor { if (!status_.ok()) { return status_; } - return data_iterator_->status(); + return value_getter_->status(); } private: @@ -356,32 +421,23 @@ class SpatialIndexCursor : public Cursor { return true; } - // doesn't return anything, but sets valid_ and status_ on corruption void ExtractData() { assert(valid_); - std::string encoded_id; - PutFixed64BigEndian(&encoded_id, *primary_keys_iterator_); + valid_ = value_getter_->Get(*primary_keys_iterator_); - data_iterator_->Seek(encoded_id); - - if (!data_iterator_->Valid() || - data_iterator_->key() != Slice(encoded_id)) { - status_ = Status::Corruption("Index inconsistency"); - valid_ = false; - return; + if (valid_) { + Slice data = value_getter_->value(); + current_feature_set_.Clear(); + if (!GetLengthPrefixedSlice(&data, ¤t_blob_) || + !current_feature_set_.Deserialize(data)) { + status_ = Status::Corruption("Primary key column family corruption"); + valid_ = false; + } } - Slice data = data_iterator_->value(); - current_feature_set_.Clear(); - if (!GetLengthPrefixedSlice(&data, ¤t_blob_) || - !current_feature_set_.Deserialize(data)) { - status_ = Status::Corruption("Primary key column family corruption"); - valid_ = false; - return; - } } - unique_ptr data_iterator_; + unique_ptr value_getter_; bool valid_; Status status_; @@ -427,10 +483,11 @@ class SpatialDBImpl : public SpatialDB { DB* db, ColumnFamilyHandle* data_column_family, const std::vector>& spatial_indexes, - uint64_t next_id) + uint64_t next_id, bool read_only) : SpatialDB(db), data_column_family_(data_column_family), - next_id_(next_id) { + next_id_(next_id), + read_only_(read_only) { for (const auto& index : spatial_indexes) { name_to_index_.insert( {index.first.name, IndexColumnFamily(index.first, index.second)}); @@ -521,17 +578,26 @@ class SpatialDBImpl : public SpatialDB { return new ErrorCursor(Status::InvalidArgument( "Spatial index " + spatial_index + " not found")); } + const auto& si = itr->second.index; + Iterator* spatial_iterator; + ValueGetter* value_getter; - std::vector iterators; - Status s = NewIterators(read_options, - {data_column_family_, itr->second.column_family}, - &iterators); - if (!s.ok()) { - return new ErrorCursor(s); - } + if (read_only_) { + spatial_iterator = NewIterator(read_options, itr->second.column_family); + value_getter = new ValueGetterFromDB(this, data_column_family_); + } else { + std::vector iterators; + Status s = NewIterators(read_options, + {data_column_family_, itr->second.column_family}, + &iterators); + if (!s.ok()) { + return new ErrorCursor(s); + } - const auto& si = itr->second.index; - return new SpatialIndexCursor(iterators[1], iterators[0], + spatial_iterator = iterators[1]; + value_getter = new ValueGetterFromIterator(iterators[0]); + } + return new SpatialIndexCursor(spatial_iterator, value_getter, GetTileBoundingBox(si, bbox), si.tile_bits); } @@ -548,31 +614,61 @@ class SpatialDBImpl : public SpatialDB { std::unordered_map name_to_index_; std::atomic next_id_; + bool read_only_; }; namespace { -Options GetRocksDBOptionsFromOptions(const SpatialDBOptions& options) { - Options rocksdb_options; - rocksdb_options.IncreaseParallelism(options.num_threads); - rocksdb_options.write_buffer_size = 256 * 1024 * 1024; // 256MB - rocksdb_options.max_bytes_for_level_base = 1024 * 1024 * 1024; // 1 GB +DBOptions GetDBOptions(const SpatialDBOptions& options) { + DBOptions db_options; + db_options.IncreaseParallelism(options.num_threads); + if (options.bulk_load) { + db_options.disableDataSync = true; + } + return db_options; +} + +ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options, + std::shared_ptr block_cache) { + ColumnFamilyOptions column_family_options; + column_family_options.write_buffer_size = 256 * 1024 * 1024; // 256MB + column_family_options.max_bytes_for_level_base = 1024 * 1024 * 1024; // 1 GB // only compress levels >= 1 - rocksdb_options.compression_per_level.resize(rocksdb_options.num_levels); - for (int i = 0; i < rocksdb_options.num_levels; ++i) { + column_family_options.compression_per_level.resize( + column_family_options.num_levels); + for (int i = 0; i < column_family_options.num_levels; ++i) { if (i == 0) { - rocksdb_options.compression_per_level[i] = kNoCompression; + column_family_options.compression_per_level[i] = kNoCompression; } else { - rocksdb_options.compression_per_level[i] = kLZ4Compression; + column_family_options.compression_per_level[i] = kLZ4Compression; } } BlockBasedTableOptions table_options; - table_options.block_cache = NewLRUCache(options.cache_size); - rocksdb_options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + table_options.block_cache = block_cache; + column_family_options.table_factory.reset( + NewBlockBasedTableFactory(table_options)); if (options.bulk_load) { - rocksdb_options.PrepareForBulkLoad(); - } - return rocksdb_options; + column_family_options.level0_file_num_compaction_trigger = (1 << 30); + column_family_options.level0_slowdown_writes_trigger = (1 << 30); + column_family_options.level0_stop_writes_trigger = (1 << 30); + column_family_options.disable_auto_compactions = true; + column_family_options.source_compaction_factor = (1 << 30); + column_family_options.num_levels = 2; + column_family_options.target_file_size_base = 256 * 1024 * 1024; + column_family_options.max_mem_compaction_level = 0; + } + return column_family_options; +} + +ColumnFamilyOptions OptimizeOptionsForDataColumnFamily( + ColumnFamilyOptions options, std::shared_ptr block_cache) { + options.prefix_extractor.reset(NewNoopTransform()); + BlockBasedTableOptions block_based_options; + block_based_options.index_type = BlockBasedTableOptions::kHashSearch; + block_based_options.block_cache = block_cache; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + return options; } + } // namespace class MetadataStorage { @@ -618,26 +714,30 @@ class MetadataStorage { Status SpatialDB::Create( const SpatialDBOptions& options, const std::string& name, const std::vector& spatial_indexes) { - Options rocksdb_options = GetRocksDBOptionsFromOptions(options); - rocksdb_options.create_if_missing = true; - rocksdb_options.create_missing_column_families = true; - rocksdb_options.error_if_exists = true; + DBOptions db_options = GetDBOptions(options); + db_options.create_if_missing = true; + db_options.create_missing_column_families = true; + db_options.error_if_exists = true; + + auto block_cache = NewLRUCache(options.cache_size); + ColumnFamilyOptions column_family_options = + GetColumnFamilyOptions(options, block_cache); std::vector column_families; column_families.push_back(ColumnFamilyDescriptor( - kDefaultColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); - column_families.push_back(ColumnFamilyDescriptor( - kMetadataColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); + kDefaultColumnFamilyName, + OptimizeOptionsForDataColumnFamily(column_family_options, block_cache))); + column_families.push_back( + ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options)); for (const auto& index : spatial_indexes) { column_families.emplace_back(GetSpatialIndexColumnFamilyName(index.name), - ColumnFamilyOptions(rocksdb_options)); + column_family_options); } std::vector handles; DB* base_db; - Status s = DB::Open(DBOptions(rocksdb_options), name, column_families, - &handles, &base_db); + Status s = DB::Open(db_options, name, column_families, &handles, &base_db); if (!s.ok()) { return s; } @@ -659,13 +759,15 @@ Status SpatialDB::Create( Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name, SpatialDB** db, bool read_only) { - Options rocksdb_options = GetRocksDBOptionsFromOptions(options); + DBOptions db_options = GetDBOptions(options); + auto block_cache = NewLRUCache(options.cache_size); + ColumnFamilyOptions column_family_options = + GetColumnFamilyOptions(options, block_cache); Status s; std::vector existing_column_families; std::vector spatial_indexes; - s = DB::ListColumnFamilies(DBOptions(rocksdb_options), name, - &existing_column_families); + s = DB::ListColumnFamilies(db_options, name, &existing_column_families); if (!s.ok()) { return s; } @@ -678,22 +780,22 @@ Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name, std::vector column_families; column_families.push_back(ColumnFamilyDescriptor( - kDefaultColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); - column_families.push_back(ColumnFamilyDescriptor( - kMetadataColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); + kDefaultColumnFamilyName, + OptimizeOptionsForDataColumnFamily(column_family_options, block_cache))); + column_families.push_back( + ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options)); for (const auto& index : spatial_indexes) { column_families.emplace_back(GetSpatialIndexColumnFamilyName(index), - ColumnFamilyOptions(rocksdb_options)); + column_family_options); } std::vector handles; DB* base_db; if (read_only) { - s = DB::OpenForReadOnly(DBOptions(rocksdb_options), name, column_families, - &handles, &base_db); + s = DB::OpenForReadOnly(db_options, name, column_families, &handles, + &base_db); } else { - s = DB::Open(DBOptions(rocksdb_options), name, column_families, &handles, - &base_db); + s = DB::Open(db_options, name, column_families, &handles, &base_db); } if (!s.ok()) { return s; @@ -730,13 +832,13 @@ Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name, for (auto h : handles) { delete h; } - delete db; + delete base_db; return s; } // I don't need metadata column family any more, so delete it delete handles[1]; - *db = new SpatialDBImpl(base_db, handles[0], index_cf, next_id); + *db = new SpatialDBImpl(base_db, handles[0], index_cf, next_id, read_only); return Status::OK(); } diff --git a/utilities/spatialdb/spatial_db_test.cc b/utilities/spatialdb/spatial_db_test.cc index 4cd2c8eed..166920b57 100644 --- a/utilities/spatialdb/spatial_db_test.cc +++ b/utilities/spatialdb/spatial_db_test.cc @@ -151,41 +151,53 @@ TEST(SpatialDBTest, FeatureSetTest) { } TEST(SpatialDBTest, SimpleTest) { - ASSERT_OK(SpatialDB::Create( - SpatialDBOptions(), dbname_, - {SpatialIndexOptions("index", BoundingBox(0, 0, 128, 128), 3)})); - ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_)); - - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(33, 17, 63, 79), - "one", FeatureSet(), {"index"})); - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(65, 65, 111, 111), - "two", FeatureSet(), {"index"})); - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(1, 49, 127, 63), - "three", FeatureSet(), {"index"})); - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(20, 100, 21, 101), - "four", FeatureSet(), {"index"})); - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(81, 33, 127, 63), - "five", FeatureSet(), {"index"})); - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(1, 65, 47, 95), - "six", FeatureSet(), {"index"})); - - AssertCursorResults(BoundingBox(33, 17, 47, 31), "index", {"one"}); - AssertCursorResults(BoundingBox(17, 33, 79, 63), "index", - {"one", "three"}); - AssertCursorResults(BoundingBox(17, 81, 63, 111), "index", - {"four", "six"}); - AssertCursorResults(BoundingBox(85, 86, 85, 86), "index", {"two"}); - AssertCursorResults(BoundingBox(33, 1, 127, 111), "index", - {"one", "two", "three", "five", "six"}); - // even though the bounding box doesn't intersect, we got "four" back because - // it's in the same tile - AssertCursorResults(BoundingBox(18, 98, 19, 99), "index", {"four"}); - AssertCursorResults(BoundingBox(130, 130, 131, 131), "index", {}); - AssertCursorResults(BoundingBox(81, 17, 127, 31), "index", {}); - AssertCursorResults(BoundingBox(90, 50, 91, 51), "index", - {"three", "five"}); + // iter 0 -- not read only + // iter 1 -- read only + for (int iter = 0; iter < 2; ++iter) { + DestroyDB(dbname_, Options()); + ASSERT_OK(SpatialDB::Create( + SpatialDBOptions(), dbname_, + {SpatialIndexOptions("index", BoundingBox(0, 0, 128, 128), + 3)})); + ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_)); + + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(33, 17, 63, 79), + "one", FeatureSet(), {"index"})); + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(65, 65, 111, 111), + "two", FeatureSet(), {"index"})); + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(1, 49, 127, 63), + "three", FeatureSet(), {"index"})); + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(20, 100, 21, 101), + "four", FeatureSet(), {"index"})); + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(81, 33, 127, 63), + "five", FeatureSet(), {"index"})); + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(1, 65, 47, 95), + "six", FeatureSet(), {"index"})); + + if (iter == 1) { + delete db_; + ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_, true)); + } - delete db_; + AssertCursorResults(BoundingBox(33, 17, 47, 31), "index", {"one"}); + AssertCursorResults(BoundingBox(17, 33, 79, 63), "index", + {"one", "three"}); + AssertCursorResults(BoundingBox(17, 81, 63, 111), "index", + {"four", "six"}); + AssertCursorResults(BoundingBox(85, 86, 85, 86), "index", {"two"}); + AssertCursorResults(BoundingBox(33, 1, 127, 111), "index", + {"one", "two", "three", "five", "six"}); + // even though the bounding box doesn't intersect, we got "four" back + // because + // it's in the same tile + AssertCursorResults(BoundingBox(18, 98, 19, 99), "index", {"four"}); + AssertCursorResults(BoundingBox(130, 130, 131, 131), "index", {}); + AssertCursorResults(BoundingBox(81, 17, 127, 31), "index", {}); + AssertCursorResults(BoundingBox(90, 50, 91, 51), "index", + {"three", "five"}); + + delete db_; + } } namespace {