Optimize SpatialDB

Summary:
Two things:
1. Use hash-based index for data column family
2. Use Get() instead of Iterator Seek() when DB is opened read-only

Test Plan: added read-only test in unit test

Reviewers: yinwang

Reviewed By: yinwang

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22323
main
Igor Canadi 10 years ago
parent 23861857c4
commit ff6ec0eb17
  1. 220
      utilities/spatialdb/spatial_db.cc
  2. 16
      utilities/spatialdb/spatial_db_test.cc

@ -16,6 +16,8 @@
#include <unordered_set> #include <unordered_set>
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/options.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/utilities/stackable_db.h" #include "rocksdb/utilities/stackable_db.h"
@ -244,13 +246,76 @@ std::string FeatureSet::DebugString() const {
return out + "}"; return out + "}";
} }
class ValueGetter {
public:
ValueGetter() {}
virtual ~ValueGetter() {}
virtual bool Get(uint64_t id) = 0;
virtual const Slice value() const = 0;
virtual Status status() const = 0;
};
class ValueGetterFromDB : public ValueGetter {
public:
ValueGetterFromDB(DB* db, ColumnFamilyHandle* cf) : db_(db), cf_(cf) {}
virtual bool Get(uint64_t id) override {
std::string encoded_id;
PutFixed64BigEndian(&encoded_id, id);
status_ = db_->Get(ReadOptions(), cf_, encoded_id, &value_);
if (status_.IsNotFound()) {
status_ = Status::Corruption("Index inconsistency");
return false;
}
return true;
}
virtual const Slice value() const override { return value_; }
virtual Status status() const override { return status_; }
private:
std::string value_;
DB* db_;
ColumnFamilyHandle* cf_;
Status status_;
};
class ValueGetterFromIterator : public ValueGetter {
public:
explicit ValueGetterFromIterator(Iterator* iterator) : iterator_(iterator) {}
virtual bool Get(uint64_t id) override {
std::string encoded_id;
PutFixed64BigEndian(&encoded_id, id);
iterator_->Seek(encoded_id);
if (!iterator_->Valid() || iterator_->key() != Slice(encoded_id)) {
status_ = Status::Corruption("Index inconsistency");
return false;
}
return true;
}
virtual const Slice value() const override { return iterator_->value(); }
virtual Status status() const override { return status_; }
private:
std::unique_ptr<Iterator> iterator_;
Status status_;
};
class SpatialIndexCursor : public Cursor { class SpatialIndexCursor : public Cursor {
public: public:
// tile_box is inclusive // tile_box is inclusive
SpatialIndexCursor(Iterator* spatial_iterator, Iterator* data_iterator, SpatialIndexCursor(Iterator* spatial_iterator, ValueGetter* value_getter,
const BoundingBox<uint64_t>& tile_bbox, uint32_t tile_bits) const BoundingBox<uint64_t>& tile_bbox, uint32_t tile_bits)
: data_iterator_(data_iterator), : value_getter_(value_getter), valid_(true) {
valid_(true) {
// calculate quad keys we'll need to query // calculate quad keys we'll need to query
std::vector<uint64_t> quad_keys; std::vector<uint64_t> quad_keys;
quad_keys.reserve((tile_bbox.max_x - tile_bbox.min_x + 1) * quad_keys.reserve((tile_bbox.max_x - tile_bbox.min_x + 1) *
@ -329,7 +394,7 @@ class SpatialIndexCursor : public Cursor {
if (!status_.ok()) { if (!status_.ok()) {
return status_; return status_;
} }
return data_iterator_->status(); return value_getter_->status();
} }
private: private:
@ -356,32 +421,23 @@ class SpatialIndexCursor : public Cursor {
return true; return true;
} }
// doesn't return anything, but sets valid_ and status_ on corruption
void ExtractData() { void ExtractData() {
assert(valid_); assert(valid_);
std::string encoded_id; valid_ = value_getter_->Get(*primary_keys_iterator_);
PutFixed64BigEndian(&encoded_id, *primary_keys_iterator_);
data_iterator_->Seek(encoded_id); if (valid_) {
Slice data = value_getter_->value();
if (!data_iterator_->Valid() ||
data_iterator_->key() != Slice(encoded_id)) {
status_ = Status::Corruption("Index inconsistency");
valid_ = false;
return;
}
Slice data = data_iterator_->value();
current_feature_set_.Clear(); current_feature_set_.Clear();
if (!GetLengthPrefixedSlice(&data, &current_blob_) || if (!GetLengthPrefixedSlice(&data, &current_blob_) ||
!current_feature_set_.Deserialize(data)) { !current_feature_set_.Deserialize(data)) {
status_ = Status::Corruption("Primary key column family corruption"); status_ = Status::Corruption("Primary key column family corruption");
valid_ = false; valid_ = false;
return;
} }
} }
unique_ptr<Iterator> data_iterator_; }
unique_ptr<ValueGetter> value_getter_;
bool valid_; bool valid_;
Status status_; Status status_;
@ -427,10 +483,11 @@ class SpatialDBImpl : public SpatialDB {
DB* db, ColumnFamilyHandle* data_column_family, DB* db, ColumnFamilyHandle* data_column_family,
const std::vector<std::pair<SpatialIndexOptions, ColumnFamilyHandle*>>& const std::vector<std::pair<SpatialIndexOptions, ColumnFamilyHandle*>>&
spatial_indexes, spatial_indexes,
uint64_t next_id) uint64_t next_id, bool read_only)
: SpatialDB(db), : SpatialDB(db),
data_column_family_(data_column_family), data_column_family_(data_column_family),
next_id_(next_id) { next_id_(next_id),
read_only_(read_only) {
for (const auto& index : spatial_indexes) { for (const auto& index : spatial_indexes) {
name_to_index_.insert( name_to_index_.insert(
{index.first.name, IndexColumnFamily(index.first, index.second)}); {index.first.name, IndexColumnFamily(index.first, index.second)});
@ -521,7 +578,14 @@ class SpatialDBImpl : public SpatialDB {
return new ErrorCursor(Status::InvalidArgument( return new ErrorCursor(Status::InvalidArgument(
"Spatial index " + spatial_index + " not found")); "Spatial index " + spatial_index + " not found"));
} }
const auto& si = itr->second.index;
Iterator* spatial_iterator;
ValueGetter* value_getter;
if (read_only_) {
spatial_iterator = NewIterator(read_options, itr->second.column_family);
value_getter = new ValueGetterFromDB(this, data_column_family_);
} else {
std::vector<Iterator*> iterators; std::vector<Iterator*> iterators;
Status s = NewIterators(read_options, Status s = NewIterators(read_options,
{data_column_family_, itr->second.column_family}, {data_column_family_, itr->second.column_family},
@ -530,8 +594,10 @@ class SpatialDBImpl : public SpatialDB {
return new ErrorCursor(s); return new ErrorCursor(s);
} }
const auto& si = itr->second.index; spatial_iterator = iterators[1];
return new SpatialIndexCursor(iterators[1], iterators[0], value_getter = new ValueGetterFromIterator(iterators[0]);
}
return new SpatialIndexCursor(spatial_iterator, value_getter,
GetTileBoundingBox(si, bbox), si.tile_bits); GetTileBoundingBox(si, bbox), si.tile_bits);
} }
@ -548,31 +614,61 @@ class SpatialDBImpl : public SpatialDB {
std::unordered_map<std::string, IndexColumnFamily> name_to_index_; std::unordered_map<std::string, IndexColumnFamily> name_to_index_;
std::atomic<uint64_t> next_id_; std::atomic<uint64_t> next_id_;
bool read_only_;
}; };
namespace { namespace {
Options GetRocksDBOptionsFromOptions(const SpatialDBOptions& options) { DBOptions GetDBOptions(const SpatialDBOptions& options) {
Options rocksdb_options; DBOptions db_options;
rocksdb_options.IncreaseParallelism(options.num_threads); db_options.IncreaseParallelism(options.num_threads);
rocksdb_options.write_buffer_size = 256 * 1024 * 1024; // 256MB if (options.bulk_load) {
rocksdb_options.max_bytes_for_level_base = 1024 * 1024 * 1024; // 1 GB db_options.disableDataSync = true;
}
return db_options;
}
ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options,
std::shared_ptr<Cache> block_cache) {
ColumnFamilyOptions column_family_options;
column_family_options.write_buffer_size = 256 * 1024 * 1024; // 256MB
column_family_options.max_bytes_for_level_base = 1024 * 1024 * 1024; // 1 GB
// only compress levels >= 1 // only compress levels >= 1
rocksdb_options.compression_per_level.resize(rocksdb_options.num_levels); column_family_options.compression_per_level.resize(
for (int i = 0; i < rocksdb_options.num_levels; ++i) { column_family_options.num_levels);
for (int i = 0; i < column_family_options.num_levels; ++i) {
if (i == 0) { if (i == 0) {
rocksdb_options.compression_per_level[i] = kNoCompression; column_family_options.compression_per_level[i] = kNoCompression;
} else { } else {
rocksdb_options.compression_per_level[i] = kLZ4Compression; column_family_options.compression_per_level[i] = kLZ4Compression;
} }
} }
BlockBasedTableOptions table_options; BlockBasedTableOptions table_options;
table_options.block_cache = NewLRUCache(options.cache_size); table_options.block_cache = block_cache;
rocksdb_options.table_factory.reset(NewBlockBasedTableFactory(table_options)); column_family_options.table_factory.reset(
NewBlockBasedTableFactory(table_options));
if (options.bulk_load) { if (options.bulk_load) {
rocksdb_options.PrepareForBulkLoad(); column_family_options.level0_file_num_compaction_trigger = (1 << 30);
column_family_options.level0_slowdown_writes_trigger = (1 << 30);
column_family_options.level0_stop_writes_trigger = (1 << 30);
column_family_options.disable_auto_compactions = true;
column_family_options.source_compaction_factor = (1 << 30);
column_family_options.num_levels = 2;
column_family_options.target_file_size_base = 256 * 1024 * 1024;
column_family_options.max_mem_compaction_level = 0;
} }
return rocksdb_options; return column_family_options;
} }
ColumnFamilyOptions OptimizeOptionsForDataColumnFamily(
ColumnFamilyOptions options, std::shared_ptr<Cache> block_cache) {
options.prefix_extractor.reset(NewNoopTransform());
BlockBasedTableOptions block_based_options;
block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
block_based_options.block_cache = block_cache;
options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
return options;
}
} // namespace } // namespace
class MetadataStorage { class MetadataStorage {
@ -618,26 +714,30 @@ class MetadataStorage {
Status SpatialDB::Create( Status SpatialDB::Create(
const SpatialDBOptions& options, const std::string& name, const SpatialDBOptions& options, const std::string& name,
const std::vector<SpatialIndexOptions>& spatial_indexes) { const std::vector<SpatialIndexOptions>& spatial_indexes) {
Options rocksdb_options = GetRocksDBOptionsFromOptions(options); DBOptions db_options = GetDBOptions(options);
rocksdb_options.create_if_missing = true; db_options.create_if_missing = true;
rocksdb_options.create_missing_column_families = true; db_options.create_missing_column_families = true;
rocksdb_options.error_if_exists = true; db_options.error_if_exists = true;
auto block_cache = NewLRUCache(options.cache_size);
ColumnFamilyOptions column_family_options =
GetColumnFamilyOptions(options, block_cache);
std::vector<ColumnFamilyDescriptor> column_families; std::vector<ColumnFamilyDescriptor> column_families;
column_families.push_back(ColumnFamilyDescriptor( column_families.push_back(ColumnFamilyDescriptor(
kDefaultColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); kDefaultColumnFamilyName,
column_families.push_back(ColumnFamilyDescriptor( OptimizeOptionsForDataColumnFamily(column_family_options, block_cache)));
kMetadataColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); column_families.push_back(
ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options));
for (const auto& index : spatial_indexes) { for (const auto& index : spatial_indexes) {
column_families.emplace_back(GetSpatialIndexColumnFamilyName(index.name), column_families.emplace_back(GetSpatialIndexColumnFamilyName(index.name),
ColumnFamilyOptions(rocksdb_options)); column_family_options);
} }
std::vector<ColumnFamilyHandle*> handles; std::vector<ColumnFamilyHandle*> handles;
DB* base_db; DB* base_db;
Status s = DB::Open(DBOptions(rocksdb_options), name, column_families, Status s = DB::Open(db_options, name, column_families, &handles, &base_db);
&handles, &base_db);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -659,13 +759,15 @@ Status SpatialDB::Create(
Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name, Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name,
SpatialDB** db, bool read_only) { SpatialDB** db, bool read_only) {
Options rocksdb_options = GetRocksDBOptionsFromOptions(options); DBOptions db_options = GetDBOptions(options);
auto block_cache = NewLRUCache(options.cache_size);
ColumnFamilyOptions column_family_options =
GetColumnFamilyOptions(options, block_cache);
Status s; Status s;
std::vector<std::string> existing_column_families; std::vector<std::string> existing_column_families;
std::vector<std::string> spatial_indexes; std::vector<std::string> spatial_indexes;
s = DB::ListColumnFamilies(DBOptions(rocksdb_options), name, s = DB::ListColumnFamilies(db_options, name, &existing_column_families);
&existing_column_families);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -678,22 +780,22 @@ Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name,
std::vector<ColumnFamilyDescriptor> column_families; std::vector<ColumnFamilyDescriptor> column_families;
column_families.push_back(ColumnFamilyDescriptor( column_families.push_back(ColumnFamilyDescriptor(
kDefaultColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); kDefaultColumnFamilyName,
column_families.push_back(ColumnFamilyDescriptor( OptimizeOptionsForDataColumnFamily(column_family_options, block_cache)));
kMetadataColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); column_families.push_back(
ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options));
for (const auto& index : spatial_indexes) { for (const auto& index : spatial_indexes) {
column_families.emplace_back(GetSpatialIndexColumnFamilyName(index), column_families.emplace_back(GetSpatialIndexColumnFamilyName(index),
ColumnFamilyOptions(rocksdb_options)); column_family_options);
} }
std::vector<ColumnFamilyHandle*> handles; std::vector<ColumnFamilyHandle*> handles;
DB* base_db; DB* base_db;
if (read_only) { if (read_only) {
s = DB::OpenForReadOnly(DBOptions(rocksdb_options), name, column_families, s = DB::OpenForReadOnly(db_options, name, column_families, &handles,
&handles, &base_db);
} else {
s = DB::Open(DBOptions(rocksdb_options), name, column_families, &handles,
&base_db); &base_db);
} else {
s = DB::Open(db_options, name, column_families, &handles, &base_db);
} }
if (!s.ok()) { if (!s.ok()) {
return s; return s;
@ -730,13 +832,13 @@ Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name,
for (auto h : handles) { for (auto h : handles) {
delete h; delete h;
} }
delete db; delete base_db;
return s; return s;
} }
// I don't need metadata column family any more, so delete it // I don't need metadata column family any more, so delete it
delete handles[1]; delete handles[1];
*db = new SpatialDBImpl(base_db, handles[0], index_cf, next_id); *db = new SpatialDBImpl(base_db, handles[0], index_cf, next_id, read_only);
return Status::OK(); return Status::OK();
} }

@ -151,9 +151,14 @@ TEST(SpatialDBTest, FeatureSetTest) {
} }
TEST(SpatialDBTest, SimpleTest) { TEST(SpatialDBTest, SimpleTest) {
// iter 0 -- not read only
// iter 1 -- read only
for (int iter = 0; iter < 2; ++iter) {
DestroyDB(dbname_, Options());
ASSERT_OK(SpatialDB::Create( ASSERT_OK(SpatialDB::Create(
SpatialDBOptions(), dbname_, SpatialDBOptions(), dbname_,
{SpatialIndexOptions("index", BoundingBox<double>(0, 0, 128, 128), 3)})); {SpatialIndexOptions("index", BoundingBox<double>(0, 0, 128, 128),
3)}));
ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_)); ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(33, 17, 63, 79), ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(33, 17, 63, 79),
@ -169,6 +174,11 @@ TEST(SpatialDBTest, SimpleTest) {
ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(1, 65, 47, 95), ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(1, 65, 47, 95),
"six", FeatureSet(), {"index"})); "six", FeatureSet(), {"index"}));
if (iter == 1) {
delete db_;
ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_, true));
}
AssertCursorResults(BoundingBox<double>(33, 17, 47, 31), "index", {"one"}); AssertCursorResults(BoundingBox<double>(33, 17, 47, 31), "index", {"one"});
AssertCursorResults(BoundingBox<double>(17, 33, 79, 63), "index", AssertCursorResults(BoundingBox<double>(17, 33, 79, 63), "index",
{"one", "three"}); {"one", "three"});
@ -177,7 +187,8 @@ TEST(SpatialDBTest, SimpleTest) {
AssertCursorResults(BoundingBox<double>(85, 86, 85, 86), "index", {"two"}); AssertCursorResults(BoundingBox<double>(85, 86, 85, 86), "index", {"two"});
AssertCursorResults(BoundingBox<double>(33, 1, 127, 111), "index", AssertCursorResults(BoundingBox<double>(33, 1, 127, 111), "index",
{"one", "two", "three", "five", "six"}); {"one", "two", "three", "five", "six"});
// even though the bounding box doesn't intersect, we got "four" back because // even though the bounding box doesn't intersect, we got "four" back
// because
// it's in the same tile // it's in the same tile
AssertCursorResults(BoundingBox<double>(18, 98, 19, 99), "index", {"four"}); AssertCursorResults(BoundingBox<double>(18, 98, 19, 99), "index", {"four"});
AssertCursorResults(BoundingBox<double>(130, 130, 131, 131), "index", {}); AssertCursorResults(BoundingBox<double>(130, 130, 131, 131), "index", {});
@ -187,6 +198,7 @@ TEST(SpatialDBTest, SimpleTest) {
delete db_; delete db_;
} }
}
namespace { namespace {
std::string RandomStr(Random* rnd) { std::string RandomStr(Random* rnd) {

Loading…
Cancel
Save