|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <map>
|
|
|
|
#include <memory>
|
|
|
|
#include <string>
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <vector>
|
|
|
|
#include "rocksdb/iterator.h"
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
#include "rocksdb/listener.h"
|
|
|
|
#include "rocksdb/metadata.h"
|
|
|
|
#include "rocksdb/options.h"
|
|
|
|
#include "rocksdb/snapshot.h"
|
|
|
|
#include "rocksdb/sst_file_writer.h"
|
|
|
|
#include "rocksdb/thread_status.h"
|
|
|
|
#include "rocksdb/transaction_log.h"
|
|
|
|
#include "rocksdb/types.h"
|
|
|
|
#include "rocksdb/version.h"
|
|
|
|
|
|
|
|
#ifdef _WIN32
|
|
|
|
// Windows API macro interference
|
|
|
|
#undef DeleteFile
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(__GNUC__) || defined(__clang__)
|
|
|
|
#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
|
|
|
|
#elif _WIN32
|
|
|
|
#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
struct Options;
|
|
|
|
struct DBOptions;
|
|
|
|
struct ColumnFamilyOptions;
|
|
|
|
struct ReadOptions;
|
|
|
|
struct WriteOptions;
|
|
|
|
struct FlushOptions;
|
|
|
|
struct CompactionOptions;
|
|
|
|
struct CompactRangeOptions;
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
struct TableProperties;
|
|
|
|
struct ExternalSstFileInfo;
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
class WriteBatch;
|
|
|
|
class Env;
|
|
|
|
class EventListener;
|
|
|
|
class StatsHistoryIterator;
|
|
|
|
class TraceWriter;
|
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
class CompactionJobInfo;
|
|
|
|
#endif
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
|
|
|
|
extern const std::string kDefaultColumnFamilyName;
|
|
|
|
extern const std::string kPersistentStatsColumnFamilyName;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
struct ColumnFamilyDescriptor {
|
|
|
|
std::string name;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
ColumnFamilyOptions options;
|
|
|
|
ColumnFamilyDescriptor()
|
|
|
|
: name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
|
|
|
|
ColumnFamilyDescriptor(const std::string& _name,
|
|
|
|
const ColumnFamilyOptions& _options)
|
|
|
|
: name(_name), options(_options) {}
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
};
|
|
|
|
|
|
|
|
class ColumnFamilyHandle {
|
|
|
|
public:
|
|
|
|
virtual ~ColumnFamilyHandle() {}
|
|
|
|
// Returns the name of the column family associated with the current handle.
|
|
|
|
virtual const std::string& GetName() const = 0;
|
|
|
|
// Returns the ID of the column family associated with the current handle.
|
|
|
|
virtual uint32_t GetID() const = 0;
|
|
|
|
// Fills "*desc" with the up-to-date descriptor of the column family
|
|
|
|
// associated with this handle. Since it fills "*desc" with the up-to-date
|
|
|
|
// information, this call might internally lock and release DB mutex to
|
|
|
|
// access the up-to-date CF options. In addition, all the pointer-typed
|
|
|
|
// options cannot be referenced any longer than the original options exist.
|
|
|
|
//
|
|
|
|
// Note that this function is not supported in RocksDBLite.
|
|
|
|
virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0;
|
|
|
|
// Returns the comparator of the column family associated with the
|
|
|
|
// current handle.
|
|
|
|
virtual const Comparator* GetComparator() const = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
static const int kMajorVersion = __ROCKSDB_MAJOR__;
|
|
|
|
static const int kMinorVersion = __ROCKSDB_MINOR__;
|
|
|
|
|
|
|
|
// A range of keys
|
|
|
|
struct Range {
|
|
|
|
Slice start;
|
|
|
|
Slice limit;
|
|
|
|
|
|
|
|
Range() {}
|
|
|
|
Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct RangePtr {
|
|
|
|
const Slice* start;
|
|
|
|
const Slice* limit;
|
|
|
|
|
|
|
|
RangePtr() : start(nullptr), limit(nullptr) {}
|
|
|
|
RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct IngestExternalFileArg {
|
|
|
|
ColumnFamilyHandle* column_family = nullptr;
|
|
|
|
std::vector<std::string> external_files;
|
|
|
|
IngestExternalFileOptions options;
|
|
|
|
};
|
|
|
|
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
struct GetMergeOperandsOptions {
|
|
|
|
int expected_max_number_of_operands = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
// A collections of table properties objects, where
|
|
|
|
// key: is the table's file name.
|
|
|
|
// value: the table properties object of the given table.
|
|
|
|
typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
|
|
|
|
TablePropertiesCollection;
|
|
|
|
|
|
|
|
// A DB is a persistent ordered map from keys to values.
|
|
|
|
// A DB is safe for concurrent access from multiple threads without
|
|
|
|
// any external synchronization.
|
|
|
|
class DB {
|
|
|
|
public:
|
|
|
|
// Open the database with the specified "name".
|
|
|
|
// Stores a pointer to a heap-allocated database in *dbptr and returns
|
|
|
|
// OK on success.
|
|
|
|
// Stores nullptr in *dbptr and returns a non-OK status on error.
|
|
|
|
// Caller should delete *dbptr when it is no longer needed.
|
|
|
|
static Status Open(const Options& options, const std::string& name,
|
|
|
|
DB** dbptr);
|
|
|
|
|
|
|
|
// Open the database for read only. All DB interfaces
|
|
|
|
// that modify data, like put/delete, will return error.
|
|
|
|
// If the db is opened in read only mode, then no compactions
|
|
|
|
// will happen.
|
|
|
|
//
|
|
|
|
// Not supported in ROCKSDB_LITE, in which case the function will
|
|
|
|
// return Status::NotSupported.
|
|
|
|
static Status OpenForReadOnly(const Options& options, const std::string& name,
|
|
|
|
DB** dbptr,
|
|
|
|
bool error_if_log_file_exist = false);
|
|
|
|
|
|
|
|
// Open the database for read only with column families. When opening DB with
|
|
|
|
// read only, you can specify only a subset of column families in the
|
|
|
|
// database that should be opened. However, you always need to specify default
|
|
|
|
// column family. The default column family name is 'default' and it's stored
|
|
|
|
// in rocksdb::kDefaultColumnFamilyName
|
|
|
|
//
|
|
|
|
// Not supported in ROCKSDB_LITE, in which case the function will
|
|
|
|
// return Status::NotSupported.
|
|
|
|
static Status OpenForReadOnly(
|
|
|
|
const DBOptions& db_options, const std::string& name,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
|
|
|
|
bool error_if_log_file_exist = false);
|
|
|
|
|
|
|
|
// The following OpenAsSecondary functions create a secondary instance that
|
|
|
|
// can dynamically tail the MANIFEST of a primary that must have already been
|
|
|
|
// created. User can call TryCatchUpWithPrimary to make the secondary
|
|
|
|
// instance catch up with primary (WAL tailing is NOT supported now) whenever
|
|
|
|
// the user feels necessary. Column families created by the primary after the
|
|
|
|
// secondary instance starts are currently ignored by the secondary instance.
|
|
|
|
// Column families opened by secondary and dropped by the primary will be
|
|
|
|
// dropped by secondary as well. However the user of the secondary instance
|
|
|
|
// can still access the data of such dropped column family as long as they
|
|
|
|
// do not destroy the corresponding column family handle.
|
|
|
|
// WAL tailing is not supported at present, but will arrive soon.
|
|
|
|
//
|
|
|
|
// The options argument specifies the options to open the secondary instance.
|
|
|
|
// The name argument specifies the name of the primary db that you have used
|
|
|
|
// to open the primary instance.
|
|
|
|
// The secondary_path argument points to a directory where the secondary
|
|
|
|
// instance stores its info log.
|
|
|
|
// The dbptr is an out-arg corresponding to the opened secondary instance.
|
|
|
|
// The pointer points to a heap-allocated database, and the user should
|
|
|
|
// delete it after use.
|
|
|
|
// Open DB as secondary instance with only the default column family.
|
|
|
|
// Return OK on success, non-OK on failures.
|
|
|
|
static Status OpenAsSecondary(const Options& options, const std::string& name,
|
|
|
|
const std::string& secondary_path, DB** dbptr);
|
|
|
|
|
|
|
|
// Open DB as secondary instance with column families. You can open a subset
|
|
|
|
// of column families in secondary mode.
|
|
|
|
// The db_options specify the database specific options.
|
|
|
|
// The name argument specifies the name of the primary db that you have used
|
|
|
|
// to open the primary instance.
|
|
|
|
// The secondary_path argument points to a directory where the secondary
|
|
|
|
// instance stores its info log.
|
|
|
|
// The column_families argument specifieds a list of column families to open.
|
|
|
|
// If any of the column families does not exist, the function returns non-OK
|
|
|
|
// status.
|
|
|
|
// The handles is an out-arg corresponding to the opened database column
|
|
|
|
// familiy handles.
|
|
|
|
// The dbptr is an out-arg corresponding to the opened secondary instance.
|
|
|
|
// The pointer points to a heap-allocated database, and the caller should
|
|
|
|
// delete it after use. Before deleting the dbptr, the user should also
|
|
|
|
// delete the pointers stored in handles vector.
|
|
|
|
// Return OK on success, on-OK on failures.
|
|
|
|
static Status OpenAsSecondary(
|
|
|
|
const DBOptions& db_options, const std::string& name,
|
|
|
|
const std::string& secondary_path,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
// Open DB with column families.
|
|
|
|
// db_options specify database specific options
|
|
|
|
// column_families is the vector of all column families in the database,
|
|
|
|
// containing column family name and options. You need to open ALL column
|
|
|
|
// families in the database. To get the list of column families, you can use
|
|
|
|
// ListColumnFamilies(). Also, you can open only a subset of column families
|
|
|
|
// for read-only access.
|
|
|
|
// The default column family name is 'default' and it's stored
|
|
|
|
// in rocksdb::kDefaultColumnFamilyName.
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
// If everything is OK, handles will on return be the same size
|
|
|
|
// as column_families --- handles[i] will be a handle that you
|
|
|
|
// will use to operate on column family column_family[i].
|
|
|
|
// Before delete DB, you have to close All column families by calling
|
|
|
|
// DestroyColumnFamilyHandle() with all the handles.
|
|
|
|
static Status Open(const DBOptions& db_options, const std::string& name,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
|
|
|
|
virtual Status Resume() { return Status::NotSupported(); }
|
|
|
|
|
|
|
|
// Close the DB by releasing resources, closing files etc. This should be
|
|
|
|
// called before calling the destructor so that the caller can get back a
|
|
|
|
// status in case there are any errors. This will not fsync the WAL files.
|
|
|
|
// If syncing is required, the caller must first call SyncWAL(), or Write()
|
|
|
|
// using an empty write batch with WriteOptions.sync=true.
|
|
|
|
// Regardless of the return status, the DB must be freed.
|
|
|
|
// If the return status is Aborted(), closing fails because there is
|
|
|
|
// unreleased snapshot in the system. In this case, users can release
|
|
|
|
// the unreleased snapshots and try again and expect it to succeed. For
|
|
|
|
// other status, recalling Close() will be no-op.
|
|
|
|
// If the return status is NotSupported(), then the DB implementation does
|
|
|
|
// cleanup in the destructor
|
|
|
|
virtual Status Close() { return Status::NotSupported(); }
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
// ListColumnFamilies will open the DB specified by argument name
|
|
|
|
// and return the list of all column families in that DB
|
|
|
|
// through column_families argument. The ordering of
|
|
|
|
// column families in column_families is unspecified.
|
|
|
|
static Status ListColumnFamilies(const DBOptions& db_options,
|
|
|
|
const std::string& name,
|
|
|
|
std::vector<std::string>* column_families);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
|
|
|
|
DB() {}
|
|
|
|
// No copying allowed
|
|
|
|
DB(const DB&) = delete;
|
|
|
|
void operator=(const DB&) = delete;
|
|
|
|
|
|
|
|
virtual ~DB();
|
|
|
|
|
|
|
|
// Create a column_family and return the handle of column family
|
|
|
|
// through the argument handle.
|
|
|
|
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
|
|
|
|
const std::string& column_family_name,
|
|
|
|
ColumnFamilyHandle** handle);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
|
|
|
|
// Bulk create column families with the same column family options.
|
|
|
|
// Return the handles of the column families through the argument handles.
|
|
|
|
// In case of error, the request may succeed partially, and handles will
|
|
|
|
// contain column family handles that it managed to create, and have size
|
|
|
|
// equal to the number of created column families.
|
|
|
|
virtual Status CreateColumnFamilies(
|
|
|
|
const ColumnFamilyOptions& options,
|
|
|
|
const std::vector<std::string>& column_family_names,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles);
|
|
|
|
|
|
|
|
// Bulk create column families.
|
|
|
|
// Return the handles of the column families through the argument handles.
|
|
|
|
// In case of error, the request may succeed partially, and handles will
|
|
|
|
// contain column family handles that it managed to create, and have size
|
|
|
|
// equal to the number of created column families.
|
|
|
|
virtual Status CreateColumnFamilies(
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles);
|
|
|
|
|
|
|
|
// Drop a column family specified by column_family handle. This call
|
|
|
|
// only records a drop record in the manifest and prevents the column
|
|
|
|
// family from flushing and compacting.
|
|
|
|
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
|
|
|
|
|
|
|
|
// Bulk drop column families. This call only records drop records in the
|
|
|
|
// manifest and prevents the column families from flushing and compacting.
|
|
|
|
// In case of error, the request may succeed partially. User may call
|
|
|
|
// ListColumnFamilies to check the result.
|
|
|
|
virtual Status DropColumnFamilies(
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families);
|
|
|
|
|
|
|
|
// Close a column family specified by column_family handle and destroy
|
|
|
|
// the column family handle specified to avoid double deletion. This call
|
|
|
|
// deletes the column family handle by default. Use this method to
|
|
|
|
// close column family instead of deleting column family handle directly
|
|
|
|
virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
11 years ago
|
|
|
// Set the database entry for "key" to "value".
|
|
|
|
// If "key" already exists, it will be overwritten.
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
11 years ago
|
|
|
// Returns OK on success, and a non-OK status on error.
|
|
|
|
// Note: consider setting options.sync = true.
|
|
|
|
virtual Status Put(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& value) = 0;
|
|
|
|
virtual Status Put(const WriteOptions& options, const Slice& key,
|
|
|
|
const Slice& value) {
|
|
|
|
return Put(options, DefaultColumnFamily(), key, value);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// Remove the database entry (if any) for "key". Returns OK on
|
|
|
|
// success, and a non-OK status on error. It is not an error if "key"
|
|
|
|
// did not exist in the database.
|
|
|
|
// Note: consider setting options.sync = true.
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
virtual Status Delete(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
const Slice& key) = 0;
|
|
|
|
virtual Status Delete(const WriteOptions& options, const Slice& key) {
|
|
|
|
return Delete(options, DefaultColumnFamily(), key);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
// Remove the database entry for "key". Requires that the key exists
|
|
|
|
// and was not overwritten. Returns OK on success, and a non-OK status
|
|
|
|
// on error. It is not an error if "key" did not exist in the database.
|
|
|
|
//
|
|
|
|
// If a key is overwritten (by calling Put() multiple times), then the result
|
|
|
|
// of calling SingleDelete() on this key is undefined. SingleDelete() only
|
|
|
|
// behaves correctly if there has been only one Put() for this key since the
|
|
|
|
// previous call to SingleDelete() for this key.
|
|
|
|
//
|
|
|
|
// This feature is currently an experimental performance optimization
|
|
|
|
// for a very specific workload. It is up to the caller to ensure that
|
|
|
|
// SingleDelete is only used for a key that is not deleted using Delete() or
|
|
|
|
// written using Merge(). Mixing SingleDelete operations with Deletes and
|
|
|
|
// Merges can result in undefined behavior.
|
|
|
|
//
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
// Note: consider setting options.sync = true.
|
|
|
|
virtual Status SingleDelete(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) = 0;
|
|
|
|
virtual Status SingleDelete(const WriteOptions& options, const Slice& key) {
|
|
|
|
return SingleDelete(options, DefaultColumnFamily(), key);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Removes the database entries in the range ["begin_key", "end_key"), i.e.,
|
|
|
|
// including "begin_key" and excluding "end_key". Returns OK on success, and
|
|
|
|
// a non-OK status on error. It is not an error if no keys exist in the range
|
|
|
|
// ["begin_key", "end_key").
|
|
|
|
//
|
|
|
|
// This feature is now usable in production, with the following caveats:
|
|
|
|
// 1) Accumulating many range tombstones in the memtable will degrade read
|
|
|
|
// performance; this can be avoided by manually flushing occasionally.
|
|
|
|
// 2) Limiting the maximum number of open files in the presence of range
|
|
|
|
// tombstones can degrade read performance. To avoid this problem, set
|
|
|
|
// max_open_files to -1 whenever possible.
|
|
|
|
virtual Status DeleteRange(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& begin_key, const Slice& end_key);
|
|
|
|
|
|
|
|
// Merge the database entry for "key" with "value". Returns OK on success,
|
|
|
|
// and a non-OK status on error. The semantics of this operation is
|
|
|
|
// determined by the user provided merge_operator when opening DB.
|
|
|
|
// Note: consider setting options.sync = true.
|
|
|
|
virtual Status Merge(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& value) = 0;
|
|
|
|
virtual Status Merge(const WriteOptions& options, const Slice& key,
|
|
|
|
const Slice& value) {
|
|
|
|
return Merge(options, DefaultColumnFamily(), key, value);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// Apply the specified updates to the database.
|
|
|
|
// If `updates` contains no update, WAL will still be synced if
|
|
|
|
// options.sync=true.
|
|
|
|
// Returns OK on success, non-OK on failure.
|
|
|
|
// Note: consider setting options.sync = true.
|
|
|
|
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
|
|
|
|
|
|
|
|
// If the database contains an entry for "key" store the
|
|
|
|
// corresponding value in *value and return OK.
|
|
|
|
//
|
|
|
|
// If there is no entry for "key" leave *value unchanged and return
|
|
|
|
// a status for which Status::IsNotFound() returns true.
|
|
|
|
//
|
|
|
|
// May return some other Status on an error.
|
|
|
|
virtual inline Status Get(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
std::string* value) {
|
|
|
|
assert(value != nullptr);
|
|
|
|
PinnableSlice pinnable_val(value);
|
|
|
|
assert(!pinnable_val.IsPinned());
|
|
|
|
auto s = Get(options, column_family, key, &pinnable_val);
|
|
|
|
if (s.ok() && pinnable_val.IsPinned()) {
|
|
|
|
value->assign(pinnable_val.data(), pinnable_val.size());
|
|
|
|
} // else value is already assigned
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
virtual Status Get(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
PinnableSlice* value) = 0;
|
|
|
|
virtual Status Get(const ReadOptions& options, const Slice& key,
|
|
|
|
std::string* value) {
|
|
|
|
return Get(options, DefaultColumnFamily(), key, value);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
// Returns all the merge operands corresponding to the key. If the
|
|
|
|
// number of merge operands in DB is greater than
|
|
|
|
// merge_operands_options.expected_max_number_of_operands
|
|
|
|
// no merge operands are returned and status is Incomplete. Merge operands
|
|
|
|
// returned are in the order of insertion.
|
|
|
|
// merge_operands- Points to an array of at-least
|
|
|
|
// merge_operands_options.expected_max_number_of_operands and the
|
|
|
|
// caller is responsible for allocating it. If the status
|
|
|
|
// returned is Incomplete then number_of_operands will contain
|
|
|
|
// the total number of merge operands found in DB for key.
|
|
|
|
virtual Status GetMergeOperands(
|
|
|
|
const ReadOptions& options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, PinnableSlice* merge_operands,
|
|
|
|
GetMergeOperandsOptions* get_merge_operands_options,
|
|
|
|
int* number_of_operands) = 0;
|
|
|
|
|
|
|
|
// If keys[i] does not exist in the database, then the i'th returned
|
|
|
|
// status will be one for which Status::IsNotFound() is true, and
|
|
|
|
// (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
|
|
|
|
// the i'th returned status will have Status::ok() true, and (*values)[i]
|
|
|
|
// will store the value associated with keys[i].
|
|
|
|
//
|
|
|
|
// (*values) will always be resized to be the same size as (keys).
|
|
|
|
// Similarly, the number of returned statuses will be the number of keys.
|
|
|
|
// Note: keys will not be "de-duplicated". Duplicate keys will return
|
|
|
|
// duplicate values in order.
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
virtual std::vector<Status> MultiGet(
|
|
|
|
const ReadOptions& options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
|
|
|
|
virtual std::vector<Status> MultiGet(const ReadOptions& options,
|
|
|
|
const std::vector<Slice>& keys,
|
|
|
|
std::vector<std::string>* values) {
|
|
|
|
return MultiGet(
|
|
|
|
options,
|
|
|
|
std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
|
|
|
|
keys, values);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
// Overloaded MultiGet API that improves performance by batching operations
|
|
|
|
// in the read path for greater efficiency. Currently, only the block based
|
|
|
|
// table format with full filters are supported. Other table formats such
|
|
|
|
// as plain table, block based table with block based filters and
|
|
|
|
// partitioned indexes will still work, but will not get any performance
|
|
|
|
// benefits.
|
|
|
|
// Parameters -
|
|
|
|
// options - ReadOptions
|
|
|
|
// column_family - ColumnFamilyHandle* that the keys belong to. All the keys
|
|
|
|
// passed to the API are restricted to a single column family
|
|
|
|
// num_keys - Number of keys to lookup
|
|
|
|
// keys - Pointer to C style array of key Slices with num_keys elements
|
|
|
|
// values - Pointer to C style array of PinnableSlices with num_keys elements
|
|
|
|
// statuses - Pointer to C style array of Status with num_keys elements
|
|
|
|
// sorted_input - If true, it means the input keys are already sorted by key
|
|
|
|
// order, so the MultiGet() API doesn't have to sort them
|
|
|
|
// again. If false, the keys will be copied and sorted
|
|
|
|
// internally by the API - the input array will not be
|
|
|
|
// modified
|
|
|
|
virtual void MultiGet(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const size_t num_keys, const Slice* keys,
|
|
|
|
PinnableSlice* values, Status* statuses,
|
|
|
|
const bool /*sorted_input*/ = false) {
|
|
|
|
std::vector<ColumnFamilyHandle*> cf;
|
|
|
|
std::vector<Slice> user_keys;
|
|
|
|
std::vector<Status> status;
|
|
|
|
std::vector<std::string> vals;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
|
|
|
cf.emplace_back(column_family);
|
|
|
|
user_keys.emplace_back(keys[i]);
|
|
|
|
}
|
|
|
|
status = MultiGet(options, cf, user_keys, &vals);
|
|
|
|
std::copy(status.begin(), status.end(), statuses);
|
|
|
|
for (auto& value : vals) {
|
|
|
|
values->PinSelf(value);
|
|
|
|
values++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Overloaded MultiGet API that improves performance by batching operations
|
|
|
|
// in the read path for greater efficiency. Currently, only the block based
|
|
|
|
// table format with full filters are supported. Other table formats such
|
|
|
|
// as plain table, block based table with block based filters and
|
|
|
|
// partitioned indexes will still work, but will not get any performance
|
|
|
|
// benefits.
|
|
|
|
// Parameters -
|
|
|
|
// options - ReadOptions
|
|
|
|
// column_family - ColumnFamilyHandle* that the keys belong to. All the keys
|
|
|
|
// passed to the API are restricted to a single column family
|
|
|
|
// num_keys - Number of keys to lookup
|
|
|
|
// keys - Pointer to C style array of key Slices with num_keys elements
|
|
|
|
// values - Pointer to C style array of PinnableSlices with num_keys elements
|
|
|
|
// statuses - Pointer to C style array of Status with num_keys elements
|
|
|
|
// sorted_input - If true, it means the input keys are already sorted by key
|
|
|
|
// order, so the MultiGet() API doesn't have to sort them
|
|
|
|
// again. If false, the keys will be copied and sorted
|
|
|
|
// internally by the API - the input array will not be
|
|
|
|
// modified
|
|
|
|
virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
|
|
|
|
ColumnFamilyHandle** column_families, const Slice* keys,
|
|
|
|
PinnableSlice* values, Status* statuses,
|
|
|
|
const bool /*sorted_input*/ = false) {
|
|
|
|
std::vector<ColumnFamilyHandle*> cf;
|
|
|
|
std::vector<Slice> user_keys;
|
|
|
|
std::vector<Status> status;
|
|
|
|
std::vector<std::string> vals;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
|
|
|
cf.emplace_back(column_families[i]);
|
|
|
|
user_keys.emplace_back(keys[i]);
|
|
|
|
}
|
|
|
|
status = MultiGet(options, cf, user_keys, &vals);
|
|
|
|
std::copy(status.begin(), status.end(), statuses);
|
|
|
|
for (auto& value : vals) {
|
|
|
|
values->PinSelf(value);
|
|
|
|
values++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the key definitely does not exist in the database, then this method
|
|
|
|
// returns false, else true. If the caller wants to obtain value when the key
|
|
|
|
// is found in memory, a bool for 'value_found' must be passed. 'value_found'
|
|
|
|
// will be true on return if value has been set properly.
|
|
|
|
// This check is potentially lighter-weight than invoking DB::Get(). One way
|
|
|
|
// to make this lighter weight is to avoid doing any IOs.
|
|
|
|
// Default implementation here returns true and sets 'value_found' to false
|
|
|
|
virtual bool KeyMayExist(const ReadOptions& /*options*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const Slice& /*key*/, std::string* /*value*/,
|
|
|
|
bool* value_found = nullptr) {
|
|
|
|
if (value_found != nullptr) {
|
|
|
|
*value_found = false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
|
|
|
|
std::string* value, bool* value_found = nullptr) {
|
|
|
|
return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// Return a heap-allocated iterator over the contents of the database.
|
|
|
|
// The result of NewIterator() is initially invalid (caller must
|
|
|
|
// call one of the Seek methods on the iterator before using it).
|
|
|
|
//
|
|
|
|
// Caller should delete the iterator when it is no longer needed.
|
|
|
|
// The returned iterator should be deleted before this db is deleted.
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
virtual Iterator* NewIterator(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family) = 0;
|
|
|
|
virtual Iterator* NewIterator(const ReadOptions& options) {
|
|
|
|
return NewIterator(options, DefaultColumnFamily());
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
// Returns iterators from a consistent database state across multiple
|
|
|
|
// column families. Iterators are heap allocated and need to be deleted
|
|
|
|
// before the db is deleted
|
|
|
|
virtual Status NewIterators(
|
|
|
|
const ReadOptions& options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
std::vector<Iterator*>* iterators) = 0;
|
|
|
|
|
|
|
|
// Return a handle to the current DB state. Iterators created with
|
|
|
|
// this handle will all observe a stable snapshot of the current DB
|
|
|
|
// state. The caller must call ReleaseSnapshot(result) when the
|
|
|
|
// snapshot is no longer needed.
|
Add a new mem-table representation based on cuckoo hash.
Summary:
= Major Changes =
* Add a new mem-table representation, HashCuckooRep, which is based cuckoo hash.
Cuckoo hash uses multiple hash functions. This allows each key to have multiple
possible locations in the mem-table.
- Put: When insert a key, it will try to find whether one of its possible
locations is vacant and store the key. If none of its possible
locations are available, then it will kick out a victim key and
store at that location. The kicked-out victim key will then be
stored at a vacant space of its possible locations or kick-out
another victim. In this diff, the kick-out path (known as
cuckoo-path) is found using BFS, which guarantees to be the shortest.
- Get: Simply tries all possible locations of a key --- this guarantees
worst-case constant time complexity.
- Time complexity: O(1) for Get, and average O(1) for Put if the
fullness of the mem-table is below 80%.
- Default using two hash functions, the number of hash functions used
by the cuckoo-hash may dynamically increase if it fails to find a
short-enough kick-out path.
- Currently, HashCuckooRep does not support iteration and snapshots,
as our current main purpose of this is to optimize point access.
= Minor Changes =
* Add IsSnapshotSupported() to DB to indicate whether the current DB
supports snapshots. If it returns false, then DB::GetSnapshot() will
always return nullptr.
Test Plan:
Run existing tests. Will develop a test specifically for cuckoo hash in
the next diff.
Reviewers: sdong, haobo
Reviewed By: sdong
CC: leveldb, dhruba, igor
Differential Revision: https://reviews.facebook.net/D16155
11 years ago
|
|
|
//
|
|
|
|
// nullptr will be returned if the DB fails to take a snapshot or does
|
|
|
|
// not support snapshot.
|
|
|
|
virtual const Snapshot* GetSnapshot() = 0;
|
|
|
|
|
|
|
|
// Release a previously acquired snapshot. The caller must not
|
|
|
|
// use "snapshot" after this call.
|
|
|
|
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
// Contains all valid property arguments for GetProperty().
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
9 years ago
|
|
|
//
|
|
|
|
// NOTE: Property names cannot end in numbers since those are interpreted as
|
|
|
|
// arguments, e.g., see kNumFilesAtLevelPrefix.
|
|
|
|
struct Properties {
|
|
|
|
// "rocksdb.num-files-at-level<N>" - returns string containing the number
|
|
|
|
// of files at level <N>, where <N> is an ASCII representation of a
|
|
|
|
// level number (e.g., "0").
|
|
|
|
static const std::string kNumFilesAtLevelPrefix;
|
|
|
|
|
|
|
|
// "rocksdb.compression-ratio-at-level<N>" - returns string containing the
|
|
|
|
// compression ratio of data at level <N>, where <N> is an ASCII
|
|
|
|
// representation of a level number (e.g., "0"). Here, compression
|
|
|
|
// ratio is defined as uncompressed data size / compressed file size.
|
|
|
|
// Returns "-1.0" if no open files at level <N>.
|
|
|
|
static const std::string kCompressionRatioAtLevelPrefix;
|
|
|
|
|
|
|
|
// "rocksdb.stats" - returns a multi-line string containing the data
|
|
|
|
// described by kCFStats followed by the data described by kDBStats.
|
|
|
|
static const std::string kStats;
|
|
|
|
|
|
|
|
// "rocksdb.sstables" - returns a multi-line string summarizing current
|
|
|
|
// SST files.
|
|
|
|
static const std::string kSSTables;
|
|
|
|
|
|
|
|
// "rocksdb.cfstats" - Both of "rocksdb.cfstats-no-file-histogram" and
|
|
|
|
// "rocksdb.cf-file-histogram" together. See below for description
|
|
|
|
// of the two.
|
|
|
|
static const std::string kCFStats;
|
|
|
|
|
|
|
|
// "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with
|
|
|
|
// general columm family stats per-level over db's lifetime ("L<n>"),
|
|
|
|
// aggregated over db's lifetime ("Sum"), and aggregated over the
|
|
|
|
// interval since the last retrieval ("Int").
|
|
|
|
// It could also be used to return the stats in the format of the map.
|
|
|
|
// In this case there will a pair of string to array of double for
|
|
|
|
// each level as well as for "Sum". "Int" stats will not be affected
|
|
|
|
// when this form of stats are retrieved.
|
|
|
|
static const std::string kCFStatsNoFileHistogram;
|
|
|
|
|
|
|
|
// "rocksdb.cf-file-histogram" - print out how many file reads to every
|
|
|
|
// level, as well as the histogram of latency of single requests.
|
|
|
|
static const std::string kCFFileHistogram;
|
|
|
|
|
|
|
|
// "rocksdb.dbstats" - returns a multi-line string with general database
|
|
|
|
// stats, both cumulative (over the db's lifetime) and interval (since
|
|
|
|
// the last retrieval of kDBStats).
|
|
|
|
static const std::string kDBStats;
|
|
|
|
|
|
|
|
// "rocksdb.levelstats" - returns multi-line string containing the number
|
|
|
|
// of files per level and total size of each level (MB).
|
|
|
|
static const std::string kLevelStats;
|
|
|
|
|
|
|
|
// "rocksdb.num-immutable-mem-table" - returns number of immutable
|
|
|
|
// memtables that have not yet been flushed.
|
|
|
|
static const std::string kNumImmutableMemTable;
|
|
|
|
|
|
|
|
// "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable
|
|
|
|
// memtables that have already been flushed.
|
|
|
|
static const std::string kNumImmutableMemTableFlushed;
|
|
|
|
|
|
|
|
// "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is
|
|
|
|
// pending; otherwise, returns 0.
|
|
|
|
static const std::string kMemTableFlushPending;
|
|
|
|
|
|
|
|
// "rocksdb.num-running-flushes" - returns the number of currently running
|
|
|
|
// flushes.
|
|
|
|
static const std::string kNumRunningFlushes;
|
|
|
|
|
|
|
|
// "rocksdb.compaction-pending" - returns 1 if at least one compaction is
|
|
|
|
// pending; otherwise, returns 0.
|
|
|
|
static const std::string kCompactionPending;
|
|
|
|
|
|
|
|
// "rocksdb.num-running-compactions" - returns the number of currently
|
|
|
|
// running compactions.
|
|
|
|
static const std::string kNumRunningCompactions;
|
|
|
|
|
|
|
|
// "rocksdb.background-errors" - returns accumulated number of background
|
|
|
|
// errors.
|
|
|
|
static const std::string kBackgroundErrors;
|
|
|
|
|
|
|
|
// "rocksdb.cur-size-active-mem-table" - returns approximate size of active
|
|
|
|
// memtable (bytes).
|
|
|
|
static const std::string kCurSizeActiveMemTable;
|
|
|
|
|
|
|
|
// "rocksdb.cur-size-all-mem-tables" - returns approximate size of active
|
|
|
|
// and unflushed immutable memtables (bytes).
|
|
|
|
static const std::string kCurSizeAllMemTables;
|
|
|
|
|
|
|
|
// "rocksdb.size-all-mem-tables" - returns approximate size of active,
|
|
|
|
// unflushed immutable, and pinned immutable memtables (bytes).
|
|
|
|
static const std::string kSizeAllMemTables;
|
|
|
|
|
|
|
|
// "rocksdb.num-entries-active-mem-table" - returns total number of entries
|
|
|
|
// in the active memtable.
|
|
|
|
static const std::string kNumEntriesActiveMemTable;
|
|
|
|
|
|
|
|
// "rocksdb.num-entries-imm-mem-tables" - returns total number of entries
|
|
|
|
// in the unflushed immutable memtables.
|
|
|
|
static const std::string kNumEntriesImmMemTables;
|
|
|
|
|
|
|
|
// "rocksdb.num-deletes-active-mem-table" - returns total number of delete
|
|
|
|
// entries in the active memtable.
|
|
|
|
static const std::string kNumDeletesActiveMemTable;
|
|
|
|
|
|
|
|
// "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete
|
|
|
|
// entries in the unflushed immutable memtables.
|
|
|
|
static const std::string kNumDeletesImmMemTables;
|
|
|
|
|
|
|
|
// "rocksdb.estimate-num-keys" - returns estimated number of total keys in
|
|
|
|
// the active and unflushed immutable memtables and storage.
|
|
|
|
static const std::string kEstimateNumKeys;
|
|
|
|
|
|
|
|
// "rocksdb.estimate-table-readers-mem" - returns estimated memory used for
|
|
|
|
// reading SST tables, excluding memory used in block cache (e.g.,
|
|
|
|
// filter and index blocks).
|
|
|
|
static const std::string kEstimateTableReadersMem;
|
|
|
|
|
|
|
|
// "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete
|
|
|
|
// files is enabled; otherwise, returns a non-zero number.
|
|
|
|
static const std::string kIsFileDeletionsEnabled;
|
|
|
|
|
|
|
|
// "rocksdb.num-snapshots" - returns number of unreleased snapshots of the
|
|
|
|
// database.
|
|
|
|
static const std::string kNumSnapshots;
|
|
|
|
|
|
|
|
// "rocksdb.oldest-snapshot-time" - returns number representing unix
|
|
|
|
// timestamp of oldest unreleased snapshot.
|
|
|
|
static const std::string kOldestSnapshotTime;
|
|
|
|
|
|
|
|
// "rocksdb.num-live-versions" - returns number of live versions. `Version`
|
|
|
|
// is an internal data structure. See version_set.h for details. More
|
|
|
|
// live versions often mean more SST files are held from being deleted,
|
|
|
|
// by iterators or unfinished compactions.
|
|
|
|
static const std::string kNumLiveVersions;
|
|
|
|
|
|
|
|
// "rocksdb.current-super-version-number" - returns number of current LSM
|
|
|
|
// version. It is a uint64_t integer number, incremented after there is
|
|
|
|
// any change to the LSM tree. The number is not preserved after restarting
|
|
|
|
// the DB. After DB restart, it will start from 0 again.
|
|
|
|
static const std::string kCurrentSuperVersionNumber;
|
|
|
|
|
|
|
|
// "rocksdb.estimate-live-data-size" - returns an estimate of the amount of
|
|
|
|
// live data in bytes.
|
|
|
|
static const std::string kEstimateLiveDataSize;
|
|
|
|
|
|
|
|
// "rocksdb.min-log-number-to-keep" - return the minimum log number of the
|
|
|
|
// log files that should be kept.
|
|
|
|
static const std::string kMinLogNumberToKeep;
|
|
|
|
|
|
|
|
// "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file
|
|
|
|
// number for an obsolete SST to be kept. The max value of `uint64_t`
|
|
|
|
// will be returned if all obsolete files can be deleted.
|
|
|
|
static const std::string kMinObsoleteSstNumberToKeep;
|
|
|
|
|
|
|
|
// "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
|
|
|
|
// files.
|
|
|
|
// WARNING: may slow down online queries if there are too many files.
|
|
|
|
static const std::string kTotalSstFilesSize;
|
|
|
|
|
|
|
|
// "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST
|
|
|
|
// files belong to the latest LSM tree.
|
|
|
|
static const std::string kLiveSstFilesSize;
|
|
|
|
|
|
|
|
// "rocksdb.base-level" - returns number of level to which L0 data will be
|
|
|
|
// compacted.
|
|
|
|
static const std::string kBaseLevel;
|
|
|
|
|
|
|
|
// "rocksdb.estimate-pending-compaction-bytes" - returns estimated total
|
|
|
|
// number of bytes compaction needs to rewrite to get all levels down
|
|
|
|
// to under target size. Not valid for other compactions than level-
|
|
|
|
// based.
|
|
|
|
static const std::string kEstimatePendingCompactionBytes;
|
|
|
|
|
|
|
|
// "rocksdb.aggregated-table-properties" - returns a string representation
|
|
|
|
// of the aggregated table properties of the target column family.
|
|
|
|
static const std::string kAggregatedTableProperties;
|
|
|
|
|
|
|
|
// "rocksdb.aggregated-table-properties-at-level<N>", same as the previous
|
|
|
|
// one but only returns the aggregated table properties of the
|
|
|
|
// specified level "N" at the target column family.
|
|
|
|
static const std::string kAggregatedTablePropertiesAtLevel;
|
|
|
|
|
|
|
|
// "rocksdb.actual-delayed-write-rate" - returns the current actual delayed
|
|
|
|
// write rate. 0 means no delay.
|
|
|
|
static const std::string kActualDelayedWriteRate;
|
|
|
|
|
|
|
|
// "rocksdb.is-write-stopped" - Return 1 if write has been stopped.
|
|
|
|
static const std::string kIsWriteStopped;
|
|
|
|
|
|
|
|
// "rocksdb.estimate-oldest-key-time" - returns an estimation of
|
|
|
|
// oldest key timestamp in the DB. Currently only available for
|
|
|
|
// FIFO compaction with
|
|
|
|
// compaction_options_fifo.allow_compaction = false.
|
|
|
|
static const std::string kEstimateOldestKeyTime;
|
|
|
|
|
|
|
|
// "rocksdb.block-cache-capacity" - returns block cache capacity.
|
|
|
|
static const std::string kBlockCacheCapacity;
|
|
|
|
|
|
|
|
// "rocksdb.block-cache-usage" - returns the memory size for the entries
|
|
|
|
// residing in block cache.
|
|
|
|
static const std::string kBlockCacheUsage;
|
|
|
|
|
|
|
|
// "rocksdb.block-cache-pinned-usage" - returns the memory size for the
|
|
|
|
// entries being pinned.
|
|
|
|
static const std::string kBlockCachePinnedUsage;
|
|
|
|
|
|
|
|
// "rocksdb.options-statistics" - returns multi-line string
|
|
|
|
// of options.statistics
|
|
|
|
static const std::string kOptionsStatistics;
|
|
|
|
};
|
|
|
|
#endif /* ROCKSDB_LITE */
|
|
|
|
|
|
|
|
// DB implementations can export properties about their state via this method.
|
|
|
|
// If "property" is a valid property understood by this DB implementation (see
|
|
|
|
// Properties struct above for valid options), fills "*value" with its current
|
|
|
|
// value and returns true. Otherwise, returns false.
|
|
|
|
virtual bool GetProperty(ColumnFamilyHandle* column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
const Slice& property, std::string* value) = 0;
|
|
|
|
virtual bool GetProperty(const Slice& property, std::string* value) {
|
|
|
|
return GetProperty(DefaultColumnFamily(), property, value);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& property,
|
|
|
|
std::map<std::string, std::string>* value) = 0;
|
|
|
|
virtual bool GetMapProperty(const Slice& property,
|
|
|
|
std::map<std::string, std::string>* value) {
|
|
|
|
return GetMapProperty(DefaultColumnFamily(), property, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Similar to GetProperty(), but only works for a subset of properties whose
|
|
|
|
// return value is an integer. Return the value by integer. Supported
|
|
|
|
// properties:
|
|
|
|
// "rocksdb.num-immutable-mem-table"
|
|
|
|
// "rocksdb.mem-table-flush-pending"
|
|
|
|
// "rocksdb.compaction-pending"
|
|
|
|
// "rocksdb.background-errors"
|
|
|
|
// "rocksdb.cur-size-active-mem-table"
|
|
|
|
// "rocksdb.cur-size-all-mem-tables"
|
|
|
|
// "rocksdb.size-all-mem-tables"
|
|
|
|
// "rocksdb.num-entries-active-mem-table"
|
|
|
|
// "rocksdb.num-entries-imm-mem-tables"
|
|
|
|
// "rocksdb.num-deletes-active-mem-table"
|
|
|
|
// "rocksdb.num-deletes-imm-mem-tables"
|
|
|
|
// "rocksdb.estimate-num-keys"
|
|
|
|
// "rocksdb.estimate-table-readers-mem"
|
|
|
|
// "rocksdb.is-file-deletions-enabled"
|
|
|
|
// "rocksdb.num-snapshots"
|
|
|
|
// "rocksdb.oldest-snapshot-time"
|
|
|
|
// "rocksdb.num-live-versions"
|
|
|
|
// "rocksdb.current-super-version-number"
|
|
|
|
// "rocksdb.estimate-live-data-size"
|
|
|
|
// "rocksdb.min-log-number-to-keep"
|
|
|
|
// "rocksdb.min-obsolete-sst-number-to-keep"
|
|
|
|
// "rocksdb.total-sst-files-size"
|
|
|
|
// "rocksdb.live-sst-files-size"
|
|
|
|
// "rocksdb.base-level"
|
|
|
|
// "rocksdb.estimate-pending-compaction-bytes"
|
|
|
|
// "rocksdb.num-running-compactions"
|
|
|
|
// "rocksdb.num-running-flushes"
|
|
|
|
// "rocksdb.actual-delayed-write-rate"
|
|
|
|
// "rocksdb.is-write-stopped"
|
|
|
|
// "rocksdb.estimate-oldest-key-time"
|
|
|
|
// "rocksdb.block-cache-capacity"
|
|
|
|
// "rocksdb.block-cache-usage"
|
|
|
|
// "rocksdb.block-cache-pinned-usage"
|
|
|
|
virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& property, uint64_t* value) = 0;
|
|
|
|
virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
|
|
|
|
return GetIntProperty(DefaultColumnFamily(), property, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reset internal stats for DB and all column families.
|
|
|
|
// Note this doesn't reset options.statistics as it is not owned by
|
|
|
|
// DB.
|
|
|
|
virtual Status ResetStats() {
|
|
|
|
return Status::NotSupported("Not implemented");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Same as GetIntProperty(), but this one returns the aggregated int
|
|
|
|
// property from all column families.
|
|
|
|
virtual bool GetAggregatedIntProperty(const Slice& property,
|
|
|
|
uint64_t* value) = 0;
|
|
|
|
|
|
|
|
// Flags for DB::GetSizeApproximation that specify whether memtable
|
|
|
|
// stats should be included, or file stats approximation or both
|
|
|
|
enum SizeApproximationFlags : uint8_t {
|
|
|
|
NONE = 0,
|
|
|
|
INCLUDE_MEMTABLES = 1 << 0,
|
|
|
|
INCLUDE_FILES = 1 << 1
|
|
|
|
};
|
|
|
|
|
|
|
|
// For each i in [0,n-1], store in "sizes[i]", the approximate
|
|
|
|
// file system space used by keys in "[range[i].start .. range[i].limit)".
|
|
|
|
//
|
|
|
|
// Note that the returned sizes measure file system space usage, so
|
|
|
|
// if the user data compresses by a factor of ten, the returned
|
|
|
|
// sizes will be one-tenth the size of the corresponding user data size.
|
|
|
|
virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Range* range, int n,
|
|
|
|
uint64_t* sizes) = 0;
|
|
|
|
|
|
|
|
// Simpler versions of the GetApproximateSizes() method above.
|
|
|
|
// The include_flags argumenbt must of type DB::SizeApproximationFlags
|
|
|
|
// and can not be NONE.
|
|
|
|
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
|
|
|
|
const Range* range, int n, uint64_t* sizes,
|
|
|
|
uint8_t include_flags = INCLUDE_FILES) {
|
|
|
|
SizeApproximationOptions options;
|
|
|
|
options.include_memtabtles =
|
|
|
|
(include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0;
|
|
|
|
options.include_files =
|
|
|
|
(include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0;
|
|
|
|
GetApproximateSizes(options, column_family, range, n, sizes);
|
|
|
|
}
|
|
|
|
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes,
|
|
|
|
uint8_t include_flags = INCLUDE_FILES) {
|
|
|
|
GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
// The method is similar to GetApproximateSizes, except it
|
|
|
|
// returns approximate number of records in memtables.
|
|
|
|
virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
|
|
|
|
const Range& range,
|
|
|
|
uint64_t* const count,
|
|
|
|
uint64_t* const size) = 0;
|
|
|
|
virtual void GetApproximateMemTableStats(const Range& range,
|
|
|
|
uint64_t* const count,
|
|
|
|
uint64_t* const size) {
|
|
|
|
GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Deprecated versions of GetApproximateSizes
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes(
|
|
|
|
const Range* range, int n, uint64_t* sizes, bool include_memtable) {
|
|
|
|
uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES;
|
|
|
|
if (include_memtable) {
|
|
|
|
include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES;
|
|
|
|
}
|
|
|
|
GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
|
|
|
|
}
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes(
|
|
|
|
ColumnFamilyHandle* column_family, const Range* range, int n,
|
|
|
|
uint64_t* sizes, bool include_memtable) {
|
|
|
|
uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES;
|
|
|
|
if (include_memtable) {
|
|
|
|
include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES;
|
|
|
|
}
|
|
|
|
GetApproximateSizes(column_family, range, n, sizes, include_flags);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// Compact the underlying storage for the key range [*begin,*end].
|
|
|
|
// The actual compaction interval might be superset of [*begin, *end].
|
|
|
|
// In particular, deleted and overwritten versions are discarded,
|
|
|
|
// and the data is rearranged to reduce the cost of operations
|
|
|
|
// needed to access the data. This operation should typically only
|
|
|
|
// be invoked by users who understand the underlying implementation.
|
|
|
|
//
|
|
|
|
// begin==nullptr is treated as a key before all keys in the database.
|
|
|
|
// end==nullptr is treated as a key after all keys in the database.
|
|
|
|
// Therefore the following call will compact the entire database:
|
|
|
|
// db->CompactRange(options, nullptr, nullptr);
|
|
|
|
// Note that after the entire database is compacted, all data are pushed
|
|
|
|
// down to the last level containing any data. If the total data size after
|
|
|
|
// compaction is reduced, that level might not be appropriate for hosting all
|
|
|
|
// the files. In this case, client could set options.change_level to true, to
|
|
|
|
// move the files back to the minimum level capable of holding the data set
|
|
|
|
// or a given level (specified by non-negative options.target_level).
|
|
|
|
virtual Status CompactRange(const CompactRangeOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice* begin, const Slice* end) = 0;
|
|
|
|
virtual Status CompactRange(const CompactRangeOptions& options,
|
|
|
|
const Slice* begin, const Slice* end) {
|
|
|
|
return CompactRange(options, DefaultColumnFamily(), begin, end);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange(
|
|
|
|
ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end,
|
|
|
|
bool change_level = false, int target_level = -1,
|
|
|
|
uint32_t target_path_id = 0) {
|
|
|
|
CompactRangeOptions options;
|
|
|
|
options.change_level = change_level;
|
|
|
|
options.target_level = target_level;
|
|
|
|
options.target_path_id = target_path_id;
|
|
|
|
return CompactRange(options, column_family, begin, end);
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange(
|
|
|
|
const Slice* begin, const Slice* end, bool change_level = false,
|
|
|
|
int target_level = -1, uint32_t target_path_id = 0) {
|
|
|
|
CompactRangeOptions options;
|
|
|
|
options.change_level = change_level;
|
|
|
|
options.target_level = target_level;
|
|
|
|
options.target_path_id = target_path_id;
|
|
|
|
return CompactRange(options, DefaultColumnFamily(), begin, end);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual Status SetOptions(
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const std::unordered_map<std::string, std::string>& /*new_options*/) {
|
|
|
|
return Status::NotSupported("Not implemented");
|
|
|
|
}
|
|
|
|
virtual Status SetOptions(
|
|
|
|
const std::unordered_map<std::string, std::string>& new_options) {
|
|
|
|
return SetOptions(DefaultColumnFamily(), new_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual Status SetDBOptions(
|
|
|
|
const std::unordered_map<std::string, std::string>& new_options) = 0;
|
|
|
|
|
|
|
|
// CompactFiles() inputs a list of files specified by file numbers and
|
|
|
|
// compacts them to the specified level. Note that the behavior is different
|
|
|
|
// from CompactRange() in that CompactFiles() performs the compaction job
|
|
|
|
// using the CURRENT thread.
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
//
|
|
|
|
// @see GetDataBaseMetaData
|
|
|
|
// @see GetColumnFamilyMetaData
|
|
|
|
virtual Status CompactFiles(
|
|
|
|
const CompactionOptions& compact_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const std::vector<std::string>& input_file_names, const int output_level,
|
|
|
|
const int output_path_id = -1,
|
|
|
|
std::vector<std::string>* const output_file_names = nullptr,
|
|
|
|
CompactionJobInfo* compaction_job_info = nullptr) = 0;
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
|
|
|
|
virtual Status CompactFiles(
|
|
|
|
const CompactionOptions& compact_options,
|
|
|
|
const std::vector<std::string>& input_file_names, const int output_level,
|
|
|
|
const int output_path_id = -1,
|
|
|
|
std::vector<std::string>* const output_file_names = nullptr,
|
|
|
|
CompactionJobInfo* compaction_job_info = nullptr) {
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
return CompactFiles(compact_options, DefaultColumnFamily(),
|
|
|
|
input_file_names, output_level, output_path_id,
|
|
|
|
output_file_names, compaction_job_info);
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// This function will wait until all currently running background processes
|
|
|
|
// finish. After it returns, no background process will be run until
|
|
|
|
// ContinueBackgroundWork is called
|
|
|
|
virtual Status PauseBackgroundWork() = 0;
|
|
|
|
virtual Status ContinueBackgroundWork() = 0;
|
|
|
|
|
|
|
|
// This function will enable automatic compactions for the given column
|
add call to install superversion and schedule work in enableautocompactions
Summary:
This patch fixes https://github.com/facebook/mysql-5.6/issues/121
There is a recent change in rocksdb to disable auto compactions on startup: https://reviews.facebook.net/D51147. However, there is a small timing window where a column family needs to be compacted and schedules a compaction, but the scheduled compaction fails when it checks the disable_auto_compactions setting. The expectation is once the application is ready, it will call EnableAutoCompactions() to allow new compactions to go through. However, if the Column family is stalled because L0 is full, and no writes can go through, it is possible the column family may never have a new compaction request get scheduled. EnableAutoCompaction() should probably schedule an new flush and compaction event when it resets disable_auto_compaction.
Using InstallSuperVersionAndScheduleWork, we call SchedulePendingFlush,
SchedulePendingCompaction, as well as MaybeScheduleFlushOrcompaction on all the
column families to avoid the situation above.
This is still a first pass for feedback.
Could also just call SchedePendingFlush and SchedulePendingCompaction directly.
Test Plan:
Run on Asan build
cd _build-5.6-ASan/ && ./mysql-test/mtr --mem --big --testcase-timeout=36000 --suite-timeout=12000 --parallel=16 --suite=rocksdb,rocksdb_rpl,rocksdb_sys_vars --mysqld=--default-storage-engine=rocksdb --mysqld=--skip-innodb --mysqld=--default-tmp-storage-engine=MyISAM --mysqld=--rocksdb rocksdb_rpl.rpl_rocksdb_stress_crash --repeat=1000
Ensure that it no longer hangs during the test.
Reviewers: hermanlee4, yhchiang, anthony
Reviewed By: anthony
Subscribers: leveldb, yhchiang, dhruba
Differential Revision: https://reviews.facebook.net/D51747
9 years ago
|
|
|
// families if they were previously disabled. The function will first set the
|
|
|
|
// disable_auto_compactions option for each column family to 'false', after
|
|
|
|
// which it will schedule a flush/compaction.
|
|
|
|
//
|
|
|
|
// NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API
|
|
|
|
// does NOT schedule a flush/compaction afterwards, and only changes the
|
|
|
|
// parameter itself within the column family option.
|
|
|
|
//
|
|
|
|
virtual Status EnableAutoCompaction(
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
|
|
|
|
|
|
|
|
virtual void DisableManualCompaction() = 0;
|
|
|
|
virtual void EnableManualCompaction() = 0;
|
|
|
|
|
|
|
|
// Number of levels used for this DB.
|
|
|
|
virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
|
|
|
|
virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
|
|
|
|
|
|
|
|
// Maximum level to which a new compacted memtable is pushed if it
|
|
|
|
// does not create overlap.
|
|
|
|
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
|
|
|
|
virtual int MaxMemCompactionLevel() {
|
|
|
|
return MaxMemCompactionLevel(DefaultColumnFamily());
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// Number of files in level-0 that would stop writes.
|
|
|
|
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
|
|
|
|
virtual int Level0StopWriteTrigger() {
|
|
|
|
return Level0StopWriteTrigger(DefaultColumnFamily());
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
[RocksDB] BackupableDB
Summary:
In this diff I present you BackupableDB v1. You can easily use it to backup your DB and it will do incremental snapshots for you.
Let's first describe how you would use BackupableDB. It's inheriting StackableDB interface so you can easily construct it with your DB object -- it will add a method RollTheSnapshot() to the DB object. When you call RollTheSnapshot(), current snapshot of the DB will be stored in the backup dir. To restore, you can just call RestoreDBFromBackup() on a BackupableDB (which is a static method) and it will restore all files from the backup dir. In the next version, it will even support automatic backuping every X minutes.
There are multiple things you can configure:
1. backup_env and db_env can be different, which is awesome because then you can easily backup to HDFS or wherever you feel like.
2. sync - if true, it *guarantees* backup consistency on machine reboot
3. number of snapshots to keep - this will keep last N snapshots around if you want, for some reason, be able to restore from an earlier snapshot. All the backuping is done in incremental fashion - if we already have 00010.sst, we will not copy it again. *IMPORTANT* -- This is based on assumption that 00010.sst never changes - two files named 00010.sst from the same DB will always be exactly the same. Is this true? I always copy manifest, current and log files.
4. You can decide if you want to flush the memtables before you backup, or you're fine with backing up the log files -- either way, you get a complete and consistent view of the database at a time of backup.
5. More things you can find in BackupableDBOptions
Here is the directory structure I use:
backup_dir/CURRENT_SNAPSHOT - just 4 bytes holding the latest snapshot
0, 1, 2, ... - files containing serialized version of each snapshot - containing a list of files
files/*.sst - sst files shared between snapshots - if one snapshot references 00010.sst and another one needs to backup it from the DB, it will just reference the same file
files/ 0/, 1/, 2/, ... - snapshot directories containing private snapshot files - current, manifest and log files
All the files are ref counted and deleted immediatelly when they get out of scope.
Some other stuff in this diff:
1. Added GetEnv() method to the DB. Discussed with @haobo and we agreed that it seems right thing to do.
2. Fixed StackableDB interface. The way it was set up before, I was not able to implement BackupableDB.
Test Plan:
I have a unittest, but please don't look at this yet. I just hacked it up to help me with debugging. I will write a lot of good tests and update the diff.
Also, `make asan_check`
Reviewers: dhruba, haobo, emayanke
Reviewed By: dhruba
CC: leveldb, haobo
Differential Revision: https://reviews.facebook.net/D14295
11 years ago
|
|
|
// Get DB name -- the exact same name that was provided as an argument to
|
|
|
|
// DB::Open()
|
|
|
|
virtual const std::string& GetName() const = 0;
|
|
|
|
|
|
|
|
// Get Env object from the DB
|
|
|
|
virtual Env* GetEnv() const = 0;
|
|
|
|
|
|
|
|
// Get DB Options that we use. During the process of opening the
|
|
|
|
// column family, the options provided when calling DB::Open() or
|
|
|
|
// DB::CreateColumnFamily() will have been "sanitized" and transformed
|
|
|
|
// in an implementation-defined manner.
|
|
|
|
virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0;
|
|
|
|
virtual Options GetOptions() const {
|
|
|
|
return GetOptions(DefaultColumnFamily());
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
|
|
|
virtual DBOptions GetDBOptions() const = 0;
|
|
|
|
|
|
|
|
// Flush all mem-table data.
|
|
|
|
// Flush a single column family, even when atomic flush is enabled. To flush
|
|
|
|
// multiple column families, use Flush(options, column_families).
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
virtual Status Flush(const FlushOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family) = 0;
|
|
|
|
virtual Status Flush(const FlushOptions& options) {
|
|
|
|
return Flush(options, DefaultColumnFamily());
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
// Flushes multiple column families.
|
|
|
|
// If atomic flush is not enabled, Flush(options, column_families) is
|
|
|
|
// equivalent to calling Flush(options, column_family) multiple times.
|
|
|
|
// If atomic flush is enabled, Flush(options, column_families) will flush all
|
|
|
|
// column families specified in 'column_families' up to the latest sequence
|
|
|
|
// number at the time when flush is requested.
|
|
|
|
// Note that RocksDB 5.15 and earlier may not be able to open later versions
|
|
|
|
// with atomic flush enabled.
|
|
|
|
virtual Status Flush(
|
|
|
|
const FlushOptions& options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families) = 0;
|
|
|
|
|
|
|
|
// Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL
|
|
|
|
// afterwards.
|
|
|
|
virtual Status FlushWAL(bool /*sync*/) {
|
|
|
|
return Status::NotSupported("FlushWAL not implemented");
|
|
|
|
}
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
9 years ago
|
|
|
// Sync the wal. Note that Write() followed by SyncWAL() is not exactly the
|
|
|
|
// same as Write() with sync=true: in the latter case the changes won't be
|
|
|
|
// visible until the sync is done.
|
|
|
|
// Currently only works if allow_mmap_writes = false in Options.
|
|
|
|
virtual Status SyncWAL() = 0;
|
|
|
|
|
|
|
|
// Lock the WAL. Also flushes the WAL after locking.
|
|
|
|
virtual Status LockWAL() {
|
|
|
|
return Status::NotSupported("LockWAL not implemented");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Unlock the WAL.
|
|
|
|
virtual Status UnlockWAL() {
|
|
|
|
return Status::NotSupported("UnlockWAL not implemented");
|
|
|
|
}
|
|
|
|
|
|
|
|
// The sequence number of the most recent transaction.
|
|
|
|
virtual SequenceNumber GetLatestSequenceNumber() const = 0;
|
|
|
|
|
Added support for differential snapshots
Summary:
The motivation for this PR is to add to RocksDB support for differential (incremental) snapshots, as snapshot of the DB changes between two points in time (one can think of it as diff between to sequence numbers, or the diff D which can be thought of as an SST file or just set of KVs that can be applied to sequence number S1 to get the database to the state at sequence number S2).
This feature would be useful for various distributed storages layers built on top of RocksDB, as it should help reduce resources (time and network bandwidth) needed to recover and rebuilt DB instances as replicas in the context of distributed storages.
From the API standpoint that would like client app requesting iterator between (start seqnum) and current DB state, and reading the "diff".
This is a very draft PR for initial review in the discussion on the approach, i'm going to rework some parts and keep updating the PR.
For now, what's done here according to initial discussions:
Preserving deletes:
- We want to be able to optionally preserve recent deletes for some defined period of time, so that if a delete came in recently and might need to be included in the next incremental snapshot it would't get dropped by a compaction. This is done by adding new param to Options (preserve deletes flag) and new variable to DB Impl where we keep track of the sequence number after which we don't want to drop tombstones, even if they are otherwise eligible for deletion.
- I also added a new API call for clients to be able to advance this cutoff seqnum after which we drop deletes; i assume it's more flexible to let clients control this, since otherwise we'd need to keep some kind of timestamp < -- > seqnum mapping inside the DB, which sounds messy and painful to support. Clients could make use of it by periodically calling GetLatestSequenceNumber(), noting the timestamp, doing some calculation and figuring out by how much we need to advance the cutoff seqnum.
- Compaction codepath in compaction_iterator.cc has been modified to avoid dropping tombstones with seqnum > cutoff seqnum.
Iterator changes:
- couple params added to ReadOptions, to optionally allow client to request internal keys instead of user keys (so that client can get the latest value of a key, be it delete marker or a put), as well as min timestamp and min seqnum.
TableCache changes:
- I modified table_cache code to be able to quickly exclude SST files from iterators heep if creation_time on the file is less then iter_start_ts as passed in ReadOptions. That would help a lot in some DB settings (like reading very recent data only or using FIFO compactions), but not so much for universal compaction with more or less long iterator time span.
What's left:
- Still looking at how to best plug that inside DBIter codepath. So far it seems that FindNextUserKeyInternal only parses values as UserKeys, and iter->key() call generally returns user key. Can we add new API to DBIter as internal_key(), and modify this internal method to optionally set saved_key_ to point to the full internal key? I don't need to store actual seqnum there, but I do need to store type.
Closes https://github.com/facebook/rocksdb/pull/2999
Differential Revision: D6175602
Pulled By: mikhail-antonov
fbshipit-source-id: c779a6696ee2d574d86c69cec866a3ae095aa900
7 years ago
|
|
|
// Instructs DB to preserve deletes with sequence numbers >= passed seqnum.
|
|
|
|
// Has no effect if DBOptions.preserve_deletes is set to false.
|
|
|
|
// This function assumes that user calls this function with monotonically
|
|
|
|
// increasing seqnums (otherwise we can't guarantee that a particular delete
|
|
|
|
// hasn't been already processed); returns true if the value was successfully
|
|
|
|
// updated, false if user attempted to call if with seqnum <= current value.
|
|
|
|
virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0;
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
|
|
|
|
// Prevent file deletions. Compactions will continue to occur,
|
|
|
|
// but no obsolete files will be deleted. Calling this multiple
|
|
|
|
// times have the same effect as calling it once.
|
|
|
|
virtual Status DisableFileDeletions() = 0;
|
|
|
|
|
|
|
|
// Allow compactions to delete obsolete files.
|
|
|
|
// If force == true, the call to EnableFileDeletions() will guarantee that
|
|
|
|
// file deletions are enabled after the call, even if DisableFileDeletions()
|
|
|
|
// was called multiple times before.
|
|
|
|
// If force == false, EnableFileDeletions will only enable file deletion
|
|
|
|
// after it's been called at least as many times as DisableFileDeletions(),
|
|
|
|
// enabling the two methods to be called by two threads concurrently without
|
|
|
|
// synchronization -- i.e., file deletions will be enabled only after both
|
|
|
|
// threads call EnableFileDeletions()
|
|
|
|
virtual Status EnableFileDeletions(bool force = true) = 0;
|
|
|
|
|
|
|
|
// GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
|
|
|
|
|
|
|
|
// Retrieve the list of all files in the database. The files are
|
|
|
|
// relative to the dbname and are not absolute paths. Despite being relative
|
|
|
|
// paths, the file names begin with "/". The valid size of the manifest file
|
|
|
|
// is returned in manifest_file_size. The manifest file is an ever growing
|
|
|
|
// file, but only the portion specified by manifest_file_size is valid for
|
|
|
|
// this snapshot. Setting flush_memtable to true does Flush before recording
|
|
|
|
// the live files. Setting flush_memtable to false is useful when we don't
|
|
|
|
// want to wait for flush which may have to wait for compaction to complete
|
|
|
|
// taking an indeterminate time.
|
|
|
|
//
|
|
|
|
// In case you have multiple column families, even if flush_memtable is true,
|
|
|
|
// you still need to call GetSortedWalFiles after GetLiveFiles to compensate
|
|
|
|
// for new data that arrived to already-flushed column families while other
|
|
|
|
// column families were flushing
|
|
|
|
virtual Status GetLiveFiles(std::vector<std::string>&,
|
|
|
|
uint64_t* manifest_file_size,
|
|
|
|
bool flush_memtable = true) = 0;
|
|
|
|
|
|
|
|
// Retrieve the sorted list of all wal files with earliest file first
|
|
|
|
virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
|
|
|
|
|
|
|
|
// Retrieve information about the current wal file
|
|
|
|
//
|
|
|
|
// Note that the log might have rolled after this call in which case
|
|
|
|
// the current_log_file would not point to the current log file.
|
|
|
|
//
|
|
|
|
// Additionally, for the sake of optimization current_log_file->StartSequence
|
|
|
|
// would always be set to 0
|
|
|
|
virtual Status GetCurrentWalFile(
|
|
|
|
std::unique_ptr<LogFile>* current_log_file) = 0;
|
|
|
|
|
|
|
|
// Retrieves the creation time of the oldest file in the DB.
|
|
|
|
// This API only works if max_open_files = -1, if it is not then
|
|
|
|
// Status returned is Status::NotSupported()
|
|
|
|
// The file creation time is set using the env provided to the DB.
|
|
|
|
// If the DB was created from a very old release then its possible that
|
|
|
|
// the SST files might not have file_creation_time property and even after
|
|
|
|
// moving to a newer release its possible that some files never got compacted
|
|
|
|
// and may not have file_creation_time property. In both the cases
|
|
|
|
// file_creation_time is considered 0 which means this API will return
|
|
|
|
// creation_time = 0 as there wouldn't be a timestamp lower than 0.
|
|
|
|
virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0;
|
|
|
|
|
|
|
|
// Note: this API is not yet consistent with WritePrepared transactions.
|
|
|
|
// Sets iter to an iterator that is positioned at a write-batch containing
|
|
|
|
// seq_number. If the sequence number is non existent, it returns an iterator
|
|
|
|
// at the first available seq_no after the requested seq_no
|
|
|
|
// Returns Status::OK if iterator is valid
|
|
|
|
// Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
|
|
|
|
// use this api, else the WAL files will get
|
|
|
|
// cleared aggressively and the iterator might keep getting invalid before
|
|
|
|
// an update is read.
|
|
|
|
virtual Status GetUpdatesSince(
|
|
|
|
SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
|
|
|
|
const TransactionLogIterator::ReadOptions& read_options =
|
|
|
|
TransactionLogIterator::ReadOptions()) = 0;
|
|
|
|
|
|
|
|
// Windows API macro interference
|
|
|
|
#undef DeleteFile
|
|
|
|
// Delete the file name from the db directory and update the internal state to
|
|
|
|
// reflect that. Supports deletion of sst and log files only. 'name' must be
|
|
|
|
// path relative to the db directory. eg. 000001.sst, /archive/000003.log
|
|
|
|
virtual Status DeleteFile(std::string name) = 0;
|
|
|
|
|
|
|
|
// Returns a list of all table files with their level, start key
|
|
|
|
// and end key
|
|
|
|
virtual void GetLiveFilesMetaData(
|
|
|
|
std::vector<LiveFileMetaData>* /*metadata*/) {}
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
// Obtains the meta data of the specified column family of the DB.
|
|
|
|
virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
|
|
|
|
ColumnFamilyMetaData* /*metadata*/) {}
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
|
|
|
|
// Get the metadata of the default column family.
|
|
|
|
void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) {
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
|
|
|
|
}
|
|
|
|
|
|
|
|
// IngestExternalFile() will load a list of external SST files (1) into the DB
|
|
|
|
// Two primary modes are supported:
|
|
|
|
// - Duplicate keys in the new files will overwrite exiting keys (default)
|
|
|
|
// - Duplicate keys will be skipped (set ingest_behind=true)
|
|
|
|
// In the first mode we will try to find the lowest possible level that
|
|
|
|
// the file can fit in, and ingest the file into this level (2). A file that
|
|
|
|
// have a key range that overlap with the memtable key range will require us
|
|
|
|
// to Flush the memtable first before ingesting the file.
|
|
|
|
// In the second mode we will always ingest in the bottom most level (see
|
|
|
|
// docs to IngestExternalFileOptions::ingest_behind).
|
|
|
|
//
|
|
|
|
// (1) External SST files can be created using SstFileWriter
|
|
|
|
// (2) We will try to ingest the files to the lowest possible level
|
|
|
|
// even if the file compression doesn't match the level compression
|
|
|
|
// (3) If IngestExternalFileOptions->ingest_behind is set to true,
|
|
|
|
// we always ingest at the bottommost level, which should be reserved
|
|
|
|
// for this purpose (see DBOPtions::allow_ingest_behind flag).
|
|
|
|
virtual Status IngestExternalFile(
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const std::vector<std::string>& external_files,
|
|
|
|
const IngestExternalFileOptions& options) = 0;
|
|
|
|
|
|
|
|
virtual Status IngestExternalFile(
|
|
|
|
const std::vector<std::string>& external_files,
|
|
|
|
const IngestExternalFileOptions& options) {
|
|
|
|
return IngestExternalFile(DefaultColumnFamily(), external_files, options);
|
|
|
|
}
|
|
|
|
|
|
|
|
// IngestExternalFiles() will ingest files for multiple column families, and
|
|
|
|
// record the result atomically to the MANIFEST.
|
|
|
|
// If this function returns OK, all column families' ingestion must succeed.
|
|
|
|
// If this function returns NOK, or the process crashes, then non-of the
|
|
|
|
// files will be ingested into the database after recovery.
|
|
|
|
// Note that it is possible for application to observe a mixed state during
|
|
|
|
// the execution of this function. If the user performs range scan over the
|
|
|
|
// column families with iterators, iterator on one column family may return
|
|
|
|
// ingested data, while iterator on other column family returns old data.
|
|
|
|
// Users can use snapshot for a consistent view of data.
|
|
|
|
// If your db ingests multiple SST files using this API, i.e. args.size()
|
|
|
|
// > 1, then RocksDB 5.15 and earlier will not be able to open it.
|
|
|
|
//
|
|
|
|
// REQUIRES: each arg corresponds to a different column family: namely, for
|
|
|
|
// 0 <= i < j < len(args), args[i].column_family != args[j].column_family.
|
|
|
|
virtual Status IngestExternalFiles(
|
|
|
|
const std::vector<IngestExternalFileArg>& args) = 0;
|
|
|
|
|
Export Import sst files (#5495)
Summary:
Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135
This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469
"Add support for taking snapshot of a column family and creating column family from a given CF snapshot"
We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality.
(1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below.
// Exports all live SST files of a specified Column Family onto export_dir,
// returning SST files information in metadata.
// - SST files will be created as hard links when the directory specified
// is in the same partition as the db directory, copied otherwise.
// - export_dir should not already exist and will be created by this API.
// - Always triggers a flush.
virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
const std::string& export_dir,
ExportImportFilesMetaData** metadata);
Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through
metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by
returning the list of file metadata.
(2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below.
// CreateColumnFamilyWithImport() will create a new column family with
// column_family_name and import external SST files specified in metadata into
// this column family.
// (1) External SST files can be created using SstFileWriter.
// (2) External SST files can be exported from a particular column family in
// an existing DB.
// Option in import_options specifies whether the external files are copied or
// moved (default is copy). When option specifies copy, managing files at
// external_file_path is caller's responsibility. When option specifies a
// move, the call ensures that the specified files at external_file_path are
// deleted on successful return and files are not modified on any error
// return.
// On error return, column family handle returned will be nullptr.
// ColumnFamily will be present on successful return and will not be present
// on error return. ColumnFamily may be present on any crash during this call.
virtual Status CreateColumnFamilyWithImport(
const ColumnFamilyOptions& options, const std::string& column_family_name,
const ImportColumnFamilyOptions& import_options,
const ExportImportFilesMetaData& metadata,
ColumnFamilyHandle** handle);
Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported.
If incoming sequence number is higher than current local sequence number, local sequence
number is updated to reflect this.
Note, as the sst files is are being moved across Column Families, Column Family name in sst file
will no longer match the actual column family on destination DB. The API does not modify Column
Family name or id in the sst files being imported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495
Differential Revision: D16018881
fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
5 years ago
|
|
|
// CreateColumnFamilyWithImport() will create a new column family with
|
|
|
|
// column_family_name and import external SST files specified in metadata into
|
|
|
|
// this column family.
|
|
|
|
// (1) External SST files can be created using SstFileWriter.
|
|
|
|
// (2) External SST files can be exported from a particular column family in
|
|
|
|
// an existing DB.
|
|
|
|
// Option in import_options specifies whether the external files are copied or
|
|
|
|
// moved (default is copy). When option specifies copy, managing files at
|
|
|
|
// external_file_path is caller's responsibility. When option specifies a
|
|
|
|
// move, the call ensures that the specified files at external_file_path are
|
|
|
|
// deleted on successful return and files are not modified on any error
|
|
|
|
// return.
|
|
|
|
// On error return, column family handle returned will be nullptr.
|
|
|
|
// ColumnFamily will be present on successful return and will not be present
|
|
|
|
// on error return. ColumnFamily may be present on any crash during this call.
|
|
|
|
virtual Status CreateColumnFamilyWithImport(
|
|
|
|
const ColumnFamilyOptions& options, const std::string& column_family_name,
|
|
|
|
const ImportColumnFamilyOptions& import_options,
|
|
|
|
const ExportImportFilesMetaData& metadata,
|
|
|
|
ColumnFamilyHandle** handle) = 0;
|
|
|
|
|
|
|
|
virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
|
|
|
|
|
|
|
|
virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
|
|
|
|
|
|
|
|
// AddFile() is deprecated, please use IngestExternalFile()
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const std::vector<std::string>& file_path_list, bool move_file = false,
|
|
|
|
bool skip_snapshot_check = false) {
|
|
|
|
IngestExternalFileOptions ifo;
|
|
|
|
ifo.move_files = move_file;
|
|
|
|
ifo.snapshot_consistency = !skip_snapshot_check;
|
|
|
|
ifo.allow_global_seqno = false;
|
|
|
|
ifo.allow_blocking_flush = false;
|
|
|
|
return IngestExternalFile(column_family, file_path_list, ifo);
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
|
|
|
const std::vector<std::string>& file_path_list, bool move_file = false,
|
|
|
|
bool skip_snapshot_check = false) {
|
|
|
|
IngestExternalFileOptions ifo;
|
|
|
|
ifo.move_files = move_file;
|
|
|
|
ifo.snapshot_consistency = !skip_snapshot_check;
|
|
|
|
ifo.allow_global_seqno = false;
|
|
|
|
ifo.allow_blocking_flush = false;
|
|
|
|
return IngestExternalFile(DefaultColumnFamily(), file_path_list, ifo);
|
|
|
|
}
|
|
|
|
|
|
|
|
// AddFile() is deprecated, please use IngestExternalFile()
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
|
|
|
ColumnFamilyHandle* column_family, const std::string& file_path,
|
|
|
|
bool move_file = false, bool skip_snapshot_check = false) {
|
|
|
|
IngestExternalFileOptions ifo;
|
|
|
|
ifo.move_files = move_file;
|
|
|
|
ifo.snapshot_consistency = !skip_snapshot_check;
|
|
|
|
ifo.allow_global_seqno = false;
|
|
|
|
ifo.allow_blocking_flush = false;
|
|
|
|
return IngestExternalFile(column_family, {file_path}, ifo);
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
|
|
|
const std::string& file_path, bool move_file = false,
|
|
|
|
bool skip_snapshot_check = false) {
|
|
|
|
IngestExternalFileOptions ifo;
|
|
|
|
ifo.move_files = move_file;
|
|
|
|
ifo.snapshot_consistency = !skip_snapshot_check;
|
|
|
|
ifo.allow_global_seqno = false;
|
|
|
|
ifo.allow_blocking_flush = false;
|
|
|
|
return IngestExternalFile(DefaultColumnFamily(), {file_path}, ifo);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Load table file with information "file_info" into "column_family"
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const std::vector<ExternalSstFileInfo>& file_info_list,
|
|
|
|
bool move_file = false, bool skip_snapshot_check = false) {
|
|
|
|
std::vector<std::string> external_files;
|
|
|
|
for (const ExternalSstFileInfo& file_info : file_info_list) {
|
|
|
|
external_files.push_back(file_info.file_path);
|
|
|
|
}
|
|
|
|
IngestExternalFileOptions ifo;
|
|
|
|
ifo.move_files = move_file;
|
|
|
|
ifo.snapshot_consistency = !skip_snapshot_check;
|
|
|
|
ifo.allow_global_seqno = false;
|
|
|
|
ifo.allow_blocking_flush = false;
|
|
|
|
return IngestExternalFile(column_family, external_files, ifo);
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
|
|
|
const std::vector<ExternalSstFileInfo>& file_info_list,
|
|
|
|
bool move_file = false, bool skip_snapshot_check = false) {
|
|
|
|
std::vector<std::string> external_files;
|
|
|
|
for (const ExternalSstFileInfo& file_info : file_info_list) {
|
|
|
|
external_files.push_back(file_info.file_path);
|
|
|
|
}
|
|
|
|
IngestExternalFileOptions ifo;
|
|
|
|
ifo.move_files = move_file;
|
|
|
|
ifo.snapshot_consistency = !skip_snapshot_check;
|
|
|
|
ifo.allow_global_seqno = false;
|
|
|
|
ifo.allow_blocking_flush = false;
|
|
|
|
return IngestExternalFile(DefaultColumnFamily(), external_files, ifo);
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
|
|
|
ColumnFamilyHandle* column_family, const ExternalSstFileInfo* file_info,
|
|
|
|
bool move_file = false, bool skip_snapshot_check = false) {
|
|
|
|
IngestExternalFileOptions ifo;
|
|
|
|
ifo.move_files = move_file;
|
|
|
|
ifo.snapshot_consistency = !skip_snapshot_check;
|
|
|
|
ifo.allow_global_seqno = false;
|
|
|
|
ifo.allow_blocking_flush = false;
|
|
|
|
return IngestExternalFile(column_family, {file_info->file_path}, ifo);
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
|
|
|
const ExternalSstFileInfo* file_info, bool move_file = false,
|
|
|
|
bool skip_snapshot_check = false) {
|
|
|
|
IngestExternalFileOptions ifo;
|
|
|
|
ifo.move_files = move_file;
|
|
|
|
ifo.snapshot_consistency = !skip_snapshot_check;
|
|
|
|
ifo.allow_global_seqno = false;
|
|
|
|
ifo.allow_blocking_flush = false;
|
|
|
|
return IngestExternalFile(DefaultColumnFamily(), {file_info->file_path},
|
|
|
|
ifo);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
|
|
|
// Sets the globally unique ID created at database creation time by invoking
|
|
|
|
// Env::GenerateUniqueId(), in identity. Returns Status::OK if identity could
|
|
|
|
// be set properly
|
|
|
|
virtual Status GetDbIdentity(std::string& identity) const = 0;
|
|
|
|
|
|
|
|
// Returns default column family handle
|
|
|
|
virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
|
|
|
TablePropertiesCollection* props) = 0;
|
|
|
|
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
|
|
|
|
return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
|
|
|
|
}
|
|
|
|
virtual Status GetPropertiesOfTablesInRange(
|
|
|
|
ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
|
|
|
|
TablePropertiesCollection* props) = 0;
|
|
|
|
|
|
|
|
virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const Slice* /*begin*/,
|
|
|
|
const Slice* /*end*/) {
|
|
|
|
return Status::NotSupported("SuggestCompactRange() is not implemented.");
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/,
|
|
|
|
int /*target_level*/) {
|
|
|
|
return Status::NotSupported("PromoteL0() is not implemented.");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Trace DB operations. Use EndTrace() to stop tracing.
|
|
|
|
virtual Status StartTrace(const TraceOptions& /*options*/,
|
|
|
|
std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
|
|
|
|
return Status::NotSupported("StartTrace() is not implemented.");
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual Status EndTrace() {
|
|
|
|
return Status::NotSupported("EndTrace() is not implemented.");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
|
|
|
|
virtual Status StartBlockCacheTrace(
|
|
|
|
const TraceOptions& /*options*/,
|
|
|
|
std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
|
|
|
|
return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual Status EndBlockCacheTrace() {
|
|
|
|
return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
|
|
|
// Needed for StackableDB
|
|
|
|
virtual DB* GetRootDB() { return this; }
|
|
|
|
|
|
|
|
// Given a window [start_time, end_time), setup a StatsHistoryIterator
|
|
|
|
// to access stats history. Note the start_time and end_time are epoch
|
|
|
|
// time measured in seconds, and end_time is an exclusive bound.
|
|
|
|
virtual Status GetStatsHistory(
|
|
|
|
uint64_t /*start_time*/, uint64_t /*end_time*/,
|
|
|
|
std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
|
|
|
|
return Status::NotSupported("GetStatsHistory() is not implemented.");
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
// Make the secondary instance catch up with the primary by tailing and
|
|
|
|
// replaying the MANIFEST and WAL of the primary.
|
|
|
|
// Column families created by the primary after the secondary instance starts
|
|
|
|
// will be ignored unless the secondary instance closes and restarts with the
|
|
|
|
// newly created column families.
|
|
|
|
// Column families that exist before secondary instance starts and dropped by
|
|
|
|
// the primary afterwards will be marked as dropped. However, as long as the
|
|
|
|
// secondary instance does not delete the corresponding column family
|
|
|
|
// handles, the data of the column family is still accessible to the
|
|
|
|
// secondary.
|
|
|
|
// TODO: we will support WAL tailing soon.
|
|
|
|
virtual Status TryCatchUpWithPrimary() {
|
|
|
|
return Status::NotSupported("Supported only by secondary instance");
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
};
|
|
|
|
|
|
|
|
// Destroy the contents of the specified database.
|
|
|
|
// Be very careful using this method.
|
|
|
|
Status DestroyDB(const std::string& name, const Options& options,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families =
|
|
|
|
std::vector<ColumnFamilyDescriptor>());
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
// If a DB cannot be opened, you may attempt to call this method to
|
|
|
|
// resurrect as much of the contents of the database as possible.
|
|
|
|
// Some data may be lost, so be careful when calling this function
|
|
|
|
// on a database that contains important information.
|
|
|
|
//
|
|
|
|
// With this API, we will warn and skip data associated with column families not
|
|
|
|
// specified in column_families.
|
|
|
|
//
|
|
|
|
// @param column_families Descriptors for known column families
|
|
|
|
Status RepairDB(const std::string& dbname, const DBOptions& db_options,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families);
|
|
|
|
|
|
|
|
// @param unknown_cf_opts Options for column families encountered during the
|
|
|
|
// repair that were not specified in column_families.
|
|
|
|
Status RepairDB(const std::string& dbname, const DBOptions& db_options,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
const ColumnFamilyOptions& unknown_cf_opts);
|
|
|
|
|
|
|
|
// @param options These options will be used for the database and for ALL column
|
|
|
|
// families encountered during the repair
|
|
|
|
Status RepairDB(const std::string& dbname, const Options& options);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
} // namespace rocksdb
|