|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/db_impl/db_impl.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
// TODO: Share common structure with CompactedDBImpl and DBImplSecondary
|
|
|
|
class DBImplReadOnly : public DBImpl {
|
|
|
|
public:
|
|
|
|
DBImplReadOnly(const DBOptions& options, const std::string& dbname);
|
|
|
|
// No copying allowed
|
|
|
|
DBImplReadOnly(const DBImplReadOnly&) = delete;
|
|
|
|
void operator=(const DBImplReadOnly&) = delete;
|
|
|
|
|
|
|
|
virtual ~DBImplReadOnly();
|
|
|
|
|
|
|
|
// Implementations of the DB interface
|
|
|
|
using DB::Get;
|
|
|
|
virtual Status Get(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
PinnableSlice* value) override;
|
|
|
|
Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, PinnableSlice* value,
|
|
|
|
std::string* timestamp) override;
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
11 years ago
|
|
|
|
|
|
|
// TODO: Implement ReadOnly MultiGet?
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
11 years ago
|
|
|
|
|
|
|
using DBImpl::NewIterator;
|
|
|
|
virtual Iterator* NewIterator(const ReadOptions&,
|
|
|
|
ColumnFamilyHandle* column_family) override;
|
|
|
|
|
|
|
|
virtual Status NewIterators(
|
|
|
|
const ReadOptions& options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families,
|
|
|
|
std::vector<Iterator*>* iterators) override;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
|
|
|
|
using DBImpl::Put;
|
|
|
|
virtual Status Put(const WriteOptions& /*options*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const Slice& /*key*/, const Slice& /*value*/) override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
|
|
|
|
using DBImpl::PutEntity;
|
|
|
|
Status PutEntity(const WriteOptions& /* options */,
|
|
|
|
ColumnFamilyHandle* /* column_family */,
|
|
|
|
const Slice& /* key */,
|
|
|
|
const WideColumns& /* columns */) override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
|
|
|
|
using DBImpl::Merge;
|
|
|
|
virtual Status Merge(const WriteOptions& /*options*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const Slice& /*key*/, const Slice& /*value*/) override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
using DBImpl::Delete;
|
|
|
|
virtual Status Delete(const WriteOptions& /*options*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const Slice& /*key*/) override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
using DBImpl::SingleDelete;
|
|
|
|
virtual Status SingleDelete(const WriteOptions& /*options*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const Slice& /*key*/) override {
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
virtual Status Write(const WriteOptions& /*options*/,
|
|
|
|
WriteBatch* /*updates*/) override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
using DBImpl::CompactRange;
|
|
|
|
virtual Status CompactRange(const CompactRangeOptions& /*options*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const Slice* /*begin*/,
|
|
|
|
const Slice* /*end*/) override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
using DBImpl::CompactFiles;
|
|
|
|
virtual Status CompactFiles(
|
|
|
|
const CompactionOptions& /*compact_options*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const std::vector<std::string>& /*input_file_names*/,
|
|
|
|
const int /*output_level*/, const int /*output_path_id*/ = -1,
|
|
|
|
std::vector<std::string>* const /*output_file_names*/ = nullptr,
|
|
|
|
CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual Status DisableFileDeletions() override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
|
|
|
|
virtual Status EnableFileDeletions(bool /*force*/) override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
virtual Status GetLiveFiles(std::vector<std::string>& ret,
|
|
|
|
uint64_t* manifest_file_size,
|
|
|
|
bool /*flush_memtable*/) override {
|
|
|
|
return DBImpl::GetLiveFiles(ret, manifest_file_size,
|
|
|
|
false /* flush_memtable */);
|
|
|
|
}
|
|
|
|
|
|
|
|
using DBImpl::Flush;
|
|
|
|
virtual Status Flush(const FlushOptions& /*options*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/) override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
|
|
|
|
using DBImpl::SyncWAL;
|
|
|
|
virtual Status SyncWAL() override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
|
|
|
|
using DB::IngestExternalFile;
|
|
|
|
virtual Status IngestExternalFile(
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const std::vector<std::string>& /*external_files*/,
|
|
|
|
const IngestExternalFileOptions& /*ingestion_options*/) override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
|
Export Import sst files (#5495)
Summary:
Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135
This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469
"Add support for taking snapshot of a column family and creating column family from a given CF snapshot"
We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality.
(1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below.
// Exports all live SST files of a specified Column Family onto export_dir,
// returning SST files information in metadata.
// - SST files will be created as hard links when the directory specified
// is in the same partition as the db directory, copied otherwise.
// - export_dir should not already exist and will be created by this API.
// - Always triggers a flush.
virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
const std::string& export_dir,
ExportImportFilesMetaData** metadata);
Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through
metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by
returning the list of file metadata.
(2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below.
// CreateColumnFamilyWithImport() will create a new column family with
// column_family_name and import external SST files specified in metadata into
// this column family.
// (1) External SST files can be created using SstFileWriter.
// (2) External SST files can be exported from a particular column family in
// an existing DB.
// Option in import_options specifies whether the external files are copied or
// moved (default is copy). When option specifies copy, managing files at
// external_file_path is caller's responsibility. When option specifies a
// move, the call ensures that the specified files at external_file_path are
// deleted on successful return and files are not modified on any error
// return.
// On error return, column family handle returned will be nullptr.
// ColumnFamily will be present on successful return and will not be present
// on error return. ColumnFamily may be present on any crash during this call.
virtual Status CreateColumnFamilyWithImport(
const ColumnFamilyOptions& options, const std::string& column_family_name,
const ImportColumnFamilyOptions& import_options,
const ExportImportFilesMetaData& metadata,
ColumnFamilyHandle** handle);
Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported.
If incoming sequence number is higher than current local sequence number, local sequence
number is updated to reflect this.
Note, as the sst files is are being moved across Column Families, Column Family name in sst file
will no longer match the actual column family on destination DB. The API does not modify Column
Family name or id in the sst files being imported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495
Differential Revision: D16018881
fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
5 years ago
|
|
|
using DB::CreateColumnFamilyWithImport;
|
|
|
|
virtual Status CreateColumnFamilyWithImport(
|
|
|
|
const ColumnFamilyOptions& /*options*/,
|
|
|
|
const std::string& /*column_family_name*/,
|
|
|
|
const ImportColumnFamilyOptions& /*import_options*/,
|
|
|
|
const ExportImportFilesMetaData& /*metadata*/,
|
|
|
|
ColumnFamilyHandle** /*handle*/) override {
|
|
|
|
return Status::NotSupported("Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: some missing overrides for more "write" functions
|
|
|
|
|
|
|
|
protected:
|
|
|
|
Status FlushForGetLiveFiles() override {
|
|
|
|
// No-op for read-only DB
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
// A "helper" function for DB::OpenForReadOnly without column families
|
|
|
|
// to reduce unnecessary I/O
|
|
|
|
// It has the same functionality as DB::OpenForReadOnly with column families
|
|
|
|
// but does not check the existence of dbname in the file system
|
|
|
|
static Status OpenForReadOnlyWithoutCheck(
|
|
|
|
const DBOptions& db_options, const std::string& dbname,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
|
|
|
|
bool error_if_wal_file_exists = false);
|
|
|
|
friend class DB;
|
|
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|