Move RADOS support to separate repo (#9206)

Summary:
This PR moves RADOS support from RocksDB repo to a separate repo. The new (temporary?) repo
in this PR serves as an example before we finalize the decision on where and who to host RADOS support. At this point,
people can start from the example repo and fork.

The goal is to include this commit in RocksDB 7.0 release.

Reference:
https://github.com/ajkr/dedupfs by ajkr

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9206

Test Plan:
Follow instructions in https://github.com/riversand963/rocksdb-rados-env/blob/main/README.md and build
test binary `env_librados_test` and run it.

Also, make check

Reviewed By: ajkr

Differential Revision: D33751690

Pulled By: riversand963

fbshipit-source-id: 30466c62afa9e4619847a48567ed158e62835e35
main
Yanqin Jin 3 years ago committed by Facebook GitHub Bot
parent 5d30668cab
commit fa52376117
  1. 21
      .circleci/config.yml
  2. 13
      CMakeLists.txt
  3. 1
      HISTORY.md
  4. 13
      Makefile
  5. 1
      PLUGINS.md
  6. 174
      include/rocksdb/utilities/env_librados.h
  7. 1467
      utilities/env_librados.cc
  8. 122
      utilities/env_librados.md
  9. 1152
      utilities/env_librados_test.cc

@ -106,13 +106,6 @@ commands:
command: | command: |
sudo apt-get update -y && sudo apt-get install -y libbenchmark-dev sudo apt-get update -y && sudo apt-get install -y libbenchmark-dev
install-librados:
steps:
- run:
name: Install librados
command: |
sudo apt-get update -y && sudo apt-get install -y librados-dev
upgrade-cmake: upgrade-cmake:
steps: steps:
- run: - run:
@ -186,17 +179,6 @@ jobs:
- run: make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain - run: make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain
- post-steps - post-steps
build-linux-mem-env-librados:
machine:
image: ubuntu-1604:202104-01
resource_class: 2xlarge
steps:
- pre-steps
- install-gflags
- install-librados
- run: MEM_ENV=1 ROCKSDB_USE_LIBRADOS=1 make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain
- post-steps
build-linux-encrypted-env: build-linux-encrypted-env:
machine: machine:
image: ubuntu-1604:202104-01 image: ubuntu-1604:202104-01
@ -759,9 +741,6 @@ workflows:
jobs: jobs:
- build-linux-cmake - build-linux-cmake
- build-linux-cmake-ubuntu-20 - build-linux-cmake-ubuntu-20
build-linux-mem-env-librados:
jobs:
- build-linux-mem-env-librados
build-linux-encrypted-env: build-linux-encrypted-env:
jobs: jobs:
- build-linux-encrypted-env - build-linux-encrypted-env

@ -1018,12 +1018,6 @@ set(ROCKSDB_SHARED_LIB rocksdb-shared${ARTIFACT_SUFFIX})
option(ROCKSDB_BUILD_SHARED "Build shared versions of the RocksDB libraries" ON) option(ROCKSDB_BUILD_SHARED "Build shared versions of the RocksDB libraries" ON)
option(WITH_LIBRADOS "Build with librados" OFF)
if(WITH_LIBRADOS)
list(APPEND SOURCES
utilities/env_librados.cc)
list(APPEND THIRDPARTY_LIBS rados)
endif()
if(WIN32) if(WIN32)
set(SYSTEM_LIBS ${SYSTEM_LIBS} shlwapi.lib rpcrt4.lib) set(SYSTEM_LIBS ${SYSTEM_LIBS} shlwapi.lib rpcrt4.lib)
@ -1349,9 +1343,6 @@ if(WITH_TESTS)
utilities/write_batch_with_index/write_batch_with_index_test.cc utilities/write_batch_with_index/write_batch_with_index_test.cc
) )
endif() endif()
if(WITH_LIBRADOS)
list(APPEND TESTS utilities/env_librados_test.cc)
endif()
if(WITH_FOLLY_DISTRIBUTED_MUTEX) if(WITH_FOLLY_DISTRIBUTED_MUTEX)
list(APPEND TESTS third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp) list(APPEND TESTS third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp)
@ -1391,10 +1382,6 @@ if(WITH_TESTS)
gtest_discover_tests(${exename} DISCOVERY_TIMEOUT 120) gtest_discover_tests(${exename} DISCOVERY_TIMEOUT 120)
add_dependencies(check ${exename}${ARTIFACT_SUFFIX}) add_dependencies(check ${exename}${ARTIFACT_SUFFIX})
endif() endif()
if("${exename}" MATCHES "env_librados_test")
# env_librados_test.cc uses librados directly
target_link_libraries(${exename}${ARTIFACT_SUFFIX} rados)
endif()
endforeach(sourcefile ${TESTS}) endforeach(sourcefile ${TESTS})
if(WIN32) if(WIN32)

@ -2,6 +2,7 @@
## Unreleased ## Unreleased
### Public API changes ### Public API changes
* Remove HDFS support from main repo. * Remove HDFS support from main repo.
* Remove librados support from main repo.
## 6.29.0 (01/21/2022) ## 6.29.0 (01/21/2022)
Note: The next release will be major release 7.0. See https://github.com/facebook/rocksdb/issues/9390 for more info. Note: The next release will be major release 7.0. See https://github.com/facebook/rocksdb/issues/9390 for more info.

@ -214,12 +214,6 @@ am__v_AR_ = $(am__v_AR_$(AM_DEFAULT_VERBOSITY))
am__v_AR_0 = @echo " AR " $@; am__v_AR_0 = @echo " AR " $@;
am__v_AR_1 = am__v_AR_1 =
ifdef ROCKSDB_USE_LIBRADOS
LIB_SOURCES += utilities/env_librados.cc
TEST_MAIN_SOURCES += utilities/env_librados_test.cc
LDFLAGS += -lrados
endif
AM_LINK = $(AM_V_CCLD)$(CXX) -L. $(patsubst lib%.a, -l%, $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^)) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) AM_LINK = $(AM_V_CCLD)$(CXX) -L. $(patsubst lib%.a, -l%, $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^)) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(LDFLAGS) -o $@ AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(LDFLAGS) -o $@
@ -561,7 +555,7 @@ ifneq ($(filter check-headers, $(MAKECMDGOALS)),)
# TODO: add/support JNI headers # TODO: add/support JNI headers
DEV_HEADER_DIRS := $(sort include/ $(dir $(ALL_SOURCES))) DEV_HEADER_DIRS := $(sort include/ $(dir $(ALL_SOURCES)))
# Some headers like in port/ are platform-specific # Some headers like in port/ are platform-specific
DEV_HEADERS := $(shell $(FIND) $(DEV_HEADER_DIRS) -type f -name '*.h' | egrep -v 'port/|plugin/|lua/|range_tree/|include/rocksdb/utilities/env_librados.h') DEV_HEADERS := $(shell $(FIND) $(DEV_HEADER_DIRS) -type f -name '*.h' | egrep -v 'port/|plugin/|lua/|range_tree/')
else else
DEV_HEADERS := DEV_HEADERS :=
endif endif
@ -1612,11 +1606,6 @@ env_mirror_test: $(OBJ_DIR)/utilities/env_mirror_test.o $(TEST_LIBRARY) $(LIBRAR
env_timed_test: $(OBJ_DIR)/utilities/env_timed_test.o $(TEST_LIBRARY) $(LIBRARY) env_timed_test: $(OBJ_DIR)/utilities/env_timed_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK) $(AM_LINK)
ifdef ROCKSDB_USE_LIBRADOS
env_librados_test: $(OBJ_DIR)/utilities/env_librados_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
endif
object_registry_test: $(OBJ_DIR)/utilities/object_registry_test.o $(TEST_LIBRARY) $(LIBRARY) object_registry_test: $(OBJ_DIR)/utilities/object_registry_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK) $(AM_LINK)

@ -3,3 +3,4 @@ This is the list of all known third-party plugins for RocksDB. If something is m
* [Dedupfs](https://github.com/ajkr/dedupfs): an example for plugin developers to reference * [Dedupfs](https://github.com/ajkr/dedupfs): an example for plugin developers to reference
* [HDFS](https://github.com/riversand963/rocksdb-hdfs-env): an Env used for interacting with HDFS. Migrated from main RocksDB repo * [HDFS](https://github.com/riversand963/rocksdb-hdfs-env): an Env used for interacting with HDFS. Migrated from main RocksDB repo
* [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices * [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices
* [RADOS](https://github.com/riversand963/rocksdb-rados-env): an Env used for interacting with RADOS. Migrated from RocksDB main repo.

@ -1,174 +0,0 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <memory>
#include <string>
#include "rocksdb/status.h"
#include "rocksdb/utilities/env_mirror.h"
#include <rados/librados.hpp>
namespace ROCKSDB_NAMESPACE {
class LibradosWritableFile;
class EnvLibrados : public EnvWrapper {
public:
// Create a brand new sequentially-readable file with the specified name.
// On success, stores a pointer to the new file in *result and returns OK.
// On failure stores nullptr in *result and returns non-OK. If the file does
// not exist, returns a non-OK status.
//
// The returned file will only be accessed by one thread at a time.
Status NewSequentialFile(const std::string& fname,
std::unique_ptr<SequentialFile>* result,
const EnvOptions& options) override;
// Create a brand new random access read-only file with the
// specified name. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK. If the file does not exist, returns a non-OK
// status.
//
// The returned file may be concurrently accessed by multiple threads.
Status NewRandomAccessFile(const std::string& fname,
std::unique_ptr<RandomAccessFile>* result,
const EnvOptions& options) override;
// Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a
// new file. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK.
//
// The returned file will only be accessed by one thread at a time.
Status NewWritableFile(const std::string& fname,
std::unique_ptr<WritableFile>* result,
const EnvOptions& options) override;
// Reuse an existing file by renaming it and opening it as writable.
Status ReuseWritableFile(const std::string& fname,
const std::string& old_fname,
std::unique_ptr<WritableFile>* result,
const EnvOptions& options) override;
// Create an object that represents a directory. Will fail if directory
// doesn't exist. If the directory exists, it will open the directory
// and create a new Directory object.
//
// On success, stores a pointer to the new Directory in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK.
Status NewDirectory(const std::string& name,
std::unique_ptr<Directory>* result) override;
// Returns OK if the named file exists.
// NotFound if the named file does not exist,
// the calling process does not have permission to determine
// whether this file exists, or if the path is invalid.
// IOError if an IO Error was encountered
Status FileExists(const std::string& fname) override;
// Store in *result the names of the children of the specified directory.
// The names are relative to "dir".
// Original contents of *results are dropped.
Status GetChildren(const std::string& dir,
std::vector<std::string>* result) override;
// Delete the named file.
Status DeleteFile(const std::string& fname) override;
// Create the specified directory. Returns error if directory exists.
Status CreateDir(const std::string& dirname) override;
// Creates directory if missing. Return Ok if it exists, or successful in
// Creating.
Status CreateDirIfMissing(const std::string& dirname) override;
// Delete the specified directory.
Status DeleteDir(const std::string& dirname) override;
// Store the size of fname in *file_size.
Status GetFileSize(const std::string& fname, uint64_t* file_size) override;
// Store the last modification time of fname in *file_mtime.
Status GetFileModificationTime(const std::string& fname,
uint64_t* file_mtime) override;
// Rename file src to target.
Status RenameFile(const std::string& src, const std::string& target) override;
// Hard Link file src to target.
Status LinkFile(const std::string& src, const std::string& target) override;
// Lock the specified file. Used to prevent concurrent access to
// the same db by multiple processes. On failure, stores nullptr in
// *lock and returns non-OK.
//
// On success, stores a pointer to the object that represents the
// acquired lock in *lock and returns OK. The caller should call
// UnlockFile(*lock) to release the lock. If the process exits,
// the lock will be automatically released.
//
// If somebody else already holds the lock, finishes immediately
// with a failure. I.e., this call does not wait for existing locks
// to go away.
//
// May create the named file if it does not already exist.
Status LockFile(const std::string& fname, FileLock** lock) override;
// Release the lock acquired by a previous successful call to LockFile.
// REQUIRES: lock was returned by a successful LockFile() call
// REQUIRES: lock has not already been unlocked.
Status UnlockFile(FileLock* lock) override;
// Get full directory name for this db.
Status GetAbsolutePath(const std::string& db_path,
std::string* output_path) override;
// Get default EnvLibrados
static EnvLibrados* Default();
explicit EnvLibrados(const std::string& db_name,
const std::string& config_path,
const std::string& db_pool);
explicit EnvLibrados(
const std::string& client_name, // first 3 parameters are
// for RADOS client init
const std::string& cluster_name, const uint64_t flags,
const std::string& db_name, const std::string& config_path,
const std::string& db_pool, const std::string& wal_dir,
const std::string& wal_pool, const uint64_t write_buffer_size);
~EnvLibrados() { _rados.shutdown(); }
private:
std::string _client_name;
std::string _cluster_name;
uint64_t _flags;
std::string _db_name; // get from user, readable string; Also used as db_id
// for db metadata
std::string _config_path;
librados::Rados _rados; // RADOS client
std::string _db_pool_name;
librados::IoCtx _db_pool_ioctx; // IoCtx for connecting db_pool
std::string _wal_dir; // WAL dir path
std::string _wal_pool_name;
librados::IoCtx _wal_pool_ioctx; // IoCtx for connecting wal_pool
uint64_t _write_buffer_size; // WritableFile buffer max size
/* private function to communicate with rados */
std::string _CreateFid();
Status _GetFid(const std::string& fname, std::string& fid);
Status _GetFid(const std::string& fname, std::string& fid, int fid_len);
Status _RenameFid(const std::string& old_fname, const std::string& new_fname);
Status _AddFid(const std::string& fname, const std::string& fid);
Status _DelFid(const std::string& fname);
Status _GetSubFnames(const std::string& dirname,
std::vector<std::string>* result);
librados::IoCtx* _GetIoctx(const std::string& prefix);
friend class LibradosWritableFile;
};
} // namespace ROCKSDB_NAMESPACE

File diff suppressed because it is too large Load Diff

@ -1,122 +0,0 @@
# Introduce to EnvLibrados
EnvLibrados is a customized RocksDB Env to use RADOS as the backend file system of RocksDB. It overrides all file system related API of default Env. The easiest way to use it is just like following:
```c++
std::string db_name = "test_db";
std::string config_path = "path/to/ceph/config";
DB* db;
Options options;
options.env = EnvLibrados(db_name, config_path);
Status s = DB::Open(options, kDBPath, &db);
...
```
Then EnvLibrados will forward all file read/write operation to the RADOS cluster assigned by config_path. Default pool is db_name+"_pool".
# Options for EnvLibrados
There are some options that users could set for EnvLibrados.
- write_buffer_size. This variable is the max buffer size for WritableFile. After reaching the buffer_max_size, EnvLibrados will sync buffer content to RADOS, then clear buffer.
- db_pool. Rather than using default pool, users could set their own db pool name
- wal_dir. The dir for WAL files. Because RocksDB only has 2-level structure (dir_name/file_name), the format of wal_dir is "/dir_name"(CAN'T be "/dir1/dir2"). Default wal_dir is "/wal".
- wal_pool. Corresponding pool name for WAL files. Default value is db_name+"_wal_pool"
The example of setting options looks like following:
```c++
db_name = "test_db";
db_pool = db_name+"_pool";
wal_dir = "/wal";
wal_pool = db_name+"_wal_pool";
write_buffer_size = 1 << 20;
env_ = new EnvLibrados(db_name, config, db_pool, wal_dir, wal_pool, write_buffer_size);
DB* db;
Options options;
options.env = env_;
// The last level dir name should match the dir name in prefix_pool_map
options.wal_dir = "/tmp/wal";
// open DB
Status s = DB::Open(options, kDBPath, &db);
...
```
# Performance Test
## Compile
Check this [link](https://github.com/facebook/rocksdb/blob/main/INSTALL.md) to install the dependencies of RocksDB. Then you can compile it by running `$ make env_librados_test ROCKSDB_USE_LIBRADOS=1` under `rocksdb\`. The configure file used by env_librados_test is `../ceph/src/ceph.conf`. For Ubuntu 14.04, just run following commands:
```bash
$ sudo apt-get install libgflags-dev
$ sudo apt-get install libsnappy-dev
$ sudo apt-get install zlib1g-dev
$ sudo apt-get install libbz2-dev
$ make env_librados_test ROCKSDB_USE_LIBRADOS=1
```
## Test Result
My test environment is Ubuntu 14.04 in VirtualBox with 8 cores and 8G RAM. Following is the test result.
1. Write (1<<20) keys in random order. The time of writing under default env is around 10s while the time of writing under EnvLibrados is varying from 10s to 30s.
2. Write (1<<20) keys in sequential order. The time of writing under default env drops to arround 1s. But the time of writing under EnvLibrados is not changed.
3. Read (1<<16) keys from (1<<20) keys in random order. The time of reading under both Envs are roughly the same, around 1.8s.
# MyRocks Test
## Compile Ceph
See [link](http://docs.ceph.com/docs/master/install/build-ceph/)
## Start RADOS
```bash
cd ceph-path/src
( ( ./stop.sh; rm -rf dev/*; CEPH_NUM_OSD=3 ./vstart.sh --short --localhost -n
-x -d ; ) ) 2>&1
```
## Compile MySQL
```bash
sudo apt-get update
sudo apt-get install g++ cmake libbz2-dev libaio-dev bison \
zlib1g-dev libsnappy-dev
sudo apt-get install libgflags-dev libreadline6-dev libncurses5-dev \
libssl-dev liblz4-dev gdb git
git clone https://github.com/facebook/mysql-5.6.git
cd mysql-5.6
git submodule init
git submodule update
cmake . -DCMAKE_BUILD_TYPE=RelWithDebInfo -DWITH_SSL=system \
-DWITH_ZLIB=bundled -DMYSQL_MAINTAINER_MODE=0 -DENABLED_LOCAL_INFILE=1 -DROCKSDB_USE_LIBRADOS=1
make install -j8
```
Check this [link](https://github.com/facebook/mysql-5.6/wiki/Build-Steps) for latest compile steps.
## Configure MySQL
Following is the steps of configuration of MySQL.
```bash
mkdir -p /etc/mysql
mkdir -p /var/lib/mysql
mkdir -p /etc/mysql/conf.d
echo -e '[mysqld_safe]\nsyslog' > /etc/mysql/conf.d/mysqld_safe_syslog.cnf
cp /usr/share/mysql/my-medium.cnf /etc/mysql/my.cnf
sed -i 's#.*datadir.*#datadir = /var/lib/mysql#g' /etc/mysql/my.cnf
chown mysql:mysql -R /var/lib/mysql
mysql_install_db --user=mysql --ldata=/var/lib/mysql/
export CEPH_CONFIG_PATH="path/of/ceph/config/file"
mysqld_safe -user=mysql --skip-innodb --rocksdb --default-storage-engine=rocksdb --default-tmp-storage-engine=MyISAM &
mysqladmin -u root password
mysql -u root -p
```
Check this [link](https://gist.github.com/shichao-an/f5639ecd551496ac2d70) for detail information.
```sql
show databases;
create database testdb;
use testdb;
show tables;
CREATE TABLE tbl (id INT AUTO_INCREMENT primary key, str VARCHAR(32));
insert into tbl values (1, "val2");
select * from tbl;
```

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save