From 559525dcbb58f240f2fbe0ad4879d1e3baa83b2a Mon Sep 17 00:00:00 2001 From: Akanksha Mahajan Date: Fri, 18 Feb 2022 16:37:24 -0800 Subject: [PATCH] Add Async Read and Poll APIs in FileSystem (#9564) Summary: This PR adds support for new APIs Async Read that reads the data asynchronously and Poll API that checks if requested read request has completed or not. Usage: In RocksDB, we are currently planning to prefetch data asynchronously during sequential scanning and RocksDB will call these APIs to prefetch more data in advanced. Design: - ReadAsync API submits the read request to underlying FileSystem in order to read data asynchronously. When read request is completed, callback function will be called. cb_arg is used by RocksDB to track the original request submitted and IOHandle is used by FileSystem to keep track of IO requests at their level. - The Poll API is added in FileSystem because the call could end up handling completions for multiple different files which is not specific to a FSRandomAccessFile instance. There could be multiple outstanding file reads from different files in future and they can complete in any order. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9564 Test Plan: Test will be added in separate PR. Reviewed By: anand1976 Differential Revision: D34226216 Pulled By: akankshamahajan15 fbshipit-source-id: 95e64edafb17f543f7232421d51e2665a3267f69 --- HISTORY.md | 1 + include/rocksdb/file_system.h | 83 ++++++++++++++++++++++++++++++++--- 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 7d70c6433..e740739cd 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -84,6 +84,7 @@ * Extended the column family statistics in the info log so the total amount of garbage in the blob files and the blob file space amplification factor are also logged. Also exposed the blob file space amp via the `rocksdb.blob-stats` DB property. * Introduced the API rocksdb_create_dir_if_missing in c.h that calls underlying file system's CreateDirIfMissing API to create the directory. * Added last level and non-last level read statistics: `LAST_LEVEL_READ_*`, `NON_LAST_LEVEL_READ_*`. +* Experimental: Add support for new APIs ReadAsync in FSRandomAccessFile that reads the data asynchronously and Poll API in FileSystem that checks if requested read request has completed or not. ReadAsync takes a callback function. Poll API checks for completion of read IO requests and should call callback functions to indicate completion of read requests. ## 6.29.0 (01/21/2022) Note: The next release will be major release 7.0. See https://github.com/facebook/rocksdb/issues/9390 for more info. diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index f1159f692..d8b0c6189 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -221,6 +221,11 @@ struct IODebugContext { } }; +// IOHandle is used by underlying file system to store any information it needs +// during Async Read requests. +using IOHandleDeleter = std::function; +using IOHandle = std::unique_ptr; + // The FileSystem, FSSequentialFile, FSRandomAccessFile, FSWritableFile, // FSRandomRWFileclass, and FSDIrectory classes define the interface between // RocksDB and storage systems, such as Posix filesystems, @@ -640,6 +645,19 @@ class FileSystem : public Customizable { const IOOptions& options, bool* is_dir, IODebugContext* /*dgb*/) = 0; + // EXPERIMENTAL + // Poll for completion of read IO requests. The Poll() method should call the + // callback functions to indicate completion of read requests. If Poll is not + // supported it means callee should be informed of IO completions via the + // callback on another thread. + // + // Default implementation is to return IOStatus::OK. + + virtual IOStatus Poll(std::vector& /*io_handles*/, + size_t /*min_completions*/) { + return IOStatus::OK(); + } + // If you're adding methods here, remember to add them to EnvWrapper too. private: @@ -712,24 +730,36 @@ class FSSequentialFile { // SequentialFileWrapper too. }; -// A read IO request structure for use in MultiRead +// A read IO request structure for use in MultiRead and asynchronous Read APIs. struct FSReadRequest { - // File offset in bytes + // Input parameter that represents the file offset in bytes. uint64_t offset; - // Length to read in bytes. `result` only returns fewer bytes if end of file - // is hit (or `status` is not OK). + // Input parameter that represents the length to read in bytes. `result` only + // returns fewer bytes if end of file is hit (or `status` is not OK). size_t len; // A buffer that MultiRead() can optionally place data in. It can - // ignore this and allocate its own buffer + // ignore this and allocate its own buffer. + // The lifecycle of scratch will be until IO is completed. + // + // In case of asynchronous reads, its an output parameter and it will be + // maintained until callback has been called. Scratch is allocated by RocksDB + // and will be passed to underlying FileSystem. char* scratch; // Output parameter set by MultiRead() to point to the data buffer, and // the number of valid bytes + // + // In case of asynchronous reads, this output parameter is set by Async Read + // APIs to point to the data buffer, and + // the number of valid bytes. + // Slice result should point to scratch i.e the data should + // always be read into scratch. Slice result; - // Status of read + // Output parameter set by underlying FileSystem that represents status of + // read request. IOStatus status; }; @@ -825,6 +855,35 @@ class FSRandomAccessFile { return IOStatus::NotSupported("InvalidateCache not supported."); } + // EXPERIMENTAL + // This API reads the requested data in FSReadRequest asynchronously. This is + // a asynchronous call, i.e it should return after submitting the request. + // + // When the read request is completed, callback function specified in cb + // should be called with arguments cb_arg and the result populated in + // FSReadRequest with result and status fileds updated by FileSystem. + // cb_arg should be used by the callback to track the original request + // submitted. + // + // This API should also populate IOHandle which should be used by + // underlying FileSystem to store the context in order to distinguish the read + // requests at their side. + // + // req contains the request offset and size passed as input parameter of read + // request and result and status fields are output parameter set by underlying + // FileSystem. The data should always be read into scratch field. + // + // Default implementation is to read the data synchronously. + virtual IOStatus ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function cb, void* cb_arg, + IOHandle* /*io_handle*/, IODebugContext* dbg) { + req.status = + Read(req.offset, req.len, opts, &(req.result), req.scratch, dbg); + cb(req, cb_arg); + return IOStatus::OK(); + } + // EXPERIMENTAL // When available, returns the actual temperature for the file. This is // useful in case some outside process moves a file from one tier to another, @@ -1410,6 +1469,12 @@ class FileSystemWrapper : public FileSystem { std::string SerializeOptions(const ConfigOptions& config_options, const std::string& header) const override; #endif // ROCKSDB_LITE + + virtual IOStatus Poll(std::vector& io_handles, + size_t min_completions) override { + return target_->Poll(io_handles, min_completions); + } + protected: std::shared_ptr target_; }; @@ -1490,6 +1555,12 @@ class FSRandomAccessFileWrapper : public FSRandomAccessFile { IOStatus InvalidateCache(size_t offset, size_t length) override { return target_->InvalidateCache(offset, length); } + IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function cb, + void* cb_arg, IOHandle* io_handle, + IODebugContext* dbg) override { + return target()->ReadAsync(req, opts, cb, cb_arg, io_handle, dbg); + } Temperature GetTemperature() const override { return target_->GetTemperature(); }