From 6ffea6eaf0823566514116d19e89b2ad5eea29c2 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Thu, 31 Oct 2024 20:36:35 +0800 Subject: [PATCH 01/85] feat: Add client code structure and interface for Data, Future and exception --- src/spider/CMakeLists.txt | 28 +++++++++++++ src/spider/client/Data.cpp | 36 ++++++++++++++++ src/spider/client/Data.hpp | 79 ++++++++++++++++++++++++++++++++++++ src/spider/client/Error.hpp | 33 +++++++++++++++ src/spider/client/Future.cpp | 29 +++++++++++++ src/spider/client/Future.hpp | 31 ++++++++++++++ src/spider/client/Spider.hpp | 21 ++++++++++ src/spider/client/Task.cpp | 7 ++++ src/spider/client/Task.hpp | 20 +++++++++ 9 files changed, 284 insertions(+) create mode 100644 src/spider/client/Data.cpp create mode 100644 src/spider/client/Data.hpp create mode 100644 src/spider/client/Error.hpp create mode 100644 src/spider/client/Future.cpp create mode 100644 src/spider/client/Future.hpp create mode 100644 src/spider/client/Spider.hpp create mode 100644 src/spider/client/Task.cpp create mode 100644 src/spider/client/Task.hpp diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 944adc7..ae903d4 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -26,3 +26,31 @@ add_executable(spider_worker) target_sources(spider_worker PRIVATE ${SPIDER_WORKER_SOURCES}) target_link_libraries(spider_worker PRIVATE spider_core) add_executable(spider::worker ALIAS spider_worker) + +set(SPIDER_CLIENT_SOURCES + client/Task.cpp + client/Future.cpp + client/Data.cpp + CACHE INTERNAL + "spider client source files" +) + +set(SPIDER_CLIENT_HEADERS + client/Spider.hpp + client/Task.hpp + client/Error.hpp + client/Future.hpp + client/Data.hpp + CACHE INTERNAL + "spider client header files" +) + +add_library(spider_client) +target_sources( + spider_client + PRIVATE + ${SPIDER_CLIENT_SOURCES} + ${SPIDER_CLIENT_HEADERS} +) +target_link_libraries(spider_client PRIVATE spider_core) +add_library(spider::spider ALIAS spider_client) diff --git a/src/spider/client/Data.cpp b/src/spider/client/Data.cpp new file mode 100644 index 0000000..ac28d9a --- /dev/null +++ b/src/spider/client/Data.cpp @@ -0,0 +1,36 @@ +#include "Data.hpp" + +namespace spider { + +class DataImpl {}; + +template +auto Data::get() -> T { + return T(); +} + +template +void Data::set_locality(std::vector const& /*nodes*/, bool /*hard*/) {} + +template +auto Data::Builder::key(std::string const& /*key*/) -> Data::Builder& { + return this; +} + +template +auto Data::Builder::locality(std::vector const& /*nodes*/, bool /*hard*/) + -> Data::Builder& { + return this; +} + +template +auto Data::Builder::cleanup(std::function const& /*f*/) -> Data::Builder& { + return this; +} + +template +auto Data::Builder::build(T&& /*t*/) -> Data { + return Data(); +} + +} // namespace spider diff --git a/src/spider/client/Data.hpp b/src/spider/client/Data.hpp new file mode 100644 index 0000000..776291f --- /dev/null +++ b/src/spider/client/Data.hpp @@ -0,0 +1,79 @@ +#ifndef SPIDER_CLIENT_DATA_HPP +#define SPIDER_CLIENT_DATA_HPP + +#include +#include + +namespace spider { + +class DataImpl; + +template +class Data { +private: + std::unique_ptr m_impl; + +public: + /** + * Gets the values stored in Data. + * @return value stored in Data. + */ + auto get() -> T; + /** + * Indicates that the data is persisted and should not be rollbacked + * on failure recovery. + */ + // Not implemented in milestone 1 + // void mark_persist(); + /** + * Sets locality list of the data. + * @param nodes nodes that has locality + * @param hard true if the locality list is a hard requirement, false otherwise + */ + void set_locality(std::vector const& nodes, bool hard); + + class Builder { + private: + public: + /** + * Sets the key for the data. If no key is provided, Spider generates a key. + * @param key of the data + */ + auto key(std::string const& key) -> Data::Builder&; + /** + * Sets locality list of the data to build. + * @param nodes nodes that has locality + * @param hard true if the locality list is a hard requirement, false otherwise + * @return self + */ + auto locality(std::vector const& nodes, bool hard) -> Data::Builder&; + /** + * Indicates that the data to build is persisted and should not be rollbacked on failure + * recovery. + * @return self + */ + // Data::Builder Builder& mark_persist(); // Not implemented in milestone 1 + /** + * Defines clean up functions of the data to build. + * @param f clean up function of data + */ + auto cleanup(std::function const& f) -> Data::Builder&; + /** + * Defines rollback functions of the data to build. + * @param f rollback function of data + */ + // Not implemented for milestone 1 + // auto rollback(std::function const& f) -> Data::Builder&; + /** + * Builds the data. Stores the value of data into storage with locality list, persisted + * flag, cleanup and rollback functions. + * @param t value of the data + * @return data object + */ + auto build(T&& t) -> Data; + }; +}; + +} // namespace spider + +#endif // SPIDER_CLIENT_DATA_HPP diff --git a/src/spider/client/Error.hpp b/src/spider/client/Error.hpp new file mode 100644 index 0000000..7a1d524 --- /dev/null +++ b/src/spider/client/Error.hpp @@ -0,0 +1,33 @@ +#ifndef SPIDER_CLIENT_ERROR_HPP +#define SPIDER_CLIENT_ERROR_HPP + +#include +#include + +namespace spider { + +class SpiderException : public std::exception { +public: + enum class ErrType : std::uint8_t { + StorageErr, + DuplicateTask, + TaskNotFound, + }; + +private: + ErrType m_type; + std::string m_description; + +public: + SpiderException(ErrType type, std::string description) + : m_type(type), + m_description(std::move(description)) {} + + auto what() -> char const* { return m_description.c_str(); } + + auto get_type() -> ErrType { return m_type; } +}; + +} // namespace spider + +#endif // SPIDER_CLIENT_ERROR_HPP diff --git a/src/spider/client/Future.cpp b/src/spider/client/Future.cpp new file mode 100644 index 0000000..66d56c8 --- /dev/null +++ b/src/spider/client/Future.cpp @@ -0,0 +1,29 @@ +#include "Future.hpp" + +#include +#include + +namespace spider { + +class FutureImpl { + // Implementation details subject to change +private: + boost::uuids::uuid m_id; + +public: + auto value() -> std::string { return boost::uuids::to_string(m_id); } + + auto ready() -> bool { return m_id.is_nil(); } +}; + +template +auto Future::get() -> T { + return T(); +} + +template +auto Future::ready() -> bool { + return true; +} + +} // namespace spider diff --git a/src/spider/client/Future.hpp b/src/spider/client/Future.hpp new file mode 100644 index 0000000..cb69ba0 --- /dev/null +++ b/src/spider/client/Future.hpp @@ -0,0 +1,31 @@ +#ifndef SPIDER_CLIENT_FUTURE_HPP +#define SPIDER_CLIENT_FUTURE_HPP + +#include + +namespace spider { + +class FutureImpl; + +template +class Future { +private: + std::unique_ptr m_impl; + +public: + /** + * Gets the value of the future. Blocks until the value is available. + * @return value of the future + */ + auto get() -> T; + + /** + * Checks if value of the future is ready. + * @return true if future is ready, false otherwise + */ + auto ready() -> bool; +}; + +} // namespace spider + +#endif // SPIDER_CLIENT_FUTURE_HPP diff --git a/src/spider/client/Spider.hpp b/src/spider/client/Spider.hpp new file mode 100644 index 0000000..52e783e --- /dev/null +++ b/src/spider/client/Spider.hpp @@ -0,0 +1,21 @@ +#ifndef SPIDER_CLIENT_SPIDER_HPP +#define SPIDER_CLIENT_SPIDER_HPP + +#include "Error.hpp" +#include "Task.hpp" + +namespace spider { +/** + * Initializes Spider library + */ +void init(); + +/** + * Connects to storage + * @param url url of the storage to connect + */ +void connect(std::string const& url); + +} // namespace spider + +#endif // SPIDER_CLIENT_SPIDER_HPP diff --git a/src/spider/client/Task.cpp b/src/spider/client/Task.cpp new file mode 100644 index 0000000..b9e8874 --- /dev/null +++ b/src/spider/client/Task.cpp @@ -0,0 +1,7 @@ +#include "Task.hpp" + +namespace spider { + +class TaskGraphImpl {}; + +} // namespace spider diff --git a/src/spider/client/Task.hpp b/src/spider/client/Task.hpp new file mode 100644 index 0000000..c1b64e6 --- /dev/null +++ b/src/spider/client/Task.hpp @@ -0,0 +1,20 @@ +#ifndef SPIDER_CLIENT_TASK_HPP +#define SPIDER_CLIENT_TASK_HPP + +#include + +namespace spider { + +class TaskGraphImpl; + +template +class TaskGraph { +private: + std::unique_ptr m_impl; + +public: +}; + +} // namespace spider + +#endif // SPIDER_CLIENT_TASK_HPP From 94af15b14c5be93b87f62182faa42731995ed49d Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Fri, 1 Nov 2024 00:16:34 +0800 Subject: [PATCH 02/85] feat: Split client into two libraries and add interface --- CMakeLists.txt | 4 ++ src/spider/CMakeLists.txt | 57 ++++++++++++++++++--------- src/spider/client/Data.cpp | 6 ++- src/spider/client/Data.hpp | 4 +- src/spider/client/Error.hpp | 33 ---------------- src/spider/client/Future.cpp | 1 + src/spider/client/Spider.hpp | 26 ++++++++++++- src/spider/client/Task.cpp | 10 ++++- src/spider/client/Task.hpp | 69 ++++++++++++++++++++++++++++----- src/spider/client/TaskGraph.cpp | 14 +++++++ src/spider/client/TaskGraph.hpp | 23 +++++++++++ 11 files changed, 181 insertions(+), 66 deletions(-) delete mode 100644 src/spider/client/Error.hpp create mode 100644 src/spider/client/TaskGraph.cpp create mode 100644 src/spider/client/TaskGraph.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 94132ef..b543b89 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,10 @@ project( set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) +# AppleClang complains about file has no symbol and abort the build. +set(CMAKE_CXX_ARCHIVE_CREATE " Scr ") +set(CMAKE_CXX_ARCHIVE_FINISH " -no_warning_for_no_symbols -c ") + # Enable exporting compile commands set(CMAKE_EXPORT_COMPILE_COMMANDS ON diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index ae903d4..3a75825 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -1,15 +1,16 @@ # set variable as CACHE INTERNAL to access it from other scope -set(SPIDER_CORE_SOURCES +set(SPIDER_CORE_SOURCES storage/MysqlStorage.cpp CACHE INTERNAL "spider core source files") + +set(SPIDER_CORE_HEADERS core/Error.hpp core/Data.hpp core/Task.hpp core/TaskGraph.hpp storage/MetadataStorage.hpp storage/DataStorage.hpp - storage/MysqlStorage.cpp storage/MysqlStorage.hpp CACHE INTERNAL - "spider core source files" + "spider core header files" ) if(SPIDER_USE_STATIC_LIBS) @@ -18,39 +19,57 @@ else() add_library(spider_core SHARED) endif() target_sources(spider_core PRIVATE ${SPIDER_CORE_SOURCES}) +target_sources(spider_core PUBLIC ${SPIDER_CORE_HEADERS}) target_link_libraries(spider_core PUBLIC Boost::boost PRIVATE absl::flat_hash_map) -set(SPIDER_WORKER_SOURCES worker/worker.cpp CACHE INTERNAL "spider worker source files") +set(SPIDER_CLIENT_SHARED_SOURCES + client/Data.cpp + client/Task.cpp + CACHE INTERNAL + "spider client shared source files" +) -add_executable(spider_worker) -target_sources(spider_worker PRIVATE ${SPIDER_WORKER_SOURCES}) -target_link_libraries(spider_worker PRIVATE spider_core) -add_executable(spider::worker ALIAS spider_worker) +set(SPIDER_CLIENT_SHARED_HEADERS + client/Data.hpp + client/Task.hpp + CACHE INTERNAL + "spider client shared header files" +) + +add_library(spider_client_lib) +target_sources(spider_client_lib PRIVATE ${SPIDER_CLIENT_SHARED_SOURCES}) +target_sources(spider_client_lib PUBLIC ${SPIDER_CLIENT_SHARED_HEADERS}) set(SPIDER_CLIENT_SOURCES - client/Task.cpp + client/TaskGraph.cpp client/Future.cpp - client/Data.cpp CACHE INTERNAL "spider client source files" ) set(SPIDER_CLIENT_HEADERS client/Spider.hpp - client/Task.hpp - client/Error.hpp + client/TaskGraph.hpp client/Future.hpp - client/Data.hpp CACHE INTERNAL "spider client header files" ) add_library(spider_client) -target_sources( - spider_client - PRIVATE - ${SPIDER_CLIENT_SOURCES} - ${SPIDER_CLIENT_HEADERS} -) +target_sources(spider_client PRIVATE ${SPIDER_CLIENT_SOURCES}) +target_sources(spider_client PUBLIC ${SPIDER_CLIENT_HEADERS}) target_link_libraries(spider_client PRIVATE spider_core) +target_link_libraries(spider_client PUBLIC spider_client_lib) add_library(spider::spider ALIAS spider_client) + +set(SPIDER_WORKER_SOURCES worker/worker.cpp CACHE INTERNAL "spider worker source files") + +add_executable(spider_worker) +target_sources(spider_worker PRIVATE ${SPIDER_WORKER_SOURCES}) +target_link_libraries( + spider_worker + PRIVATE + spider_core + spider_client_lib +) +add_executable(spider::worker ALIAS spider_worker) diff --git a/src/spider/client/Data.cpp b/src/spider/client/Data.cpp index ac28d9a..a118f58 100644 --- a/src/spider/client/Data.cpp +++ b/src/spider/client/Data.cpp @@ -1,5 +1,9 @@ #include "Data.hpp" +#include +#include +#include + namespace spider { class DataImpl {}; @@ -29,7 +33,7 @@ auto Data::Builder::cleanup(std::function const& /*f*/) -> Data -auto Data::Builder::build(T&& /*t*/) -> Data { +auto Data::Builder::build(T const& /*t*/) -> Data { return Data(); } diff --git a/src/spider/client/Data.hpp b/src/spider/client/Data.hpp index 776291f..dde763d 100644 --- a/src/spider/client/Data.hpp +++ b/src/spider/client/Data.hpp @@ -3,6 +3,8 @@ #include #include +#include +#include namespace spider { @@ -70,7 +72,7 @@ class Data { * @param t value of the data * @return data object */ - auto build(T&& t) -> Data; + auto build(T const& /*t*/) -> Data; }; }; diff --git a/src/spider/client/Error.hpp b/src/spider/client/Error.hpp deleted file mode 100644 index 7a1d524..0000000 --- a/src/spider/client/Error.hpp +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef SPIDER_CLIENT_ERROR_HPP -#define SPIDER_CLIENT_ERROR_HPP - -#include -#include - -namespace spider { - -class SpiderException : public std::exception { -public: - enum class ErrType : std::uint8_t { - StorageErr, - DuplicateTask, - TaskNotFound, - }; - -private: - ErrType m_type; - std::string m_description; - -public: - SpiderException(ErrType type, std::string description) - : m_type(type), - m_description(std::move(description)) {} - - auto what() -> char const* { return m_description.c_str(); } - - auto get_type() -> ErrType { return m_type; } -}; - -} // namespace spider - -#endif // SPIDER_CLIENT_ERROR_HPP diff --git a/src/spider/client/Future.cpp b/src/spider/client/Future.cpp index 66d56c8..bc2d1df 100644 --- a/src/spider/client/Future.cpp +++ b/src/spider/client/Future.cpp @@ -2,6 +2,7 @@ #include #include +#include namespace spider { diff --git a/src/spider/client/Spider.hpp b/src/spider/client/Spider.hpp index 52e783e..acb5a60 100644 --- a/src/spider/client/Spider.hpp +++ b/src/spider/client/Spider.hpp @@ -1,8 +1,16 @@ #ifndef SPIDER_CLIENT_SPIDER_HPP #define SPIDER_CLIENT_SPIDER_HPP -#include "Error.hpp" +#include +#include + +// NOLINTBEGIN(misc-include-cleaner) +#include "Data.hpp" +#include "Future.hpp" #include "Task.hpp" +#include "TaskGraph.hpp" + +// NOLINTEND(misc-include-cleaner) namespace spider { /** @@ -16,6 +24,22 @@ void init(); */ void connect(std::string const& url); +/** + * Registers function to Spider + * @param function function to register + */ +template +void register_task(std::function const& function); + +/** + * Registers function to Spider with timeout + * @param function_name name of the function to register + * @param timeout task is considered straggler after timeout ms, and Spider triggers replicate the + * task + */ +template +void register_task(std::function const& function, float timeout); + } // namespace spider #endif // SPIDER_CLIENT_SPIDER_HPP diff --git a/src/spider/client/Task.cpp b/src/spider/client/Task.cpp index b9e8874..1a14a6c 100644 --- a/src/spider/client/Task.cpp +++ b/src/spider/client/Task.cpp @@ -1,7 +1,15 @@ #include "Task.hpp" +#include +#include + +#include "Data.hpp" + namespace spider { -class TaskGraphImpl {}; +template +auto get_data(std::string const& /*key*/) -> std::optional> { + return std::nullopt; +} } // namespace spider diff --git a/src/spider/client/Task.hpp b/src/spider/client/Task.hpp index c1b64e6..12fc89e 100644 --- a/src/spider/client/Task.hpp +++ b/src/spider/client/Task.hpp @@ -1,20 +1,69 @@ -#ifndef SPIDER_CLIENT_TASK_HPP -#define SPIDER_CLIENT_TASK_HPP +#ifndef SPIDER_CORE_SPIDER_HPP +#define SPIDER_CORE_SPIDER_HPP -#include +#include +#include +#include + +#include "Data.hpp" +#include "Future.hpp" +#include "TaskGraph.hpp" namespace spider { -class TaskGraphImpl; +/** + * Gets data by key. + * This function can be called by a client to get all data or called by a task to get data created + * by it. + * @param key key of the data + * @return std::nullopt if no data with key is stored, the data associated by the key otherwise + */ +template +auto get_data(std::string const& key) -> std::optional>; + +/** + * Add task as a child of current task. + * This function can only be called by a task. + * @param f child task or task graph + */ +template +void add_child(F const& f); +/** + * Binds inputs to a task. Input of the task can be bound from + * outputs of task, forming dependencies between tasks. Input can + * also be a value or a spider::Data. + * This function can be called by a client or by a task + * @param task child task to be bound on + * @param inputs task or task graph whose outputs to bind to f, or value or spider::Data used as + * input + * @return task graph representing the task dependencies. If none of args is a task or task graph, + * returns a task graph with only one task + */ +template +auto bind(std::function const& task, Inputs&&... inputs) + -> spider::TaskGraph; + +/** + * Runs task on Spider. + * This function can be called by a client or by a task. + * @param task task to run + * @param args task input + * @return future of the result + */ template -class TaskGraph { -private: - std::unique_ptr m_impl; +auto run(std::function const& task, Args&&... args) -> Future; -public: -}; +/** + * Runs task graph on Spider. + * This function can be called by a client or by a task. + * @param graph task graph to run + * @param args task input + * @return future of the result + */ +template +auto run(TaskGraph const& graph, Args&&... args) -> Future; } // namespace spider -#endif // SPIDER_CLIENT_TASK_HPP +#endif diff --git a/src/spider/client/TaskGraph.cpp b/src/spider/client/TaskGraph.cpp new file mode 100644 index 0000000..1678292 --- /dev/null +++ b/src/spider/client/TaskGraph.cpp @@ -0,0 +1,14 @@ +#include "TaskGraph.hpp" + +#include "Future.hpp" + +namespace spider { + +class TaskGraphImpl {}; + +template +auto TaskGraph::run(Args&&... /*args*/) -> Future { + return Future(); +} + +} // namespace spider diff --git a/src/spider/client/TaskGraph.hpp b/src/spider/client/TaskGraph.hpp new file mode 100644 index 0000000..b8916f0 --- /dev/null +++ b/src/spider/client/TaskGraph.hpp @@ -0,0 +1,23 @@ +#ifndef SPIDER_CLIENT_TASK_HPP +#define SPIDER_CLIENT_TASK_HPP + +#include + +#include "Future.hpp" + +namespace spider { + +class TaskGraphImpl; + +template +class TaskGraph { +private: + std::unique_ptr m_impl; + +public: + auto run(Args&&... args) -> Future; +}; + +} // namespace spider + +#endif // SPIDER_CLIENT_TASK_HPP From f69523a6f5fc3b6657dbee7ef7d45c81d1452a45 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Fri, 1 Nov 2024 09:14:54 +0800 Subject: [PATCH 03/85] fix: Add boost library for spider_client_lib --- src/spider/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 3a75825..05c4322 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -39,6 +39,7 @@ set(SPIDER_CLIENT_SHARED_HEADERS add_library(spider_client_lib) target_sources(spider_client_lib PRIVATE ${SPIDER_CLIENT_SHARED_SOURCES}) target_sources(spider_client_lib PUBLIC ${SPIDER_CLIENT_SHARED_HEADERS}) +target_link_libraries(spider_client_lib PUBLIC Boost::boost) set(SPIDER_CLIENT_SOURCES client/TaskGraph.cpp From ccf6cc8f542b15ea37f81c868eb4a37713fc002d Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Fri, 1 Nov 2024 09:52:45 +0800 Subject: [PATCH 04/85] style: Improve code style for data based on pr comments --- src/spider/client/Data.cpp | 6 ++-- src/spider/client/Data.hpp | 66 ++++++++++++++++++++------------------ 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/src/spider/client/Data.cpp b/src/spider/client/Data.cpp index a118f58..c4b30fd 100644 --- a/src/spider/client/Data.cpp +++ b/src/spider/client/Data.cpp @@ -17,18 +17,18 @@ template void Data::set_locality(std::vector const& /*nodes*/, bool /*hard*/) {} template -auto Data::Builder::key(std::string const& /*key*/) -> Data::Builder& { +auto Data::Builder::set_key(std::string const& /*key*/) -> Data::Builder& { return this; } template -auto Data::Builder::locality(std::vector const& /*nodes*/, bool /*hard*/) +auto Data::Builder::set_locality(std::vector const& /*nodes*/, bool /*hard*/) -> Data::Builder& { return this; } template -auto Data::Builder::cleanup(std::function const& /*f*/) -> Data::Builder& { +auto Data::Builder::set_cleanup(std::function const& /*f*/) -> Data::Builder& { return this; } diff --git a/src/spider/client/Data.hpp b/src/spider/client/Data.hpp index dde763d..7702bb4 100644 --- a/src/spider/client/Data.hpp +++ b/src/spider/client/Data.hpp @@ -7,73 +7,77 @@ #include namespace spider { - class DataImpl; +/** + * Data represents metadata of data stored on external storage. Data provides hint for Spider of + * metadata information like locality of the data to improve scheduling decision. + * Example: + * spider::Data disk_file_data = spider::Data::Builder() + * .set_locality("node_address", true) + * .set_cleanup([](std::string cont& path) { std::remove(path); }) + * .build("/path/of/file"); + * + * Data is passed in as input so the tasks can get the value of the data. + * + * Data could also be used as a key-value store. + * Example: + * spider::Data key_value_data = spider::Data::Builder() + * .set_key("key") + * .build("value"); + * + * @tparam T type of the value. T must be a POD. + */ template class Data { -private: - std::unique_ptr m_impl; - public: /** * Gets the values stored in Data. - * @return value stored in Data. + * @return The stored value. */ auto get() -> T; - /** - * Indicates that the data is persisted and should not be rollbacked - * on failure recovery. - */ - // Not implemented in milestone 1 - // void mark_persist(); + /** * Sets locality list of the data. * @param nodes nodes that has locality - * @param hard true if the locality list is a hard requirement, false otherwise + * @param hard true if the locality list is a hard requirement, false otherwise. Hard locality + * requirement means that data can only be accessed from the node in the locality list. */ void set_locality(std::vector const& nodes, bool hard); class Builder { - private: public: /** - * Sets the key for the data. If no key is provided, Spider generates a key. + * Sets the key for the data. * @param key of the data + * @return self */ - auto key(std::string const& key) -> Data::Builder&; + auto set_key(std::string const& key) -> Data::Builder&; /** * Sets locality list of the data to build. * @param nodes nodes that has locality * @param hard true if the locality list is a hard requirement, false otherwise * @return self */ - auto locality(std::vector const& nodes, bool hard) -> Data::Builder&; - /** - * Indicates that the data to build is persisted and should not be rollbacked on failure - * recovery. - * @return self - */ - // Data::Builder Builder& mark_persist(); // Not implemented in milestone 1 + auto set_locality(std::vector const& nodes, bool hard) -> Data::Builder&; + /** * Defines clean up functions of the data to build. * @param f clean up function of data */ - auto cleanup(std::function const& f) -> Data::Builder&; - /** - * Defines rollback functions of the data to build. - * @param f rollback function of data - */ - // Not implemented for milestone 1 - // auto rollback(std::function const& f) -> Data::Builder&; + auto set_cleanup(std::function const& f) -> Data::Builder&; + /** - * Builds the data. Stores the value of data into storage with locality list, persisted - * flag, cleanup and rollback functions. + * Builds the data. Stores the value of data into storage with locality list and cleanup + * functions. * @param t value of the data * @return data object */ auto build(T const& /*t*/) -> Data; }; + +private: + std::unique_ptr m_impl; }; } // namespace spider From 5e26f58be37e9a4da696a9924ccaade2902d0274 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Fri, 1 Nov 2024 10:03:49 +0800 Subject: [PATCH 05/85] fix: Add absl as public library for core --- src/spider/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 05c4322..0fda693 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -20,7 +20,12 @@ else() endif() target_sources(spider_core PRIVATE ${SPIDER_CORE_SOURCES}) target_sources(spider_core PUBLIC ${SPIDER_CORE_HEADERS}) -target_link_libraries(spider_core PUBLIC Boost::boost PRIVATE absl::flat_hash_map) +target_link_libraries( + spider_core + PUBLIC + Boost::boost + absl::flat_hash_map +) set(SPIDER_CLIENT_SHARED_SOURCES client/Data.cpp From 020093c0f6d76d4831678630183a9a88659aa156 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Fri, 1 Nov 2024 10:32:16 +0800 Subject: [PATCH 06/85] style: Improve code style for client interface based on pr reviea principlew --- src/spider/CMakeLists.txt | 10 +++------- src/spider/client/Future.hpp | 13 ++++++++----- src/spider/client/Spider.hpp | 3 +-- src/spider/client/Task.hpp | 29 +++++++++++++++++++++++------ src/spider/client/TaskGraph.hpp | 22 +++++++++++++++++----- 5 files changed, 52 insertions(+), 25 deletions(-) diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 0fda693..1385db3 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -30,6 +30,7 @@ target_link_libraries( set(SPIDER_CLIENT_SHARED_SOURCES client/Data.cpp client/Task.cpp + client/TaskGraph.cpp CACHE INTERNAL "spider client shared source files" ) @@ -37,6 +38,7 @@ set(SPIDER_CLIENT_SHARED_SOURCES set(SPIDER_CLIENT_SHARED_HEADERS client/Data.hpp client/Task.hpp + client/TaskGraph.hpp CACHE INTERNAL "spider client shared header files" ) @@ -46,16 +48,10 @@ target_sources(spider_client_lib PRIVATE ${SPIDER_CLIENT_SHARED_SOURCES}) target_sources(spider_client_lib PUBLIC ${SPIDER_CLIENT_SHARED_HEADERS}) target_link_libraries(spider_client_lib PUBLIC Boost::boost) -set(SPIDER_CLIENT_SOURCES - client/TaskGraph.cpp - client/Future.cpp - CACHE INTERNAL - "spider client source files" -) +set(SPIDER_CLIENT_SOURCES client/Future.cpp CACHE INTERNAL "spider client source files") set(SPIDER_CLIENT_HEADERS client/Spider.hpp - client/TaskGraph.hpp client/Future.hpp CACHE INTERNAL "spider client header files" diff --git a/src/spider/client/Future.hpp b/src/spider/client/Future.hpp index cb69ba0..fe6c38e 100644 --- a/src/spider/client/Future.hpp +++ b/src/spider/client/Future.hpp @@ -4,14 +4,15 @@ #include namespace spider { - class FutureImpl; +/** + * Future represents a value that will be ready. + * + * @tparam T type of the value represented by Future. + */ template class Future { -private: - std::unique_ptr m_impl; - public: /** * Gets the value of the future. Blocks until the value is available. @@ -24,8 +25,10 @@ class Future { * @return true if future is ready, false otherwise */ auto ready() -> bool; -}; +private: + std::unique_ptr m_impl; +}; } // namespace spider #endif // SPIDER_CLIENT_FUTURE_HPP diff --git a/src/spider/client/Spider.hpp b/src/spider/client/Spider.hpp index acb5a60..2424e62 100644 --- a/src/spider/client/Spider.hpp +++ b/src/spider/client/Spider.hpp @@ -32,14 +32,13 @@ template void register_task(std::function const& function); /** - * Registers function to Spider with timeout + * Registers function to Spider with timeout * @param function_name name of the function to register * @param timeout task is considered straggler after timeout ms, and Spider triggers replicate the * task */ template void register_task(std::function const& function, float timeout); - } // namespace spider #endif // SPIDER_CLIENT_SPIDER_HPP diff --git a/src/spider/client/Task.hpp b/src/spider/client/Task.hpp index 12fc89e..ef5db2b 100644 --- a/src/spider/client/Task.hpp +++ b/src/spider/client/Task.hpp @@ -1,3 +1,7 @@ +/** + * Task.hpp include functions that can be called inside a Task. + */ + #ifndef SPIDER_CORE_SPIDER_HPP #define SPIDER_CORE_SPIDER_HPP @@ -10,11 +14,11 @@ #include "TaskGraph.hpp" namespace spider { - /** * Gets data by key. * This function can be called by a client to get all data or called by a task to get data created * by it. + * @tparam T type of the value stored in data * @param key key of the data * @return std::nullopt if no data with key is stored, the data associated by the key otherwise */ @@ -24,16 +28,22 @@ auto get_data(std::string const& key) -> std::optional>; /** * Add task as a child of current task. * This function can only be called by a task. + * @tparam F task graph type or function type for a single task * @param f child task or task graph */ template void add_child(F const& f); /** - * Binds inputs to a task. Input of the task can be bound from - * outputs of task, forming dependencies between tasks. Input can - * also be a value or a spider::Data. - * This function can be called by a client or by a task + * Binds inputs to a task. Input of the task can be bound from outputs of task or task graph, + * forming dependencies between tasks. Input can also be a value or a spider::Data. + * This function can be called by a client or by a task. + * + * @tparam R return type of the task or task graph + * @tparam Args input types of task or task graph + * @tparam Inputs types of task, task graph, spider::Data or POD value + * @tparam GraphInputs input types of the new task graph + * * @param task child task to be bound on * @param inputs task or task graph whose outputs to bind to f, or value or spider::Data used as * input @@ -47,6 +57,10 @@ auto bind(std::function const& task, Inputs&&... inputs) /** * Runs task on Spider. * This function can be called by a client or by a task. + * + * @tparam R return type of the task + * @tparam Args input types of the task + * * @param task task to run * @param args task input * @return future of the result @@ -57,13 +71,16 @@ auto run(std::function const& task, Args&&... args) -> Future; /** * Runs task graph on Spider. * This function can be called by a client or by a task. + * + * @tparam R return type of the task graph + * @tparam Args input types of the task graph + * * @param graph task graph to run * @param args task input * @return future of the result */ template auto run(TaskGraph const& graph, Args&&... args) -> Future; - } // namespace spider #endif diff --git a/src/spider/client/TaskGraph.hpp b/src/spider/client/TaskGraph.hpp index b8916f0..f4675f1 100644 --- a/src/spider/client/TaskGraph.hpp +++ b/src/spider/client/TaskGraph.hpp @@ -6,18 +6,30 @@ #include "Future.hpp" namespace spider { - class TaskGraphImpl; +/** + * TaskGraph represents a DAG of tasks. + * @tparam R return type of the task graph + * @tparam Args input types of the task graph + */ template class TaskGraph { -private: - std::unique_ptr m_impl; - public: + /** + * Runs the task graph. + * + * @tparam Args input types of the task graph + * @tparam R return type of the task graph + * + * @param args inputs of the task graph + * @return future of the result + */ auto run(Args&&... args) -> Future; -}; +private: + std::unique_ptr m_impl; +}; } // namespace spider #endif // SPIDER_CLIENT_TASK_HPP From ee222f0caf4b4857e6141ee795c2bb702d18b876 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Fri, 1 Nov 2024 12:12:49 +0800 Subject: [PATCH 07/85] fix: Try fix clang-tidy find nout found --- CMakeLists.txt | 6 ++++-- src/spider/CMakeLists.txt | 7 ++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b543b89..dfcc0f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,8 +13,10 @@ set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) # AppleClang complains about file has no symbol and abort the build. -set(CMAKE_CXX_ARCHIVE_CREATE " Scr ") -set(CMAKE_CXX_ARCHIVE_FINISH " -no_warning_for_no_symbols -c ") +if(APPLE) + set(CMAKE_CXX_ARCHIVE_CREATE " Scr ") + set(CMAKE_CXX_ARCHIVE_FINISH " -no_warning_for_no_symbols -c ") +endif() # Enable exporting compile commands set(CMAKE_EXPORT_COMPILE_COMMANDS diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 1385db3..2f35055 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -46,7 +46,12 @@ set(SPIDER_CLIENT_SHARED_HEADERS add_library(spider_client_lib) target_sources(spider_client_lib PRIVATE ${SPIDER_CLIENT_SHARED_SOURCES}) target_sources(spider_client_lib PUBLIC ${SPIDER_CLIENT_SHARED_HEADERS}) -target_link_libraries(spider_client_lib PUBLIC Boost::boost) +target_link_libraries( + spider_client_lib + PUBLIC + Boost::boost + absl::flat_hash_map +) set(SPIDER_CLIENT_SOURCES client/Future.cpp CACHE INTERNAL "spider client source files") From 1b0ccacfb7aecad3773d756c1aa7dfb778bb109c Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Sat, 2 Nov 2024 00:47:16 +0800 Subject: [PATCH 08/85] docs: Add quick start doc --- docs/QuickStart.md | 204 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 docs/QuickStart.md diff --git a/docs/QuickStart.md b/docs/QuickStart.md new file mode 100644 index 0000000..916570b --- /dev/null +++ b/docs/QuickStart.md @@ -0,0 +1,204 @@ +# Spider Quick Start Guide + +## Set Up Spider +To get started, first start a database supported by Spider, e.g. MySql. Second, start a scheduler and connect it to the database by running `spider start --scheduler --db --port `. Third, start some workers and connect them to the database by running `spider start --worker --db `. + +## Start a Client +Client first creates a Spider client driver and connects it to the database. Spider automatically cleans up the resource in driver's destructor, but you can close the driver to release the resource early. +```c++ +#include + +auto main(int argc, char **argv) -> int { + spider::Driver driver{}; + driver.connect("db_url"); + + driver.close(); +} +``` + +## Create a Task +In Spider, a task is a non-member function that takes the first argument a `spider::Context` object. It can then take any number of arguments of POD type. + +Task can return any POD type. If a task needs to return more than one result, uses `std::tuple`. + +The `Context` object represents the context of a running task. It provides methods to get the task metadata information like task id. It also supports the creating task inside a task. We will cover this later. +```c++ +auto sum(spider::Context &context, int x, int y) -> int { + return x + y; +} + +auto sort(spider::Context &context, int x, int y) -> std::tuple { + if (x >= y) { + return { x, y }; + } + return { y, x }; +} +``` + +## Run a Task +Spider enables user to run a task on the cluster. First register the functions statically so it is known by Spider. Simply call `Driver::run` and provide the arguments of the task. `Driver::run` returns a `spider::Future` object, which represents the result that will be available in the future. You can call `Future::ready` to check if the value in future is available yet. You can use `Future::get` to block and get the value once it is available. +```c++ +spider::register_task(sum); +spider::register_task(sort); + +auto main(int argc, char **argv) -> int { + // driver initialization skipped + spider::Future sum_future = driver.run(sum, 2); + assert(4 == sum_future.get()); + + spider::Future> sort_future = driver.run(4, 3); + assert(std::tuple{3, 4} == sort_future.get()); +} +``` + +## Group Tasks Together +In real world, running a single task is too simple to be useful. Spider lets you bind outputs of tasks as inputs of another task, similar to `std::bind`. Binding the tasks together forms a dependencies among tasks, which is represented by `spider::TaskGraph`. `TaskGraph` can be further bound into more complicated `TaskGraph` by serving as inputs for another task. You can run the task using `Driver::run` in the same way as running a single task. +```c++ +auto square(spider::Context& context, int x) -> int { + return x * x; +} + +auto square_root(spider::Context& context, int x) -> int { + return sqrt(x); +} +// task registration skipped +auto main(int argc, char **argv) -> auto { + // driver initialization skipped + spider::TaskGraph sum_of_square = spider::bind(sum, square, square); + spider::TaskGraph rss = spider::bind(square_root, sum_of_square); + spider::Future future = driver::run(rss, 3, 4); + assert(5 == future.get()); +} +``` + +## Run Task inside Task +Static task graph is enough to solve a lot of real work problems, but dynamically add tasks on-the-fly could become handy. As mentioned before, spider allows you to add another task as child of the running task by calling `Context::add_child`. + +```c++ +auto gcd(spider::Conect& context, int x, int y) -> std::tuple { + if (x == y) { + std::cout << "gdc is: " << x << std::endl; + return { x, y }; + } + if (x > y) { + context.add_child(gcd); + return { x % y, y }; + } + context.add_child(gcd); + return { x, y % x }; +} +``` + +However, it is impossible to get the return value of the task graph from a client. We have a solution by sharing data using key-value store, which will be discussed later. Another solution is to run task or task graph inside a task and wait for its value, just like a client. This solution is closer to the conventional function call semantic. + +```c++ +auto gcd(spider:Context& context, int x, int y) -> int { + if (x < y) { + std::swap(x, y); + } + while (x != y) { + spider::Future> future = context.run(gcd_impl, x, y); + x = future.get().get().get<0>(); + y = future.get().get().get<1>(); + } + return x; +} + +auto gcd_impl(spider::Context& context, int x, int y) -> std::tuple { + return { x, x % y}; +} +``` + +## Data on External Storage +Often simple POD data are not enough. However, passing large amount of data around is expensive. Usually these data is stored on disk or a distributed storage system. For example, an ETL workload usually reads in data from an external storage, writes temporary data on an external storage, and writes final data into an external storage. + +Spider lets user pass the metadata of these data around in `spider::Data` objects. `Data` stores the value of the metadata information of external data, and provides crucial information to Spider for correct and efficient scheduling and failure recovery. `Data` stores a list of nodes which has locality of the external data, and user can specify if locality is a hard requirement, i.e. task can only run on the nodes in locality list. `Data` can include a `cleanup`function, which will run when the `Data` object is no longer reference by any task and client. `Data` has a persist flag to represent that external data is persisted and do not need to be cleaned up. + +```c++ +struct HdfsFile { + std::string url; +}; + +auto filter(spider::Data input) -> spider::Data { + std::string const output_path = std::format("/path/%s", context.task_id()); + std::string const input_path = input.get().url; + // Create HdfsFile Data first in case task fails and Spider can clean up the data. + spider::Data output = spider::Data::Builder() + .cleanup([](HdfsFile const& file) { delete_hdfs_file(file); }) + .build(HdfsFile { output_path }); + auto file = hdfs_create(output_path); + std::vector nodes = hdfs_get_nodes(file); + output.set_locality(nodes, false); // not hard locality + + run_filter(input_path, file); + + return output; +} + +auto map(spider::Data input) -> spider::Data { + std::string const output_path = "/path/to/output"; + std::string const input_path = input.get().url; + + spider::Data output = spider::Data::Builder() + .cleanup([](HdfaFile const& file) { delete_hdfs_file(file); }) + .build(HdfsFile { output_path }); + + run_map(input_path, output_path); + + // Now that map finishes, the file is persisted on Hdfs as output of job. + output.mark_persist(); + return output; +} + +auto main(int argc, char** argv) -> int { + spider::Data input = spider::Data::Builder() + .mark_persist(true) + .build(HdfsFile { "/path/to/input" }); + spider::Future> future = spider::run( + spider::bind(map, filter), + input); + std::string const output_path = future.get().get().url; + std::cout << "Result is stored in " << output_path << std::endl; +} +``` + +## Data as Key-Value Store +`Data` can also be used a a key-value store. User can specify a key when creating the data, and the data can be accessed later by its key. Notice that a task can only access the `Data` created by itself or passed to it. Client can access any data with the key. +Using the key value store, we can solve the dynamic task result problem. + +```c++ +auto gcd(spider::Context& context, int x, int y, std::string key) + -> std::tuple { + if (x == y) { + spider::Data.Builder() + .set_key(key) + .build(x); + return { x, y, key }; + } + if (x > y) { + context.add_child(gcd); + return { x % y, y, key }; + } + context.add_child(gcd); + return { x, y % x, key }; +} + +auto main(int argc, char** argv) -> int { + std::string const key = "random_key"; + driver.run(gcd, 48, 18, key); + while (!driver.get_data_by_key(key)) { + int value = driver.get_data_by_key(key).get(); + std::cout << "gcd of " << x << " and " << y << " is " << value << std::endl; + } +} +``` + +## Straggler Mitigation +`Driver::register_task` can take a second argument for timeout milliseconds. If a task executes for longer than the specified timeout, Spider spawns another task instance running the same function. The task that finishes first wins. Other running task instances are cancelled, and associated data is cleaned up. + +The new task has a different task id, and it is the responsibility of the user to avoid any data race and deduplicate the output if necessary. + +## Note on Worker Setup +The setup section said that we can start a worker by running `spider start --worker --db `. This is oversimplified. The worker has to know the function it will run. + +When user compiles the client code, an executable and a library are generated. The executable executes the client code as expected. The library contains all the functions registered by user. Worker needs to run with a copy of this library. The actual commands to start a worker is `spider start --worker --db --libs [client_libraries]`. \ No newline at end of file From b3a2e1e6e175b47898afed224082debe6aa2ea84 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Sun, 3 Nov 2024 13:45:45 +0800 Subject: [PATCH 09/85] style: Change markdown headings to sentence style and hard wrap markdown at 100 chars --- docs/{QuickStart.md => quick_start.md} | 109 +++++++++++++++++++------ 1 file changed, 83 insertions(+), 26 deletions(-) rename docs/{QuickStart.md => quick_start.md} (59%) diff --git a/docs/QuickStart.md b/docs/quick_start.md similarity index 59% rename from docs/QuickStart.md rename to docs/quick_start.md index 916570b..fc039da 100644 --- a/docs/QuickStart.md +++ b/docs/quick_start.md @@ -1,10 +1,21 @@ -# Spider Quick Start Guide +# Spider quick start guide -## Set Up Spider -To get started, first start a database supported by Spider, e.g. MySql. Second, start a scheduler and connect it to the database by running `spider start --scheduler --db --port `. Third, start some workers and connect them to the database by running `spider start --worker --db `. +## Set up Spider + +To get started, + +1. Start a database supported by Spider, e.g. MySql. +2. Start a scheduler and connect it to the database by running + `spider start --scheduler --db --port `. +3. Start some workers and connect them to the database by running + `spider start --worker --db `. + +## Start a client + +Client first creates a Spider client driver and connects it to the database. Spider automatically +cleans up the resource in driver's destructor, but you can close the driver to release the resource +early. -## Start a Client -Client first creates a Spider client driver and connects it to the database. Spider automatically cleans up the resource in driver's destructor, but you can close the driver to release the resource early. ```c++ #include @@ -16,12 +27,17 @@ auto main(int argc, char **argv) -> int { } ``` -## Create a Task -In Spider, a task is a non-member function that takes the first argument a `spider::Context` object. It can then take any number of arguments of POD type. +## Create a task + +In Spider, a task is a non-member function that takes the first argument a `spider::Context` object. +It can then take any number of arguments of POD type. Task can return any POD type. If a task needs to return more than one result, uses `std::tuple`. -The `Context` object represents the context of a running task. It provides methods to get the task metadata information like task id. It also supports the creating task inside a task. We will cover this later. +The `Context` object represents the context of a running task. It provides methods to get the task +metadata information like task id. It also supports the creating task inside a task. We will cover +this later. + ```c++ auto sum(spider::Context &context, int x, int y) -> int { return x + y; @@ -35,8 +51,14 @@ auto sort(spider::Context &context, int x, int y) -> std::tuple { } ``` -## Run a Task -Spider enables user to run a task on the cluster. First register the functions statically so it is known by Spider. Simply call `Driver::run` and provide the arguments of the task. `Driver::run` returns a `spider::Future` object, which represents the result that will be available in the future. You can call `Future::ready` to check if the value in future is available yet. You can use `Future::get` to block and get the value once it is available. +## Run a task + +Spider enables user to run a task on the cluster. First register the functions statically so it is +known by Spider. Simply call `Driver::run` and provide the arguments of the task. `Driver::run` +returns a `spider::Future` object, which represents the result that will be available in the future. +You can call `Future::ready` to check if the value in future is available yet. You can use +`Future::get` to block and get the value once it is available. + ```c++ spider::register_task(sum); spider::register_task(sort); @@ -51,8 +73,14 @@ auto main(int argc, char **argv) -> int { } ``` -## Group Tasks Together -In real world, running a single task is too simple to be useful. Spider lets you bind outputs of tasks as inputs of another task, similar to `std::bind`. Binding the tasks together forms a dependencies among tasks, which is represented by `spider::TaskGraph`. `TaskGraph` can be further bound into more complicated `TaskGraph` by serving as inputs for another task. You can run the task using `Driver::run` in the same way as running a single task. +## Group tasks together + +In real world, running a single task is too simple to be useful. Spider lets you bind outputs of +tasks as inputs of another task, similar to `std::bind`. Binding the tasks together forms a +dependencies among tasks, which is represented by `spider::TaskGraph`. `TaskGraph` can be further +bound into more complicated `TaskGraph` by serving as inputs for another task. You can run the task +using `Driver::run` in the same way as running a single task. + ```c++ auto square(spider::Context& context, int x) -> int { return x * x; @@ -71,8 +99,11 @@ auto main(int argc, char **argv) -> auto { } ``` -## Run Task inside Task -Static task graph is enough to solve a lot of real work problems, but dynamically add tasks on-the-fly could become handy. As mentioned before, spider allows you to add another task as child of the running task by calling `Context::add_child`. +## Run task inside task + +Static task graph is enough to solve a lot of real work problems, but dynamically add tasks +on-the-fly could become handy. As mentioned before, spider allows you to add another task as child +of the running task by calling `Context::add_child`. ```c++ auto gcd(spider::Conect& context, int x, int y) -> std::tuple { @@ -89,7 +120,10 @@ auto gcd(spider::Conect& context, int x, int y) -> std::tuple { } ``` -However, it is impossible to get the return value of the task graph from a client. We have a solution by sharing data using key-value store, which will be discussed later. Another solution is to run task or task graph inside a task and wait for its value, just like a client. This solution is closer to the conventional function call semantic. +However, it is impossible to get the return value of the task graph from a client. We have a +solution by sharing data using key-value store, which will be discussed later. Another solution is +to run task or task graph inside a task and wait for its value, just like a client. This solution is +closer to the conventional function call semantic. ```c++ auto gcd(spider:Context& context, int x, int y) -> int { @@ -109,10 +143,20 @@ auto gcd_impl(spider::Context& context, int x, int y) -> std::tuple { } ``` -## Data on External Storage -Often simple POD data are not enough. However, passing large amount of data around is expensive. Usually these data is stored on disk or a distributed storage system. For example, an ETL workload usually reads in data from an external storage, writes temporary data on an external storage, and writes final data into an external storage. +## Data on external storage + +Often simple POD data are not enough. However, passing large amount of data around is expensive. +Usually these data is stored on disk or a distributed storage system. For example, an ETL workload +usually reads in data from an external storage, writes temporary data on an external storage, and +writes final data into an external storage. -Spider lets user pass the metadata of these data around in `spider::Data` objects. `Data` stores the value of the metadata information of external data, and provides crucial information to Spider for correct and efficient scheduling and failure recovery. `Data` stores a list of nodes which has locality of the external data, and user can specify if locality is a hard requirement, i.e. task can only run on the nodes in locality list. `Data` can include a `cleanup`function, which will run when the `Data` object is no longer reference by any task and client. `Data` has a persist flag to represent that external data is persisted and do not need to be cleaned up. +Spider lets user pass the metadata of these data around in `spider::Data` objects. `Data` stores the +value of the metadata information of external data, and provides crucial information to Spider for +correct and efficient scheduling and failure recovery. `Data` stores a list of nodes which has +locality of the external data, and user can specify if locality is a hard requirement, i.e. task can +only run on the nodes in locality list. `Data` can include a `cleanup`function, which will run when +the `Data` object is no longer reference by any task and client. `Data` has a persist flag to +represent that external data is persisted and do not need to be cleaned up. ```c++ struct HdfsFile { @@ -162,8 +206,11 @@ auto main(int argc, char** argv) -> int { } ``` -## Data as Key-Value Store -`Data` can also be used a a key-value store. User can specify a key when creating the data, and the data can be accessed later by its key. Notice that a task can only access the `Data` created by itself or passed to it. Client can access any data with the key. +## Data as key-value store + +`Data` can also be used a a key-value store. User can specify a key when creating the data, and the +data can be accessed later by its key. Notice that a task can only access the `Data` created by +itself or passed to it. Client can access any data with the key. Using the key value store, we can solve the dynamic task result problem. ```c++ @@ -193,12 +240,22 @@ auto main(int argc, char** argv) -> int { } ``` -## Straggler Mitigation -`Driver::register_task` can take a second argument for timeout milliseconds. If a task executes for longer than the specified timeout, Spider spawns another task instance running the same function. The task that finishes first wins. Other running task instances are cancelled, and associated data is cleaned up. +## Straggler mitigation + +`Driver::register_task` can take a second argument for timeout milliseconds. If a task executes for +longer than the specified timeout, Spider spawns another task instance running the same function. +The task that finishes first wins. Other running task instances are cancelled, and associated data +is cleaned up. + +The new task has a different task id, and it is the responsibility of the user to avoid any data +race and deduplicate the output if necessary. -The new task has a different task id, and it is the responsibility of the user to avoid any data race and deduplicate the output if necessary. +## Note on worker setup -## Note on Worker Setup -The setup section said that we can start a worker by running `spider start --worker --db `. This is oversimplified. The worker has to know the function it will run. +The setup section said that we can start a worker by running `spider start --worker --db `. +This is oversimplified. The worker has to know the function it will run. -When user compiles the client code, an executable and a library are generated. The executable executes the client code as expected. The library contains all the functions registered by user. Worker needs to run with a copy of this library. The actual commands to start a worker is `spider start --worker --db --libs [client_libraries]`. \ No newline at end of file +When user compiles the client code, an executable and a library are generated. The executable +executes the client code as expected. The library contains all the functions registered by user. +Worker needs to run with a copy of this library. The actual commands to start a worker is +`spider start --worker --db --libs [client_libraries]`. \ No newline at end of file From ec8f5006bf63810fa43fa3678bb6070f2900af30 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Sun, 3 Nov 2024 14:56:13 +0800 Subject: [PATCH 10/85] docs: Update doc according to pr comments --- docs/quick_start.md | 115 ++++++++++++++++++++++++++++++++------------ 1 file changed, 85 insertions(+), 30 deletions(-) diff --git a/docs/quick_start.md b/docs/quick_start.md index fc039da..0ae8d82 100644 --- a/docs/quick_start.md +++ b/docs/quick_start.md @@ -1,5 +1,18 @@ # Spider quick start guide +## Architecture of Spider + +A Spider cluster is made up of three components: + +* __Database__: Spider stores all the states and data in a fault-tolerant database. +* __Scheduler__: Scheduler is responsible for making scheduling decision when a worker ask for a new + task to run. It also handles garbage collection and failure recovery. +* __Worker__: Worker executes the task it is assigned to. Once it finishes, it updates the task + output in database and contacts scheduler for a new task. + +Users creates a __client__ to run tasks on Spider cluster. It connects to the database to submit new +tasks and get the results. Clients _never_ directly talks to a scheduler or a worker. + ## Set up Spider To get started, @@ -8,41 +21,42 @@ To get started, 2. Start a scheduler and connect it to the database by running `spider start --scheduler --db --port `. 3. Start some workers and connect them to the database by running - `spider start --worker --db `. + `spider start --worker --db `. Starting a worker that can run specific tasks needs to + link to libraries. We'll cover this later. ## Start a client Client first creates a Spider client driver and connects it to the database. Spider automatically -cleans up the resource in driver's destructor, but you can close the driver to release the resource -early. +cleans up the resource in driver's destructor. ```c++ #include auto main(int argc, char **argv) -> int { - spider::Driver driver{}; - driver.connect("db_url"); - - driver.close(); + spider::Driver driver{"db_url"}; } ``` ## Create a task In Spider, a task is a non-member function that takes the first argument a `spider::Context` object. -It can then take any number of arguments of POD type. +It can then take any number of arguments of POD type or `spider::Data` covered +in [later section](#data-on-external-storage). -Task can return any POD type. If a task needs to return more than one result, uses `std::tuple`. +Task can return any POD type. If a task needs to return more than one result, uses `std::tuple` and +makes sure all elements of `std::tuple` are POD or `spider::Data`. The `Context` object represents the context of a running task. It provides methods to get the task metadata information like task id. It also supports the creating task inside a task. We will cover this later. ```c++ +// Task that sums to integers auto sum(spider::Context &context, int x, int y) -> int { return x + y; } +// Task that sorts two integers in non-acesending order auto sort(spider::Context &context, int x, int y) -> std::tuple { if (x >= y) { return { x, y }; @@ -76,10 +90,16 @@ auto main(int argc, char **argv) -> int { ## Group tasks together In real world, running a single task is too simple to be useful. Spider lets you bind outputs of -tasks as inputs of another task, similar to `std::bind`. Binding the tasks together forms a -dependencies among tasks, which is represented by `spider::TaskGraph`. `TaskGraph` can be further -bound into more complicated `TaskGraph` by serving as inputs for another task. You can run the task -using `Driver::run` in the same way as running a single task. +tasks as inputs of another task, similar to `std::bind`. The first argument of `spider::bind` is the +child task. The later arguments are either a `spider::Task` or a `spider::TaskGraph`, whose entire +outputs are used as part of the inputs to the child task, or a POD or +`spider::Data` that is directly used as input. Spider requires that the types of `Task` or +`TaskGraph` outputs or POD type or `spider::Data` matches the input types of child task. + +Binding the tasks together forms a dependencies among tasks, which is represented by +`spider::TaskGraph`. `TaskGraph` can be further bound into more complicated `TaskGraph` by serving +as inputs for another task. You can run the task using `Driver::run` in the same way as running a +single task. ```c++ auto square(spider::Context& context, int x) -> int { @@ -120,10 +140,11 @@ auto gcd(spider::Conect& context, int x, int y) -> std::tuple { } ``` -However, it is impossible to get the return value of the task graph from a client. We have a -solution by sharing data using key-value store, which will be discussed later. Another solution is -to run task or task graph inside a task and wait for its value, just like a client. This solution is -closer to the conventional function call semantic. +However, it is impossible to get the return value of the dynamically created tasks from a client. We +have a solution by sharing data using key-value store, which will be discussed +[later](#data-as-key-value-store). Another solution is to run task or task graph inside a task and +wait for its value, just like a client. This solution is closer to the conventional function call +semantic. ```c++ auto gcd(spider:Context& context, int x, int y) -> int { @@ -163,23 +184,63 @@ struct HdfsFile { std::string url; }; +/** + * In this example, we run a filter and map on the input stored in Hdfs. + * Filter writes its output into a temporary Hdfs file, which will be cleaned + * up by Spider when the task graph finishes. + * Map reads the temporary files and persists the output in Hdfs file. + */ +auto main(int argc, char** argv) -> int { + // Creates a HdfsFile Data to represent the input data stored in Hdfs. + spider::Data input = spider::Data::Builder() + .mark_persist(true) + .build(HdfsFile { "/path/to/input" }); + spider::Future> future = spider::run( + spider::bind(map, filter), + input); + std::string const output_path = future.get().get().url; + std::cout << "Result is stored in " << output_path << std::endl; +} + +/** + * Runs filer on the input data from Hdfs file and write the output into a + * temporary Hdfs file for later tasks. + * + * @param input input file stored in Hdfs + * @return temporary file store in Hdfs + */ auto filter(spider::Data input) -> spider::Data { + // We can use task id as a unique random number. std::string const output_path = std::format("/path/%s", context.task_id()); std::string const input_path = input.get().url; - // Create HdfsFile Data first in case task fails and Spider can clean up the data. + // Creates HdfsFile Data before creating the actual file in Hdfs so Spider + // can clean up the Hdfs file on failure. spider::Data output = spider::Data::Builder() .cleanup([](HdfsFile const& file) { delete_hdfs_file(file); }) .build(HdfsFile { output_path }); auto file = hdfs_create(output_path); + // Hdfs allows reading data from any node, but reading from the nodes where + // file is stored and replicated is faster. std::vector nodes = hdfs_get_nodes(file); output.set_locality(nodes, false); // not hard locality + // Runs the filter run_filter(input_path, file); return output; } +/** + * Runs map on the input data from Hdfs file and persists the output into an + * Hdfs file. + * + * @param input input file stored in Hdfs + * @return persisted output in Hdfs + */ auto map(spider::Data input) -> spider::Data { + // We use hardcoded path for simplicity in this example. You can pass in + // the path as an input to the task or use task id as random name as in + // filter. std::string const output_path = "/path/to/output"; std::string const input_path = input.get().url; @@ -190,20 +251,12 @@ auto map(spider::Data input) -> spider::Data { run_map(input_path, output_path); // Now that map finishes, the file is persisted on Hdfs as output of job. + // We need to inform Spider that the file is not persisted and should not + // be cleaned up. output.mark_persist(); return output; } -auto main(int argc, char** argv) -> int { - spider::Data input = spider::Data::Builder() - .mark_persist(true) - .build(HdfsFile { "/path/to/input" }); - spider::Future> future = spider::run( - spider::bind(map, filter), - input); - std::string const output_path = future.get().get().url; - std::cout << "Result is stored in " << output_path << std::endl; -} ``` ## Data as key-value store @@ -211,10 +264,12 @@ auto main(int argc, char** argv) -> int { `Data` can also be used a a key-value store. User can specify a key when creating the data, and the data can be accessed later by its key. Notice that a task can only access the `Data` created by itself or passed to it. Client can access any data with the key. -Using the key value store, we can solve the dynamic task result problem. + +Using the key value store, we can solve the dynamic task result problem +mentioned [before](#run-task-inside-task). ```c++ -auto gcd(spider::Context& context, int x, int y, std::string key) +auto gcd(spider::Context& context, int x, int y, const char* key) -> std::tuple { if (x == y) { spider::Data.Builder() From 5dc12cb2a2f2fcd0aefcff48b57f8095c093b9fe Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Sun, 3 Nov 2024 23:18:42 +0800 Subject: [PATCH 11/85] docs: Remove the worker note section and put the content in run task section --- docs/quick_start.md | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/docs/quick_start.md b/docs/quick_start.md index 0ae8d82..b30678b 100644 --- a/docs/quick_start.md +++ b/docs/quick_start.md @@ -43,12 +43,12 @@ In Spider, a task is a non-member function that takes the first argument a `spid It can then take any number of arguments of POD type or `spider::Data` covered in [later section](#data-on-external-storage). -Task can return any POD type. If a task needs to return more than one result, uses `std::tuple` and -makes sure all elements of `std::tuple` are POD or `spider::Data`. +Task can return any POD type or `spider::Data`. If a task needs to return more than one result, uses +`std::tuple` and makes sure all elements of `std::tuple` are POD or `spider::Data`. -The `Context` object represents the context of a running task. It provides methods to get the task -metadata information like task id. It also supports the creating task inside a task. We will cover -this later. +Spider requires user to register the task function using static `spider::register_task`, which +sets up the function internally in Spider library for later user. Spider requires the function name +to be unique in the cluster. ```c++ // Task that sums to integers @@ -56,27 +56,27 @@ auto sum(spider::Context &context, int x, int y) -> int { return x + y; } -// Task that sorts two integers in non-acesending order +// Task that sorts two integers in non-ascending order auto sort(spider::Context &context, int x, int y) -> std::tuple { if (x >= y) { return { x, y }; } return { y, x }; } + +spider::register_task(sum); +spider::register_task(sort); + ``` ## Run a task -Spider enables user to run a task on the cluster. First register the functions statically so it is -known by Spider. Simply call `Driver::run` and provide the arguments of the task. `Driver::run` -returns a `spider::Future` object, which represents the result that will be available in the future. -You can call `Future::ready` to check if the value in future is available yet. You can use -`Future::get` to block and get the value once it is available. +Spider enables user to run a task on the cluster. Simply call `Driver::run` and provide the +arguments of the task. `Driver::run`returns a `spider::Future` object, which represents the result +that will be available in the future. You can call `Future::ready` to check if the value in future +is available yet. You can use`Future::get` to block and get the value once it is available. ```c++ -spider::register_task(sum); -spider::register_task(sort); - auto main(int argc, char **argv) -> int { // driver initialization skipped spider::Future sum_future = driver.run(sum, 2); @@ -87,6 +87,11 @@ auto main(int argc, char **argv) -> int { } ``` +If you try to compile and run the example code directly, you'll find that it fails because Spider +worker does not know which function to run. User need to compile all the tasks into a shared +library, including the call to `spider::register_task`, and start the worker with the library by +running `spider start --worker --db --libs [client_libraries]`. + ## Group tasks together In real world, running a single task is too simple to be useful. Spider lets you bind outputs of @@ -261,7 +266,7 @@ auto map(spider::Data input) -> spider::Data { ## Data as key-value store -`Data` can also be used a a key-value store. User can specify a key when creating the data, and the +`Data` can also be used as a key-value store. User can specify a key when creating the data, and the data can be accessed later by its key. Notice that a task can only access the `Data` created by itself or passed to it. Client can access any data with the key. @@ -304,13 +309,3 @@ is cleaned up. The new task has a different task id, and it is the responsibility of the user to avoid any data race and deduplicate the output if necessary. - -## Note on worker setup - -The setup section said that we can start a worker by running `spider start --worker --db `. -This is oversimplified. The worker has to know the function it will run. - -When user compiles the client code, an executable and a library are generated. The executable -executes the client code as expected. The library contains all the functions registered by user. -Worker needs to run with a copy of this library. The actual commands to start a worker is -`spider start --worker --db --libs [client_libraries]`. \ No newline at end of file From 0d59c624eaf7266792b86d9ced46ca1df5521820 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 6 Nov 2024 01:01:22 +0800 Subject: [PATCH 12/85] docs: Return a Job instead of Future for run and support user to pass in client id spider::run now returns a Job represents a running task or task graph, so user can cancel the job. User can now pass in client id when creating the driver, so the recovered client can get all jobs by client id. --- docs/quick_start.md | 53 +++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/docs/quick_start.md b/docs/quick_start.md index b30678b..c9c895f 100644 --- a/docs/quick_start.md +++ b/docs/quick_start.md @@ -27,24 +27,31 @@ To get started, ## Start a client Client first creates a Spider client driver and connects it to the database. Spider automatically -cleans up the resource in driver's destructor. +cleans up the resource in driver's destructor. User can pass in an optional client id. Two drivers +with same client id cannot run at the same time. ```c++ #include auto main(int argc, char **argv) -> int { - spider::Driver driver{"db_url"}; + boost::uuids::string_generator gen; + spider::Driver driver{"db_url", gen(L"01234567-89ab-cdef-0123-456789abcdef")}; } ``` ## Create a task In Spider, a task is a non-member function that takes the first argument a `spider::Context` object. -It can then take any number of arguments of POD type or `spider::Data` covered -in [later section](#data-on-external-storage). +It can then take any number of arguments. The argument of a task must have one of the following +type: -Task can return any POD type or `spider::Data`. If a task needs to return more than one result, uses -`std::tuple` and makes sure all elements of `std::tuple` are POD or `spider::Data`. +1. POD type +2. `spider::Data` covered in [later section](#data-on-external-storage) +3. `std::vector` of POD type and `spider::Data` + +Task can return any value of the valid argument type listed above. If a task needs to return more +than one result, uses `std::tuple` and makes sure all elements of the tuple are of a valid argument +type. Spider requires user to register the task function using static `spider::register_task`, which sets up the function internally in Spider library for later user. Spider requires the function name @@ -72,18 +79,20 @@ spider::register_task(sort); ## Run a task Spider enables user to run a task on the cluster. Simply call `Driver::run` and provide the -arguments of the task. `Driver::run`returns a `spider::Future` object, which represents the result -that will be available in the future. You can call `Future::ready` to check if the value in future -is available yet. You can use`Future::get` to block and get the value once it is available. +arguments of the task. `Driver::run`returns a `spider::Job` object, which represents the running +task. `spider::Job` takes the output type of the task graph as template argument. You can call +`Job::state` to check the state of the running task, and `Job::get_result` to block and get the task +result. User can send a cancel signal to Spider by calling `Job::cancel`. Client can get all running +jobs submitted by itself by calling `Driver::get_jobs`. ```c++ auto main(int argc, char **argv) -> int { // driver initialization skipped - spider::Future sum_future = driver.run(sum, 2); - assert(4 == sum_future.get()); + spider::Job sum_job = driver.run(sum, 2); + assert(4 == sum_job.get_result()); - spider::Future> sort_future = driver.run(4, 3); - assert(std::tuple{3, 4} == sort_future.get()); + spider::Job> sort_job = driver.run(4, 3); + assert(std::tuple{3, 4} == sort_job.get_result()); } ``` @@ -119,8 +128,8 @@ auto main(int argc, char **argv) -> auto { // driver initialization skipped spider::TaskGraph sum_of_square = spider::bind(sum, square, square); spider::TaskGraph rss = spider::bind(square_root, sum_of_square); - spider::Future future = driver::run(rss, 3, 4); - assert(5 == future.get()); + spider::Job job = driver::run(rss, 3, 4); + assert(5 == job.get_result()); } ``` @@ -128,7 +137,9 @@ auto main(int argc, char **argv) -> auto { Static task graph is enough to solve a lot of real work problems, but dynamically add tasks on-the-fly could become handy. As mentioned before, spider allows you to add another task as child -of the running task by calling `Context::add_child`. +of the running task by calling `Context::add_child`. `Context::add_child` can also add a task graph +as child. Task graph can be constructed by `Context::bind`, which has the same signature and +semantic as`spider::bind`. ```c++ auto gcd(spider::Conect& context, int x, int y) -> std::tuple { @@ -157,9 +168,9 @@ auto gcd(spider:Context& context, int x, int y) -> int { std::swap(x, y); } while (x != y) { - spider::Future> future = context.run(gcd_impl, x, y); - x = future.get().get().get<0>(); - y = future.get().get().get<1>(); + spider::Job> job = context.run(gcd_impl, x, y); + x = job.get_result().get<0>(); + y = job.get_result().get<1>(); } return x; } @@ -200,10 +211,10 @@ auto main(int argc, char** argv) -> int { spider::Data input = spider::Data::Builder() .mark_persist(true) .build(HdfsFile { "/path/to/input" }); - spider::Future> future = spider::run( + spider::Job> job = spider::run( spider::bind(map, filter), input); - std::string const output_path = future.get().get().url; + std::string const output_path = job.get_result().get().url; std::cout << "Result is stored in " << output_path << std::endl; } From fec5e73faa48f64fe4d8b083277d4bf466ffa7e9 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Thu, 14 Nov 2024 17:08:35 +0000 Subject: [PATCH 13/85] Change future to job --- src/spider/CMakeLists.txt | 4 +-- src/spider/client/Future.cpp | 30 ---------------- src/spider/client/Future.hpp | 34 ------------------ src/spider/client/Job.cpp | 41 ++++++++++++++++++++++ src/spider/client/Job.hpp | 61 +++++++++++++++++++++++++++++++++ src/spider/client/Spider.hpp | 2 +- src/spider/client/Task.hpp | 6 ++-- src/spider/client/TaskGraph.cpp | 6 ++-- src/spider/client/TaskGraph.hpp | 4 +-- 9 files changed, 113 insertions(+), 75 deletions(-) delete mode 100644 src/spider/client/Future.cpp delete mode 100644 src/spider/client/Future.hpp create mode 100644 src/spider/client/Job.cpp create mode 100644 src/spider/client/Job.hpp diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 55b58b7..374e2f8 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -54,11 +54,11 @@ target_link_libraries( absl::flat_hash_map ) -set(SPIDER_CLIENT_SOURCES client/Future.cpp CACHE INTERNAL "spider client source files") +set(SPIDER_CLIENT_SOURCES client/Job.cpp CACHE INTERNAL "spider client source files") set(SPIDER_CLIENT_HEADERS client/Spider.hpp - client/Future.hpp + client/Job.hpp CACHE INTERNAL "spider client header files" ) diff --git a/src/spider/client/Future.cpp b/src/spider/client/Future.cpp deleted file mode 100644 index bc2d1df..0000000 --- a/src/spider/client/Future.cpp +++ /dev/null @@ -1,30 +0,0 @@ -#include "Future.hpp" - -#include -#include -#include - -namespace spider { - -class FutureImpl { - // Implementation details subject to change -private: - boost::uuids::uuid m_id; - -public: - auto value() -> std::string { return boost::uuids::to_string(m_id); } - - auto ready() -> bool { return m_id.is_nil(); } -}; - -template -auto Future::get() -> T { - return T(); -} - -template -auto Future::ready() -> bool { - return true; -} - -} // namespace spider diff --git a/src/spider/client/Future.hpp b/src/spider/client/Future.hpp deleted file mode 100644 index fe6c38e..0000000 --- a/src/spider/client/Future.hpp +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef SPIDER_CLIENT_FUTURE_HPP -#define SPIDER_CLIENT_FUTURE_HPP - -#include - -namespace spider { -class FutureImpl; - -/** - * Future represents a value that will be ready. - * - * @tparam T type of the value represented by Future. - */ -template -class Future { -public: - /** - * Gets the value of the future. Blocks until the value is available. - * @return value of the future - */ - auto get() -> T; - - /** - * Checks if value of the future is ready. - * @return true if future is ready, false otherwise - */ - auto ready() -> bool; - -private: - std::unique_ptr m_impl; -}; -} // namespace spider - -#endif // SPIDER_CLIENT_FUTURE_HPP diff --git a/src/spider/client/Job.cpp b/src/spider/client/Job.cpp new file mode 100644 index 0000000..0885992 --- /dev/null +++ b/src/spider/client/Job.cpp @@ -0,0 +1,41 @@ +#include "Job.hpp" + +#include +#include +#include + +namespace spider { + +class JobImpl { + // Implementation details subject to change +public: + auto get_status() -> JobStatus { + if (m_id.is_nil()) { + return JobStatus::Cancel; + } + return JobStatus::Running; + } + +private: + boost::uuids::uuid m_id; +}; + +template +auto Job::wait_complete() {} + +template +auto Job::get_status() -> JobStatus { + return m_impl->get_status(); +} + +template +auto Job::get_result() -> T { + return T{}; +} + +template +auto Job::get_error() -> std::pair { + return std::make_pair("", ""); +} + +} // namespace spider diff --git a/src/spider/client/Job.hpp b/src/spider/client/Job.hpp new file mode 100644 index 0000000..7dac35f --- /dev/null +++ b/src/spider/client/Job.hpp @@ -0,0 +1,61 @@ +#ifndef SPIDER_CLIENT_FUTURE_HPP +#define SPIDER_CLIENT_FUTURE_HPP + +#include +#include +#include +#include + +namespace spider { +class JobImpl; + +enum class JobStatus : uint8_t { + Running, + Succeed, + Fail, + Cancel, +}; + +/** + * Job represents a running task graph. + * + * @tparam T output type of the job. + */ +template +class Job { +public: + /** + * Waits for the job to complete. + */ + auto wait_complete(); + + /** + * Gets the status of the job. + * + * @return Status of the job. + */ + auto get_status() -> JobStatus; + + /** + * Get the result of the succeeded job. + * It is undefined behavior to call on job that is in other status. + * + * @return Result of the job. + */ + auto get_result() -> T; + + /** + * Get the error message of the failed job. + * It is undefined behavior to call on job that is in other status. + * + * @return first is the function that fails. second is the error message provided in + * context::abort + */ + auto get_error() -> std::pair; + +private: + std::unique_ptr m_impl; +}; +} // namespace spider + +#endif // SPIDER_CLIENT_FUTURE_HPP diff --git a/src/spider/client/Spider.hpp b/src/spider/client/Spider.hpp index 2424e62..1877b67 100644 --- a/src/spider/client/Spider.hpp +++ b/src/spider/client/Spider.hpp @@ -6,7 +6,7 @@ // NOLINTBEGIN(misc-include-cleaner) #include "Data.hpp" -#include "Future.hpp" +#include "Job.hpp" #include "Task.hpp" #include "TaskGraph.hpp" diff --git a/src/spider/client/Task.hpp b/src/spider/client/Task.hpp index ef5db2b..a7afe62 100644 --- a/src/spider/client/Task.hpp +++ b/src/spider/client/Task.hpp @@ -10,7 +10,7 @@ #include #include "Data.hpp" -#include "Future.hpp" +#include "Job.hpp" #include "TaskGraph.hpp" namespace spider { @@ -66,7 +66,7 @@ auto bind(std::function const& task, Inputs&&... inputs) * @return future of the result */ template -auto run(std::function const& task, Args&&... args) -> Future; +auto run(std::function const& task, Args&&... args) -> Job; /** * Runs task graph on Spider. @@ -80,7 +80,7 @@ auto run(std::function const& task, Args&&... args) -> Future; * @return future of the result */ template -auto run(TaskGraph const& graph, Args&&... args) -> Future; +auto run(TaskGraph const& graph, Args&&... args) -> Job; } // namespace spider #endif diff --git a/src/spider/client/TaskGraph.cpp b/src/spider/client/TaskGraph.cpp index 1678292..b14a800 100644 --- a/src/spider/client/TaskGraph.cpp +++ b/src/spider/client/TaskGraph.cpp @@ -1,14 +1,14 @@ #include "TaskGraph.hpp" -#include "Future.hpp" +#include "Job.hpp" namespace spider { class TaskGraphImpl {}; template -auto TaskGraph::run(Args&&... /*args*/) -> Future { - return Future(); +auto TaskGraph::run(Args&&... /*args*/) -> Job { + return Job(); } } // namespace spider diff --git a/src/spider/client/TaskGraph.hpp b/src/spider/client/TaskGraph.hpp index f4675f1..2bc7919 100644 --- a/src/spider/client/TaskGraph.hpp +++ b/src/spider/client/TaskGraph.hpp @@ -3,7 +3,7 @@ #include -#include "Future.hpp" +#include "Job.hpp" namespace spider { class TaskGraphImpl; @@ -25,7 +25,7 @@ class TaskGraph { * @param args inputs of the task graph * @return future of the result */ - auto run(Args&&... args) -> Future; + auto run(Args&&... args) -> Job; private: std::unique_ptr m_impl; From a5e799bf6c09b7ac2984c5717d0b4acbc25905e7 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Thu, 14 Nov 2024 17:56:56 +0000 Subject: [PATCH 14/85] Change task to context --- src/spider/CMakeLists.txt | 4 +- src/spider/client/Context.cpp | 19 +++++++ src/spider/client/Context.hpp | 94 +++++++++++++++++++++++++++++++++++ src/spider/client/Spider.hpp | 61 ++++++++++++++++++++++- src/spider/client/Task.cpp | 15 ------ src/spider/client/Task.hpp | 86 -------------------------------- 6 files changed, 174 insertions(+), 105 deletions(-) create mode 100644 src/spider/client/Context.cpp create mode 100644 src/spider/client/Context.hpp delete mode 100644 src/spider/client/Task.cpp delete mode 100644 src/spider/client/Task.hpp diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 374e2f8..2bed7be 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -30,7 +30,7 @@ target_link_libraries(spider_core PRIVATE fmt::fmt) set(SPIDER_CLIENT_SHARED_SOURCES client/Data.cpp - client/Task.cpp + client/Context.cpp client/TaskGraph.cpp CACHE INTERNAL "spider client shared source files" @@ -38,7 +38,7 @@ set(SPIDER_CLIENT_SHARED_SOURCES set(SPIDER_CLIENT_SHARED_HEADERS client/Data.hpp - client/Task.hpp + client/Context.hpp client/TaskGraph.hpp CACHE INTERNAL "spider client shared header files" diff --git a/src/spider/client/Context.cpp b/src/spider/client/Context.cpp new file mode 100644 index 0000000..bcfaf5d --- /dev/null +++ b/src/spider/client/Context.cpp @@ -0,0 +1,19 @@ +#include "Context.hpp" + +#include + +namespace spider { + +class ContextImpl { +public: + [[nodiscard]] auto get_id() const -> boost::uuids::uuid { return m_id; } + +private: + boost::uuids::uuid m_id; +}; + +auto Context::get_id() const -> boost::uuids::uuid { + return m_impl->get_id(); +} + +} // namespace spider diff --git a/src/spider/client/Context.hpp b/src/spider/client/Context.hpp new file mode 100644 index 0000000..c128c3b --- /dev/null +++ b/src/spider/client/Context.hpp @@ -0,0 +1,94 @@ +#ifndef SPIDER_CORE_SPIDER_HPP +#define SPIDER_CORE_SPIDER_HPP + +#include +#include +#include +#include +#include + +#include "Data.hpp" +#include "Job.hpp" +#include "TaskGraph.hpp" + +namespace spider { +class ContextImpl; + +class Context { +public: + /** + * Aborts the current running task and job. This function never returns. + * + * @param message error message indicating the reason of failure + */ + auto abort(std::string const& message); + + /** + * Gets id of the current running task instance. + * + * @return id of the current running task instance. + */ + [[nodiscard]] auto get_id() const -> boost::uuids::uuid; + + /** + * Gets data by key. Cannot get data created by other tasks. + * + * @tparam T type of the value stored in data + * @param key key of the data + * @return std::nullopt if no data with key is stored, the data associated by the key otherwise + */ + template + auto get_data(std::string const& key) -> std::optional>; + + /** + * Binds inputs to a task. Input of the task can be bound from outputs of task or task graph, + * forming dependencies between tasks. Input can also be a value or a spider::Data. + * + * @tparam R return type of the task or task graph + * @tparam Args input types of task or task graph + * @tparam Inputs types of task, task graph, spider::Data or POD value + * @tparam GraphInputs input types of the new task graph + * + * @param task child task to be bound on + * @param inputs task or task graph whose outputs to bind to f, or value or spider::Data used as + * input + * @return task graph representing the task dependencies. If none of args is a task or task + * graph, returns a task graph with only one task + */ + template + auto + bind(std::function const& task, Inputs&&... inputs) -> TaskGraph; + + /** + * Runs task on Spider. + * + * @tparam R return type of the task + * @tparam Args input types of the task + * + * @param task task to run + * @param args task input + * @return job representing the running task + */ + template + auto run(std::function const& task, Args&&... args) -> Job; + + /** + * Runs task graph on Spider. + * + * @tparam R return type of the task graph + * @tparam Args input types of the task graph + * + * @param graph task graph to run + * @param args task input + * @return job representing the running task graph + */ + template + auto run(TaskGraph const& graph, Args&&... args) -> Job; + +private: + std::unique_ptr m_impl; +}; + +} // namespace spider + +#endif diff --git a/src/spider/client/Spider.hpp b/src/spider/client/Spider.hpp index 1877b67..e13fed6 100644 --- a/src/spider/client/Spider.hpp +++ b/src/spider/client/Spider.hpp @@ -2,12 +2,13 @@ #define SPIDER_CLIENT_SPIDER_HPP #include +#include #include // NOLINTBEGIN(misc-include-cleaner) +#include "Context.hpp" #include "Data.hpp" #include "Job.hpp" -#include "Task.hpp" #include "TaskGraph.hpp" // NOLINTEND(misc-include-cleaner) @@ -33,12 +34,68 @@ void register_task(std::function const& function); /** * Registers function to Spider with timeout - * @param function_name name of the function to register + * @param function function to register * @param timeout task is considered straggler after timeout ms, and Spider triggers replicate the * task */ template void register_task(std::function const& function, float timeout); + +/** + * Gets data by key. + * + * @tparam T type of the value stored in data + * @param key key of the data + * @return std::nullopt if no data with key is stored, the data associated by the key otherwise + */ +template +auto get_data(std::string const& key) -> std::optional>; + +/** + * Binds inputs to a task. Input of the task can be bound from outputs of task or task graph, + * forming dependencies between tasks. Input can also be a value or a spider::Data. + * + * @tparam R return type of the task or task graph + * @tparam Args input types of task or task graph + * @tparam Inputs types of task, task graph, spider::Data or POD value + * @tparam GraphInputs input types of the new task graph + * + * @param task child task to be bound on + * @param inputs task or task graph whose outputs to bind to f, or value or spider::Data used as + * input + * @return task graph representing the task dependencies. If none of args is a task or task graph, + * returns a task graph with only one task + */ +template +auto bind(std::function const& task, Inputs&&... inputs) + -> TaskGraph; + +/** + * Runs task on Spider. + * + * @tparam R return type of the task + * @tparam Args input types of the task + * + * @param task task to run + * @param args task input + * @return job representing the running task + */ +template +auto run(std::function const& task, Args&&... args) -> Job; + +/** + * Runs task graph on Spider. + * + * @tparam R return type of the task graph + * @tparam Args input types of the task graph + * + * @param graph task graph to run + * @param args task input + * @return job representing the running task graph + */ +template +auto run(TaskGraph const& graph, Args&&... args) -> Job; + } // namespace spider #endif // SPIDER_CLIENT_SPIDER_HPP diff --git a/src/spider/client/Task.cpp b/src/spider/client/Task.cpp deleted file mode 100644 index 1a14a6c..0000000 --- a/src/spider/client/Task.cpp +++ /dev/null @@ -1,15 +0,0 @@ -#include "Task.hpp" - -#include -#include - -#include "Data.hpp" - -namespace spider { - -template -auto get_data(std::string const& /*key*/) -> std::optional> { - return std::nullopt; -} - -} // namespace spider diff --git a/src/spider/client/Task.hpp b/src/spider/client/Task.hpp deleted file mode 100644 index a7afe62..0000000 --- a/src/spider/client/Task.hpp +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Task.hpp include functions that can be called inside a Task. - */ - -#ifndef SPIDER_CORE_SPIDER_HPP -#define SPIDER_CORE_SPIDER_HPP - -#include -#include -#include - -#include "Data.hpp" -#include "Job.hpp" -#include "TaskGraph.hpp" - -namespace spider { -/** - * Gets data by key. - * This function can be called by a client to get all data or called by a task to get data created - * by it. - * @tparam T type of the value stored in data - * @param key key of the data - * @return std::nullopt if no data with key is stored, the data associated by the key otherwise - */ -template -auto get_data(std::string const& key) -> std::optional>; - -/** - * Add task as a child of current task. - * This function can only be called by a task. - * @tparam F task graph type or function type for a single task - * @param f child task or task graph - */ -template -void add_child(F const& f); - -/** - * Binds inputs to a task. Input of the task can be bound from outputs of task or task graph, - * forming dependencies between tasks. Input can also be a value or a spider::Data. - * This function can be called by a client or by a task. - * - * @tparam R return type of the task or task graph - * @tparam Args input types of task or task graph - * @tparam Inputs types of task, task graph, spider::Data or POD value - * @tparam GraphInputs input types of the new task graph - * - * @param task child task to be bound on - * @param inputs task or task graph whose outputs to bind to f, or value or spider::Data used as - * input - * @return task graph representing the task dependencies. If none of args is a task or task graph, - * returns a task graph with only one task - */ -template -auto bind(std::function const& task, Inputs&&... inputs) - -> spider::TaskGraph; - -/** - * Runs task on Spider. - * This function can be called by a client or by a task. - * - * @tparam R return type of the task - * @tparam Args input types of the task - * - * @param task task to run - * @param args task input - * @return future of the result - */ -template -auto run(std::function const& task, Args&&... args) -> Job; - -/** - * Runs task graph on Spider. - * This function can be called by a client or by a task. - * - * @tparam R return type of the task graph - * @tparam Args input types of the task graph - * - * @param graph task graph to run - * @param args task input - * @return future of the result - */ -template -auto run(TaskGraph const& graph, Args&&... args) -> Job; -} // namespace spider - -#endif From 1b15b5d6f0d4757aac9f886e74fce4e5abfa7a16 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Sat, 16 Nov 2024 14:52:19 +0000 Subject: [PATCH 15/85] Remove TaskGraph::run to simplify interface --- src/spider/client/TaskGraph.hpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/spider/client/TaskGraph.hpp b/src/spider/client/TaskGraph.hpp index 2bc7919..3e8d8c6 100644 --- a/src/spider/client/TaskGraph.hpp +++ b/src/spider/client/TaskGraph.hpp @@ -16,17 +16,6 @@ class TaskGraphImpl; template class TaskGraph { public: - /** - * Runs the task graph. - * - * @tparam Args input types of the task graph - * @tparam R return type of the task graph - * - * @param args inputs of the task graph - * @return future of the result - */ - auto run(Args&&... args) -> Job; - private: std::unique_ptr m_impl; }; From 98104f1492ff68bc506576435d817c574a33e58a Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Sat, 16 Nov 2024 16:03:47 +0000 Subject: [PATCH 16/85] Add separate key-value store interface --- src/spider/CMakeLists.txt | 2 ++ src/spider/client/Context.hpp | 16 +++++++++++++++- src/spider/client/Data.cpp | 5 ----- src/spider/client/Data.hpp | 9 ++------- src/spider/client/KeyValueData.cpp | 14 ++++++++++++++ src/spider/client/KeyValueData.hpp | 25 +++++++++++++++++++++++++ src/spider/client/TaskGraph.cpp | 7 ------- src/spider/client/TaskGraph.hpp | 2 -- 8 files changed, 58 insertions(+), 22 deletions(-) create mode 100644 src/spider/client/KeyValueData.cpp create mode 100644 src/spider/client/KeyValueData.hpp diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 2bed7be..d07c1c7 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -30,6 +30,7 @@ target_link_libraries(spider_core PRIVATE fmt::fmt) set(SPIDER_CLIENT_SHARED_SOURCES client/Data.cpp + client/KeyValueData.cpp client/Context.cpp client/TaskGraph.cpp CACHE INTERNAL @@ -38,6 +39,7 @@ set(SPIDER_CLIENT_SHARED_SOURCES set(SPIDER_CLIENT_SHARED_HEADERS client/Data.hpp + client/KeyValueData.hpp client/Context.hpp client/TaskGraph.hpp CACHE INTERNAL diff --git a/src/spider/client/Context.hpp b/src/spider/client/Context.hpp index c128c3b..fb09361 100644 --- a/src/spider/client/Context.hpp +++ b/src/spider/client/Context.hpp @@ -40,6 +40,21 @@ class Context { template auto get_data(std::string const& key) -> std::optional>; + /** + * Insert the key-value pair into the key value store. Overwrite the existing value stored if + * key already exists. + * @param key key of the key-value pair + * @param value value of the key-value pair + */ + auto insert_kv(std::string const& key, std::string const& value); + + /** + * Get the value based on the key. Client can only get the value created by itself. + * @param key key to lookup + * @return std::nullopt if key not in storage, corresponding value if key in storage + */ + auto get_kv(std::string const& key) -> std::optional; + /** * Binds inputs to a task. Input of the task can be bound from outputs of task or task graph, * forming dependencies between tasks. Input can also be a value or a spider::Data. @@ -88,7 +103,6 @@ class Context { private: std::unique_ptr m_impl; }; - } // namespace spider #endif diff --git a/src/spider/client/Data.cpp b/src/spider/client/Data.cpp index c4b30fd..14be155 100644 --- a/src/spider/client/Data.cpp +++ b/src/spider/client/Data.cpp @@ -16,11 +16,6 @@ auto Data::get() -> T { template void Data::set_locality(std::vector const& /*nodes*/, bool /*hard*/) {} -template -auto Data::Builder::set_key(std::string const& /*key*/) -> Data::Builder& { - return this; -} - template auto Data::Builder::set_locality(std::vector const& /*nodes*/, bool /*hard*/) -> Data::Builder& { diff --git a/src/spider/client/Data.hpp b/src/spider/client/Data.hpp index 7702bb4..1dfacd0 100644 --- a/src/spider/client/Data.hpp +++ b/src/spider/client/Data.hpp @@ -47,12 +47,6 @@ class Data { class Builder { public: - /** - * Sets the key for the data. - * @param key of the data - * @return self - */ - auto set_key(std::string const& key) -> Data::Builder&; /** * Sets locality list of the data to build. * @param nodes nodes that has locality @@ -64,6 +58,7 @@ class Data { /** * Defines clean up functions of the data to build. * @param f clean up function of data + * @return self */ auto set_cleanup(std::function const& f) -> Data::Builder&; @@ -73,7 +68,7 @@ class Data { * @param t value of the data * @return data object */ - auto build(T const& /*t*/) -> Data; + auto build(T const& t) -> Data; }; private: diff --git a/src/spider/client/KeyValueData.cpp b/src/spider/client/KeyValueData.cpp new file mode 100644 index 0000000..53840b5 --- /dev/null +++ b/src/spider/client/KeyValueData.cpp @@ -0,0 +1,14 @@ +#include "KeyValueData.hpp" + +#include +#include + +namespace spider { + +auto insert_kv(std::string const& /*key*/, std::string const& /*value*/) {} + +auto get_kv(std::string const& /*key*/) -> std::optional { + return std::nullopt; +} + +} // namespace spider diff --git a/src/spider/client/KeyValueData.hpp b/src/spider/client/KeyValueData.hpp new file mode 100644 index 0000000..29b5805 --- /dev/null +++ b/src/spider/client/KeyValueData.hpp @@ -0,0 +1,25 @@ +#ifndef SPIDER_CLIENT_KEYVALUEDATA_HPP +#define SPIDER_CLIENT_KEYVALUEDATA_HPP + +#include +#include + +namespace spider { + +/** + * Insert the key-value pair into the key value store. Overwrite the existing value stored if key + * already exists. + * @param key key of the key-value pair + * @param value value of the key-value pair + */ +auto insert_kv(std::string const& key, std::string const& value); + +/** + * Get the value based on the key. Client can only get the value created by itself. + * @param key key to lookup + * @return std::nullopt if key not in storage, corresponding value if key in storage + */ +auto get_kv(std::string const& key) -> std::optional; + +} // namespace spider +#endif // SPIDER_CLIENT_KEYVALUEDATA_HPP diff --git a/src/spider/client/TaskGraph.cpp b/src/spider/client/TaskGraph.cpp index b14a800..e06f1b6 100644 --- a/src/spider/client/TaskGraph.cpp +++ b/src/spider/client/TaskGraph.cpp @@ -1,14 +1,7 @@ #include "TaskGraph.hpp" -#include "Job.hpp" - namespace spider { class TaskGraphImpl {}; -template -auto TaskGraph::run(Args&&... /*args*/) -> Job { - return Job(); -} - } // namespace spider diff --git a/src/spider/client/TaskGraph.hpp b/src/spider/client/TaskGraph.hpp index 3e8d8c6..f37efcf 100644 --- a/src/spider/client/TaskGraph.hpp +++ b/src/spider/client/TaskGraph.hpp @@ -3,8 +3,6 @@ #include -#include "Job.hpp" - namespace spider { class TaskGraphImpl; From cb369fdfb7ef2432557f63fc8c409e60273136aa Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Tue, 19 Nov 2024 08:23:00 -0500 Subject: [PATCH 17/85] Edit some docstrings. --- src/spider/client/Context.hpp | 36 ++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/spider/client/Context.hpp b/src/spider/client/Context.hpp index fb09361..15b34cf 100644 --- a/src/spider/client/Context.hpp +++ b/src/spider/client/Context.hpp @@ -19,39 +19,45 @@ class Context { /** * Aborts the current running task and job. This function never returns. * - * @param message error message indicating the reason of failure + * @param message Error message indicating the reason for the abort. */ auto abort(std::string const& message); /** - * Gets id of the current running task instance. - * - * @return id of the current running task instance. + * @return ID of the current running task instance. */ [[nodiscard]] auto get_id() const -> boost::uuids::uuid; /** - * Gets data by key. Cannot get data created by other tasks. + * Gets data by key. + * + * NOTE: Callers cannot get data created by other tasks, but they can get data created by + * previous instances of the same task. * - * @tparam T type of the value stored in data - * @param key key of the data - * @return std::nullopt if no data with key is stored, the data associated by the key otherwise + * @tparam T Type of the value stored in data + * @param key Key of the data. + * @return An optional containing the data if the given key exists, or `std::nullopt` otherwise. */ template auto get_data(std::string const& key) -> std::optional>; /** - * Insert the key-value pair into the key value store. Overwrite the existing value stored if - * key already exists. - * @param key key of the key-value pair - * @param value value of the key-value pair + * Inserts the given key-value pair into the key-value store, overwriting any existing value. + * + * @param key + * @param value */ auto insert_kv(std::string const& key, std::string const& value); /** - * Get the value based on the key. Client can only get the value created by itself. - * @param key key to lookup - * @return std::nullopt if key not in storage, corresponding value if key in storage + * Gets the value corresponding to the given key. + * + * NOTE: Callers cannot get values created by other tasks, but they can get values created by + * previous instances of the same task. + * + * @param key + * @return An optional containing the value if the given key exists, or `std::nullopt` + * otherwise. */ auto get_kv(std::string const& key) -> std::optional; From b4a6f36699ac195980b052c20c34fedbe1100cf1 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Tue, 19 Nov 2024 23:18:11 +0000 Subject: [PATCH 18/85] Fix include guard --- src/spider/client/Context.hpp | 6 +++--- src/spider/client/Job.hpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/spider/client/Context.hpp b/src/spider/client/Context.hpp index fb09361..21477a5 100644 --- a/src/spider/client/Context.hpp +++ b/src/spider/client/Context.hpp @@ -1,5 +1,5 @@ -#ifndef SPIDER_CORE_SPIDER_HPP -#define SPIDER_CORE_SPIDER_HPP +#ifndef SPIDER_CLIENT_CONTEXT_HPP +#define SPIDER_CLIENT_CONTEXT_HPP #include #include @@ -105,4 +105,4 @@ class Context { }; } // namespace spider -#endif +#endif // SPIDER_CLIENT_CONTEXT_HPP diff --git a/src/spider/client/Job.hpp b/src/spider/client/Job.hpp index 7dac35f..7daad73 100644 --- a/src/spider/client/Job.hpp +++ b/src/spider/client/Job.hpp @@ -1,5 +1,5 @@ -#ifndef SPIDER_CLIENT_FUTURE_HPP -#define SPIDER_CLIENT_FUTURE_HPP +#ifndef SPIDER_CLIENT_JOB_HPP +#define SPIDER_CLIENT_JOB_HPP #include #include @@ -58,4 +58,4 @@ class Job { }; } // namespace spider -#endif // SPIDER_CLIENT_FUTURE_HPP +#endif // SPIDER_CLIENT_JOB_HPP From 70547aec0be8b847d8e151e42bfe4ef5acd666a3 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 02:43:44 +0000 Subject: [PATCH 19/85] Add serialzable concept --- src/spider/client/Spider.hpp | 5 ++--- src/spider/core/Serializer.hpp | 14 ++++++++++++++ src/spider/worker/FunctionManager.hpp | 4 ++-- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/spider/client/Spider.hpp b/src/spider/client/Spider.hpp index e13fed6..f652517 100644 --- a/src/spider/client/Spider.hpp +++ b/src/spider/client/Spider.hpp @@ -5,13 +5,12 @@ #include #include -// NOLINTBEGIN(misc-include-cleaner) +// IWYU pragma: begin_exports #include "Context.hpp" #include "Data.hpp" #include "Job.hpp" #include "TaskGraph.hpp" - -// NOLINTEND(misc-include-cleaner) +// IWYU pragma: end_exports namespace spider { /** diff --git a/src/spider/core/Serializer.hpp b/src/spider/core/Serializer.hpp index 79c3f48..68c34ed 100644 --- a/src/spider/core/Serializer.hpp +++ b/src/spider/core/Serializer.hpp @@ -40,4 +40,18 @@ struct msgpack::adaptor::pack { } }; +template +concept Serialzable = requires(T t) { + { + msgpack::pack(msgpack::sbuffer{}, t) + }; +}; + +template +concept DeSerialzable = requires(T t) { + { + msgpack::object{}.convert(t) + }; +}; + #endif // SPIDER_CORE_SERIALIZER_HPP diff --git a/src/spider/worker/FunctionManager.hpp b/src/spider/worker/FunctionManager.hpp index 487bbb6..56ae2ba 100644 --- a/src/spider/worker/FunctionManager.hpp +++ b/src/spider/worker/FunctionManager.hpp @@ -34,13 +34,13 @@ struct signature; template struct signature { - using args_t = std::tuple>...>; + using args_t = std::tuple...>; using ret_t = R; }; template struct signature { - using args_t = std::tuple>...>; + using args_t = std::tuple...>; using ret_t = R; }; From c7763769118b16a84a08eed5c08177695280476d Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 03:02:33 +0000 Subject: [PATCH 20/85] Fix clang-tidy --- src/spider/client/Context.hpp | 2 +- src/spider/client/Spider.hpp | 1 + src/spider/core/Serializer.hpp | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/spider/client/Context.hpp b/src/spider/client/Context.hpp index 8879eeb..8912eec 100644 --- a/src/spider/client/Context.hpp +++ b/src/spider/client/Context.hpp @@ -111,4 +111,4 @@ class Context { }; } // namespace spider -#endif // SPIDER_CLIENT_CONTEXT_HPP +#endif // SPIDER_CLIENT_CONTEXT_HPP diff --git a/src/spider/client/Spider.hpp b/src/spider/client/Spider.hpp index f652517..1fdae07 100644 --- a/src/spider/client/Spider.hpp +++ b/src/spider/client/Spider.hpp @@ -10,6 +10,7 @@ #include "Data.hpp" #include "Job.hpp" #include "TaskGraph.hpp" + // IWYU pragma: end_exports namespace spider { diff --git a/src/spider/core/Serializer.hpp b/src/spider/core/Serializer.hpp index 68c34ed..0431bbc 100644 --- a/src/spider/core/Serializer.hpp +++ b/src/spider/core/Serializer.hpp @@ -43,7 +43,7 @@ struct msgpack::adaptor::pack { template concept Serialzable = requires(T t) { { - msgpack::pack(msgpack::sbuffer{}, t) + msgpack::pack(msgpack::sbuffer{}, t) }; }; From 49c571edbe5324f0622473a29792df68a5a2e0da Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 06:07:02 +0000 Subject: [PATCH 21/85] Fix typo --- src/spider/client/KeyValueData.cpp | 14 -------------- src/spider/client/KeyValueData.hpp | 25 ------------------------- src/spider/core/Serializer.hpp | 7 +++++-- 3 files changed, 5 insertions(+), 41 deletions(-) delete mode 100644 src/spider/client/KeyValueData.cpp delete mode 100644 src/spider/client/KeyValueData.hpp diff --git a/src/spider/client/KeyValueData.cpp b/src/spider/client/KeyValueData.cpp deleted file mode 100644 index 53840b5..0000000 --- a/src/spider/client/KeyValueData.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include "KeyValueData.hpp" - -#include -#include - -namespace spider { - -auto insert_kv(std::string const& /*key*/, std::string const& /*value*/) {} - -auto get_kv(std::string const& /*key*/) -> std::optional { - return std::nullopt; -} - -} // namespace spider diff --git a/src/spider/client/KeyValueData.hpp b/src/spider/client/KeyValueData.hpp deleted file mode 100644 index 29b5805..0000000 --- a/src/spider/client/KeyValueData.hpp +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef SPIDER_CLIENT_KEYVALUEDATA_HPP -#define SPIDER_CLIENT_KEYVALUEDATA_HPP - -#include -#include - -namespace spider { - -/** - * Insert the key-value pair into the key value store. Overwrite the existing value stored if key - * already exists. - * @param key key of the key-value pair - * @param value value of the key-value pair - */ -auto insert_kv(std::string const& key, std::string const& value); - -/** - * Get the value based on the key. Client can only get the value created by itself. - * @param key key to lookup - * @return std::nullopt if key not in storage, corresponding value if key in storage - */ -auto get_kv(std::string const& key) -> std::optional; - -} // namespace spider -#endif // SPIDER_CLIENT_KEYVALUEDATA_HPP diff --git a/src/spider/core/Serializer.hpp b/src/spider/core/Serializer.hpp index 0431bbc..d540379 100644 --- a/src/spider/core/Serializer.hpp +++ b/src/spider/core/Serializer.hpp @@ -41,17 +41,20 @@ struct msgpack::adaptor::pack { }; template -concept Serialzable = requires(T t) { +concept SerializableImpl = requires(T t) { { msgpack::pack(msgpack::sbuffer{}, t) }; }; template -concept DeSerialzable = requires(T t) { +concept DeSerialzableImpl = requires(T t) { { msgpack::object{}.convert(t) }; }; +template +concept Serializable = SerializableImpl && DeSerialzableImpl; + #endif // SPIDER_CORE_SERIALIZER_HPP From 43d7e16b33d8ec1d40f3f1b9b4af337fc13d7209 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 06:58:29 +0000 Subject: [PATCH 22/85] Fix clang-tidy --- src/spider/CMakeLists.txt | 6 +- src/spider/client/Context.hpp | 9 +- src/spider/client/Data.cpp | 12 +-- src/spider/client/Data.hpp | 6 +- src/spider/client/Driver.cpp | 0 src/spider/client/Driver.hpp | 118 ++++++++++++++++++++++++++ src/spider/client/Job.cpp | 10 ++- src/spider/client/Job.hpp | 4 +- src/spider/client/Spider.hpp | 101 ---------------------- src/spider/client/TaskGraph.hpp | 4 +- src/spider/client/spider.hpp | 13 +++ src/spider/worker/FunctionManager.hpp | 2 +- tests/worker/test-FunctionManager.cpp | 6 +- 13 files changed, 166 insertions(+), 125 deletions(-) create mode 100644 src/spider/client/Driver.cpp create mode 100644 src/spider/client/Driver.hpp delete mode 100644 src/spider/client/Spider.hpp create mode 100644 src/spider/client/spider.hpp diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 08fc91b..7e0516d 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -30,7 +30,7 @@ target_link_libraries(spider_core PRIVATE fmt::fmt) set(SPIDER_CLIENT_SHARED_SOURCES client/Data.cpp - client/KeyValueData.cpp + client/Driver.cpp client/Context.cpp client/TaskGraph.cpp CACHE INTERNAL @@ -39,7 +39,7 @@ set(SPIDER_CLIENT_SHARED_SOURCES set(SPIDER_CLIENT_SHARED_HEADERS client/Data.hpp - client/KeyValueData.hpp + client/Driver.hpp client/Context.hpp client/TaskGraph.hpp CACHE INTERNAL @@ -59,7 +59,7 @@ target_link_libraries( set(SPIDER_CLIENT_SOURCES client/Job.cpp CACHE INTERNAL "spider client source files") set(SPIDER_CLIENT_HEADERS - client/Spider.hpp + client/spider.hpp client/Job.hpp CACHE INTERNAL "spider client header files" diff --git a/src/spider/client/Context.hpp b/src/spider/client/Context.hpp index 8912eec..80e2862 100644 --- a/src/spider/client/Context.hpp +++ b/src/spider/client/Context.hpp @@ -7,6 +7,7 @@ #include #include +#include "../core/Serializer.hpp" #include "Data.hpp" #include "Job.hpp" #include "TaskGraph.hpp" @@ -38,7 +39,7 @@ class Context { * @param key Key of the data. * @return An optional containing the data if the given key exists, or `std::nullopt` otherwise. */ - template + template auto get_data(std::string const& key) -> std::optional>; /** @@ -76,7 +77,7 @@ class Context { * @return task graph representing the task dependencies. If none of args is a task or task * graph, returns a task graph with only one task */ - template + template auto bind(std::function const& task, Inputs&&... inputs) -> TaskGraph; @@ -90,7 +91,7 @@ class Context { * @param args task input * @return job representing the running task */ - template + template auto run(std::function const& task, Args&&... args) -> Job; /** @@ -103,7 +104,7 @@ class Context { * @param args task input * @return job representing the running task graph */ - template + template auto run(TaskGraph const& graph, Args&&... args) -> Job; private: diff --git a/src/spider/client/Data.cpp b/src/spider/client/Data.cpp index 14be155..0f272b5 100644 --- a/src/spider/client/Data.cpp +++ b/src/spider/client/Data.cpp @@ -4,30 +4,32 @@ #include #include +#include "../core/Serializer.hpp" + namespace spider { class DataImpl {}; -template +template auto Data::get() -> T { return T(); } -template +template void Data::set_locality(std::vector const& /*nodes*/, bool /*hard*/) {} -template +template auto Data::Builder::set_locality(std::vector const& /*nodes*/, bool /*hard*/) -> Data::Builder& { return this; } -template +template auto Data::Builder::set_cleanup(std::function const& /*f*/) -> Data::Builder& { return this; } -template +template auto Data::Builder::build(T const& /*t*/) -> Data { return Data(); } diff --git a/src/spider/client/Data.hpp b/src/spider/client/Data.hpp index 1dfacd0..8226bea 100644 --- a/src/spider/client/Data.hpp +++ b/src/spider/client/Data.hpp @@ -6,6 +6,8 @@ #include #include +#include "../core/Serializer.hpp" + namespace spider { class DataImpl; @@ -26,9 +28,9 @@ class DataImpl; * .set_key("key") * .build("value"); * - * @tparam T type of the value. T must be a POD. + * @tparam T type of the value. */ -template +template class Data { public: /** diff --git a/src/spider/client/Driver.cpp b/src/spider/client/Driver.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/spider/client/Driver.hpp b/src/spider/client/Driver.hpp new file mode 100644 index 0000000..a4b8815 --- /dev/null +++ b/src/spider/client/Driver.hpp @@ -0,0 +1,118 @@ +#ifndef SPIDER_CLIENT_DRIVER_HPP +#define SPIDER_CLIENT_DRIVER_HPP + +#include +#include +#include +#include + +#include "../core/Serializer.hpp" +#include "../worker/FunctionManager.hpp" +#include "Data.hpp" +#include "Job.hpp" +#include "TaskGraph.hpp" + +// NOLINTBEGIN(cppcoreguidelines-macro-usage) +/** + * Registers function to Spider + * @param func function to register + */ +#define SPIDER_REGISTER_TASK(func) SPIDER_WORKER_REGISTER_TASK(func) + +/** + * Registers function to Spider with timeout + * @param func function to register + * @param timeout task is considered straggler after timeout ms, and Spider triggers replicating + * the task + */ +#define SPIDER_REGISTER_TASK_TIMEOUT(func, timeout) SPIDER_WORKER_REGISTER_TASK(func) + +// NOLINTEND(cppcoreguidelines-macro-usage) + +namespace spider { +class DriverImpl; + +class Driver { +public: + /** + * Connects to storage + * @param url url of the storage to connect + */ + void connect(std::string const& url); + + /** + * Gets data by key. + * + * @tparam T type of the value stored in data + * @param key key of the data + * @return std::nullopt if no data with key is stored, the data associated by the key otherwise + */ + template + auto get_data(std::string const& key) -> std::optional>; + + /** + * Insert the key-value pair into the key value store. Overwrite the existing value stored if + * key already exists. + * @param key key of the key-value pair + * @param value value of the key-value pair + */ + auto insert_kv(std::string const& key, std::string const& value); + + /** + * Get the value based on the key. Client can only get the value created by itself. + * @param key key to lookup + * @return std::nullopt if key not in storage, corresponding value if key in storage + */ + auto get_kv(std::string const& key) -> std::optional; + + /** + * Binds inputs to a task. Input of the task can be bound from outputs of task or task graph, + * forming dependencies between tasks. Input can also be a value or a spider::Data. + * + * @tparam R return type of the task or task graph + * @tparam Args input types of task or task graph + * @tparam Inputs types of task, task graph, spider::Data or POD value + * @tparam GraphInputs input types of the new task graph + * + * @param task child task to be bound on + * @param inputs task or task graph whose outputs to bind to f, or value or spider::Data used as + * input + * @return task graph representing the task dependencies. If none of args is a task or task + * graph, returns a task graph with only one task + */ + template + auto + bind(std::function const& task, Inputs&&... inputs) -> TaskGraph; + + /** + * Runs task on Spider. + * + * @tparam R return type of the task + * @tparam Args input types of the task + * + * @param task task to run + * @param args task input + * @return job representing the running task + */ + template + auto run(std::function const& task, Args&&... args) -> Job; + + /** + * Runs task graph on Spider. + * + * @tparam R return type of the task graph + * @tparam Args input types of the task graph + * + * @param graph task graph to run + * @param args task input + * @return job representing the running task graph + */ + template + auto run(TaskGraph const& graph, Args&&... args) -> Job; + +private: + std::unique_ptr m_impl; +}; +} // namespace spider + +#endif // SPIDER_CLIENT_DRIVER_HPP diff --git a/src/spider/client/Job.cpp b/src/spider/client/Job.cpp index 0885992..0a42cd4 100644 --- a/src/spider/client/Job.cpp +++ b/src/spider/client/Job.cpp @@ -4,6 +4,8 @@ #include #include +#include "../core/Serializer.hpp" + namespace spider { class JobImpl { @@ -20,20 +22,20 @@ class JobImpl { boost::uuids::uuid m_id; }; -template +template auto Job::wait_complete() {} -template +template auto Job::get_status() -> JobStatus { return m_impl->get_status(); } -template +template auto Job::get_result() -> T { return T{}; } -template +template auto Job::get_error() -> std::pair { return std::make_pair("", ""); } diff --git a/src/spider/client/Job.hpp b/src/spider/client/Job.hpp index 7daad73..bd31477 100644 --- a/src/spider/client/Job.hpp +++ b/src/spider/client/Job.hpp @@ -6,6 +6,8 @@ #include #include +#include "../core/Serializer.hpp" + namespace spider { class JobImpl; @@ -21,7 +23,7 @@ enum class JobStatus : uint8_t { * * @tparam T output type of the job. */ -template +template class Job { public: /** diff --git a/src/spider/client/Spider.hpp b/src/spider/client/Spider.hpp deleted file mode 100644 index 1fdae07..0000000 --- a/src/spider/client/Spider.hpp +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef SPIDER_CLIENT_SPIDER_HPP -#define SPIDER_CLIENT_SPIDER_HPP - -#include -#include -#include - -// IWYU pragma: begin_exports -#include "Context.hpp" -#include "Data.hpp" -#include "Job.hpp" -#include "TaskGraph.hpp" - -// IWYU pragma: end_exports - -namespace spider { -/** - * Initializes Spider library - */ -void init(); - -/** - * Connects to storage - * @param url url of the storage to connect - */ -void connect(std::string const& url); - -/** - * Registers function to Spider - * @param function function to register - */ -template -void register_task(std::function const& function); - -/** - * Registers function to Spider with timeout - * @param function function to register - * @param timeout task is considered straggler after timeout ms, and Spider triggers replicate the - * task - */ -template -void register_task(std::function const& function, float timeout); - -/** - * Gets data by key. - * - * @tparam T type of the value stored in data - * @param key key of the data - * @return std::nullopt if no data with key is stored, the data associated by the key otherwise - */ -template -auto get_data(std::string const& key) -> std::optional>; - -/** - * Binds inputs to a task. Input of the task can be bound from outputs of task or task graph, - * forming dependencies between tasks. Input can also be a value or a spider::Data. - * - * @tparam R return type of the task or task graph - * @tparam Args input types of task or task graph - * @tparam Inputs types of task, task graph, spider::Data or POD value - * @tparam GraphInputs input types of the new task graph - * - * @param task child task to be bound on - * @param inputs task or task graph whose outputs to bind to f, or value or spider::Data used as - * input - * @return task graph representing the task dependencies. If none of args is a task or task graph, - * returns a task graph with only one task - */ -template -auto bind(std::function const& task, Inputs&&... inputs) - -> TaskGraph; - -/** - * Runs task on Spider. - * - * @tparam R return type of the task - * @tparam Args input types of the task - * - * @param task task to run - * @param args task input - * @return job representing the running task - */ -template -auto run(std::function const& task, Args&&... args) -> Job; - -/** - * Runs task graph on Spider. - * - * @tparam R return type of the task graph - * @tparam Args input types of the task graph - * - * @param graph task graph to run - * @param args task input - * @return job representing the running task graph - */ -template -auto run(TaskGraph const& graph, Args&&... args) -> Job; - -} // namespace spider - -#endif // SPIDER_CLIENT_SPIDER_HPP diff --git a/src/spider/client/TaskGraph.hpp b/src/spider/client/TaskGraph.hpp index f37efcf..ef8c04a 100644 --- a/src/spider/client/TaskGraph.hpp +++ b/src/spider/client/TaskGraph.hpp @@ -3,6 +3,8 @@ #include +#include "../core/Serializer.hpp" + namespace spider { class TaskGraphImpl; @@ -11,7 +13,7 @@ class TaskGraphImpl; * @tparam R return type of the task graph * @tparam Args input types of the task graph */ -template +template class TaskGraph { public: private: diff --git a/src/spider/client/spider.hpp b/src/spider/client/spider.hpp new file mode 100644 index 0000000..41a1f6a --- /dev/null +++ b/src/spider/client/spider.hpp @@ -0,0 +1,13 @@ +#ifndef SPIDER_CLIENT_SPIDER_HPP +#define SPIDER_CLIENT_SPIDER_HPP + +// IWYU pragma: begin_exports +#include "Context.hpp" +#include "Data.hpp" +#include "Driver.hpp" +#include "Job.hpp" +#include "TaskGraph.hpp" + +// IWYU pragma: end_exports + +#endif // SPIDER_CLIENT_SPIDER_HPP diff --git a/src/spider/worker/FunctionManager.hpp b/src/spider/worker/FunctionManager.hpp index 56ae2ba..f43de8f 100644 --- a/src/spider/worker/FunctionManager.hpp +++ b/src/spider/worker/FunctionManager.hpp @@ -17,7 +17,7 @@ #include "../core/MsgPack.hpp" // IWYU pragma: keep -#define REGISTER_TASK(func) \ +#define SPIDER_WORKER_REGISTER_TASK(func) \ spider::core::FunctionManager::get_instance().register_function(#func, func); namespace spider::core { diff --git a/tests/worker/test-FunctionManager.cpp b/tests/worker/test-FunctionManager.cpp index 4f772f0..b08c6b6 100644 --- a/tests/worker/test-FunctionManager.cpp +++ b/tests/worker/test-FunctionManager.cpp @@ -22,7 +22,7 @@ auto data_test(spider::core::Data const& data) -> spider::core::Data { } TEST_CASE("Register and run function with POD inputs", "[core]") { - REGISTER_TASK(int_test); + SPIDER_WORKER_REGISTER_TASK(int_test); spider::core::FunctionManager& manager = spider::core::FunctionManager::get_instance(); @@ -60,7 +60,7 @@ TEST_CASE("Register and run function with POD inputs", "[core]") { } TEST_CASE("Register and run function with tuple return", "[core]") { - REGISTER_TASK(tuple_ret_test); + SPIDER_WORKER_REGISTER_TASK(tuple_ret_test); spider::core::FunctionManager& manager = spider::core::FunctionManager::get_instance(); @@ -75,7 +75,7 @@ TEST_CASE("Register and run function with tuple return", "[core]") { } TEST_CASE("Register and run function with data", "[core]") { - REGISTER_TASK(data_test); + SPIDER_WORKER_REGISTER_TASK(data_test); spider::core::FunctionManager& manager = spider::core::FunctionManager::get_instance(); From 5e8e1dd2c90a3138a264bdb7f59ed1446d9e8436 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 06:59:38 +0000 Subject: [PATCH 23/85] Remove macOS build --- CMakeLists.txt | 6 ------ 1 file changed, 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eb85269..0ec3d43 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,12 +12,6 @@ project( set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) -# AppleClang complains about file has no symbol and abort the build. -if(APPLE) - set(CMAKE_CXX_ARCHIVE_CREATE " Scr ") - set(CMAKE_CXX_ARCHIVE_FINISH " -no_warning_for_no_symbols -c ") -endif() - # Enable exporting compile commands set(CMAKE_EXPORT_COMPILE_COMMANDS ON From fe23c3c2bd43a93082f9828e75ebace83955aec8 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 07:34:57 +0000 Subject: [PATCH 24/85] Change driver constructor --- src/spider/client/Driver.hpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/spider/client/Driver.hpp b/src/spider/client/Driver.hpp index a4b8815..13fe01e 100644 --- a/src/spider/client/Driver.hpp +++ b/src/spider/client/Driver.hpp @@ -35,10 +35,19 @@ class DriverImpl; class Driver { public: /** - * Connects to storage - * @param url url of the storage to connect + * Create a spider driver that connects to a storage. + * + * @param url storage url + */ + explicit Driver(std::string const& url); + + /** + * Create a spider driver that connects to a storage. + * + * @param url storage url + * @param id client id */ - void connect(std::string const& url); + Driver(std::string const& url, boost::uuids::uuid id); /** * Gets data by key. @@ -71,7 +80,7 @@ class Driver { * * @tparam R return type of the task or task graph * @tparam Args input types of task or task graph - * @tparam Inputs types of task, task graph, spider::Data or POD value + * @tparam Inputs types of task, task graph, spider::Data or Serializable * @tparam GraphInputs input types of the new task graph * * @param task child task to be bound on @@ -80,7 +89,7 @@ class Driver { * @return task graph representing the task dependencies. If none of args is a task or task * graph, returns a task graph with only one task */ - template + template auto bind(std::function const& task, Inputs&&... inputs) -> TaskGraph; From 064edd8fa5a6e6eb1005daaabfe9fcc36dd9a3d1 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 07:47:44 +0000 Subject: [PATCH 25/85] Add exception to interface --- src/spider/CMakeLists.txt | 1 + src/spider/client/Exception.hpp | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 src/spider/client/Exception.hpp diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 7e0516d..131434b 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -42,6 +42,7 @@ set(SPIDER_CLIENT_SHARED_HEADERS client/Driver.hpp client/Context.hpp client/TaskGraph.hpp + client/Exception.hpp CACHE INTERNAL "spider client shared header files" ) diff --git a/src/spider/client/Exception.hpp b/src/spider/client/Exception.hpp new file mode 100644 index 0000000..c18b03e --- /dev/null +++ b/src/spider/client/Exception.hpp @@ -0,0 +1,19 @@ +#ifndef SPIDER_CLIENT_EXCEPTION_HPP +#define SPIDER_CLIENT_EXCEPTION_HPP + +#include +#include + +namespace spider { + +enum class ExceptionCode: std::uint8_t { + ConnectionError, + +}; + +struct SpiderException: std::exception { + ExceptionCode code; +}; + +} +#endif // SPIDER_CLIENT_EXCEPTION_HPP From e7c5240a8de909b58cef8b74322893ce3d56de14 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 07:53:22 +0000 Subject: [PATCH 26/85] Change run to start --- src/spider/client/Context.hpp | 8 ++++---- src/spider/client/Driver.hpp | 8 ++++---- src/spider/client/Exception.hpp | 9 ++++----- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/spider/client/Context.hpp b/src/spider/client/Context.hpp index 80e2862..4bf6456 100644 --- a/src/spider/client/Context.hpp +++ b/src/spider/client/Context.hpp @@ -82,7 +82,7 @@ class Context { bind(std::function const& task, Inputs&&... inputs) -> TaskGraph; /** - * Runs task on Spider. + * Starts task on Spider. * * @tparam R return type of the task * @tparam Args input types of the task @@ -92,10 +92,10 @@ class Context { * @return job representing the running task */ template - auto run(std::function const& task, Args&&... args) -> Job; + auto start(std::function const& task, Args&&... args) -> Job; /** - * Runs task graph on Spider. + * Starts task graph on Spider. * * @tparam R return type of the task graph * @tparam Args input types of the task graph @@ -105,7 +105,7 @@ class Context { * @return job representing the running task graph */ template - auto run(TaskGraph const& graph, Args&&... args) -> Job; + auto start(TaskGraph const& graph, Args&&... args) -> Job; private: std::unique_ptr m_impl; diff --git a/src/spider/client/Driver.hpp b/src/spider/client/Driver.hpp index 13fe01e..62bd763 100644 --- a/src/spider/client/Driver.hpp +++ b/src/spider/client/Driver.hpp @@ -94,7 +94,7 @@ class Driver { bind(std::function const& task, Inputs&&... inputs) -> TaskGraph; /** - * Runs task on Spider. + * Starts task on Spider. * * @tparam R return type of the task * @tparam Args input types of the task @@ -104,10 +104,10 @@ class Driver { * @return job representing the running task */ template - auto run(std::function const& task, Args&&... args) -> Job; + auto start(std::function const& task, Args&&... args) -> Job; /** - * Runs task graph on Spider. + * Starts task graph on Spider. * * @tparam R return type of the task graph * @tparam Args input types of the task graph @@ -117,7 +117,7 @@ class Driver { * @return job representing the running task graph */ template - auto run(TaskGraph const& graph, Args&&... args) -> Job; + auto start(TaskGraph const& graph, Args&&... args) -> Job; private: std::unique_ptr m_impl; diff --git a/src/spider/client/Exception.hpp b/src/spider/client/Exception.hpp index c18b03e..9eabd89 100644 --- a/src/spider/client/Exception.hpp +++ b/src/spider/client/Exception.hpp @@ -6,14 +6,13 @@ namespace spider { -enum class ExceptionCode: std::uint8_t { +enum class ExceptionCode : std::uint8_t { ConnectionError, - }; -struct SpiderException: std::exception { +struct SpiderException : std::exception { ExceptionCode code; }; -} -#endif // SPIDER_CLIENT_EXCEPTION_HPP +} // namespace spider +#endif // SPIDER_CLIENT_EXCEPTION_HPP From b0b414e8a3caed7c92d59648043955a2151a980f Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 07:58:18 +0000 Subject: [PATCH 27/85] Add get jobs to driver --- src/spider/client/Driver.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/spider/client/Driver.hpp b/src/spider/client/Driver.hpp index 62bd763..0081387 100644 --- a/src/spider/client/Driver.hpp +++ b/src/spider/client/Driver.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "../core/Serializer.hpp" #include "../worker/FunctionManager.hpp" @@ -119,6 +120,13 @@ class Driver { template auto start(TaskGraph const& graph, Args&&... args) -> Job; + /** + * Gets all jobs started by drivers with same client id. + * + * @return ids of the jobs + */ + auto get_jobs() -> std::vector; + private: std::unique_ptr m_impl; }; From 97761e1a4b3f7df04d7653d8abb52c4c76a51500 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 08:00:05 +0000 Subject: [PATCH 28/85] Add get jobs in context --- src/spider/client/Context.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/spider/client/Context.hpp b/src/spider/client/Context.hpp index 4bf6456..a33e33e 100644 --- a/src/spider/client/Context.hpp +++ b/src/spider/client/Context.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "../core/Serializer.hpp" #include "Data.hpp" @@ -107,6 +108,13 @@ class Context { template auto start(TaskGraph const& graph, Args&&... args) -> Job; + /** + * Gets all jobs started by the task. + * + * @return ids of the jobs + */ + auto get_jobs() -> std::vector; + private: std::unique_ptr m_impl; }; From 91d36f2287fe63297ff355e7ede1da4ee94113ea Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 08:42:22 +0000 Subject: [PATCH 29/85] Update doc with new interface --- docs/quick_start.md | 147 +++++++++++++++++--------------------------- 1 file changed, 57 insertions(+), 90 deletions(-) diff --git a/docs/quick_start.md b/docs/quick_start.md index c9c895f..b8fce29 100644 --- a/docs/quick_start.md +++ b/docs/quick_start.md @@ -31,7 +31,7 @@ cleans up the resource in driver's destructor. User can pass in an optional clie with same client id cannot run at the same time. ```c++ -#include +#include auto main(int argc, char **argv) -> int { boost::uuids::string_generator gen; @@ -42,20 +42,14 @@ auto main(int argc, char **argv) -> int { ## Create a task In Spider, a task is a non-member function that takes the first argument a `spider::Context` object. -It can then take any number of arguments. The argument of a task must have one of the following -type: +It can then take any number of arguments which is `Serializable`. -1. POD type -2. `spider::Data` covered in [later section](#data-on-external-storage) -3. `std::vector` of POD type and `spider::Data` +Tasks can return any `Serialiable` value. If a task needs to return more than one result, uses +`std::tuple` and makes sure all elements of the tuple are `Serializable`. -Task can return any value of the valid argument type listed above. If a task needs to return more -than one result, uses `std::tuple` and makes sure all elements of the tuple are of a valid argument -type. - -Spider requires user to register the task function using static `spider::register_task`, which -sets up the function internally in Spider library for later user. Spider requires the function name -to be unique in the cluster. +Spider requires user to register the task function by calling `SPIDER_REGISTER_TASK` statically, +which sets up the function internally in Spider library for later user. Spider requires the function +name to be unique in the cluster. ```c++ // Task that sums to integers @@ -71,19 +65,19 @@ auto sort(spider::Context &context, int x, int y) -> std::tuple { return { y, x }; } -spider::register_task(sum); -spider::register_task(sort); +SPIDER_REGISTER_TASK(sum); +SPIDER_REGISTER_TASK(sort); ``` ## Run a task -Spider enables user to run a task on the cluster. Simply call `Driver::run` and provide the -arguments of the task. `Driver::run`returns a `spider::Job` object, which represents the running +Spider enables user to start a task on the cluster. Simply call `Driver::start` and provide the +arguments of the task. `Driver::start`returns a `spider::Job` object, which represents the running task. `spider::Job` takes the output type of the task graph as template argument. You can call -`Job::state` to check the state of the running task, and `Job::get_result` to block and get the task -result. User can send a cancel signal to Spider by calling `Job::cancel`. Client can get all running -jobs submitted by itself by calling `Driver::get_jobs`. +`Job::state` to check the state of the running task, and `Job::wait_complete` to block until job +ends and `Job::get_result`. User can send a cancel signal to Spider by calling `Job::cancel`. Client +can get all running jobs submitted by itself by calling `Driver::get_jobs`. ```c++ auto main(int argc, char **argv) -> int { @@ -91,14 +85,15 @@ auto main(int argc, char **argv) -> int { spider::Job sum_job = driver.run(sum, 2); assert(4 == sum_job.get_result()); - spider::Job> sort_job = driver.run(4, 3); + spider::Job> sort_job = driver.start(4, 3); + sort_job.wait_complete(); assert(std::tuple{3, 4} == sort_job.get_result()); } ``` If you try to compile and run the example code directly, you'll find that it fails because Spider worker does not know which function to run. User need to compile all the tasks into a shared -library, including the call to `spider::register_task`, and start the worker with the library by +library, including the call to `SPIDER_REGISTER_TASK`, and start the worker with the library by running `spider start --worker --db --libs [client_libraries]`. ## Group tasks together @@ -106,7 +101,7 @@ running `spider start --worker --db --libs [client_libraries]`. In real world, running a single task is too simple to be useful. Spider lets you bind outputs of tasks as inputs of another task, similar to `std::bind`. The first argument of `spider::bind` is the child task. The later arguments are either a `spider::Task` or a `spider::TaskGraph`, whose entire -outputs are used as part of the inputs to the child task, or a POD or +outputs are used as part of the inputs to the child task, or a `Serializable` or `spider::Data` that is directly used as input. Spider requires that the types of `Task` or `TaskGraph` outputs or POD type or `spider::Data` matches the input types of child task. @@ -128,39 +123,17 @@ auto main(int argc, char **argv) -> auto { // driver initialization skipped spider::TaskGraph sum_of_square = spider::bind(sum, square, square); spider::TaskGraph rss = spider::bind(square_root, sum_of_square); - spider::Job job = driver::run(rss, 3, 4); + spider::Job job = driver::start(rss, 3, 4); + job.wait_complete(); assert(5 == job.get_result()); } ``` ## Run task inside task -Static task graph is enough to solve a lot of real work problems, but dynamically add tasks -on-the-fly could become handy. As mentioned before, spider allows you to add another task as child -of the running task by calling `Context::add_child`. `Context::add_child` can also add a task graph -as child. Task graph can be constructed by `Context::bind`, which has the same signature and -semantic as`spider::bind`. - -```c++ -auto gcd(spider::Conect& context, int x, int y) -> std::tuple { - if (x == y) { - std::cout << "gdc is: " << x << std::endl; - return { x, y }; - } - if (x > y) { - context.add_child(gcd); - return { x % y, y }; - } - context.add_child(gcd); - return { x, y % x }; -} -``` - -However, it is impossible to get the return value of the dynamically created tasks from a client. We -have a solution by sharing data using key-value store, which will be discussed -[later](#data-as-key-value-store). Another solution is to run task or task graph inside a task and -wait for its value, just like a client. This solution is closer to the conventional function call -semantic. +Static task graph is enough to solve a lot of real work problems, but dynamically run task graphs +on-the-fly could become handy. Running a task graph inside task is the same as running it from a +client. ```c++ auto gcd(spider:Context& context, int x, int y) -> int { @@ -168,7 +141,8 @@ auto gcd(spider:Context& context, int x, int y) -> int { std::swap(x, y); } while (x != y) { - spider::Job> job = context.run(gcd_impl, x, y); + spider::Job> job = context.start(gcd_impl, x, y); + job.wait_complete(); x = job.get_result().get<0>(); y = job.get_result().get<1>(); } @@ -182,10 +156,10 @@ auto gcd_impl(spider::Context& context, int x, int y) -> std::tuple { ## Data on external storage -Often simple POD data are not enough. However, passing large amount of data around is expensive. -Usually these data is stored on disk or a distributed storage system. For example, an ETL workload -usually reads in data from an external storage, writes temporary data on an external storage, and -writes final data into an external storage. +Often simple `Serializable` value are not enough. However, passing large amount of data around is +expensive. Usually these data is stored on disk or a distributed storage system. For example, an ETL +workload usually reads in data from an external storage, writes temporary data on an external +storage, and writes final data into an external storage. Spider lets user pass the metadata of these data around in `spider::Data` objects. `Data` stores the value of the metadata information of external data, and provides crucial information to Spider for @@ -207,13 +181,15 @@ struct HdfsFile { * Map reads the temporary files and persists the output in Hdfs file. */ auto main(int argc, char** argv) -> int { + // driver initialization skipped // Creates a HdfsFile Data to represent the input data stored in Hdfs. spider::Data input = spider::Data::Builder() .mark_persist(true) .build(HdfsFile { "/path/to/input" }); - spider::Job> job = spider::run( - spider::bind(map, filter), + spider::Job> job = driver::start( + driver::bind(map, filter), input); + job.wait_complete(); std::string const output_path = job.get_result().get().url; std::cout << "Result is stored in " << output_path << std::endl; } @@ -275,48 +251,39 @@ auto map(spider::Data input) -> spider::Data { ``` -## Data as key-value store +## Using key-value store when tasks restart -`Data` can also be used as a key-value store. User can specify a key when creating the data, and the -data can be accessed later by its key. Notice that a task can only access the `Data` created by -itself or passed to it. Client can access any data with the key. - -Using the key value store, we can solve the dynamic task result problem -mentioned [before](#run-task-inside-task). +Spider provides exactly-once semantics in failure recovery. To achieve this, Spider restarts some +tasks after a task fails. Tasks might want to keep some data around after restart. However, all the +`Data` objects created by tasks are cleaned up on restart. Spider provides a key-value store for +the restarted tasks and restarted clients to retrieve values stored by previous run by `insert_kv` +and `get_kv` from `Context` or `Driver`. Note that a task or client can only get the value created +by itself, and the two different tasks can store two different values using the same key. ```c++ -auto gcd(spider::Context& context, int x, int y, const char* key) - -> std::tuple { - if (x == y) { - spider::Data.Builder() - .set_key(key) - .build(x); - return { x, y, key }; - } - if (x > y) { - context.add_child(gcd); - return { x % y, y, key }; +auto long_running(spider::Context& context) { + std::optional state_option = context.get_kv("state"); + if (!state_option.has_value()) { + long_compute_0(); + context.store_kv("state", "0"); } - context.add_child(gcd); - return { x, y % x, key }; -} - -auto main(int argc, char** argv) -> int { - std::string const key = "random_key"; - driver.run(gcd, 48, 18, key); - while (!driver.get_data_by_key(key)) { - int value = driver.get_data_by_key(key).get(); - std::cout << "gcd of " << x << " and " << y << " is " << value << std::endl; + std::string state = context.get_kv("state").value(); + switch (std::stoi(state)) { + case 0: + long_compute_1(); + context.store_kv("state", "1") // Keep running after update key-value store + case 1: + long_compute_2(); } } ``` ## Straggler mitigation -`Driver::register_task` can take a second argument for timeout milliseconds. If a task executes for -longer than the specified timeout, Spider spawns another task instance running the same function. -The task that finishes first wins. Other running task instances are cancelled, and associated data -is cleaned up. +`SPIDER_REGISTER_TASK_TIMEOUT` is same as `SPIDER_REGISTER_TASK`, but accepts a second argument as +timeout in milliseconds. If a task instance executes for longer than the specified timeout, Spider +spawns another task instance running the same function. The task instance that finishes first wins. +Other running task instances are cancelled, and associated data is cleaned up. -The new task has a different task id, and it is the responsibility of the user to avoid any data +The new task instance has a different id, and it is the responsibility of the user to avoid any data race and deduplicate the output if necessary. From 84c2f412831236c1b94e4eeeb4dd975a35b30244 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Wed, 20 Nov 2024 08:59:43 +0000 Subject: [PATCH 30/85] Fix clang-tidy --- src/spider/client/Context.hpp | 2 +- src/spider/client/Driver.hpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/spider/client/Context.hpp b/src/spider/client/Context.hpp index a33e33e..f76c235 100644 --- a/src/spider/client/Context.hpp +++ b/src/spider/client/Context.hpp @@ -78,7 +78,7 @@ class Context { * @return task graph representing the task dependencies. If none of args is a task or task * graph, returns a task graph with only one task */ - template + template auto bind(std::function const& task, Inputs&&... inputs) -> TaskGraph; diff --git a/src/spider/client/Driver.hpp b/src/spider/client/Driver.hpp index 0081387..ebd93c8 100644 --- a/src/spider/client/Driver.hpp +++ b/src/spider/client/Driver.hpp @@ -1,6 +1,7 @@ #ifndef SPIDER_CLIENT_DRIVER_HPP #define SPIDER_CLIENT_DRIVER_HPP +#include #include #include #include From f7ab0133f772b999778dc20340b8f58c87c6f463 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Thu, 21 Nov 2024 08:05:45 -0500 Subject: [PATCH 31/85] Refactor Context.hpp. --- src/spider/client/Context.hpp | 76 ++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/src/spider/client/Context.hpp b/src/spider/client/Context.hpp index f76c235..2445afb 100644 --- a/src/spider/client/Context.hpp +++ b/src/spider/client/Context.hpp @@ -36,12 +36,12 @@ class Context { * NOTE: Callers cannot get data created by other tasks, but they can get data created by * previous instances of the same task. * - * @tparam T Type of the value stored in data - * @param key Key of the data. + * @tparam Value + * @param key * @return An optional containing the data if the given key exists, or `std::nullopt` otherwise. */ - template - auto get_data(std::string const& key) -> std::optional>; + template + auto get_data(std::string const& key) -> std::optional>; /** * Inserts the given key-value pair into the key-value store, overwriting any existing value. @@ -67,51 +67,53 @@ class Context { * Binds inputs to a task. Input of the task can be bound from outputs of task or task graph, * forming dependencies between tasks. Input can also be a value or a spider::Data. * - * @tparam R return type of the task or task graph - * @tparam Args input types of task or task graph - * @tparam Inputs types of task, task graph, spider::Data or POD value - * @tparam GraphInputs input types of the new task graph - * - * @param task child task to be bound on - * @param inputs task or task graph whose outputs to bind to f, or value or spider::Data used as - * input - * @return task graph representing the task dependencies. If none of args is a task or task - * graph, returns a task graph with only one task + * @tparam ReturnType Return type for both the task and the resulting `TaskGraph`. + * @tparam TaskParams + * @tparam Inputs + * @tparam GraphParams + * @param task + * @param inputs Inputs to bind to `task`. If an input is a `Task` or `TaskGraph`, their + * outputs will be bound to the inputs of `task`. + * @return A `TaskGraph` of the inputs bound to `task`. */ - template - auto - bind(std::function const& task, Inputs&&... inputs) -> TaskGraph; + template < + Serializable ReturnType, + Serializable... TaskParams, + class... Inputs, + Serializable... GraphParams> + auto bind(std::function const& task, Inputs&&... inputs) + -> TaskGraph; /** - * Starts task on Spider. - * - * @tparam R return type of the task - * @tparam Args input types of the task + * Starts running a task with the given inputs on Spider. * - * @param task task to run - * @param args task input - * @return job representing the running task + * @tparam ReturnType + * @tparam Params + * @param task + * @param inputs + * @return A job representing the running task. */ - template - auto start(std::function const& task, Args&&... args) -> Job; + template + auto + start(std::function const& task, Params&&... inputs) -> Job; /** - * Starts task graph on Spider. - * - * @tparam R return type of the task graph - * @tparam Args input types of the task graph + * Starts running a task graph with the given inputs on Spider. * - * @param graph task graph to run - * @param args task input - * @return job representing the running task graph + * @tparam ReturnType + * @tparam Params + * @param graph + * @param inputs + * @return A job representing the running task graph. */ - template - auto start(TaskGraph const& graph, Args&&... args) -> Job; + template + auto + start(TaskGraph const& graph, Params&&... inputs) -> Job; /** - * Gets all jobs started by the task. + * Gets all jobs started by this task. * - * @return ids of the jobs + * @return IDs of the jobs. */ auto get_jobs() -> std::vector; From 302e68adffb8f0dfcccd5220c2f1b6daffba711a Mon Sep 17 00:00:00 2001 From: sitaowang1998 Date: Thu, 21 Nov 2024 09:01:42 -0500 Subject: [PATCH 32/85] style: Fix header guard name --- src/spider/client/TaskGraph.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/spider/client/TaskGraph.hpp b/src/spider/client/TaskGraph.hpp index ef8c04a..3ef986d 100644 --- a/src/spider/client/TaskGraph.hpp +++ b/src/spider/client/TaskGraph.hpp @@ -1,5 +1,5 @@ -#ifndef SPIDER_CLIENT_TASK_HPP -#define SPIDER_CLIENT_TASK_HPP +#ifndef SPIDER_CLIENT_TASKGRAPH_CPP +#define SPIDER_CLIENT_TASKGRAPH_CPP #include @@ -21,4 +21,4 @@ class TaskGraph { }; } // namespace spider -#endif // SPIDER_CLIENT_TASK_HPP +#endif // SPIDER_CLIENT_TASKGRAPH_CPP From a2dc8bc75cdeec5976efbaf12b19367ecb040326 Mon Sep 17 00:00:00 2001 From: sitaowang1998 Date: Thu, 21 Nov 2024 09:05:46 -0500 Subject: [PATCH 33/85] style: Rename Context to TaskContext --- docs/quick_start.md | 25 ++++++++++--------- src/spider/CMakeLists.txt | 4 +-- .../client/{Context.cpp => TaskContext.cpp} | 6 ++--- .../client/{Context.hpp => TaskContext.hpp} | 2 +- src/spider/client/spider.hpp | 2 +- 5 files changed, 20 insertions(+), 19 deletions(-) rename src/spider/client/{Context.cpp => TaskContext.cpp} (75%) rename src/spider/client/{Context.hpp => TaskContext.hpp} (99%) diff --git a/docs/quick_start.md b/docs/quick_start.md index b8fce29..e6656ef 100644 --- a/docs/quick_start.md +++ b/docs/quick_start.md @@ -41,8 +41,8 @@ auto main(int argc, char **argv) -> int { ## Create a task -In Spider, a task is a non-member function that takes the first argument a `spider::Context` object. -It can then take any number of arguments which is `Serializable`. +In Spider, a task is a non-member function that takes the first argument a `spider::TaskContext` +object. It can then take any number of arguments which is `Serializable`. Tasks can return any `Serialiable` value. If a task needs to return more than one result, uses `std::tuple` and makes sure all elements of the tuple are `Serializable`. @@ -53,12 +53,12 @@ name to be unique in the cluster. ```c++ // Task that sums to integers -auto sum(spider::Context &context, int x, int y) -> int { +auto sum(spider::TaskContext &context, int x, int y) -> int { return x + y; } // Task that sorts two integers in non-ascending order -auto sort(spider::Context &context, int x, int y) -> std::tuple { +auto sort(spider::TaskContext &context, int x, int y) -> std::tuple { if (x >= y) { return { x, y }; } @@ -111,11 +111,11 @@ as inputs for another task. You can run the task using `Driver::run` in the same single task. ```c++ -auto square(spider::Context& context, int x) -> int { +auto square(spider::TaskContext& context, int x) -> int { return x * x; } -auto square_root(spider::Context& context, int x) -> int { +auto square_root(spider::TaskContext& context, int x) -> int { return sqrt(x); } // task registration skipped @@ -136,7 +136,7 @@ on-the-fly could become handy. Running a task graph inside task is the same as r client. ```c++ -auto gcd(spider:Context& context, int x, int y) -> int { +auto gcd(spider:TaskContext& context, int x, int y) -> int { if (x < y) { std::swap(x, y); } @@ -149,7 +149,7 @@ auto gcd(spider:Context& context, int x, int y) -> int { return x; } -auto gcd_impl(spider::Context& context, int x, int y) -> std::tuple { +auto gcd_impl(spider::TaskContext& context, int x, int y) -> std::tuple { return { x, x % y}; } ``` @@ -201,7 +201,7 @@ auto main(int argc, char** argv) -> int { * @param input input file stored in Hdfs * @return temporary file store in Hdfs */ -auto filter(spider::Data input) -> spider::Data { +auto filter(spider::TaskContext& context, spider::Data input) -> spider::Data { // We can use task id as a unique random number. std::string const output_path = std::format("/path/%s", context.task_id()); std::string const input_path = input.get().url; @@ -229,7 +229,7 @@ auto filter(spider::Data input) -> spider::Data { * @param input input file stored in Hdfs * @return persisted output in Hdfs */ -auto map(spider::Data input) -> spider::Data { +auto map(spider::TaskContext& context, spider::Data input) -> spider::Data { // We use hardcoded path for simplicity in this example. You can pass in // the path as an input to the task or use task id as random name as in // filter. @@ -257,11 +257,12 @@ Spider provides exactly-once semantics in failure recovery. To achieve this, Spi tasks after a task fails. Tasks might want to keep some data around after restart. However, all the `Data` objects created by tasks are cleaned up on restart. Spider provides a key-value store for the restarted tasks and restarted clients to retrieve values stored by previous run by `insert_kv` -and `get_kv` from `Context` or `Driver`. Note that a task or client can only get the value created +and `get_kv` from `TaskContext` or `Driver`. Note that a task or client can only get the value +created by itself, and the two different tasks can store two different values using the same key. ```c++ -auto long_running(spider::Context& context) { +auto long_running(spider::TaskContext& context) { std::optional state_option = context.get_kv("state"); if (!state_option.has_value()) { long_compute_0(); diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 131434b..38ad307 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -31,7 +31,7 @@ target_link_libraries(spider_core PRIVATE fmt::fmt) set(SPIDER_CLIENT_SHARED_SOURCES client/Data.cpp client/Driver.cpp - client/Context.cpp + client/TaskContext.cpp client/TaskGraph.cpp CACHE INTERNAL "spider client shared source files" @@ -40,7 +40,7 @@ set(SPIDER_CLIENT_SHARED_SOURCES set(SPIDER_CLIENT_SHARED_HEADERS client/Data.hpp client/Driver.hpp - client/Context.hpp + client/TaskContext.hpp client/TaskGraph.hpp client/Exception.hpp CACHE INTERNAL diff --git a/src/spider/client/Context.cpp b/src/spider/client/TaskContext.cpp similarity index 75% rename from src/spider/client/Context.cpp rename to src/spider/client/TaskContext.cpp index bcfaf5d..48298c8 100644 --- a/src/spider/client/Context.cpp +++ b/src/spider/client/TaskContext.cpp @@ -1,7 +1,7 @@ -#include "Context.hpp" - #include +#include "TaskContext.hpp" + namespace spider { class ContextImpl { @@ -12,7 +12,7 @@ class ContextImpl { boost::uuids::uuid m_id; }; -auto Context::get_id() const -> boost::uuids::uuid { +auto TaskContext::get_id() const -> boost::uuids::uuid { return m_impl->get_id(); } diff --git a/src/spider/client/Context.hpp b/src/spider/client/TaskContext.hpp similarity index 99% rename from src/spider/client/Context.hpp rename to src/spider/client/TaskContext.hpp index 2445afb..a093d91 100644 --- a/src/spider/client/Context.hpp +++ b/src/spider/client/TaskContext.hpp @@ -16,7 +16,7 @@ namespace spider { class ContextImpl; -class Context { +class TaskContext { public: /** * Aborts the current running task and job. This function never returns. diff --git a/src/spider/client/spider.hpp b/src/spider/client/spider.hpp index 41a1f6a..67fd005 100644 --- a/src/spider/client/spider.hpp +++ b/src/spider/client/spider.hpp @@ -2,10 +2,10 @@ #define SPIDER_CLIENT_SPIDER_HPP // IWYU pragma: begin_exports -#include "Context.hpp" #include "Data.hpp" #include "Driver.hpp" #include "Job.hpp" +#include "TaskContext.hpp" #include "TaskGraph.hpp" // IWYU pragma: end_exports From 046e740d76923cd37eacbd9dee3db84733949fed Mon Sep 17 00:00:00 2001 From: sitaowang1998 Date: Thu, 21 Nov 2024 09:32:13 -0500 Subject: [PATCH 34/85] style: Add missing class docstring --- src/spider/client/Data.hpp | 6 ------ src/spider/client/Driver.cpp | 17 +++++++++++++++++ src/spider/client/Driver.hpp | 4 ++++ src/spider/client/TaskContext.cpp | 4 ++-- src/spider/client/TaskContext.hpp | 5 +++++ 5 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/spider/client/Data.hpp b/src/spider/client/Data.hpp index 8226bea..7bf4b42 100644 --- a/src/spider/client/Data.hpp +++ b/src/spider/client/Data.hpp @@ -22,12 +22,6 @@ class DataImpl; * * Data is passed in as input so the tasks can get the value of the data. * - * Data could also be used as a key-value store. - * Example: - * spider::Data key_value_data = spider::Data::Builder() - * .set_key("key") - * .build("value"); - * * @tparam T type of the value. */ template diff --git a/src/spider/client/Driver.cpp b/src/spider/client/Driver.cpp index e69de29..0ae8bd3 100644 --- a/src/spider/client/Driver.cpp +++ b/src/spider/client/Driver.cpp @@ -0,0 +1,17 @@ +#include "Driver.hpp" + +#include + +namespace spider { + +class DriverImpl { +public: + DriverImpl() = default; + +private: + boost::uuids::uuid m_id; +}; + +Driver::Driver(std::string const& /*url*/) {} + +} // namespace spider diff --git a/src/spider/client/Driver.hpp b/src/spider/client/Driver.hpp index ebd93c8..a638f36 100644 --- a/src/spider/client/Driver.hpp +++ b/src/spider/client/Driver.hpp @@ -34,6 +34,10 @@ namespace spider { class DriverImpl; +/** + * Driver provides Spider functionalities for a client, e.g. accessing data storage, creating new + * jobs. + */ class Driver { public: /** diff --git a/src/spider/client/TaskContext.cpp b/src/spider/client/TaskContext.cpp index 48298c8..e9502bf 100644 --- a/src/spider/client/TaskContext.cpp +++ b/src/spider/client/TaskContext.cpp @@ -1,7 +1,7 @@ -#include - #include "TaskContext.hpp" +#include + namespace spider { class ContextImpl { diff --git a/src/spider/client/TaskContext.hpp b/src/spider/client/TaskContext.hpp index a093d91..4882bf8 100644 --- a/src/spider/client/TaskContext.hpp +++ b/src/spider/client/TaskContext.hpp @@ -16,6 +16,11 @@ namespace spider { class ContextImpl; +/** + * TaskContext provides a task with all Spider functionalities, e.g. getting task instance id, + * accessing data storage, creating and waiting for new jobs, etc. + * TaskContext is provided as first argument to a task. + */ class TaskContext { public: /** From 92d64898def424e2d96df6aebedf01f7a265baec Mon Sep 17 00:00:00 2001 From: sitaowang1998 Date: Thu, 21 Nov 2024 10:29:21 -0500 Subject: [PATCH 35/85] feat: Add concepts for task argument --- src/spider/CMakeLists.txt | 1 + src/spider/client/Concepts.hpp | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 src/spider/client/Concepts.hpp diff --git a/src/spider/CMakeLists.txt b/src/spider/CMakeLists.txt index 38ad307..0c67d03 100644 --- a/src/spider/CMakeLists.txt +++ b/src/spider/CMakeLists.txt @@ -43,6 +43,7 @@ set(SPIDER_CLIENT_SHARED_HEADERS client/TaskContext.hpp client/TaskGraph.hpp client/Exception.hpp + client/Concepts.hpp CACHE INTERNAL "spider client shared header files" ) diff --git a/src/spider/client/Concepts.hpp b/src/spider/client/Concepts.hpp new file mode 100644 index 0000000..effd197 --- /dev/null +++ b/src/spider/client/Concepts.hpp @@ -0,0 +1,26 @@ +#ifndef SPIDER_CLIENT_CONCEPTS_HPP +#define SPIDER_CLIENT_CONCEPTS_HPP + +#include "../core/Serializer.hpp" +#include "Data.hpp" + +namespace spider { + +namespace { +template class> +struct IsSpecialization: public std::false_type {}; + +template