From 6adb2902107c29ba8770e20a5e8a5a1dd0f750c0 Mon Sep 17 00:00:00 2001 From: reyna-abhyankar Date: Sat, 24 Aug 2024 23:43:58 -0700 Subject: [PATCH 01/22] temporary weight adjust index --- lib/local-execution/src/local_slots_backing.cc | 14 +++++++++++--- .../test/src/test_local_slots_backing.cc | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 0ec9068c6a..c8d186a0fe 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -76,13 +76,22 @@ GenericTensorAccessorW const & TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { TensorSlotsBacking mapping; + int num_inputs = 0; + for (auto const &tensor_binding : binding.get_tensor_bindings()) { + if (tensor_binding.first.is_grad == IsGrad::NO && tensor_binding.second.role == TensorRole::INPUT) { + num_inputs += 1; + } + } + for (auto const &tensor_binding : binding.get_tensor_bindings()) { SlotGradId slot_grad_id = tensor_binding.first; OpTensorSpec tensor_spec = tensor_binding.second; std::vector tensor_guids; + int weight_adjusted_idx = 0; switch (tensor_spec.role) { - case TensorRole::INPUT: case TensorRole::WEIGHT: + weight_adjusted_idx = num_inputs; + case TensorRole::INPUT: assert(contains_key(this->input_tensor_slots, op_guid)); tensor_guids = this->input_tensor_slots.at(op_guid); break; @@ -96,10 +105,9 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( // "type_is_unformattable" error } - assert(tensor_guids.size() > tensor_spec.idx); IsGrad is_grad = slot_grad_id.is_grad; GenericTensorAccessorW tensor_backing = - this->get_tensor_backing(tensor_guids.at(tensor_spec.idx), is_grad); + this->get_tensor_backing(tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad); mapping.insert({slot_grad_id, tensor_backing}); } diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index 542aa66087..e31e7cf2b4 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -188,7 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) { b.bind(QUERY, input_tensor(0)); b.bind(KEY, input_tensor(1)); b.bind(VALUE, input_tensor(2)); - b.bind(WEIGHTS, weight_tensor(3)); + b.bind(WEIGHTS, weight_tensor(0)); b.bind(OUTPUT, output_tensor(0)); b.bind_grad(QUERY, input_tensor(0)); From 61697c2a30338ae39fa10ef35899f519c8d2e514 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 01:45:52 -0700 Subject: [PATCH 02/22] Loss function --- lib/kernels/CMakeLists.txt | 1 + .../include/kernels/optimizer_kernels.h | 6 +- ...timizer_kernel.cu => optimizer_kernels.cu} | 1 + .../generic_task_impl_function.h | 33 +++++++ .../local-execution/local_slots_backing.h | 11 ++- .../local-execution/local_training_backing.h | 11 ++- .../include/local-execution}/loss_functions.h | 22 ++--- .../model_training_instance.struct.toml | 26 +++++ .../task_arg_spec.variant.toml | 18 ++++ .../task_impl_function.variant.toml | 5 + .../include/local-execution/task_invocation.h | 71 ++++++++++++++ .../include/local-execution/task_signature.h | 57 +++++++++++ .../task_signature.struct.toml | 29 ++++++ .../tensor_guid_slot_spec.struct.toml | 27 ++++++ .../tensor_guid_spec.struct.toml | 22 +++++ .../src/generic_task_impl_function.cc | 53 ++++++++++ .../src/local_cost_estimator.cc | 3 +- .../src/local_slots_backing.cc | 51 +++++++++- .../src/local_training_backing.cc | 50 +++++++++- .../src/loss_functions.cc | 96 ++++++++----------- lib/local-execution/src/ops/attention.cc | 2 +- .../local-execution => src}/ops/attention.h | 0 lib/local-execution/src/task_invocation.cc | 49 ++++++++++ lib/local-execution/src/task_signature.cc | 25 +++++ .../src/task_signature_impl.cc | 2 +- .../test/src/test_task_registry.cc | 1 - .../op-attrs/ops/loss_attrs.variant.toml | 22 +++++ .../op-attrs/ops/loss_function.enum.toml | 23 +++++ .../include/op-attrs/ops/loss_functions.h | 68 +------------ .../op-attrs/ops/other_loss_attrs.struct.toml | 18 ++++ ...arse_categorical_ce_loss_attrs.struct.toml | 14 +++ lib/op-attrs/src/loss_functions.cc | 25 ++--- 32 files changed, 671 insertions(+), 171 deletions(-) rename lib/kernels/src/cuda/{optimizer_kernel.cu => optimizer_kernels.cu} (99%) create mode 100644 lib/local-execution/include/local-execution/generic_task_impl_function.h rename lib/{runtime/src => local-execution/include/local-execution}/loss_functions.h (63%) create mode 100644 lib/local-execution/include/local-execution/model_training_instance.struct.toml create mode 100644 lib/local-execution/include/local-execution/task_arg_spec.variant.toml create mode 100644 lib/local-execution/include/local-execution/task_invocation.h create mode 100644 lib/local-execution/include/local-execution/task_signature.h create mode 100644 lib/local-execution/include/local-execution/task_signature.struct.toml create mode 100644 lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml create mode 100644 lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml create mode 100644 lib/local-execution/src/generic_task_impl_function.cc rename lib/{runtime => local-execution}/src/loss_functions.cc (63%) rename lib/local-execution/{include/local-execution => src}/ops/attention.h (100%) create mode 100644 lib/local-execution/src/task_invocation.cc create mode 100644 lib/local-execution/src/task_signature.cc create mode 100644 lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml create mode 100644 lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml create mode 100644 lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml create mode 100644 lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index 8ccd7c1011..54fa3c9583 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -8,6 +8,7 @@ file(GLOB_RECURSE SRC LIST_DIRECTORIES False src/*.cc src/cuda/cuda_helper.cu + src/cuda/loss_functions_kernels.cu src/cuda/ops/*.cu ) diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index 9ca6bf8e2b..fcbf9454f8 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H -#include "device.h" +#include "kernels/device.h" +#include "kernels/ff_handle.h" namespace FlexFlow { @@ -20,7 +21,8 @@ void sgd_nccl_update_task_gpu(ffStream_t, float lr, float momentum, bool nesterov, - float weight_decay PerDeviceFFHandle const &, + float weight_decay, + PerDeviceFFHandle const &, float const *weight_grad_ptr, size_t size, float *weight_ptr, diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernels.cu similarity index 99% rename from lib/kernels/src/cuda/optimizer_kernel.cu rename to lib/kernels/src/cuda/optimizer_kernels.cu index 439eed9dec..1bb38b2870 100644 --- a/lib/kernels/src/cuda/optimizer_kernel.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "device.h" #include "kernels/optimizer_kernels.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/generic_task_impl_function.h b/lib/local-execution/include/local-execution/generic_task_impl_function.h new file mode 100644 index 0000000000..425740f61d --- /dev/null +++ b/lib/local-execution/include/local-execution/generic_task_impl_function.h @@ -0,0 +1,33 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H +#define _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H + +#include "local-execution/device_specific_device_states.dtg.h" +#include "local-execution/task_argument_accessor.h" + +namespace FlexFlow { + +struct GenericTaskImplFunction { + + void (*function_ptr)(TaskArgumentAccessor const &); + + bool operator==(GenericTaskImplFunction const &) const; + bool operator!=(GenericTaskImplFunction const &) const; + bool operator<(GenericTaskImplFunction const &) const; + bool operator>(GenericTaskImplFunction const &) const; + bool operator<=(GenericTaskImplFunction const &) const; + bool operator>=(GenericTaskImplFunction const &) const; +}; + +std::string format_as(GenericTaskImplFunction const &x); +std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash<::FlexFlow::GenericTaskImplFunction> { + size_t operator()(::FlexFlow::GenericTaskImplFunction const &) const; +}; +} // namespace std + +#endif diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 6a0c28e988..312a13cc01 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -7,6 +7,7 @@ #include "local-execution/op_task_invocation.h" #include "local-execution/per_device_op_state.h" #include "local-execution/runtime_arg_config.h" +#include "local-execution/task_invocation.h" namespace FlexFlow { @@ -19,23 +20,29 @@ struct LocalSlotsBacking { public: void add_per_device_op_state(layer_guid_t const &, DeviceSpecificDeviceStates const &); + void allocate_label_tensor(tensor_guid_t const &, + ComputationGraph const &, + Allocator &); void allocate_outgoing_tensors(layer_guid_t const &, ComputationGraph const &, Allocator &); TensorSlotsBacking construct_tensor_slots_backing(OpTaskBinding const &, layer_guid_t const &) const; + TensorSlotsBacking construct_tensor_slots_backing(TaskBinding const &) const; ArgSlotsBacking construct_arg_slots_backing(OpTaskBinding const &, layer_guid_t const &) const; + ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const; ConcreteArgSpec resolve_runtime_arg_ref_spec(RuntimeArgRefSpec const &) const; ConcreteArgSpec resolve_op_arg_ref_spec(OpArgRefSpec const &, layer_guid_t const &) const; + GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &, + IsGrad) const; + private: bool is_tensor_allocated(tensor_guid_t const &) const; bool is_gradient_tensor_allocated(tensor_guid_t const &) const; - GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &, - IsGrad) const; public: // tensors diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index b398bb8cc3..55983086c2 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -2,7 +2,9 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H #include "local-execution/local_slots_backing.h" +#include "local-execution/model_training_instance.dtg.h" #include "local-execution/task_registry.h" +#include "op-attrs/ops/loss_functions.h" namespace FlexFlow { @@ -13,15 +15,17 @@ struct LocalTrainingBacking { LocalTrainingBacking(Allocator const &, ComputationGraph const &, TensorBackingMap const &, - RuntimeArgConfig const &); + RuntimeArgConfig const &, + std::optional const &); void execute_init(); PerLayerElapsedTime execute_forward(); PerLayerElapsedTime execute_backward(); void execute_update(); - TaskArgumentAccessor get_task_arg_accessor(OpTaskInvocation const &, - layer_guid_t const &) const; + TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const; + TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &, + layer_guid_t const &) const; private: DeviceSpecificDeviceStates call_init_task_impl(task_id_t, @@ -33,6 +37,7 @@ struct LocalTrainingBacking { ComputationGraph computation_graph; TaskRegistry task_registry; LocalSlotsBacking local_slots_backing; + std::optional training_instance; }; } // namespace FlexFlow diff --git a/lib/runtime/src/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h similarity index 63% rename from lib/runtime/src/loss_functions.h rename to lib/local-execution/include/local-execution/loss_functions.h index 620ebc6936..e5e81b60a7 100644 --- a/lib/runtime/src/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -13,24 +13,20 @@ * limitations under the License. */ -#ifndef _FF_LOSS_FUNCTIONS_H_ -#define _FF_LOSS_FUNCTIONS_H_ +#ifndef _FLEXFLOW_LOSS_FUNCTIONS_H_ +#define _FLEXFLOW_LOSS_FUNCTIONS_H_ +#include "local-execution/task_impl_function.dtg.h" +#include "local-execution/task_invocation.h" +#include "local-execution/task_signature.h" #include "op-attrs/ops/loss_functions.h" -#include "pcg/operator.h" -#include "pcg/parallel_tensor.h" -#include "pcg/parallel_tensor_guid_t.h" -#include "task_spec/task_invocation.h" -#include "tasks.h" namespace FlexFlow { -template <> -void register_task(); - -TaskInvocation backward(LossAttrs const &, - parallel_tensor_guid_t logit, - parallel_tensor_guid_t label); +TaskImplFunction get_loss_bwd_task_impl(); +TaskSignature get_loss_bwd_signature(); +TaskInvocation + backward(LossAttrs const &, tensor_guid_t logit, tensor_guid_t label); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml new file mode 100644 index 0000000000..ea7e8d24ab --- /dev/null +++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "ModelTrainingInstance" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "utils/optional.h", + "op-attrs/ops/loss_attrs.dtg.h", + "pcg/tensor_guid_t.dtg.h", +] + +[[fields]] +name = "loss_attrs" +type = "::FlexFlow::LossAttrs" + +[[fields]] +name = "label_tensor" +type = "::FlexFlow::tensor_guid_t" + +[[fields]] +name = "logit_tensor" +type = "::FlexFlow::tensor_guid_t" diff --git a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml new file mode 100644 index 0000000000..a6df0c8a7d --- /dev/null +++ b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml @@ -0,0 +1,18 @@ +namespace = "FlexFlow" +name = "TaskArgSpec" +features = [ + "eq" +] + +includes = [ + "local-execution/concrete_arg.h", + "local-execution/runtime_arg_ref.h" +] + +[[values]] +type = "::FlexFlow::ConcreteArgSpec" +key = "concrete_arg_spec" + +[[values]] +type = "::FlexFlow::RuntimeArgRefSpec" +key = "runtime_arg_ref" diff --git a/lib/local-execution/include/local-execution/task_impl_function.variant.toml b/lib/local-execution/include/local-execution/task_impl_function.variant.toml index a12be37da2..1be18bebfa 100644 --- a/lib/local-execution/include/local-execution/task_impl_function.variant.toml +++ b/lib/local-execution/include/local-execution/task_impl_function.variant.toml @@ -10,6 +10,7 @@ features = [ includes = [ "local-execution/init_task_impl_function.h", "local-execution/fwd_bwd_task_impl_function.h", + "local-execution/generic_task_impl_function.h", ] [[values]] @@ -19,3 +20,7 @@ key = "init_task_impl_function" [[values]] type = "::FlexFlow::FwdBwdTaskImplFunction" key = "fwd_bwd_task_impl_function" + +[[values]] +type = "::FlexFlow::GenericTaskImplFunction" +key = "generic_task_impl_function" diff --git a/lib/local-execution/include/local-execution/task_invocation.h b/lib/local-execution/include/local-execution/task_invocation.h new file mode 100644 index 0000000000..2317c65c02 --- /dev/null +++ b/lib/local-execution/include/local-execution/task_invocation.h @@ -0,0 +1,71 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H + +#include "local-execution/slot_grad_id.dtg.h" +#include "local-execution/slot_id_t.dtg.h" +#include "local-execution/task_arg_spec.dtg.h" +#include "local-execution/task_id_t.dtg.h" +#include "local-execution/task_signature.dtg.h" +#include "local-execution/tensor_guid_spec.dtg.h" + +namespace FlexFlow { + +struct TaskBinding { + TaskBinding() = default; + + void bind(int, TensorGuidSpec const &); + void bind(slot_id_t, TensorGuidSpec const &); + + template + void bind_arg(int name, T const &t) { + this->bind_arg(slot_id_t{name}, t); + } + + template + void bind_arg(slot_id_t name, T const &t) { + this->insert_arg_spec(name, TaskArgSpec{ConcreteArgSpec::create(t)}); + } + + template + void bind_arg(int name, RuntimeArgRef const &t) { + this->bind_arg(slot_id_t{name}, t); + } + + template + void bind_arg(slot_id_t name, RuntimeArgRef const &ref) { + this->insert_arg_spec(name, TaskArgSpec{RuntimeArgRefSpec::create(ref)}); + } + + bool operator==(TaskBinding const &other) const; + bool operator!=(TaskBinding const &other) const; + + std::unordered_map const & + get_tensor_bindings() const; + std::unordered_map const &get_arg_bindings() const; + +private: + std::unordered_map tensor_bindings; + std::unordered_map arg_bindings; + +private: + void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec); + std::tuple + tie() const; +}; + +struct TaskInvocation { +public: + TaskInvocation() = delete; + TaskInvocation(task_id_t task_id, TaskBinding const &binding) + : task_id(task_id), binding(binding) {} + +public: + task_id_t task_id; + TaskBinding binding; +}; + +bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h new file mode 100644 index 0000000000..d31a67e027 --- /dev/null +++ b/lib/local-execution/include/local-execution/task_signature.h @@ -0,0 +1,57 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H + +// #include "local-execution/tensor_guid_slot_spec.dtg.h" +// #include "local-execution/serialization.h" +// #include "utils/hash/unordered_map.h" +// #include "utils/hash/unordered_set.h" +// #include "utils/type_index.h" + +#include "local-execution/task_signature.dtg.h" + +namespace FlexFlow { + +TaskSignature make_empty_task_signature(); + +void add_slot(TaskSignature &, + int name, + IsGrad, + SlotType slot_type = SlotType::TENSOR); +void add_slot(TaskSignature &, + slot_id_t name, + IsGrad, + SlotType slot_type = SlotType::TENSOR); + +template +void add_arg_slot(TaskSignature &task_signature, int name) { + add_arg_slot(task_signature, slot_id_t{name}); +} + +template +void add_arg_slot(TaskSignature &task_signature, slot_id_t name) { + // static_assert(is_serializable::value, "Type must be serializable"); + task_signature.task_arg_types.insert({name, get_type_index_for_type()}); +} + +template +void add_return_value(TaskSignature &task_signature) { + task_signature.return_value = get_type_index_for_type(); +} + +// adds arg_slot without checking is_serializable, used for arguments that are +// deviceSpecific +template +void add_unchecked_arg_slot(TaskSignature &task_signature, int name) { + add_unchecked_arg_slot(task_signature, slot_id_t{name}); +} + +// adds arg_slot without checking is_serializable, used for arguments that are +// deviceSpecific +template +void add_unchecked_arg_slot(TaskSignature &task_signature, slot_id_t name) { + task_signature.task_arg_types.insert({name, get_type_index_for_type()}); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml new file mode 100644 index 0000000000..f86f7b0c57 --- /dev/null +++ b/lib/local-execution/include/local-execution/task_signature.struct.toml @@ -0,0 +1,29 @@ +namespace = "FlexFlow" +name = "TaskSignature" +features = [ + "eq", + "fmt", +] + +includes = [ + "local-execution/tensor_guid_slot_spec.dtg.h", + "utils/type_index.h", + "utils/optional.h" +] + +src_includes = [ + "utils/fmt/unordered_map.h", + "utils/fmt/unordered_set.h", +] + +[[fields]] +name = "return_value" +type = "std::optional" + +[[fields]] +name = "task_arg_types" +type = "std::unordered_map<::FlexFlow::slot_id_t, std::type_index>" + +[[fields]] +name = "tensor_guid_slots" +type = "std::unordered_set<::FlexFlow::TensorGuidSlotSpec>" diff --git a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml new file mode 100644 index 0000000000..4b3e5b2674 --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml @@ -0,0 +1,27 @@ +namespace = "FlexFlow" +name = "TensorGuidSlotSpec" +features = [ + "eq", + "fmt", + "hash", + "ord", +] + +includes = [ + "local-execution/slot_id_t.dtg.h", + "local-execution/slot_type.dtg.h", + "local-execution/is_grad.dtg.h", +] + +[[fields]] +name = "name" +type = "::FlexFlow::slot_id_t" + +[[fields]] +name = "slot_type" +type = "::FlexFlow::SlotType" + +[[fields]] +name = "is_grad" +type = "::FlexFlow::IsGrad" + diff --git a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml new file mode 100644 index 0000000000..a51d6ccf1b --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml @@ -0,0 +1,22 @@ +namespace = "FlexFlow" +name = "TensorGuidSpec" +features = [ + "eq", + "fmt", + "hash", + "ord" +] + +includes = [ + "pcg/tensor_guid_t.dtg.h", + "local-execution/is_grad.dtg.h", +] + +[[fields]] +name = "tensor_guid" +type = "::FlexFlow::tensor_guid_t" + +[[fields]] +name = "is_grad" +type = "::FlexFlow::IsGrad" + diff --git a/lib/local-execution/src/generic_task_impl_function.cc b/lib/local-execution/src/generic_task_impl_function.cc new file mode 100644 index 0000000000..87d4db53e6 --- /dev/null +++ b/lib/local-execution/src/generic_task_impl_function.cc @@ -0,0 +1,53 @@ +#include "local-execution/generic_task_impl_function.h" + +namespace FlexFlow { + +bool GenericTaskImplFunction::operator==( + GenericTaskImplFunction const &other) const { + return this->function_ptr == other.function_ptr; +} + +bool GenericTaskImplFunction::operator!=( + GenericTaskImplFunction const &other) const { + return this->function_ptr != other.function_ptr; +} + +bool GenericTaskImplFunction::operator<( + GenericTaskImplFunction const &other) const { + return this->function_ptr < other.function_ptr; +} + +bool GenericTaskImplFunction::operator>( + GenericTaskImplFunction const &other) const { + return this->function_ptr > other.function_ptr; +} + +bool GenericTaskImplFunction::operator<=( + GenericTaskImplFunction const &other) const { + return this->function_ptr <= other.function_ptr; +} + +bool GenericTaskImplFunction::operator>=( + GenericTaskImplFunction const &other) const { + return this->function_ptr >= other.function_ptr; +} + +std::string format_as(GenericTaskImplFunction const &x) { + std::ostringstream oss; + oss << ""; + return oss.str(); +} +std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x) { + return s << fmt::to_string(x); +} + +} // namespace FlexFlow + +namespace std { +size_t hash::operator()( + ::FlexFlow::GenericTaskImplFunction const &x) const { + return std::hash{}(x.function_ptr); +} +} // namespace std diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index d4e0467cbf..1ca422d8e1 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -75,7 +75,8 @@ CostDetails LocalCostEstimator::estimate_cost( LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, - this->runtime_arg_config); + this->runtime_arg_config, + std::nullopt); local_backing.execute_init(); PerLayerElapsedTime fwd = local_backing.execute_forward(); diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index c8d186a0fe..967f8d9ba3 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -15,6 +15,14 @@ void LocalSlotsBacking::add_per_device_op_state( this->per_device_op_states.insert({op_guid, device_state}); } +void LocalSlotsBacking::allocate_label_tensor(tensor_guid_t const &label_tensor, + ComputationGraph const &cg, + Allocator &allocator) { + GenericTensorAccessorW tensor_backing = + allocator.allocate_tensor(get_tensor_attrs(cg, label_tensor).shape); + this->tensor_mapping.insert({label_tensor, tensor_backing}); +} + void LocalSlotsBacking::allocate_outgoing_tensors( layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, @@ -78,7 +86,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( TensorSlotsBacking mapping; int num_inputs = 0; for (auto const &tensor_binding : binding.get_tensor_bindings()) { - if (tensor_binding.first.is_grad == IsGrad::NO && tensor_binding.second.role == TensorRole::INPUT) { + if (tensor_binding.first.is_grad == IsGrad::NO && + tensor_binding.second.role == TensorRole::INPUT) { num_inputs += 1; } } @@ -90,7 +99,7 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( int weight_adjusted_idx = 0; switch (tensor_spec.role) { case TensorRole::WEIGHT: - weight_adjusted_idx = num_inputs; + weight_adjusted_idx = num_inputs; case TensorRole::INPUT: assert(contains_key(this->input_tensor_slots, op_guid)); tensor_guids = this->input_tensor_slots.at(op_guid); @@ -106,14 +115,30 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( } IsGrad is_grad = slot_grad_id.is_grad; - GenericTensorAccessorW tensor_backing = - this->get_tensor_backing(tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad); + GenericTensorAccessorW tensor_backing = this->get_tensor_backing( + tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad); mapping.insert({slot_grad_id, tensor_backing}); } return mapping; } +TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( + TaskBinding const &binding) const { + TensorSlotsBacking mapping; + + for (auto const &tensor_binding : binding.get_tensor_bindings()) { + SlotGradId slot_grad_id = tensor_binding.first; + TensorGuidSpec tensor_spec = tensor_binding.second; + + GenericTensorAccessorW accessor = + this->get_tensor_backing(tensor_spec.tensor_guid, slot_grad_id.is_grad); + mapping.insert({slot_grad_id, accessor}); + } + + return mapping; +} + ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { ArgSlotsBacking mapping; @@ -135,6 +160,24 @@ ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( return mapping; } +ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( + TaskBinding const &binding) const { + ArgSlotsBacking mapping; + for (auto const &arg_binding : binding.get_arg_bindings()) { + slot_id_t arg_slot = arg_binding.first; + TaskArgSpec task_arg_spec = arg_binding.second; + + mapping.insert({arg_slot, + task_arg_spec.visit(overload{ + [&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }, + })}); + } + return mapping; +} + ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec( OpArgRefSpec const &op_arg_ref_spec, layer_guid_t const &op_guid) const { if (op_arg_ref_spec.holds()) { diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index a2ee06a95a..f54d0ddaad 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,4 +1,5 @@ #include "local-execution/local_training_backing.h" +#include "local-execution/loss_functions.h" #include "local-execution/task_signature_impl.h" #include "utils/containers/reversed.h" #include "utils/exception.h" @@ -9,10 +10,12 @@ LocalTrainingBacking::LocalTrainingBacking( Allocator const &allocator, ComputationGraph const &computation_graph, TensorBackingMap const &tensor_backing_mapping, - RuntimeArgConfig const &runtime_arg_config) + RuntimeArgConfig const &runtime_arg_config, + std::optional const &training_instance) : allocator(allocator), computation_graph(computation_graph), local_slots_backing(tensor_backing_mapping, runtime_arg_config), - task_registry(empty_task_registry()) { + task_registry(empty_task_registry()), + training_instance(training_instance) { for (layer_guid_t const &node : topological_ordering(computation_graph)) { ComputationGraphOpAttrs attrs = @@ -25,6 +28,13 @@ LocalTrainingBacking::LocalTrainingBacking( // register tasks register_tasks_for_layer(this->task_registry, node, attrs); } + + if (this->training_instance.has_value()) { + this->local_slots_backing.allocate_label_tensor( + this->training_instance.value().label_tensor, + computation_graph, + this->allocator); + } } DeviceSpecificDeviceStates @@ -56,7 +66,7 @@ void LocalTrainingBacking::execute_init() { OpTaskInvocation invocation = init(attrs); TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation, operator_node); + this->get_op_task_arg_accessor(invocation, operator_node); DeviceSpecificDeviceStates device_state = this->call_init_task_impl(invocation.task_id, accessor); this->local_slots_backing.add_per_device_op_state(operator_node, @@ -67,6 +77,7 @@ void LocalTrainingBacking::execute_init() { PerLayerElapsedTime LocalTrainingBacking::execute_forward() { PerLayerElapsedTime per_op_elapsed_time; + for (layer_guid_t const &operator_node : topological_ordering(this->computation_graph)) { if (this->task_registry.forward_task_ids.at(operator_node).has_value()) { @@ -75,17 +86,35 @@ PerLayerElapsedTime LocalTrainingBacking::execute_forward() { OpTaskInvocation invocation = forward(attrs); TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation, operator_node); + this->get_op_task_arg_accessor(invocation, operator_node); std::optional elapsed_time = this->call_task_impl(invocation.task_id, accessor); per_op_elapsed_time.insert({operator_node, elapsed_time}); } } + return per_op_elapsed_time; } PerLayerElapsedTime LocalTrainingBacking::execute_backward() { PerLayerElapsedTime per_op_elapsed_time; + + // compute loss + if (this->training_instance.has_value()) { + ModelTrainingInstance unwrapped_training_instance = + training_instance.value(); + TaskInvocation loss_invocation = + backward(unwrapped_training_instance.loss_attrs, + unwrapped_training_instance.logit_tensor, + unwrapped_training_instance.label_tensor); + assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); + TaskArgumentAccessor loss_accessor = + this->get_task_arg_accessor(loss_invocation); + TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); + loss_impl_fn.get().function_ptr(loss_accessor); + } + + // backward through computation graph for (layer_guid_t const &operator_node : reversed(topological_ordering(this->computation_graph))) { if (this->task_registry.backward_task_ids.at(operator_node).has_value()) { @@ -94,7 +123,7 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() { OpTaskInvocation invocation = backward(attrs); TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation, operator_node); + this->get_op_task_arg_accessor(invocation, operator_node); std::optional elapsed_time = this->call_task_impl(invocation.task_id, accessor); per_op_elapsed_time.insert({operator_node, elapsed_time}); @@ -108,6 +137,17 @@ void LocalTrainingBacking::execute_update() { } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( + TaskInvocation const &invocation) const { + TensorSlotsBacking tensor_slots_backing = + this->local_slots_backing.construct_tensor_slots_backing( + invocation.binding); + ArgSlotsBacking arg_slots_backing = + this->local_slots_backing.construct_arg_slots_backing(invocation.binding); + return TaskArgumentAccessor::create( + this->allocator, tensor_slots_backing, arg_slots_backing); +} + +TaskArgumentAccessor LocalTrainingBacking::get_op_task_arg_accessor( OpTaskInvocation const &invocation, layer_guid_t const &op_guid) const { TensorSlotsBacking tensor_slots_backing = this->local_slots_backing.construct_tensor_slots_backing( diff --git a/lib/runtime/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc similarity index 63% rename from lib/runtime/src/loss_functions.cc rename to lib/local-execution/src/loss_functions.cc index b0d5ac2029..6b23d5da51 100644 --- a/lib/runtime/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -13,56 +13,44 @@ * limitations under the License. */ -#include "loss_functions.h" +#include "op-attrs/ops/loss_functions.h" #include "kernels/loss_function_kernels.h" -#include "legion.h" -#include "runtime/profiling.h" -#include "task_spec/task_argument_accessor.h" +#include "local-execution/loss_functions.h" +#include "local-execution/profiling.h" namespace FlexFlow { -enum LossSlots { - LOGIT_GRAD, - LOGIT, - LABEL, - LOSS_ATTRS, - BATCH_SIZE, - PROFILING_SETTINGS -}; +enum Slots { LOGIT, LABEL, ATTRS, PROFILING }; -TaskInvocation backward_invocation(LossAttrs const &attrs, - EnableProfiling enable_profiling, - parallel_tensor_guid_t logit, - parallel_tensor_guid_t label) { - auto binding = IndexTaskBinding{LOGIT}; - StandardTypedTaskArg arg = attrs; - binding.bind_arg(LOSS_ATTRS, attrs); - binding.bind(LOGIT, logit); - binding.bind(LABEL, label); - binding.bind(LOGIT_GRAD, grad(logit)); - binding.bind_arg(PROFILING_SETTINGS, profiling_settings()); +TaskSignature get_loss_bwd_signature() { + TaskSignature sig = make_empty_task_signature(); + add_slot(sig, LOGIT, IsGrad::NO); + add_slot(sig, LABEL, IsGrad::NO); + add_slot(sig, LOGIT, IsGrad::YES); + add_arg_slot(sig, ATTRS); + add_arg_slot(sig, PROFILING); + return sig; +} + +TaskInvocation + backward(LossAttrs const &attrs, tensor_guid_t logit, tensor_guid_t label) { + TaskBinding b; + b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::NO}); + b.bind(LABEL, TensorGuidSpec{label, IsGrad::NO}); + b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::YES}); + b.bind_arg(ATTRS, attrs); + b.bind_arg(PROFILING, profiling_settings()); - /* if ((logit_domain != part_domain) || (label_domain != part_domain)) { */ // TODO @lockshaw make sure this is still checked - /* fprintf(stderr, */ - /* "Encounter inconsistency in parallelizing loss computation"); */ - /* assert(false); */ - /* } */ - return {LOSS_BWD_TASK_ID, binding}; + return {task_id_t::LOSS_BWD_TASK_ID, b}; } -static void - loss_backward_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - auto attrs = acc.get_argument(LOSS_ATTRS); - auto profiling_settings = - acc.get_argument(PROFILING_SETTINGS); - auto batch_size = acc.get_argument(BATCH_SIZE); - auto logit_grad = acc.get_tensor(LOGIT_GRAD); +static void backward_task_impl(TaskArgumentAccessor const &acc) { + auto attrs = acc.get_argument(ATTRS); + auto profiling = acc.get_argument(PROFILING); + auto logit_grad = acc.get_tensor_grad(LOGIT); auto logit = acc.get_tensor(LOGIT); auto label = acc.get_tensor(LABEL); + int batch_size = label.shape.at(ff_dim_t{0}); LossFunction loss_type = get_loss_function(attrs); float scale_factor = 1.0f / batch_size; @@ -73,7 +61,7 @@ static void if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) { // assertion the outter-most dim is replica dim and replica degree is 1 - auto scce_attrs = get(attrs); + auto scce_attrs = attrs.get(); size_t ndim = logit.shape.num_dims(); assert(logit.shape.at(legion_dim_t(ndim - 1)) == 1); int num_samples = logit.shape.at(legion_dim_t(ndim - 2)); @@ -86,19 +74,19 @@ static void ndim - 1)); // TODO FIXME something seems wrong here, isn't the // numerator guaranteed to be 1? } - assert(label.shape.sub_shape(legion_dim_t(1), nullopt) == - logit.shape.sub_shape(legion_dim_t(1), nullopt)); + assert(label.shape.sub_shape(legion_dim_t(1), std::nullopt) == + logit.shape.sub_shape(legion_dim_t(1), std::nullopt)); assert(k * label.shape.at(legion_dim_t(ndim - 1)) == logit.shape.at(legion_dim_t(ndim - 1))); assert(label.shape.at(legion_dim_t(0)) == 1); profile(sparse_categorical_crossentropy_loss_backward_kernel, - profiling_settings, + profiling, "[SparseCategoricalCrossEntropyLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), get_int32_ptr(label), - logit.shape.get_volume(), + get_volume(logit.shape), get_volume(logit_grad.shape), num_samples, num_classes, @@ -115,7 +103,7 @@ static void switch (loss_type) { case LossFunction::CATEGORICAL_CROSSENTROPY: { profile(categorical_crossentropy_loss_backward_kernel, - profiling_settings, + profiling, "[CategoricalCrossEntropyLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), @@ -127,7 +115,7 @@ static void } case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: { profile(mean_squared_error_avg_loss_backward_kernel, - profiling_settings, + profiling, "[MeanSquaredErrorAvgLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), @@ -139,7 +127,7 @@ static void } case LossFunction::IDENTITY: { profile(identity_loss_backward_kernel, - profiling_settings, + profiling, "[IdentityLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), @@ -156,16 +144,8 @@ static void } } -template <> -void register_task() { - TaskSignature sig; - sig.add_arg_slot(LOSS_ATTRS); - sig.add_arg_slot(PROFILING_SETTINGS); - sig.add_slot(LOGIT, {SlotType::TENSOR, Permissions::RO}); - sig.add_slot(LABEL, {SlotType::TENSOR, Permissions::RO}); - sig.add_slot(LOGIT_GRAD, {SlotType::TENSOR, Permissions::RW}); - - register_task(LOSS_BWD_TASK_ID, "Loss Backward", sig, loss_backward_task); +TaskImplFunction get_loss_bwd_task_impl() { + return TaskImplFunction{GenericTaskImplFunction{backward_task_impl}}; } } // namespace FlexFlow diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc index eebef9039d..8ede2cb38b 100644 --- a/lib/local-execution/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/attention.h" +#include "attention.h" #include "kernels/attention_kernels.h" #include "local-execution/op_task_signature.h" #include "op-attrs/ops/attention.h" diff --git a/lib/local-execution/include/local-execution/ops/attention.h b/lib/local-execution/src/ops/attention.h similarity index 100% rename from lib/local-execution/include/local-execution/ops/attention.h rename to lib/local-execution/src/ops/attention.h diff --git a/lib/local-execution/src/task_invocation.cc b/lib/local-execution/src/task_invocation.cc new file mode 100644 index 0000000000..c64af5332e --- /dev/null +++ b/lib/local-execution/src/task_invocation.cc @@ -0,0 +1,49 @@ +#include "local-execution/task_invocation.h" +#include "utils/containers/contains_key.h" + +namespace FlexFlow { + +void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) { + this->bind(slot_id_t{name}, tensor_guid_spec); +} + +void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) { + this->tensor_bindings.insert( + {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec}); +} + +void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) { + assert(!contains_key(this->arg_bindings, name)); + this->arg_bindings.insert({name, arg_spec}); +} + +bool TaskBinding::operator==(TaskBinding const &other) const { + return this->tie() == other.tie(); +} + +bool TaskBinding::operator!=(TaskBinding const &other) const { + return this->tie() != other.tie(); +} + +std::tuple const &, + std::unordered_map const &> + TaskBinding::tie() const { + return std::tie(this->tensor_bindings, this->arg_bindings); +} + +std::unordered_map const & + TaskBinding::get_tensor_bindings() const { + return this->tensor_bindings; +} + +std::unordered_map const & + TaskBinding::get_arg_bindings() const { + return this->arg_bindings; +} + +bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) { + // TODO: implement signature checking + return true; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc new file mode 100644 index 0000000000..3bba9e2c8a --- /dev/null +++ b/lib/local-execution/src/task_signature.cc @@ -0,0 +1,25 @@ +#include "local-execution/task_signature.h" + +namespace FlexFlow { + +TaskSignature make_empty_task_signature() { + return TaskSignature(std::nullopt, {}, {}); +} + +void add_slot(TaskSignature &task_signature, + int name, + IsGrad is_grad, + SlotType slot_type) { + add_slot(task_signature, slot_id_t{name}, is_grad, slot_type); +} + +void add_slot(TaskSignature &task_signature, + slot_id_t name, + IsGrad is_grad, + SlotType slot_type) { + TensorGuidSlotSpec tensor_guid_slot_spec = + TensorGuidSlotSpec{name, slot_type, is_grad}; + task_signature.tensor_guid_slots.insert(tensor_guid_slot_spec); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc index ca428aad25..16b7870601 100644 --- a/lib/local-execution/src/task_signature_impl.cc +++ b/lib/local-execution/src/task_signature_impl.cc @@ -1,5 +1,5 @@ #include "local-execution/task_signature_impl.h" -#include "local-execution/ops/attention.h" +#include "ops/attention.h" #include "ops/batch_matmul.h" #include "ops/batch_norm.h" #include "ops/cast.h" diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc index fa3b068425..2c3a6c1d63 100644 --- a/lib/local-execution/test/src/test_task_registry.cc +++ b/lib/local-execution/test/src/test_task_registry.cc @@ -1,7 +1,6 @@ #include "doctest/doctest.h" #include "kernels/local_cuda_allocator.h" #include "local-execution/local_cost_estimator.h" -#include "local-execution/ops/attention.h" #include "local-execution/task_signature_impl.h" #include "pcg/computation_graph_builder.h" #include "utils/fmt/optional.h" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml new file mode 100644 index 0000000000..8a4f38839c --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml @@ -0,0 +1,22 @@ +namespace = "FlexFlow" +name = "LossAttrs" +features = [ + "eq", + "ord", + "hash", + "json", + "fmt", +] + +includes = [ + "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h", + "op-attrs/ops/other_loss_attrs.dtg.h" +] + +[[values]] +type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs" +key = "sparse_categorical_ce_loss_attrs" + +[[values]] +type = "::FlexFlow::OtherLossAttrs" +key = "other_loss_attrs" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml b/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml new file mode 100644 index 0000000000..b9cd13eabf --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml @@ -0,0 +1,23 @@ +namespace = "FlexFlow" +name = "LossFunction" +features = [ + "hash", + "json", + "rapidcheck", + "fmt", +] + +[[values]] +name = "CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "SPARSE_CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "MEAN_SQUARED_ERROR_AVG_REDUCE" + +[[values]] +name = "MEAN_SQUARED_ERROR_SUM_REDUCE" + +[[values]] +name = "IDENTITY" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions.h index 58d372d9e5..9fb0597197 100644 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions.h +++ b/lib/op-attrs/include/op-attrs/ops/loss_functions.h @@ -2,74 +2,16 @@ #define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H #include "core.h" -#include "utils/exception.h" -#include "utils/visitable.h" -#include +#include "loss_attrs.dtg.h" +#include "loss_function.dtg.h" +#include "other_loss_attrs.dtg.h" +#include "sparse_categorical_ce_loss_attrs.dtg.h" namespace FlexFlow { -enum class LossFunction { - CATEGORICAL_CROSSENTROPY, - SPARSE_CATEGORICAL_CROSSENTROPY, - MEAN_SQUARED_ERROR_AVG_REDUCE, - MEAN_SQUARED_ERROR_SUM_REDUCE, - IDENTITY -}; - -LossFunction parse_loss_function_name(std::string const &); - -struct SparseCategoricalCrossEntropyLossAttrs { - req replace_labels; // for aggregate_spec: More predictions than labels -}; -FF_VISITABLE_STRUCT(SparseCategoricalCrossEntropyLossAttrs, replace_labels); -CHECK_VALID_OP_ATTR(SparseCategoricalCrossEntropyLossAttrs); - -struct OtherLossAttrs { - req loss_type; -}; -FF_VISITABLE_STRUCT(OtherLossAttrs, loss_type); -CHECK_VALID_OP_ATTR(OtherLossAttrs); - -using LossAttrs = - std::variant; - -LossFunction get_loss_function(OtherLossAttrs const &); -LossFunction get_loss_function(SparseCategoricalCrossEntropyLossAttrs const &); LossFunction get_loss_function(LossAttrs const &); +LossFunction parse_loss_name(std::string const &raw_name); } // namespace FlexFlow -namespace fmt { - -template <> -struct formatter<::FlexFlow::LossFunction> : formatter { - template - auto format(::FlexFlow::LossFunction d, FormatContext &ctx) const - -> decltype(ctx.out()) { - using namespace FlexFlow; - - string_view name = "unknown"; - switch (d) { - case LossFunction::CATEGORICAL_CROSSENTROPY: - name = "CategoricalCrossEntropy"; - break; - case LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY: - name = "SparseCategoricalCrossEntropy"; - break; - case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: - name = "MeanSquaredErrorAvgReduce"; - break; - case LossFunction::MEAN_SQUARED_ERROR_SUM_REDUCE: - name = "MeanSquaredErrorSumReduce"; - break; - case LossFunction::IDENTITY: - name = "Identity"; - break; - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - #endif diff --git a/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml new file mode 100644 index 0000000000..81055f5835 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml @@ -0,0 +1,18 @@ +namespace = "FlexFlow" +name = "OtherLossAttrs" +features = [ + "eq", + "ord", + "hash", + "json", + "rapidcheck", + "fmt", +] + +includes = [ + "op-attrs/ops/loss_function.dtg.h" +] + +[[fields]] +name = "loss_type" +type = "::FlexFlow::LossFunction" diff --git a/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml new file mode 100644 index 0000000000..21378a1154 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml @@ -0,0 +1,14 @@ +namespace = "FlexFlow" +name = "SparseCategoricalCrossEntropyLossAttrs" +features = [ + "eq", + "ord", + "hash", + "json", + "rapidcheck", + "fmt", +] + +[[fields]] +name = "replace_labels" +type = "bool" diff --git a/lib/op-attrs/src/loss_functions.cc b/lib/op-attrs/src/loss_functions.cc index 094e117d77..cae88be453 100644 --- a/lib/op-attrs/src/loss_functions.cc +++ b/lib/op-attrs/src/loss_functions.cc @@ -1,27 +1,18 @@ #include "op-attrs/ops/loss_functions.h" #include "utils/containers/transform.h" +#include "utils/exception.h" +#include "utils/overload.h" #include #include namespace FlexFlow { -LossFunction get_loss_type(OtherLossAttrs const &attrs) { - return attrs.loss_type; -} -LossFunction - get_loss_type(SparseCategoricalCrossEntropyLossAttrs const &attrs) { - return LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY; -} - -struct GetLossFunction { - template - LossFunction operator()(T const &t) { - return get_loss_type(t); - } -}; - -LossFunction get_loss_type(LossAttrs const &attrs) { - return visit(GetLossFunction{}, attrs); +LossFunction get_loss_function(LossAttrs const &attrs) { + return attrs.visit( + overload{[&](SparseCategoricalCrossEntropyLossAttrs const &s) { + return LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY; + }, + [&](OtherLossAttrs const &s) { return s.loss_type; }}); } LossFunction parse_loss_name(std::string const &raw_name) { From b56c046b3bc44586bae96b59476b6c384f922837 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 04:58:33 -0700 Subject: [PATCH 03/22] Add cuda test for loss function --- lib/kernels/CMakeLists.txt | 2 +- lib/kernels/include/kernels/array_shape.h | 10 ++- lib/kernels/src/array_shape.cc | 36 +++++++- lib/kernels/src/cuda/cuda_helper.cu | 6 ++ lib/kernels/src/device.h | 1 + .../local-execution/local_slots_backing.h | 4 - .../src/local_slots_backing.cc | 8 -- .../src/local_training_backing.cc | 11 ++- lib/local-execution/src/loss_functions.cc | 22 ++--- lib/local-execution/src/ops/element_unary.cc | 8 +- .../src/task_signature_impl.cc | 4 +- .../test/src/test_loss_function.cc | 88 +++++++++++++++++++ 12 files changed, 159 insertions(+), 41 deletions(-) create mode 100644 lib/local-execution/test/src/test_loss_function.cc diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index 54fa3c9583..baac58f8e3 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -8,7 +8,7 @@ file(GLOB_RECURSE SRC LIST_DIRECTORIES False src/*.cc src/cuda/cuda_helper.cu - src/cuda/loss_functions_kernels.cu + src/cuda/loss_function_kernels.cu src/cuda/ops/*.cu ) diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 5de9fae7ad..c95c447574 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -42,9 +42,13 @@ struct ArrayShape { std::optional at_maybe(legion_dim_t) const; std::optional at_maybe(ff_dim_t) const; - ArrayShape - sub_shape(std::optional> start, - std::optional> end) const; + ArrayShape sub_shape(legion_dim_t start, ff_dim_t end) const; + + ArrayShape sub_shape(std::optional start, + std::optional end) const; + + ArrayShape sub_shape(std::optional start, + std::optional end) const; public: LegionTensorDims dims; diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index d5e2f1167d..bf80c6b5c1 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -50,12 +50,42 @@ std::size_t ArrayShape::at(ff_dim_t idx) const { return dims.at(legion_dim_from_ff_dim(idx, this->num_dims())); } -ArrayShape ArrayShape::sub_shape( - std::optional> start, - std::optional> end) const { +// ArrayShape ArrayShape::sub_shape( +// std::optional> start, +// std::optional> end) const { +// NOT_IMPLEMENTED(); +// } + +ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const { NOT_IMPLEMENTED(); } +ArrayShape ArrayShape::sub_shape(std::optional start, + std::optional end) const { + std::vector new_shape; + ff_dim_t start_idx = start.value_or(ff_dim_t{0}); + ff_dim_t end_idx = end.value_or(ff_dim_t{this->num_dims()}); + + while (start_idx < end_idx) { + new_shape.push_back(this->at(start_idx)); + start_idx = ff_dim_t{start_idx.value + 1}; + } + return ArrayShape{new_shape}; +} + +ArrayShape ArrayShape::sub_shape(std::optional start, + std::optional end) const { + std::vector new_shape; + legion_dim_t start_idx = start.value_or(legion_dim_t{0}); + legion_dim_t end_idx = end.value_or(legion_dim_t{this->num_dims()}); + + while (start_idx < end_idx) { + new_shape.push_back(this->at(start_idx)); + start_idx = add_to_legion_dim(start_idx, 1); + } + return ArrayShape{new_shape}; +} + std::optional ArrayShape::at_maybe(legion_dim_t index) const { if (index.value < dims.size()) { return dims.at(index); diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 2ff02038f4..5a303ca15e 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -35,6 +35,12 @@ __global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { } } +__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b) { + CUDA_KERNEL_LOOP(i, size) { + ptr[i] = (b - a) * ptr[i] + a; + } +} + __global__ void ones_kernel(float *ptr, coord_t size) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = 1.0f; diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h index ceff2f92ff..e32805fde3 100644 --- a/lib/kernels/src/device.h +++ b/lib/kernels/src/device.h @@ -71,6 +71,7 @@ inline int GET_BLOCKS(int const N) { } __global__ void scale_kernel(float *ptr, size_t size, float a, float b); +__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b); __global__ void ones_kernel(float *ptr, size_t size); diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 312a13cc01..1f35bdd304 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -20,9 +20,6 @@ struct LocalSlotsBacking { public: void add_per_device_op_state(layer_guid_t const &, DeviceSpecificDeviceStates const &); - void allocate_label_tensor(tensor_guid_t const &, - ComputationGraph const &, - Allocator &); void allocate_outgoing_tensors(layer_guid_t const &, ComputationGraph const &, Allocator &); @@ -40,7 +37,6 @@ struct LocalSlotsBacking { GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &, IsGrad) const; -private: bool is_tensor_allocated(tensor_guid_t const &) const; bool is_gradient_tensor_allocated(tensor_guid_t const &) const; diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 967f8d9ba3..787c7dda86 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -15,14 +15,6 @@ void LocalSlotsBacking::add_per_device_op_state( this->per_device_op_states.insert({op_guid, device_state}); } -void LocalSlotsBacking::allocate_label_tensor(tensor_guid_t const &label_tensor, - ComputationGraph const &cg, - Allocator &allocator) { - GenericTensorAccessorW tensor_backing = - allocator.allocate_tensor(get_tensor_attrs(cg, label_tensor).shape); - this->tensor_mapping.insert({label_tensor, tensor_backing}); -} - void LocalSlotsBacking::allocate_outgoing_tensors( layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index f54d0ddaad..98bfe7683e 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,6 +1,8 @@ #include "local-execution/local_training_backing.h" #include "local-execution/loss_functions.h" #include "local-execution/task_signature_impl.h" +#include "utils/containers/contains.h" +#include "utils/containers/contains_key.h" #include "utils/containers/reversed.h" #include "utils/exception.h" @@ -30,10 +32,11 @@ LocalTrainingBacking::LocalTrainingBacking( } if (this->training_instance.has_value()) { - this->local_slots_backing.allocate_label_tensor( - this->training_instance.value().label_tensor, - computation_graph, - this->allocator); + // label and logit tensor should be allocated + assert(this->local_slots_backing.is_tensor_allocated( + this->training_instance.value().label_tensor)); + assert(this->local_slots_backing.is_tensor_allocated( + this->training_instance.value().logit_tensor)); } } diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 6b23d5da51..771d175a7d 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -50,7 +50,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { auto logit_grad = acc.get_tensor_grad(LOGIT); auto logit = acc.get_tensor(LOGIT); auto label = acc.get_tensor(LABEL); - int batch_size = label.shape.at(ff_dim_t{0}); + int batch_size = logit.shape.at(legion_dim_t{1}); + // assuming logit shape is [parallel dim(?), batch dim, num classes] LossFunction loss_type = get_loss_function(attrs); float scale_factor = 1.0f / batch_size; @@ -60,19 +61,18 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { } if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) { - // assertion the outter-most dim is replica dim and replica degree is 1 + // label shape is [parallel dim(?), batch dim, 1] auto scce_attrs = attrs.get(); size_t ndim = logit.shape.num_dims(); - assert(logit.shape.at(legion_dim_t(ndim - 1)) == 1); - int num_samples = logit.shape.at(legion_dim_t(ndim - 2)); - int num_classes = logit.shape.get_volume() / num_samples; + int num_classes = logit.shape.at(legion_dim_t{0}); assert(logit_grad.shape == logit.shape); int k = 1; if (scce_attrs.replace_labels) { k = logit.shape.at(legion_dim_t(ndim - 1)) / label.shape.at(legion_dim_t( ndim - 1)); // TODO FIXME something seems wrong here, isn't the - // numerator guaranteed to be 1? + // numerator guaranteed to be 1? <--- this is not the + // case because of the potential parallel dim } assert(label.shape.sub_shape(legion_dim_t(1), std::nullopt) == logit.shape.sub_shape(legion_dim_t(1), std::nullopt)); @@ -85,21 +85,17 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { "[SparseCategoricalCrossEntropyLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), - get_int32_ptr(label), + reinterpret_cast(get_float_ptr(label)), get_volume(logit.shape), get_volume(logit_grad.shape), - num_samples, + batch_size, num_classes, k, scale_factor); } else { assert(logit.shape == label.shape); assert(logit_grad.shape == logit.shape); - // assertion the outter-most dim is replica dim and replica degree is 1 - size_t ndim = logit.shape.num_dims(); - assert(logit.shape.at(legion_dim_t(ndim - 1)) == 1); - int num_samples = label.shape.at(legion_dim_t(ndim - 1)); - int num_channels = logit.shape.get_volume() / num_samples; + int num_channels = logit.shape.at(legion_dim_t{0}); switch (loss_type) { case LossFunction::CATEGORICAL_CROSSENTROPY: { profile(categorical_crossentropy_loss_backward_kernel, diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index a52ebb8089..502afb5f9f 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -34,7 +34,9 @@ OpTaskInvocation forward(ElementUnaryAttrs const &attrs) { b.bind(INPUT, input_tensor(0)); b.bind(OUTPUT, output_tensor(0)); + b.bind_arg(ATTRS, attrs); + b.bind_arg(HANDLE, ff_handle()); b.bind_arg(PROFILING, profiling_settings()); b.bind_arg(PER_DEVICE_STATE, per_device_op_state()); @@ -51,8 +53,8 @@ OpTaskInvocation backward(ElementUnaryAttrs const &attrs) { static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); - ProfilingSettings profiling = acc.get_argument(PROFILING); + auto attrs = acc.get_argument(ATTRS); + ParallelTensorShape input_shape = acc.get_argument(INPUT_SHAPE); @@ -68,7 +70,7 @@ static DeviceSpecificDeviceStates static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - auto const &attrs = acc.get_argument(ATTRS); + auto attrs = acc.get_argument(ATTRS); auto handle = acc.get_argument(HANDLE); diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc index 16b7870601..3072b9a8bd 100644 --- a/lib/local-execution/src/task_signature_impl.cc +++ b/lib/local-execution/src/task_signature_impl.cc @@ -50,8 +50,8 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) { return TaskSignatureAndImpl{get_element_unary_fwd_task_impl(), get_element_unary_fwd_signature()}; case task_id_t::ELEMENTUNARY_BWD_TASK_ID: - return TaskSignatureAndImpl{get_element_binary_bwd_task_impl(), - get_element_binary_bwd_signature()}; + return TaskSignatureAndImpl{get_element_unary_bwd_task_impl(), + get_element_unary_bwd_signature()}; case task_id_t::CONV2D_INIT_TASK_ID: return TaskSignatureAndImpl{get_conv_2d_init_task_impl(), get_conv_2d_init_signature()}; diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_function.cc new file mode 100644 index 0000000000..73ab02646e --- /dev/null +++ b/lib/local-execution/test/src/test_loss_function.cc @@ -0,0 +1,88 @@ +#include "doctest/doctest.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "kernels/managed_ff_stream.h" +#include "pcg/computation_graph_builder.h" +#include "test_utils.h" +#include "local-execution/local_training_backing.h" + +namespace FlexFlow { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Loss Function Local Execution") { + // initialize runtime configs + ManagedPerDeviceFFHandle managed_handle{}; + + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::NO, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0} + }; + + // construct graph + ComputationGraphBuilder cg_builder; + + size_t batch_size = 10; + size_t data_dim = 100; + TensorShape input_shape = TensorShape{TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; + tensor_guid_t input_tensor = cg_builder.create_tensor(input_shape, CreateGrad::YES); + + float scalar = 4.0; + tensor_guid_t logit_tensor = cg_builder.scalar_multiply(input_tensor, scalar); + + // allocate memory + Allocator allocator = create_local_cuda_memory_allocator(); + TensorBackingMap tensor_backing_map; + GenericTensorAccessorW input_backing = allocator.allocate_tensor(input_shape); + tensor_backing_map.insert({input_tensor, input_backing}); + + SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { + TensorShape label_shape = TensorShape{TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; + tensor_guid_t label_tensor = cg_builder.create_tensor(label_shape, CreateGrad::NO); + GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); + tensor_backing_map.insert({label_tensor, label_backing}); + ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}, + label_tensor, logit_tensor}; + LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + } + + SUBCASE("OtherAttrs") { + tensor_guid_t label_tensor = cg_builder.create_tensor(input_shape, CreateGrad::NO); + GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); + tensor_backing_map.insert({label_tensor, label_backing}); + + SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { + ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}, + label_tensor, logit_tensor}; + LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + } + + SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { + ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, logit_tensor}; + LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + } + + SUBCASE("LossFunction::IDENTITY") { + ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, + label_tensor, logit_tensor}; + LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + } + + } + } +} + +} // namespace FlexFlow From f75a3d4c1cc85ae60d6254ddcabbb40b6f2338ad Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 05:00:16 -0700 Subject: [PATCH 04/22] Format --- lib/kernels/src/array_shape.cc | 6 -- .../test/src/test_loss_function.cc | 89 +++++++++++++------ 2 files changed, 61 insertions(+), 34 deletions(-) diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index bf80c6b5c1..69f04d6d34 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -50,12 +50,6 @@ std::size_t ArrayShape::at(ff_dim_t idx) const { return dims.at(legion_dim_from_ff_dim(idx, this->num_dims())); } -// ArrayShape ArrayShape::sub_shape( -// std::optional> start, -// std::optional> end) const { -// NOT_IMPLEMENTED(); -// } - ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const { NOT_IMPLEMENTED(); } diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_function.cc index 73ab02646e..9e60c1b979 100644 --- a/lib/local-execution/test/src/test_loss_function.cc +++ b/lib/local-execution/test/src/test_loss_function.cc @@ -1,10 +1,10 @@ #include "doctest/doctest.h" #include "kernels/local_cuda_allocator.h" -#include "kernels/managed_per_device_ff_handle.h" #include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "local-execution/local_training_backing.h" #include "pcg/computation_graph_builder.h" #include "test_utils.h" -#include "local-execution/local_training_backing.h" namespace FlexFlow { @@ -14,73 +14,106 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ManagedPerDeviceFFHandle managed_handle{}; RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ - DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::NO, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0} - }; + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::NO, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; // construct graph ComputationGraphBuilder cg_builder; size_t batch_size = 10; size_t data_dim = 100; - TensorShape input_shape = TensorShape{TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; - tensor_guid_t input_tensor = cg_builder.create_tensor(input_shape, CreateGrad::YES); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; + tensor_guid_t input_tensor = + cg_builder.create_tensor(input_shape, CreateGrad::YES); float scalar = 4.0; - tensor_guid_t logit_tensor = cg_builder.scalar_multiply(input_tensor, scalar); + tensor_guid_t logit_tensor = + cg_builder.scalar_multiply(input_tensor, scalar); // allocate memory Allocator allocator = create_local_cuda_memory_allocator(); TensorBackingMap tensor_backing_map; - GenericTensorAccessorW input_backing = allocator.allocate_tensor(input_shape); + GenericTensorAccessorW input_backing = + allocator.allocate_tensor(input_shape); tensor_backing_map.insert({input_tensor, input_backing}); SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { - TensorShape label_shape = TensorShape{TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; - tensor_guid_t label_tensor = cg_builder.create_tensor(label_shape, CreateGrad::NO); - GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); + TensorShape label_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; + tensor_guid_t label_tensor = + cg_builder.create_tensor(label_shape, CreateGrad::NO); + GenericTensorAccessorW label_backing = + allocator.allocate_tensor(label_shape); tensor_backing_map.insert({label_tensor, label_backing}); - ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}, - label_tensor, logit_tensor}; - LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + LossAttrs{ + SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}, + label_tensor, + logit_tensor}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); } SUBCASE("OtherAttrs") { - tensor_guid_t label_tensor = cg_builder.create_tensor(input_shape, CreateGrad::NO); - GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); + tensor_guid_t label_tensor = + cg_builder.create_tensor(input_shape, CreateGrad::NO); + GenericTensorAccessorW label_backing = + allocator.allocate_tensor(input_shape); tensor_backing_map.insert({label_tensor, label_backing}); SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}, - label_tensor, logit_tensor}; - LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}, + label_tensor, + logit_tensor}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); } SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, logit_tensor}; - LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + LossAttrs{ + OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, + logit_tensor}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); } SUBCASE("LossFunction::IDENTITY") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, - label_tensor, logit_tensor}; - LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, + label_tensor, + logit_tensor}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); } - } } } From f74711fb71685ef95c10770646e39fdf3acd27a0 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 12:30:16 -0700 Subject: [PATCH 05/22] Refactor and build optimizer kernels, op --- lib/kernels/CMakeLists.txt | 1 + lib/kernels/include/kernels/array_shape.h | 3 + .../include/kernels/optimizer_kernels.h | 3 + lib/kernels/src/array_shape.cc | 8 + lib/kernels/src/cuda/optimizer_kernels.cu | 167 +++++++------- .../include/local-execution/loss_functions.h | 4 +- .../include/local-execution/optimizer.h | 22 ++ .../src/local_training_backing.cc | 9 +- lib/local-execution/src/optimizer.cc | 205 ++++++++++++++++++ lib/pcg/include/pcg/optimizer_attrs.h | 14 -- .../include/pcg/optimizer_attrs.variant.toml | 23 ++ .../adam_optimizer_attrs.struct.toml | 4 + 12 files changed, 370 insertions(+), 93 deletions(-) create mode 100644 lib/local-execution/include/local-execution/optimizer.h create mode 100644 lib/local-execution/src/optimizer.cc delete mode 100644 lib/pcg/include/pcg/optimizer_attrs.h create mode 100644 lib/pcg/include/pcg/optimizer_attrs.variant.toml diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index baac58f8e3..5a6a0d1357 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -9,6 +9,7 @@ file(GLOB_RECURSE SRC src/*.cc src/cuda/cuda_helper.cu src/cuda/loss_function_kernels.cu + src/cuda/optimizer_kernels.cu src/cuda/ops/*.cu ) diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index c95c447574..6b0b57b57f 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -50,6 +50,9 @@ struct ArrayShape { ArrayShape sub_shape(std::optional start, std::optional end) const; + bool operator==(ArrayShape const &) const; + bool operator!=(ArrayShape const &) const; + public: LegionTensorDims dims; }; diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index fcbf9454f8..ed7c2778dd 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -34,6 +34,8 @@ void adam_ps_update_task_gpu(ffStream_t, float beta2, float weight_decay, float epsilon, + size_t size, + int num_replicas, float const *weight_grad_ptr, float *adam_m_ptr, float *adam_v_ptr, @@ -45,6 +47,7 @@ void adam_nccl_update_task_gpu(ffStream_t, float beta2, float weight_decay, float epsilon, + size_t size, PerDeviceFFHandle const &, float const *weight_grad_ptr, float *adam_m_ptr, diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 69f04d6d34..ddfa3964e3 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -101,6 +101,14 @@ TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) { dtype}; } +bool ArrayShape::operator==(ArrayShape const & other) const { + return this->dims == other.dims; +} + +bool ArrayShape::operator!=(ArrayShape const & other) const { + return this->dims != other.dims; +} + std::string format_as(ArrayShape const &x) { std::ostringstream oss; oss << " <<>>( - (float *)w_grad_ptr, src, size, 1.0f); + (float *)weight_grad_ptr, src, size, 1.0f); } // checkCUDA(cudaDeviceSynchronize()); // Step 2: SGD update sgd_update<<>>( size, - op->lr, - op->weight_decay, - op->momentum, - op->nesterov, - w_grad_ptr, - v_ptr, - w_ptr); + lr, + weight_decay, + momentum, + nesterov, + weight_grad_ptr, + sgd_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } #ifdef FF_USE_NCCL -__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, - PerDeviceOpState const *meta, - float const *w_grad_ptr, - size_t size, - float *w_ptr, - float *v_ptr) { +void sgd_nccl_update_task_gpu(cudaStream_t stream, + float lr, + float momentum, + bool nesterov, + float weight_decay, + PerDeviceFFHandle const & handle, + float const *weight_grad_ptr, + size_t size, + float *weight_ptr, + float *sgd_v_ptr) { // Use NCCL to sync gradients // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); - cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, + checkNCCL(ncclAllReduce(weight_grad_ptr, + (float *)weight_grad_ptr, size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, + ncclDataType_t::ncclFloat, + ncclRedOp_t::ncclSum, + handle.ncclComm, stream)); // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); @@ -94,13 +101,13 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, // Step 2: SGD update sgd_update<<>>( size, - op->lr, - op->weight_decay, - op->momentum, - op->nesterov, - w_grad_ptr, - v_ptr, - w_ptr); + lr, + weight_decay, + momentum, + nesterov, + weight_grad_ptr, + sgd_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } #endif @@ -145,20 +152,24 @@ __global__ void adam_update(int count, } } -__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, - float const *w_grad_ptr, - size_t size, - int num_replicas, - float *w_ptr, - float *v_ptr, - float *m_ptr) { - cudaStream_t stream; +void adam_ps_update_task_gpu(cudaStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + size_t size, + int num_replicas, + float const *weight_grad_ptr, + float *adam_m_ptr, + float *adam_v_ptr, + float *weight_ptr) { checkCUDA(get_legion_stream(&stream)); // Step 1: Gather gradients in the first replica for (int i = 1; i < num_replicas; i++) { - float const *src = w_grad_ptr + i * size; + float const *src = weight_grad_ptr + i * size; add_kernel<<>>( - size, 1.0f, src, (float *)w_grad_ptr); + size, 1.0f, src, (float *)weight_grad_ptr); } // checkCUDA(cudaDeviceSynchronize()); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", @@ -166,50 +177,54 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, // Step 2: Adam update adam_update<<>>( size, - op->alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - w_grad_ptr, - m_ptr, - v_ptr, - w_ptr); + alpha_t, + beta1, + beta2, + weight_decay, + epsilon, + weight_grad_ptr, + adam_m_ptr, + adam_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } #ifdef FF_USE_NCCL -__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, - PerDeviceOpState const *meta, - float const *w_grad_ptr, - size_t size, - float *w_ptr, - float *v_ptr, - float *m_ptr) { +void adam_nccl_update_task_gpu(cudaStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + size_t size, + PerDeviceFFHandle const & handle, + float const *weight_grad_ptr, + float *adam_m_ptr, + float *adam_v_ptr, + float *weight_ptr) { // Use NCCL to sync gradients - cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, + checkNCCL(ncclAllReduce(weight_grad_ptr, + (float *)weight_grad_ptr, size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, + ncclDataType_t::ncclFloat, + ncclRedOp_t::ncclSum, + handle.ncclComm, stream)); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update adam_update<<>>( size, - op->alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - w_grad_ptr, - m_ptr, - v_ptr, - w_ptr); + alpha_t, + beta1, + beta2, + weight_decay, + epsilon, + weight_grad_ptr, + adam_m_ptr, + adam_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } #endif diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index e5e81b60a7..58405536d8 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -13,8 +13,8 @@ * limitations under the License. */ -#ifndef _FLEXFLOW_LOSS_FUNCTIONS_H_ -#define _FLEXFLOW_LOSS_FUNCTIONS_H_ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ +#define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ #include "local-execution/task_impl_function.dtg.h" #include "local-execution/task_invocation.h" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h new file mode 100644 index 0000000000..4702352568 --- /dev/null +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ +#define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ + +#include "local-execution/task_impl_function.dtg.h" +#include "local-execution/task_invocation.h" +#include "local-execution/task_signature.h" +#include "pcg/optimizers/sgd_optimizer_attrs.dtg.h" +#include "pcg/optimizers/adam_optimizer_attrs.dtg.h" + +namespace FlexFlow { + +TaskSignature get_sgd_update_signature(); +TaskInvocation sgd_update(SGDOptimizerAttrs const &); +TaskImplFunction get_sgd_update_task_impl(); + +TaskSignature get_adam_update_signature(); +TaskInvocation adam_update(SGDOptimizerAttrs const &); +TaskImplFunction get_adam_update_task_impl(); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 98bfe7683e..c8f5f279d2 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -4,6 +4,7 @@ #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/reversed.h" +#include "utils/containers/get_only.h" #include "utils/exception.h" namespace FlexFlow { @@ -136,7 +137,13 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() { } void LocalTrainingBacking::execute_update() { - NOT_IMPLEMENTED(); + for (layer_guid_t const &node: topological_ordering(this->computation_graph)) { + LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); + if (layer_attrs.attrs.has()) { + tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); + // TODO: handle momentum vectors separately? handle different updates? + } + } } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc new file mode 100644 index 0000000000..2f45802978 --- /dev/null +++ b/lib/local-execution/src/optimizer.cc @@ -0,0 +1,205 @@ +#include "kernels/optimizer_kernels.h" +#include "local-execution/optimizer.h" +#include "local-execution/profiling.h" + +namespace FlexFlow { + +enum Slots { + ATTRS, + WEIGHT, + SGD_V, + PROFILING, + ADAM_M, + ADAM_V, + HANDLE +}; + +TaskSignature get_sgd_update_signature() { + TaskSignature sig = make_empty_task_signature(); + add_slot(sig, WEIGHT, IsGrad::YES); + add_slot(sig, WEIGHT, IsGrad::NO); + add_slot(sig, SGD_V, IsGrad::YES); + add_arg_slot(sig, ATTRS); + add_arg_slot(sig, PROFILING); + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + add_unchecked_arg_slot(sig, HANDLE); + } + return sig; +} + +TaskInvocation sgd_update(SGDOptimizerAttrs const & attrs, + tensor_guid_t const & weight, + tensor_guid_t const & sgd_v) { + TaskBinding b; + b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); + b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); + if (attrs.momentum > 0.0f) { + b.bind(SGD_V, TensorGuidSpec{sgd_v, IsGrad::YES}); + } + b.bind_arg(ATTRS, attrs); + b.bind_arg(PROFILING, profiling_settings()); + + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + b.bind_arg(HANDLE, ff_handle()); + return {task_id_t::SGD_UPD_NCCL_TASK_ID, b}; + } + return {task_id_t::SGD_UPD_PS_TASK_ID, b}; +} + +static void sgd_update_task_impl(TaskArgumentAccessor const & acc) { + auto attrs = acc.get_argument(ATTRS); + auto weight_grad = acc.get_tensor_grad(WEIGHT); + auto weight = acc.get_tensor(WEIGHT); + auto profiling = acc.get_argument(PROFILING); + + assert (weight.shape == weight_grad.shape); + size_t size = weight_grad.shape.get_volume(); + + assert (weight_grad.shape.get_volume() & weight.shape.get_volume() == 0); + size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume(); + + float *sgd_v_ptr; + if (attrs.momentum > 0.0f) { + auto sgd_v = acc.get_tensor(SGD_V); + assert (sgd_v.shape == weight.shape); + sgd_v_ptr = sgd_v.get_float_ptr(); + } + + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + auto handle = acc.get_argument(HANDLE); + profile(sgd_nccl_update_task_gpu, + profiling, + "[SGD NCCL] update_time = %.2lfms\n", + attrs.lr, + attrs.momentum, + attrs.nesterov, + attrs.weight_decay, + handle, + weight_grad.get_float_ptr(), + size, + weight.get_float_ptr(), + sgd_v_ptr); + + } else { + profile(sgd_ps_update_task_gpu, + profiling, + "[SGD PS] update_time = %.2lfms\n", + attrs.lr, + attrs.momentum, + attrs.nesterov, + attrs.weight_decay, + weight_grad.get_float_ptr(), + size, + num_replicas, + weight.get_float_ptr(), + sgd_v_ptr); + } +} + +TaskImplFunction get_sgd_update_task_impl() { + return TaskImplFunction{GenericTaskImplFunction{sgd_update_task_impl}}; +} + +TaskSignature get_adam_update_signature() { + TaskSignature sig = make_empty_task_signature(); + add_slot(sig, WEIGHT, IsGrad::YES); + add_slot(sig, WEIGHT, IsGrad::NO); + add_slot(sig, ADAM_V, IsGrad::YES); + add_slot(sig, ADAM_M, IsGrad::YES); + add_arg_slot(sig, ATTRS); + add_arg_slot(sig, PROFILING); + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + add_unchecked_arg_slot(sig, HANDLE); + } + return sig; +} + +TaskInvocation adam_update(AdamOptimizerAttrs const & attrs, + tensor_guid_t const & weight, + tensor_guid_t const & adam_v, + tensor_guid_t const & adam_m) { + TaskBinding b; + b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); + b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); + b.bind(ADAM_M, TensorGuidSpec{adam_m, IsGrad::YES}); + b.bind(ADAM_V, TensorGuidSpec{adam_v, IsGrad::YES}); + b.bind_arg(ATTRS, attrs); + b.bind_arg(PROFILING, profiling_settings()); + + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + b.bind_arg(HANDLE, ff_handle()); + return {task_id_t::ADAM_UPD_NCCL_TASK_ID, b}; + } + return {task_id_t::ADAM_UPD_PS_TASK_ID, b}; +} + +static void adam_update_task_impl(TaskArgumentAccessor const & acc) { + auto attrs = acc.get_argument(ATTRS); + auto weight_grad = acc.get_tensor_grad(WEIGHT); + auto weight = acc.get_tensor(WEIGHT); + auto v_tensor = acc.get_tensor(ADAM_V); + auto m_tensor = acc.get_tensor(ADAM_M); + + auto profiling = acc.get_argument(PROFILING); + + assert (weight.shape == weight_grad.shape); + size_t size = weight_grad.shape.get_volume(); + + assert (weight_grad.shape.get_volume() % weight.shape.get_volume() == 0); + size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume(); + + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + auto handle = acc.get_argument(HANDLE); + profile(adam_nccl_update_task_gpu, + profiling, + "[Adam NCCL] update_time = %.2lfms\n", + attrs.alpha_t, + attrs.beta1, + attrs.beta2, + attrs.weight_decay, + attrs.epsilon, + size, + handle, + weight_grad.get_float_ptr(), + m_tensor.get_float_ptr(), + v_tensor.get_float_ptr(), + weight.get_float_ptr()); + } else { + profile(adam_ps_update_task_gpu, + profiling, + "[Adam NCCL] update_time = %.2lfms\n", + attrs.alpha_t, + attrs.beta1, + attrs.beta2, + attrs.weight_decay, + attrs.epsilon, + size, + num_replicas, + weight_grad.get_float_ptr(), + m_tensor.get_float_ptr(), + v_tensor.get_float_ptr(), + weight.get_float_ptr()); + } +} + +AdamOptimizerAttrs next(AdamOptimizerAttrs const & old) { + double new_beta1_t = old.beta_t * old.beta1; + double new_beta2_t = old.beta2_t * old.beta2; + double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); + return AdamOptimizerAttrs{ + old.alpha, + old.beta1, + old.beta2, + old.weight_decay, + new_alpha_t, + new_beta1_t, + new_beta2_t, + old.epsilon + }; +} + +TaskImplFunction get_adam_update_task_impl() { + return TaskImplFunction{GenericTaskImplFunction{adam_update_task_impl}}; +} + +} diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h deleted file mode 100644 index 4bac74b999..0000000000 --- a/lib/pcg/include/pcg/optimizer_attrs.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _FLEXFLOW_PCG_INCLUDE_PCG_OPTIMIZER_H -#define _FLEXFLOW_PCG_INCLUDE_PCG_OPTIMIZER_H - -#include "pcg/optimizers/adam_optimizer_attrs.h" -#include "pcg/optimizers/sgd_optimizer_attrs.h" -#include "utils/variant.h" - -namespace FlexFlow { - -using OptimizerAttrs = std::variant; - -} // namespace FlexFlow - -#endif diff --git a/lib/pcg/include/pcg/optimizer_attrs.variant.toml b/lib/pcg/include/pcg/optimizer_attrs.variant.toml new file mode 100644 index 0000000000..585c150700 --- /dev/null +++ b/lib/pcg/include/pcg/optimizer_attrs.variant.toml @@ -0,0 +1,23 @@ +namespace = "FlexFlow" +name = "OptimizerAttrs" +features = [ + "eq", + "ord", + "hash", + "json", + "fmt", + "rapidcheck", +] + +includes = [ + "pcg/optimizers/sgd_optimizer_attrs.dtg.h", + "pcg/optimizers/adam_optimizer_attrs.dtg.h", +] + +[[values]] +type = "::FlexFlow::SGDOptimizerAttrs" +key = "sgd_optimizer" + +[[values]] +type = "::FlexFlow::AdamOptimizerAttrs" +key = "adam_optimizer" diff --git a/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml b/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml index fd3e83cc4a..c25baa6c89 100644 --- a/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml +++ b/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml @@ -36,3 +36,7 @@ type = "double" [[fields]] name = "beta2_t" type = "double" + +[[fields]] +name = "epsilon" +type = "double" From 40c62526336ffbbee069988126047dcdad64a1ce Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 13:40:10 -0700 Subject: [PATCH 06/22] Finish optimizer local backing --- lib/kernels/src/array_shape.cc | 4 +- lib/kernels/src/cuda/optimizer_kernels.cu | 40 +++-- .../local-execution/local_slots_backing.h | 11 +- .../local-execution/local_training_backing.h | 3 +- .../local-execution/model_training_instance.h | 13 ++ .../model_training_instance.struct.toml | 5 + .../include/local-execution/optimizer.h | 18 ++- .../src/local_cost_estimator.cc | 3 +- .../src/local_slots_backing.cc | 21 +++ .../src/local_training_backing.cc | 49 ++++++- .../src/model_training_instance.cc | 26 ++++ lib/local-execution/src/optimizer.cc | 137 ++++++++++-------- .../test/src/test_loss_function.cc | 47 +++--- lib/pcg/include/pcg/computation_graph.h | 4 + lib/pcg/include/pcg/optimizer_attrs.h | 13 ++ lib/pcg/src/pcg/computation_graph.cc | 13 ++ lib/pcg/src/pcg/optimizer_attrs.cc | 14 ++ 17 files changed, 300 insertions(+), 121 deletions(-) create mode 100644 lib/local-execution/include/local-execution/model_training_instance.h create mode 100644 lib/local-execution/src/model_training_instance.cc create mode 100644 lib/pcg/include/pcg/optimizer_attrs.h create mode 100644 lib/pcg/src/pcg/optimizer_attrs.cc diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index ddfa3964e3..054e16e90a 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -101,11 +101,11 @@ TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) { dtype}; } -bool ArrayShape::operator==(ArrayShape const & other) const { +bool ArrayShape::operator==(ArrayShape const &other) const { return this->dims == other.dims; } -bool ArrayShape::operator!=(ArrayShape const & other) const { +bool ArrayShape::operator!=(ArrayShape const &other) const { return this->dims != other.dims; } diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu index 7d1d720ba0..2eaf30b21f 100644 --- a/lib/kernels/src/cuda/optimizer_kernels.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -14,8 +14,8 @@ */ #include "device.h" -#include "kernels/optimizer_kernels.h" #include "kernels/nccl.h" +#include "kernels/optimizer_kernels.h" namespace FlexFlow { @@ -62,15 +62,14 @@ void sgd_ps_update_task_gpu(cudaStream_t stream, } // checkCUDA(cudaDeviceSynchronize()); // Step 2: SGD update - sgd_update<<>>( - size, - lr, - weight_decay, - momentum, - nesterov, - weight_grad_ptr, - sgd_v_ptr, - weight_ptr); + sgd_update<<>>(size, + lr, + weight_decay, + momentum, + nesterov, + weight_grad_ptr, + sgd_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } @@ -80,7 +79,7 @@ void sgd_nccl_update_task_gpu(cudaStream_t stream, float momentum, bool nesterov, float weight_decay, - PerDeviceFFHandle const & handle, + PerDeviceFFHandle const &handle, float const *weight_grad_ptr, size_t size, float *weight_ptr, @@ -99,15 +98,14 @@ void sgd_nccl_update_task_gpu(cudaStream_t stream, // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); // Step 2: SGD update - sgd_update<<>>( - size, - lr, - weight_decay, - momentum, - nesterov, - weight_grad_ptr, - sgd_v_ptr, - weight_ptr); + sgd_update<<>>(size, + lr, + weight_decay, + momentum, + nesterov, + weight_grad_ptr, + sgd_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } #endif @@ -197,7 +195,7 @@ void adam_nccl_update_task_gpu(cudaStream_t stream, float weight_decay, float epsilon, size_t size, - PerDeviceFFHandle const & handle, + PerDeviceFFHandle const &handle, float const *weight_grad_ptr, float *adam_m_ptr, float *adam_v_ptr, diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 1f35bdd304..439113c873 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -1,6 +1,6 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_SLOT_REGISTRY_H -#define _FLEXFLOW_LOCAL_EXECUTION_SLOT_REGISTRY_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H +#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H #include "kernels/accessor.h" #include "local-execution/local_task_argument_accessor.h" @@ -23,6 +23,11 @@ struct LocalSlotsBacking { void allocate_outgoing_tensors(layer_guid_t const &, ComputationGraph const &, Allocator &); + void allocate_optimizer_tensors(layer_guid_t const &weight_layer, + tensor_guid_t const &, + ComputationGraph const &, + Allocator &, + TaskSignature const &); TensorSlotsBacking construct_tensor_slots_backing(OpTaskBinding const &, layer_guid_t const &) const; TensorSlotsBacking construct_tensor_slots_backing(TaskBinding const &) const; @@ -48,6 +53,8 @@ struct LocalSlotsBacking { input_tensor_slots; std::unordered_map> output_tensor_slots; + std::unordered_map> + weight_optimizer_tensor_guids; // arguments std::unordered_map diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index 55983086c2..d2586038f0 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -4,7 +4,6 @@ #include "local-execution/local_slots_backing.h" #include "local-execution/model_training_instance.dtg.h" #include "local-execution/task_registry.h" -#include "op-attrs/ops/loss_functions.h" namespace FlexFlow { @@ -16,7 +15,7 @@ struct LocalTrainingBacking { ComputationGraph const &, TensorBackingMap const &, RuntimeArgConfig const &, - std::optional const &); + std::optional &); void execute_init(); PerLayerElapsedTime execute_forward(); diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h new file mode 100644 index 0000000000..7ea027a636 --- /dev/null +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -0,0 +1,13 @@ + +#ifndef _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H +#define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H + +#include "local-execution/model_training_instance.dtg.h" + +namespace FlexFlow { + +ModelTrainingInstance next(ModelTrainingInstance const & old); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml index ea7e8d24ab..e3ff397e39 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml +++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml @@ -11,6 +11,7 @@ includes = [ "utils/optional.h", "op-attrs/ops/loss_attrs.dtg.h", "pcg/tensor_guid_t.dtg.h", + "pcg/optimizer_attrs.dtg.h", ] [[fields]] @@ -24,3 +25,7 @@ type = "::FlexFlow::tensor_guid_t" [[fields]] name = "logit_tensor" type = "::FlexFlow::tensor_guid_t" + +[[fields]] +name = "optimizer_attrs" +type = "::FlexFlow::OptimizerAttrs" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index 4702352568..53dcad63de 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -4,17 +4,29 @@ #include "local-execution/task_impl_function.dtg.h" #include "local-execution/task_invocation.h" #include "local-execution/task_signature.h" -#include "pcg/optimizers/sgd_optimizer_attrs.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" #include "pcg/optimizers/adam_optimizer_attrs.dtg.h" +#include "pcg/optimizers/sgd_optimizer_attrs.dtg.h" namespace FlexFlow { +TaskSignature get_update_signature(OptimizerAttrs const &); +TaskInvocation get_update_invocation(OptimizerAttrs const &, + tensor_guid_t const &weight, + std::vector const &); +TaskImplFunction get_update_task_impl(OptimizerAttrs const &); + TaskSignature get_sgd_update_signature(); -TaskInvocation sgd_update(SGDOptimizerAttrs const &); +TaskInvocation sgd_update(SGDOptimizerAttrs const &, + tensor_guid_t const &weight, + tensor_guid_t const &); TaskImplFunction get_sgd_update_task_impl(); TaskSignature get_adam_update_signature(); -TaskInvocation adam_update(SGDOptimizerAttrs const &); +TaskInvocation adam_update(AdamOptimizerAttrs const &, + tensor_guid_t const &weight, + tensor_guid_t const &, + tensor_guid_t const &); TaskImplFunction get_adam_update_task_impl(); } // namespace FlexFlow diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 1ca422d8e1..a39d55adff 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -72,11 +72,12 @@ CostDetails LocalCostEstimator::estimate_cost( get_vector_piece_attrs(weights), get_vector_piece_attrs(outputs)); + std::optional model_training_instance = std::nullopt; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, this->runtime_arg_config, - std::nullopt); + model_training_instance); local_backing.execute_init(); PerLayerElapsedTime fwd = local_backing.execute_forward(); diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 787c7dda86..5059f29abd 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -47,6 +47,27 @@ void LocalSlotsBacking::allocate_outgoing_tensors( this->output_tensor_slots.insert({layer_guid, outgoing_tensors}); } +void LocalSlotsBacking::allocate_optimizer_tensors( + layer_guid_t const &weight_layer, + tensor_guid_t const &weight, + ComputationGraph const &cg, + Allocator &allocator, + TaskSignature const &sig) { + GenericTensorAccessorW weight_backing = + get_tensor_backing(weight, IsGrad::NO); + int num_buffer_tensors = + sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad) + std::vector buffer_tensors = + get_new_tensor_guids_for_layer_without_graph_insertion( + cg, weight_layer, num_buffer_tensors); + for (auto const &tensor_guid : buffer_tensors) { + GenericTensorAccessorW buffer_backing = allocator.allocate_tensor( + get_tensor_shape(weight_backing.shape, weight_backing.data_type)); + this->gradient_tensor_mapping.insert({tensor_guid, buffer_backing}); + } + this->weight_optimizer_tensor_guids.insert({weight, buffer_tensors}); +} + bool LocalSlotsBacking::is_tensor_allocated( tensor_guid_t const &tensor_id) const { return contains_key(this->tensor_mapping, tensor_id); diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index c8f5f279d2..eb49f16df1 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,10 +1,12 @@ #include "local-execution/local_training_backing.h" #include "local-execution/loss_functions.h" +#include "local-execution/model_training_instance.h" +#include "local-execution/optimizer.h" #include "local-execution/task_signature_impl.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" -#include "utils/containers/reversed.h" #include "utils/containers/get_only.h" +#include "utils/containers/reversed.h" #include "utils/exception.h" namespace FlexFlow { @@ -14,22 +16,33 @@ LocalTrainingBacking::LocalTrainingBacking( ComputationGraph const &computation_graph, TensorBackingMap const &tensor_backing_mapping, RuntimeArgConfig const &runtime_arg_config, - std::optional const &training_instance) + std::optional &training_instance) : allocator(allocator), computation_graph(computation_graph), local_slots_backing(tensor_backing_mapping, runtime_arg_config), task_registry(empty_task_registry()), training_instance(training_instance) { - for (layer_guid_t const &node : topological_ordering(computation_graph)) { + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { ComputationGraphOpAttrs attrs = - get_layer_attrs(computation_graph, node).attrs; + get_layer_attrs(this->computation_graph, node).attrs; // allocate outgoing tensors this->local_slots_backing.allocate_outgoing_tensors( - node, computation_graph, this->allocator); + node, this->computation_graph, this->allocator); // register tasks register_tasks_for_layer(this->task_registry, node, attrs); + + // allocate optimizer buffers + if (attrs.has() && this->training_instance.has_value()) { + OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs; + TaskSignature sig = get_update_signature(attrs); + tensor_guid_t weight_tensor = + get_only(get_outgoing_tensors(this->computation_graph, node)); + this->local_slots_backing.allocate_optimizer_tensors( + node, weight_tensor, this->computation_graph, this->allocator, sig); + } } if (this->training_instance.has_value()) { @@ -137,13 +150,33 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() { } void LocalTrainingBacking::execute_update() { - for (layer_guid_t const &node: topological_ordering(this->computation_graph)) { + assert(this->training_instance.has_value()); + OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs; + + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); if (layer_attrs.attrs.has()) { - tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); - // TODO: handle momentum vectors separately? handle different updates? + // get tensors + tensor_guid_t weight_tensor = + get_only(get_outgoing_tensors(this->computation_graph, node)); + std::vector buffer_tensors = + this->local_slots_backing.weight_optimizer_tensor_guids.at( + weight_tensor); + + // get invocation + TaskInvocation invocation = + get_update_invocation(attrs, weight_tensor, buffer_tensors); + assert(is_invocation_valid(get_update_signature(attrs), invocation)); + + // execute update + TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + TaskImplFunction update_impl_fn = get_update_task_impl(attrs); + update_impl_fn.get().function_ptr(accessor); } } + + this->training_instance = next(this->training_instance.value()); } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc new file mode 100644 index 0000000000..646e3ac588 --- /dev/null +++ b/lib/local-execution/src/model_training_instance.cc @@ -0,0 +1,26 @@ +#include "local-execution/model_training_instance.h" + +namespace FlexFlow { + +ModelTrainingInstance next(ModelTrainingInstance const & old_training_instance) { + if (old_training_instance.optimizer_attrs.has()) { + AdamOptimizerAttrs old = old_training_instance.optimizer_attrs.get(); + double new_beta1_t = old.beta_t * old.beta1; + double new_beta2_t = old.beta2_t * old.beta2; + double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); + OptimizerAttrs new_attrs = OptimizerAttrs{AdamOptimizerAttrs{ + old.alpha, + old.beta1, + old.beta2, + old.weight_decay, + new_alpha_t, + new_beta1_t, + new_beta2_t, + old.epsilon + }}; + return ModelTrainingInstance{old_training_instance.loss_attrs, old_training_instance.label_tensor, old_training_instance.logit_tensor, new_attrs}; + } + return old_training_instance; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 2f45802978..1b1173c70e 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -1,18 +1,11 @@ -#include "kernels/optimizer_kernels.h" #include "local-execution/optimizer.h" +#include "kernels/optimizer_kernels.h" #include "local-execution/profiling.h" +#include "utils/overload.h" namespace FlexFlow { -enum Slots { - ATTRS, - WEIGHT, - SGD_V, - PROFILING, - ADAM_M, - ADAM_V, - HANDLE -}; +enum Slots { ATTRS, WEIGHT, SGD_V, PROFILING, ADAM_M, ADAM_V, HANDLE }; TaskSignature get_sgd_update_signature() { TaskSignature sig = make_empty_task_signature(); @@ -27,9 +20,9 @@ TaskSignature get_sgd_update_signature() { return sig; } -TaskInvocation sgd_update(SGDOptimizerAttrs const & attrs, - tensor_guid_t const & weight, - tensor_guid_t const & sgd_v) { +TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, + tensor_guid_t const &weight, + tensor_guid_t const &sgd_v) { TaskBinding b; b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); @@ -46,53 +39,54 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const & attrs, return {task_id_t::SGD_UPD_PS_TASK_ID, b}; } -static void sgd_update_task_impl(TaskArgumentAccessor const & acc) { +static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); auto weight_grad = acc.get_tensor_grad(WEIGHT); auto weight = acc.get_tensor(WEIGHT); auto profiling = acc.get_argument(PROFILING); - assert (weight.shape == weight_grad.shape); + assert(weight.shape == weight_grad.shape); size_t size = weight_grad.shape.get_volume(); - assert (weight_grad.shape.get_volume() & weight.shape.get_volume() == 0); - size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume(); + assert(weight_grad.shape.get_volume() & weight.shape.get_volume() == 0); + size_t num_replicas = + weight_grad.shape.get_volume() / weight.shape.get_volume(); float *sgd_v_ptr; if (attrs.momentum > 0.0f) { auto sgd_v = acc.get_tensor(SGD_V); - assert (sgd_v.shape == weight.shape); + assert(sgd_v.shape == weight.shape); sgd_v_ptr = sgd_v.get_float_ptr(); } if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { auto handle = acc.get_argument(HANDLE); profile(sgd_nccl_update_task_gpu, - profiling, - "[SGD NCCL] update_time = %.2lfms\n", - attrs.lr, - attrs.momentum, - attrs.nesterov, - attrs.weight_decay, - handle, - weight_grad.get_float_ptr(), - size, - weight.get_float_ptr(), - sgd_v_ptr); + profiling, + "[SGD NCCL] update_time = %.2lfms\n", + attrs.lr, + attrs.momentum, + attrs.nesterov, + attrs.weight_decay, + handle, + weight_grad.get_float_ptr(), + size, + weight.get_float_ptr(), + sgd_v_ptr); } else { profile(sgd_ps_update_task_gpu, - profiling, - "[SGD PS] update_time = %.2lfms\n", - attrs.lr, - attrs.momentum, - attrs.nesterov, - attrs.weight_decay, - weight_grad.get_float_ptr(), - size, - num_replicas, - weight.get_float_ptr(), - sgd_v_ptr); + profiling, + "[SGD PS] update_time = %.2lfms\n", + attrs.lr, + attrs.momentum, + attrs.nesterov, + attrs.weight_decay, + weight_grad.get_float_ptr(), + size, + num_replicas, + weight.get_float_ptr(), + sgd_v_ptr); } } @@ -114,10 +108,10 @@ TaskSignature get_adam_update_signature() { return sig; } -TaskInvocation adam_update(AdamOptimizerAttrs const & attrs, - tensor_guid_t const & weight, - tensor_guid_t const & adam_v, - tensor_guid_t const & adam_m) { +TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, + tensor_guid_t const &weight, + tensor_guid_t const &adam_v, + tensor_guid_t const &adam_m) { TaskBinding b; b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); @@ -133,7 +127,7 @@ TaskInvocation adam_update(AdamOptimizerAttrs const & attrs, return {task_id_t::ADAM_UPD_PS_TASK_ID, b}; } -static void adam_update_task_impl(TaskArgumentAccessor const & acc) { +static void adam_update_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); auto weight_grad = acc.get_tensor_grad(WEIGHT); auto weight = acc.get_tensor(WEIGHT); @@ -142,11 +136,12 @@ static void adam_update_task_impl(TaskArgumentAccessor const & acc) { auto profiling = acc.get_argument(PROFILING); - assert (weight.shape == weight_grad.shape); + assert(weight.shape == weight_grad.shape); size_t size = weight_grad.shape.get_volume(); - assert (weight_grad.shape.get_volume() % weight.shape.get_volume() == 0); - size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume(); + assert(weight_grad.shape.get_volume() % weight.shape.get_volume() == 0); + size_t num_replicas = + weight_grad.shape.get_volume() / weight.shape.get_volume(); if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { auto handle = acc.get_argument(HANDLE); @@ -182,24 +177,38 @@ static void adam_update_task_impl(TaskArgumentAccessor const & acc) { } } -AdamOptimizerAttrs next(AdamOptimizerAttrs const & old) { - double new_beta1_t = old.beta_t * old.beta1; - double new_beta2_t = old.beta2_t * old.beta2; - double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); - return AdamOptimizerAttrs{ - old.alpha, - old.beta1, - old.beta2, - old.weight_decay, - new_alpha_t, - new_beta1_t, - new_beta2_t, - old.epsilon - }; -} - TaskImplFunction get_adam_update_task_impl() { return TaskImplFunction{GenericTaskImplFunction{adam_update_task_impl}}; } +TaskSignature get_update_signature(OptimizerAttrs const &attrs) { + return attrs.visit(overload{ + [&](SGDOptimizerAttrs const &s) { return get_sgd_update_signature(); }, + [&](AdamOptimizerAttrs const &s) { + return get_adam_update_signature(); + }}); } + +TaskInvocation + get_update_invocation(OptimizerAttrs const &attrs, + tensor_guid_t const &weight, + std::vector const &buffer_tensors) { + return attrs.visit( + overload{[&](SGDOptimizerAttrs const &s) { + return sgd_update(s, weight, buffer_tensors.at(0)); + }, + [&](AdamOptimizerAttrs const &s) { + return adam_update( + s, weight, buffer_tensors.at(0), buffer_tensors.at(1)); + }}); +} + +TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) { + return attrs.visit(overload{ + [&](SGDOptimizerAttrs const &s) { return get_sgd_update_task_impl(); }, + [&](AdamOptimizerAttrs const &s) { + return get_adam_update_task_impl(); + }}); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_function.cc index 9e60c1b979..3d9946c89c 100644 --- a/lib/local-execution/test/src/test_loss_function.cc +++ b/lib/local-execution/test/src/test_loss_function.cc @@ -4,6 +4,7 @@ #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" #include "pcg/computation_graph_builder.h" +#include "pcg/optimizer_attrs.h" #include "test_utils.h" namespace FlexFlow { @@ -18,6 +19,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { EnableProfiling::NO, ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; + OptimizerAttrs optimizer_attrs = make_empty_sgd_attrs(); + // construct graph ComputationGraphBuilder cg_builder; @@ -47,11 +50,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); tensor_backing_map.insert({label_tensor, label_backing}); - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - LossAttrs{ - SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}, - label_tensor, - logit_tensor}; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{SparseCategoricalCrossEntropyLossAttrs{ + /*replace_labels=*/false}}, + label_tensor, + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, @@ -70,10 +75,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { tensor_backing_map.insert({label_tensor, label_backing}); SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}, - label_tensor, - logit_tensor}; + std::optional model_training_instance = + ModelTrainingInstance{LossAttrs{OtherLossAttrs{ + LossFunction::CATEGORICAL_CROSSENTROPY}}, + label_tensor, + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, @@ -85,11 +92,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - LossAttrs{ - OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, - logit_tensor}; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, @@ -101,10 +110,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } SUBCASE("LossFunction::IDENTITY") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, - label_tensor, - logit_tensor}; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, + label_tensor, + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index 46d5b22afb..6fbac987ec 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -32,6 +32,10 @@ LayerAttrs get_layer_attrs(ComputationGraph const &cg, layer_guid_t const &n); layer_guid_t get_layer_by_name(ComputationGraph const &cg, std::string const &name); +std::vector + get_new_tensor_guids_for_layer_without_graph_insertion( + ComputationGraph const &, layer_guid_t const &n, int num_tensors); + } // namespace FlexFlow #endif diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h new file mode 100644 index 0000000000..b154116a4d --- /dev/null +++ b/lib/pcg/include/pcg/optimizer_attrs.h @@ -0,0 +1,13 @@ +#ifndef _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H +#define _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H + +#include "pcg/optimizer_attrs.dtg.h" + +namespace FlexFlow { + +OptimizerAttrs make_empty_sgd_attrs(); +OptimizerAttrs make_empty_adam_attrs(); + +} // namespace FlexFlow + +#endif diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc index afa1774858..23ddd98f3c 100644 --- a/lib/pcg/src/pcg/computation_graph.cc +++ b/lib/pcg/src/pcg/computation_graph.cc @@ -64,4 +64,17 @@ layer_guid_t get_layer_by_name(ComputationGraph const &cg, return get_only(found); } +std::vector + get_new_tensor_guids_for_layer_without_graph_insertion( + ComputationGraph const &cg, layer_guid_t const &n, int num_tensors) { + std::vector new_tensor_guids; + int num_outgoing_tensors = get_outgoing_tensors(cg, n).size(); + + for (int i = 0; i < num_tensors; ++i) { + new_tensor_guids.push_back( + tensor_guid_t{DataflowOutput{n.raw_node, num_outgoing_tensors + i}}); + } + return new_tensor_guids; +} + } // namespace FlexFlow diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc new file mode 100644 index 0000000000..a1c2a2e6d4 --- /dev/null +++ b/lib/pcg/src/pcg/optimizer_attrs.cc @@ -0,0 +1,14 @@ +#include "pcg/optimizer_attrs.h" + +namespace FlexFlow { + +OptimizerAttrs make_empty_sgd_attrs() { + return OptimizerAttrs{SGDOptimizerAttrs{0.0, 0.0, false, 0.0}}; +} + +OptimizerAttrs make_empty_adam_attrs() { + return OptimizerAttrs{AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0}}; +} + +} // namespace FlexFlow From ad9b9eac557d1d84f3226019a62fddbe3b163cef Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 14:04:56 -0700 Subject: [PATCH 07/22] Format --- .../local-execution/model_training_instance.h | 4 +-- .../src/model_training_instance.cc | 31 ++++++++++--------- lib/pcg/include/pcg/optimizer_attrs.h | 2 +- lib/pcg/src/pcg/optimizer_attrs.cc | 6 ++-- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 7ea027a636..afc8fa7472 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -5,8 +5,8 @@ #include "local-execution/model_training_instance.dtg.h" namespace FlexFlow { - -ModelTrainingInstance next(ModelTrainingInstance const & old); + +ModelTrainingInstance next(ModelTrainingInstance const &old); } // namespace FlexFlow diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 646e3ac588..d34cc5d49a 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -1,24 +1,27 @@ #include "local-execution/model_training_instance.h" namespace FlexFlow { - -ModelTrainingInstance next(ModelTrainingInstance const & old_training_instance) { + +ModelTrainingInstance next(ModelTrainingInstance const &old_training_instance) { if (old_training_instance.optimizer_attrs.has()) { - AdamOptimizerAttrs old = old_training_instance.optimizer_attrs.get(); + AdamOptimizerAttrs old = + old_training_instance.optimizer_attrs.get(); double new_beta1_t = old.beta_t * old.beta1; double new_beta2_t = old.beta2_t * old.beta2; double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); - OptimizerAttrs new_attrs = OptimizerAttrs{AdamOptimizerAttrs{ - old.alpha, - old.beta1, - old.beta2, - old.weight_decay, - new_alpha_t, - new_beta1_t, - new_beta2_t, - old.epsilon - }}; - return ModelTrainingInstance{old_training_instance.loss_attrs, old_training_instance.label_tensor, old_training_instance.logit_tensor, new_attrs}; + OptimizerAttrs new_attrs = + OptimizerAttrs{AdamOptimizerAttrs{old.alpha, + old.beta1, + old.beta2, + old.weight_decay, + new_alpha_t, + new_beta1_t, + new_beta2_t, + old.epsilon}}; + return ModelTrainingInstance{old_training_instance.loss_attrs, + old_training_instance.label_tensor, + old_training_instance.logit_tensor, + new_attrs}; } return old_training_instance; } diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h index b154116a4d..550bf12cc8 100644 --- a/lib/pcg/include/pcg/optimizer_attrs.h +++ b/lib/pcg/include/pcg/optimizer_attrs.h @@ -4,7 +4,7 @@ #include "pcg/optimizer_attrs.dtg.h" namespace FlexFlow { - + OptimizerAttrs make_empty_sgd_attrs(); OptimizerAttrs make_empty_adam_attrs(); diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc index a1c2a2e6d4..d51070b10d 100644 --- a/lib/pcg/src/pcg/optimizer_attrs.cc +++ b/lib/pcg/src/pcg/optimizer_attrs.cc @@ -1,14 +1,14 @@ #include "pcg/optimizer_attrs.h" namespace FlexFlow { - + OptimizerAttrs make_empty_sgd_attrs() { return OptimizerAttrs{SGDOptimizerAttrs{0.0, 0.0, false, 0.0}}; } OptimizerAttrs make_empty_adam_attrs() { - return OptimizerAttrs{AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0}}; + return OptimizerAttrs{ + AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}}; } } // namespace FlexFlow From 1ddfadeebdcdcdabe8a84a03ec51fb5bcb02bfd4 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 14:17:21 -0700 Subject: [PATCH 08/22] E2E update test --- ...test_loss_function.cc => test_loss_e2e.cc} | 2 +- .../test/src/test_update_e2e.cc | 128 ++++++++++++++++++ 2 files changed, 129 insertions(+), 1 deletion(-) rename lib/local-execution/test/src/{test_loss_function.cc => test_loss_e2e.cc} (99%) create mode 100644 lib/local-execution/test/src/test_update_e2e.cc diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_e2e.cc similarity index 99% rename from lib/local-execution/test/src/test_loss_function.cc rename to lib/local-execution/test/src/test_loss_e2e.cc index 3d9946c89c..15bf089b6b 100644 --- a/lib/local-execution/test/src/test_loss_function.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -10,7 +10,7 @@ namespace FlexFlow { TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("Loss Function Local Execution") { + TEST_CASE("Local Execution E2E") { // initialize runtime configs ManagedPerDeviceFFHandle managed_handle{}; diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc new file mode 100644 index 0000000000..7f7a90d9a3 --- /dev/null +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -0,0 +1,128 @@ +#include "doctest/doctest.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "local-execution/local_training_backing.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/optimizer_attrs.h" +#include "test_utils.h" + +namespace FlexFlow { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Local Execution Update E2E") { + // initialize runtime configs + ManagedPerDeviceFFHandle managed_handle{}; + + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::NO, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; + + // construct graph + ComputationGraphBuilder cg_builder; + + size_t batch_size = 10; + size_t data_dim = 100; + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; + tensor_guid_t input_tensor = + cg_builder.create_tensor(input_shape, CreateGrad::YES); + + float scalar = 4.0; + tensor_guid_t logit_tensor = + cg_builder.scalar_multiply(input_tensor, scalar); + + // allocate memory + Allocator allocator = create_local_cuda_memory_allocator(); + TensorBackingMap tensor_backing_map; + GenericTensorAccessorW input_backing = + allocator.allocate_tensor(input_shape); + tensor_backing_map.insert({input_tensor, input_backing}); + + tensor_guid_t label_tensor = + cg_builder.create_tensor(input_shape, CreateGrad::NO); + GenericTensorAccessorW label_backing = + allocator.allocate_tensor(input_shape); + tensor_backing_map.insert({label_tensor, label_backing}); + + SUBCASE("SGDOptimizerAttrs") { + SUBCASE("momentum=0") { + OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ + /*lr=*/0.001, + /*momentum=*/0.0f, + /*nesterov=*/false, + /*weight_decay=*/0.001 + }}; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, + logit_tensor, optimizer_attrs}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + local_backing.execute_update(); + } + SUBCASE("momentum=0.9") { + OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ + /*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001 + }}; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, + logit_tensor, optimizer_attrs}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + local_backing.execute_update(); + } + } + SUBCASE("AdamOptimizerAttrs") { + OptimizerAttrs optimizer_attrs = OptimizerAttrs{AdamOptimizerAttrs{ + /*alpha=*/ 0.001, + /*beta1=*/ 0.9, + /*beta2=*/ 0.999, + /*weight_decay=*/ 0.001, + /*alpha_t=*/ 0.001, + /*beta_t=*/ 0.9, + /*beta2_t=*/ 0.999, + /*epsilon=*/ 1e-8 + } + }; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, + logit_tensor, optimizer_attrs}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + local_backing.execute_update(); + } + } +} + +} // namespace FlexFlow From dde9496ada1c18ece558d9ac1b9bb38fbc147417 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 15:23:00 -0700 Subject: [PATCH 09/22] Format --- .../test/src/test_update_e2e.cc | 79 +++++++++---------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index 7f7a90d9a3..3899f60b83 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -48,46 +48,46 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("SGDOptimizerAttrs") { SUBCASE("momentum=0") { - OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ - /*lr=*/0.001, - /*momentum=*/0.0f, - /*nesterov=*/false, - /*weight_decay=*/0.001 - }}; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.0f, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; std::optional model_training_instance = ModelTrainingInstance{ LossAttrs{OtherLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, optimizer_attrs}; + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance); + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); local_backing.execute_update(); } SUBCASE("momentum=0.9") { - OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ - /*lr=*/0.001, - /*momentum=*/0.9, - /*nesterov=*/false, - /*weight_decay=*/0.001 - }}; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; std::optional model_training_instance = ModelTrainingInstance{ LossAttrs{OtherLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, optimizer_attrs}; + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance); + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -95,28 +95,27 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } } SUBCASE("AdamOptimizerAttrs") { - OptimizerAttrs optimizer_attrs = OptimizerAttrs{AdamOptimizerAttrs{ - /*alpha=*/ 0.001, - /*beta1=*/ 0.9, - /*beta2=*/ 0.999, - /*weight_decay=*/ 0.001, - /*alpha_t=*/ 0.001, - /*beta_t=*/ 0.9, - /*beta2_t=*/ 0.999, - /*epsilon=*/ 1e-8 - } - }; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, + /*beta1=*/0.9, + /*beta2=*/0.999, + /*weight_decay=*/0.001, + /*alpha_t=*/0.001, + /*beta_t=*/0.9, + /*beta2_t=*/0.999, + /*epsilon=*/1e-8}}; std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{ - LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + LossAttrs{ + OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, optimizer_attrs}; + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance); + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); From 59635d827e02dfcc26274784c9d7315985bf86cb Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 11 Sep 2024 12:59:05 -0700 Subject: [PATCH 10/22] Small fixes --- lib/kernels/src/cuda/cuda_helper.cu | 10 +-- lib/kernels/src/device.h | 1 - .../fwd_bwd_op_task_impl_function.h | 32 ++++++++++ .../fwd_bwd_task_impl_function.h | 32 ---------- .../init_op_task_impl_function.h | 33 ++++++++++ .../local-execution/init_task_impl_function.h | 33 ---------- .../model_training_instance.struct.toml | 1 - .../include/local-execution/optimizer.h | 8 +-- .../task_impl_function.variant.toml | 12 ++-- .../include/local-execution/task_signature.h | 23 ++++--- .../task_signature.struct.toml | 9 ++- .../src/fwd_bwd_op_task_impl_function.cc | 54 ++++++++++++++++ .../src/fwd_bwd_task_impl_function.cc | 54 ---------------- .../src/init_op_task_impl_function.cc | 47 ++++++++++++++ .../src/init_task_impl_function.cc | 47 -------------- .../src/local_slots_backing.cc | 64 ++++++++----------- .../src/local_training_backing.cc | 8 +-- .../src/model_training_instance.cc | 3 +- lib/local-execution/src/ops/attention.cc | 6 +- lib/local-execution/src/ops/batch_matmul.cc | 4 +- lib/local-execution/src/ops/batch_norm.cc | 6 +- lib/local-execution/src/ops/cast.cc | 4 +- lib/local-execution/src/ops/combine.cc | 4 +- lib/local-execution/src/ops/concat.cc | 4 +- lib/local-execution/src/ops/conv_2d.cc | 6 +- lib/local-execution/src/ops/dropout.cc | 6 +- lib/local-execution/src/ops/element_binary.cc | 6 +- lib/local-execution/src/ops/element_unary.cc | 6 +- lib/local-execution/src/ops/flat.cc | 4 +- lib/local-execution/src/ops/gather.cc | 6 +- lib/local-execution/src/ops/layer_norm.cc | 6 +- lib/local-execution/src/ops/linear.cc | 6 +- lib/local-execution/src/ops/pool_2d.cc | 6 +- lib/local-execution/src/ops/reduce.cc | 6 +- lib/local-execution/src/ops/reduction.cc | 4 +- lib/local-execution/src/ops/repartition.cc | 6 +- lib/local-execution/src/ops/replicate.cc | 4 +- lib/local-execution/src/ops/reshape.cc | 6 +- lib/local-execution/src/ops/reverse.cc | 4 +- lib/local-execution/src/ops/softmax.cc | 6 +- lib/local-execution/src/ops/split.cc | 4 +- lib/local-execution/src/ops/topk.cc | 6 +- lib/local-execution/src/ops/transpose.cc | 6 +- lib/local-execution/src/optimizer.cc | 17 ++--- lib/local-execution/src/task_invocation.cc | 3 +- lib/local-execution/test/src/test_loss_e2e.cc | 12 ++-- .../test/src/test_update_e2e.cc | 6 +- .../op-attrs/ops/loss_attrs.variant.toml | 6 +- .../include/op-attrs/ops/loss_functions.h | 10 +-- ...=> nonconfigurable_loss_attrs.struct.toml} | 2 +- lib/op-attrs/src/loss_functions.cc | 2 +- lib/pcg/include/pcg/optimizer_attrs.h | 13 ---- lib/pcg/src/pcg/optimizer_attrs.cc | 14 ---- 53 files changed, 327 insertions(+), 361 deletions(-) create mode 100644 lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h delete mode 100644 lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h create mode 100644 lib/local-execution/include/local-execution/init_op_task_impl_function.h delete mode 100644 lib/local-execution/include/local-execution/init_task_impl_function.h create mode 100644 lib/local-execution/src/fwd_bwd_op_task_impl_function.cc delete mode 100644 lib/local-execution/src/fwd_bwd_task_impl_function.cc create mode 100644 lib/local-execution/src/init_op_task_impl_function.cc delete mode 100644 lib/local-execution/src/init_task_impl_function.cc rename lib/op-attrs/include/op-attrs/ops/{other_loss_attrs.struct.toml => nonconfigurable_loss_attrs.struct.toml} (86%) delete mode 100644 lib/pcg/include/pcg/optimizer_attrs.h delete mode 100644 lib/pcg/src/pcg/optimizer_attrs.cc diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 5a303ca15e..4ad22b3a57 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -29,19 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { #error "Unknown device, please make sure if CUDA is enabled" #endif -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +__global__ void scale_kernel(float *ptr, size_t size, float a, float b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } } -__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b) { - CUDA_KERNEL_LOOP(i, size) { - ptr[i] = (b - a) * ptr[i] + a; - } -} - -__global__ void ones_kernel(float *ptr, coord_t size) { +__global__ void ones_kernel(float *ptr, size_t size) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = 1.0f; } diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h index e32805fde3..ceff2f92ff 100644 --- a/lib/kernels/src/device.h +++ b/lib/kernels/src/device.h @@ -71,7 +71,6 @@ inline int GET_BLOCKS(int const N) { } __global__ void scale_kernel(float *ptr, size_t size, float a, float b); -__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b); __global__ void ones_kernel(float *ptr, size_t size); diff --git a/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h b/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h new file mode 100644 index 0000000000..cc82291f6a --- /dev/null +++ b/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h @@ -0,0 +1,32 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H +#define _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H + +#include "local-execution/task_argument_accessor.h" + +namespace FlexFlow { + +struct FwdBwdOpTaskImplFunction { + + std::optional (*function_ptr)(TaskArgumentAccessor const &); + + bool operator==(FwdBwdOpTaskImplFunction const &) const; + bool operator!=(FwdBwdOpTaskImplFunction const &) const; + bool operator<(FwdBwdOpTaskImplFunction const &) const; + bool operator>(FwdBwdOpTaskImplFunction const &) const; + bool operator<=(FwdBwdOpTaskImplFunction const &) const; + bool operator>=(FwdBwdOpTaskImplFunction const &) const; +}; + +std::string format_as(FwdBwdOpTaskImplFunction const &x); +std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash<::FlexFlow::FwdBwdOpTaskImplFunction> { + size_t operator()(::FlexFlow::FwdBwdOpTaskImplFunction const &) const; +}; +} // namespace std + +#endif diff --git a/lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h b/lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h deleted file mode 100644 index 7f80af77f3..0000000000 --- a/lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H -#define _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H - -#include "local-execution/task_argument_accessor.h" - -namespace FlexFlow { - -struct FwdBwdTaskImplFunction { - - std::optional (*function_ptr)(TaskArgumentAccessor const &); - - bool operator==(FwdBwdTaskImplFunction const &) const; - bool operator!=(FwdBwdTaskImplFunction const &) const; - bool operator<(FwdBwdTaskImplFunction const &) const; - bool operator>(FwdBwdTaskImplFunction const &) const; - bool operator<=(FwdBwdTaskImplFunction const &) const; - bool operator>=(FwdBwdTaskImplFunction const &) const; -}; - -std::string format_as(FwdBwdTaskImplFunction const &x); -std::ostream &operator<<(std::ostream &s, FwdBwdTaskImplFunction const &x); - -} // namespace FlexFlow - -namespace std { -template <> -struct hash<::FlexFlow::FwdBwdTaskImplFunction> { - size_t operator()(::FlexFlow::FwdBwdTaskImplFunction const &) const; -}; -} // namespace std - -#endif diff --git a/lib/local-execution/include/local-execution/init_op_task_impl_function.h b/lib/local-execution/include/local-execution/init_op_task_impl_function.h new file mode 100644 index 0000000000..7b23a2bc64 --- /dev/null +++ b/lib/local-execution/include/local-execution/init_op_task_impl_function.h @@ -0,0 +1,33 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H +#define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H + +#include "local-execution/device_specific_device_states.dtg.h" +#include "local-execution/task_argument_accessor.h" + +namespace FlexFlow { + +struct InitOpTaskImplFunction { + + DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &); + + bool operator==(InitOpTaskImplFunction const &) const; + bool operator!=(InitOpTaskImplFunction const &) const; + bool operator<(InitOpTaskImplFunction const &) const; + bool operator>(InitOpTaskImplFunction const &) const; + bool operator<=(InitOpTaskImplFunction const &) const; + bool operator>=(InitOpTaskImplFunction const &) const; +}; + +std::string format_as(InitOpTaskImplFunction const &x); +std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash<::FlexFlow::InitOpTaskImplFunction> { + size_t operator()(::FlexFlow::InitOpTaskImplFunction const &) const; +}; +} // namespace std + +#endif diff --git a/lib/local-execution/include/local-execution/init_task_impl_function.h b/lib/local-execution/include/local-execution/init_task_impl_function.h deleted file mode 100644 index b85944e13a..0000000000 --- a/lib/local-execution/include/local-execution/init_task_impl_function.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H -#define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H - -#include "local-execution/device_specific_device_states.dtg.h" -#include "local-execution/task_argument_accessor.h" - -namespace FlexFlow { - -struct InitTaskImplFunction { - - DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &); - - bool operator==(InitTaskImplFunction const &) const; - bool operator!=(InitTaskImplFunction const &) const; - bool operator<(InitTaskImplFunction const &) const; - bool operator>(InitTaskImplFunction const &) const; - bool operator<=(InitTaskImplFunction const &) const; - bool operator>=(InitTaskImplFunction const &) const; -}; - -std::string format_as(InitTaskImplFunction const &x); -std::ostream &operator<<(std::ostream &s, InitTaskImplFunction const &x); - -} // namespace FlexFlow - -namespace std { -template <> -struct hash<::FlexFlow::InitTaskImplFunction> { - size_t operator()(::FlexFlow::InitTaskImplFunction const &) const; -}; -} // namespace std - -#endif diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml index e3ff397e39..b460d6bd3a 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml +++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml @@ -8,7 +8,6 @@ features = [ ] includes = [ - "utils/optional.h", "op-attrs/ops/loss_attrs.dtg.h", "pcg/tensor_guid_t.dtg.h", "pcg/optimizer_attrs.dtg.h", diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index 53dcad63de..e1f11b8a68 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -13,20 +13,20 @@ namespace FlexFlow { TaskSignature get_update_signature(OptimizerAttrs const &); TaskInvocation get_update_invocation(OptimizerAttrs const &, tensor_guid_t const &weight, - std::vector const &); + std::vector const &grad_buffer_tensors); TaskImplFunction get_update_task_impl(OptimizerAttrs const &); TaskSignature get_sgd_update_signature(); TaskInvocation sgd_update(SGDOptimizerAttrs const &, tensor_guid_t const &weight, - tensor_guid_t const &); + tensor_guid_t const &sgd_v); TaskImplFunction get_sgd_update_task_impl(); TaskSignature get_adam_update_signature(); TaskInvocation adam_update(AdamOptimizerAttrs const &, tensor_guid_t const &weight, - tensor_guid_t const &, - tensor_guid_t const &); + tensor_guid_t const &adam_v, + tensor_guid_t const &adam_m); TaskImplFunction get_adam_update_task_impl(); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/task_impl_function.variant.toml b/lib/local-execution/include/local-execution/task_impl_function.variant.toml index 1be18bebfa..48cab9eb01 100644 --- a/lib/local-execution/include/local-execution/task_impl_function.variant.toml +++ b/lib/local-execution/include/local-execution/task_impl_function.variant.toml @@ -8,18 +8,18 @@ features = [ ] includes = [ - "local-execution/init_task_impl_function.h", - "local-execution/fwd_bwd_task_impl_function.h", + "local-execution/init_op_task_impl_function.h", + "local-execution/fwd_bwd_op_task_impl_function.h", "local-execution/generic_task_impl_function.h", ] [[values]] -type = "::FlexFlow::InitTaskImplFunction" -key = "init_task_impl_function" +type = "::FlexFlow::InitOpTaskImplFunction" +key = "init_op_task_impl_function" [[values]] -type = "::FlexFlow::FwdBwdTaskImplFunction" -key = "fwd_bwd_task_impl_function" +type = "::FlexFlow::FwdBwdOpTaskImplFunction" +key = "fwd_bwd_op_task_impl_function" [[values]] type = "::FlexFlow::GenericTaskImplFunction" diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h index d31a67e027..ed28f8eaea 100644 --- a/lib/local-execution/include/local-execution/task_signature.h +++ b/lib/local-execution/include/local-execution/task_signature.h @@ -1,13 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H -// #include "local-execution/tensor_guid_slot_spec.dtg.h" -// #include "local-execution/serialization.h" -// #include "utils/hash/unordered_map.h" -// #include "utils/hash/unordered_set.h" -// #include "utils/type_index.h" - #include "local-execution/task_signature.dtg.h" +#include "utils/type_index.h" namespace FlexFlow { @@ -38,15 +33,23 @@ void add_return_value(TaskSignature &task_signature) { task_signature.return_value = get_type_index_for_type(); } -// adds arg_slot without checking is_serializable, used for arguments that are -// deviceSpecific +/** + * @brief Adds an argument slot without checking if it is serializable. + * + * This function is used for arguments that are device-specific. + */ + template void add_unchecked_arg_slot(TaskSignature &task_signature, int name) { add_unchecked_arg_slot(task_signature, slot_id_t{name}); } -// adds arg_slot without checking is_serializable, used for arguments that are -// deviceSpecific +/** + * @brief Adds an argument slot without checking if it is serializable. + * + * This function is used for arguments that are device-specific. + */ + template void add_unchecked_arg_slot(TaskSignature &task_signature, slot_id_t name) { task_signature.task_arg_types.insert({name, get_type_index_for_type()}); diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml index f86f7b0c57..fd15df91d5 100644 --- a/lib/local-execution/include/local-execution/task_signature.struct.toml +++ b/lib/local-execution/include/local-execution/task_signature.struct.toml @@ -3,17 +3,22 @@ name = "TaskSignature" features = [ "eq", "fmt", + "hash" ] includes = [ "local-execution/tensor_guid_slot_spec.dtg.h", - "utils/type_index.h", - "utils/optional.h" + "", + "" ] src_includes = [ "utils/fmt/unordered_map.h", "utils/fmt/unordered_set.h", + "utils/hash/unordered_map.h", + "utils/hash/unordered_set.h", + "utils/fmt/optional.h", + "utils/type_index.h" ] [[fields]] diff --git a/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc b/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc new file mode 100644 index 0000000000..308dbfd3ae --- /dev/null +++ b/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc @@ -0,0 +1,54 @@ +#include "local-execution/fwd_bwd_op_task_impl_function.h" + +namespace FlexFlow { + +bool FwdBwdOpTaskImplFunction::operator==( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr == other.function_ptr; +} + +bool FwdBwdOpTaskImplFunction::operator!=( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr != other.function_ptr; +} + +bool FwdBwdOpTaskImplFunction::operator<( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr < other.function_ptr; +} + +bool FwdBwdOpTaskImplFunction::operator>( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr > other.function_ptr; +} + +bool FwdBwdOpTaskImplFunction::operator<=( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr <= other.function_ptr; +} + +bool FwdBwdOpTaskImplFunction::operator>=( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr >= other.function_ptr; +} + +std::string format_as(FwdBwdOpTaskImplFunction const &x) { + std::ostringstream oss; + oss << ""; + return oss.str(); +} + +std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x) { + return s << fmt::to_string(x); +} + +} // namespace FlexFlow + +namespace std { +size_t hash::operator()( + ::FlexFlow::FwdBwdOpTaskImplFunction const &x) const { + return std::hash{}(x.function_ptr); +} +} // namespace std diff --git a/lib/local-execution/src/fwd_bwd_task_impl_function.cc b/lib/local-execution/src/fwd_bwd_task_impl_function.cc deleted file mode 100644 index f85d7cec61..0000000000 --- a/lib/local-execution/src/fwd_bwd_task_impl_function.cc +++ /dev/null @@ -1,54 +0,0 @@ -#include "local-execution/fwd_bwd_task_impl_function.h" - -namespace FlexFlow { - -bool FwdBwdTaskImplFunction::operator==( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr == other.function_ptr; -} - -bool FwdBwdTaskImplFunction::operator!=( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr != other.function_ptr; -} - -bool FwdBwdTaskImplFunction::operator<( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr < other.function_ptr; -} - -bool FwdBwdTaskImplFunction::operator>( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr > other.function_ptr; -} - -bool FwdBwdTaskImplFunction::operator<=( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr <= other.function_ptr; -} - -bool FwdBwdTaskImplFunction::operator>=( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr >= other.function_ptr; -} - -std::string format_as(FwdBwdTaskImplFunction const &x) { - std::ostringstream oss; - oss << ""; - return oss.str(); -} - -std::ostream &operator<<(std::ostream &s, FwdBwdTaskImplFunction const &x) { - return s << fmt::to_string(x); -} - -} // namespace FlexFlow - -namespace std { -size_t hash::operator()( - ::FlexFlow::FwdBwdTaskImplFunction const &x) const { - return std::hash{}(x.function_ptr); -} -} // namespace std diff --git a/lib/local-execution/src/init_op_task_impl_function.cc b/lib/local-execution/src/init_op_task_impl_function.cc new file mode 100644 index 0000000000..1c946982f5 --- /dev/null +++ b/lib/local-execution/src/init_op_task_impl_function.cc @@ -0,0 +1,47 @@ +#include "local-execution/init_op_task_impl_function.h" + +namespace FlexFlow { + +bool InitOpTaskImplFunction::operator==(InitOpTaskImplFunction const &other) const { + return this->function_ptr == other.function_ptr; +} + +bool InitOpTaskImplFunction::operator!=(InitOpTaskImplFunction const &other) const { + return this->function_ptr != other.function_ptr; +} + +bool InitOpTaskImplFunction::operator<(InitOpTaskImplFunction const &other) const { + return this->function_ptr < other.function_ptr; +} + +bool InitOpTaskImplFunction::operator>(InitOpTaskImplFunction const &other) const { + return this->function_ptr > other.function_ptr; +} + +bool InitOpTaskImplFunction::operator<=(InitOpTaskImplFunction const &other) const { + return this->function_ptr <= other.function_ptr; +} + +bool InitOpTaskImplFunction::operator>=(InitOpTaskImplFunction const &other) const { + return this->function_ptr >= other.function_ptr; +} + +std::string format_as(InitOpTaskImplFunction const &x) { + std::ostringstream oss; + oss << ""; + return oss.str(); +} +std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x) { + return s << fmt::to_string(x); +} + +} // namespace FlexFlow + +namespace std { +size_t hash::operator()( + ::FlexFlow::InitOpTaskImplFunction const &x) const { + return std::hash{}(x.function_ptr); +} +} // namespace std diff --git a/lib/local-execution/src/init_task_impl_function.cc b/lib/local-execution/src/init_task_impl_function.cc deleted file mode 100644 index 9501f72dd6..0000000000 --- a/lib/local-execution/src/init_task_impl_function.cc +++ /dev/null @@ -1,47 +0,0 @@ -#include "local-execution/init_task_impl_function.h" - -namespace FlexFlow { - -bool InitTaskImplFunction::operator==(InitTaskImplFunction const &other) const { - return this->function_ptr == other.function_ptr; -} - -bool InitTaskImplFunction::operator!=(InitTaskImplFunction const &other) const { - return this->function_ptr != other.function_ptr; -} - -bool InitTaskImplFunction::operator<(InitTaskImplFunction const &other) const { - return this->function_ptr < other.function_ptr; -} - -bool InitTaskImplFunction::operator>(InitTaskImplFunction const &other) const { - return this->function_ptr > other.function_ptr; -} - -bool InitTaskImplFunction::operator<=(InitTaskImplFunction const &other) const { - return this->function_ptr <= other.function_ptr; -} - -bool InitTaskImplFunction::operator>=(InitTaskImplFunction const &other) const { - return this->function_ptr >= other.function_ptr; -} - -std::string format_as(InitTaskImplFunction const &x) { - std::ostringstream oss; - oss << ""; - return oss.str(); -} -std::ostream &operator<<(std::ostream &s, InitTaskImplFunction const &x) { - return s << fmt::to_string(x); -} - -} // namespace FlexFlow - -namespace std { -size_t hash::operator()( - ::FlexFlow::InitTaskImplFunction const &x) const { - return std::hash{}(x.function_ptr); -} -} // namespace std diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 5059f29abd..0a1497b6c8 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -1,5 +1,6 @@ #include "local-execution/local_slots_backing.h" #include "utils/containers/contains_key.h" +#include "utils/containers/map_values.h" #include "utils/overload.h" namespace FlexFlow { @@ -55,17 +56,17 @@ void LocalSlotsBacking::allocate_optimizer_tensors( TaskSignature const &sig) { GenericTensorAccessorW weight_backing = get_tensor_backing(weight, IsGrad::NO); - int num_buffer_tensors = + int num_grad_buffer_tensors = sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad) - std::vector buffer_tensors = + std::vector grad_buffer_tensors = get_new_tensor_guids_for_layer_without_graph_insertion( - cg, weight_layer, num_buffer_tensors); - for (auto const &tensor_guid : buffer_tensors) { + cg, weight_layer, num_grad_buffer_tensors); + for (tensor_guid_t const &tensor_guid : grad_buffer_tensors) { GenericTensorAccessorW buffer_backing = allocator.allocate_tensor( get_tensor_shape(weight_backing.shape, weight_backing.data_type)); this->gradient_tensor_mapping.insert({tensor_guid, buffer_backing}); } - this->weight_optimizer_tensor_guids.insert({weight, buffer_tensors}); + this->weight_optimizer_tensor_guids.insert({weight, grad_buffer_tensors}); } bool LocalSlotsBacking::is_tensor_allocated( @@ -123,8 +124,7 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( break; default: throw mk_runtime_error( - fmt::format("Invalid TensorRole")); // inserting role yields - // "type_is_unformattable" error + fmt::format("Invalid TensorRole {}", tensor_spec.role)); } IsGrad is_grad = slot_grad_id.is_grad; @@ -154,41 +154,29 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { - ArgSlotsBacking mapping; - for (auto const &arg_binding : binding.get_arg_bindings()) { - slot_id_t arg_slot = arg_binding.first; - OpArgSpec op_arg_spec = arg_binding.second; - - mapping.insert({arg_slot, - op_arg_spec.visit(overload{ - [&](OpArgRefSpec const &s) { - return this->resolve_op_arg_ref_spec(s, op_guid); - }, - [&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; }, - })}); - } - return mapping; + return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){ + return arg_binding.template visit(overload{ + [&](OpArgRefSpec const &s) { + return this->resolve_op_arg_ref_spec(s, op_guid); + }, + [&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; } + }); + }); } ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( TaskBinding const &binding) const { - ArgSlotsBacking mapping; - for (auto const &arg_binding : binding.get_arg_bindings()) { - slot_id_t arg_slot = arg_binding.first; - TaskArgSpec task_arg_spec = arg_binding.second; - - mapping.insert({arg_slot, - task_arg_spec.visit(overload{ - [&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; }, - })}); - } - return mapping; + return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){ + return arg_binding.template visit(overload{ + [&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; } + }); + });; } ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec( diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index eb49f16df1..dff33826b9 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -60,7 +60,7 @@ DeviceSpecificDeviceStates TaskSignatureAndImpl task_sig_impl = this->task_registry.task_mapping.at(task_id); auto fn = - task_sig_impl.impl_function.get().function_ptr; + task_sig_impl.impl_function.get().function_ptr; return fn(acc); } @@ -70,7 +70,7 @@ std::optional TaskSignatureAndImpl task_sig_impl = this->task_registry.task_mapping.at(task_id); auto fn = - task_sig_impl.impl_function.get().function_ptr; + task_sig_impl.impl_function.get().function_ptr; return fn(acc); } @@ -160,13 +160,13 @@ void LocalTrainingBacking::execute_update() { // get tensors tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); - std::vector buffer_tensors = + std::vector grad_buffer_tensors = this->local_slots_backing.weight_optimizer_tensor_guids.at( weight_tensor); // get invocation TaskInvocation invocation = - get_update_invocation(attrs, weight_tensor, buffer_tensors); + get_update_invocation(attrs, weight_tensor, grad_buffer_tensors); assert(is_invocation_valid(get_update_signature(attrs), invocation)); // execute update diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index d34cc5d49a..c626bfc0e0 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -22,8 +22,9 @@ ModelTrainingInstance next(ModelTrainingInstance const &old_training_instance) { old_training_instance.label_tensor, old_training_instance.logit_tensor, new_attrs}; + } else { + return old_training_instance; } - return old_training_instance; } } // namespace FlexFlow diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc index 8ede2cb38b..5e693d43db 100644 --- a/lib/local-execution/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -202,13 +202,13 @@ static std::optional } TaskImplFunction get_attention_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_attention_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_attention_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_attention_init_signature() { diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc index 1eae409ae2..d60a003061 100644 --- a/lib/local-execution/src/ops/batch_matmul.cc +++ b/lib/local-execution/src/ops/batch_matmul.cc @@ -153,10 +153,10 @@ static std::optional } TaskImplFunction get_batch_matmul_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_batch_matmul_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_batch_matmul_fwd_signature() { diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 851566fc02..254d7ef39e 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -144,13 +144,13 @@ static std::optional } TaskImplFunction get_batch_norm_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_batch_norm_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_batch_norm_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_batch_norm_init_signature() { diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 3e7baf49a9..d3e43a46a0 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -79,10 +79,10 @@ static std::optional } TaskImplFunction get_cast_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_cast_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_cast_fwd_signature() { diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc index ccc82cce17..92f2931344 100644 --- a/lib/local-execution/src/ops/combine.cc +++ b/lib/local-execution/src/ops/combine.cc @@ -85,10 +85,10 @@ OpTaskSignature get_combine_bwd_signature() { } TaskImplFunction get_combine_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_combine_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } }; // namespace FlexFlow diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc index 35f663b1cd..94d8fc6827 100644 --- a/lib/local-execution/src/ops/concat.cc +++ b/lib/local-execution/src/ops/concat.cc @@ -79,10 +79,10 @@ static std::optional } TaskImplFunction get_concat_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_concat_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_concat_fwd_signature() { diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index d5c6e7f851..7694a03947 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -132,13 +132,13 @@ static std::optional } TaskImplFunction get_conv_2d_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_conv_2d_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_conv_2d_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_conv_2d_init_signature() { diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc index cac08866cc..77a2963313 100644 --- a/lib/local-execution/src/ops/dropout.cc +++ b/lib/local-execution/src/ops/dropout.cc @@ -87,13 +87,13 @@ static std::optional } TaskImplFunction get_dropout_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_dropout_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_dropout_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_dropout_init_signature() { diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc index 48c6c699a2..2152b1beea 100644 --- a/lib/local-execution/src/ops/element_binary.cc +++ b/lib/local-execution/src/ops/element_binary.cc @@ -126,15 +126,15 @@ static std::optional } TaskImplFunction get_element_binary_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_element_binary_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_element_binary_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_element_binary_init_signature() { diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index 502afb5f9f..64a0c5e94e 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -115,13 +115,13 @@ static std::optional } TaskImplFunction get_element_unary_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_element_unary_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_element_unary_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_element_unary_init_signature() { diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 3fe5029fa1..8df5703f60 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -53,10 +53,10 @@ static std::optional } TaskImplFunction get_flat_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_flat_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_flat_fwd_signature() { diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc index a015c64f4d..558988f9a4 100644 --- a/lib/local-execution/src/ops/gather.cc +++ b/lib/local-execution/src/ops/gather.cc @@ -122,13 +122,13 @@ static std::optional } TaskImplFunction get_gather_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_gather_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_gather_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_gather_init_signature() { diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc index e99d27319c..b1f44d69ae 100644 --- a/lib/local-execution/src/ops/layer_norm.cc +++ b/lib/local-execution/src/ops/layer_norm.cc @@ -146,13 +146,13 @@ static DeviceSpecificDeviceStates } TaskImplFunction get_layer_norm_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_layer_norm_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_layer_norm_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_layer_norm_fwd_signature() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 9934e2a45c..9e29a0cce0 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -161,13 +161,13 @@ static std::optional } TaskImplFunction get_linear_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_linear_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_linear_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_linear_init_signature() { diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index 789ed2cd63..093a3c1374 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -142,13 +142,13 @@ static std::optional } TaskImplFunction get_pool_2d_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_pool_2d_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_pool_2d_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_pool_2d_init_signature() { diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc index a043d9f847..01d2f0e86f 100644 --- a/lib/local-execution/src/ops/reduce.cc +++ b/lib/local-execution/src/ops/reduce.cc @@ -102,13 +102,13 @@ static std::optional } TaskImplFunction get_reduce_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_reduce_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_reduce_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_reduce_init_signature() { diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index a58d79a4f8..f946b7d146 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -74,10 +74,10 @@ static std::optional } TaskImplFunction get_reduction_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_reduction_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_reduction_fwd_signature() { diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc index 73692f4a13..e260fd77f5 100644 --- a/lib/local-execution/src/ops/repartition.cc +++ b/lib/local-execution/src/ops/repartition.cc @@ -98,13 +98,13 @@ static std::optional } TaskImplFunction get_repartition_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_repartition_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_repartition_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_repartition_init_signature() { diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index 135475a711..10cd80a6d9 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -73,10 +73,10 @@ static std::optional } TaskImplFunction get_replicate_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_replicate_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_replicate_fwd_signature() { diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index 7584d405eb..433e961a8a 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -92,13 +92,13 @@ static std::optional } TaskImplFunction get_reshape_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_reshape_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_reshape_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_reshape_init_signature() { diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index 366a579bea..b767b61b20 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -103,10 +103,10 @@ static std::optional } TaskImplFunction get_reverse_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_reverse_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_reverse_fwd_signature() { diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 4c7979ae9b..36c4afcaf3 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -108,13 +108,13 @@ static std::optional } TaskImplFunction get_softmax_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_softmax_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_softmax_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_softmax_init_signature() { diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc index 9f039d84f8..dc627aae96 100644 --- a/lib/local-execution/src/ops/split.cc +++ b/lib/local-execution/src/ops/split.cc @@ -114,10 +114,10 @@ static std::optional } TaskImplFunction get_split_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_split_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_split_fwd_signature() { diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc index 7f3519529a..ea4fc09e19 100644 --- a/lib/local-execution/src/ops/topk.cc +++ b/lib/local-execution/src/ops/topk.cc @@ -120,13 +120,13 @@ static std::optional } TaskImplFunction get_topk_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_topk_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_topk_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_topk_init_signature() { diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 5c3c1dd1ca..099206e372 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -100,13 +100,13 @@ OpTaskInvocation backward(TransposeAttrs const &attrs) { } TaskImplFunction get_transpose_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_transpose_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_transpose_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_transpose_init_signature() { diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 1b1173c70e..485955a5dc 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -35,8 +35,9 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { b.bind_arg(HANDLE, ff_handle()); return {task_id_t::SGD_UPD_NCCL_TASK_ID, b}; + } else { + return {task_id_t::SGD_UPD_PS_TASK_ID, b}; } - return {task_id_t::SGD_UPD_PS_TASK_ID, b}; } static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { @@ -183,8 +184,8 @@ TaskImplFunction get_adam_update_task_impl() { TaskSignature get_update_signature(OptimizerAttrs const &attrs) { return attrs.visit(overload{ - [&](SGDOptimizerAttrs const &s) { return get_sgd_update_signature(); }, - [&](AdamOptimizerAttrs const &s) { + [&](SGDOptimizerAttrs const &) { return get_sgd_update_signature(); }, + [&](AdamOptimizerAttrs const &) { return get_adam_update_signature(); }}); } @@ -192,21 +193,21 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) { TaskInvocation get_update_invocation(OptimizerAttrs const &attrs, tensor_guid_t const &weight, - std::vector const &buffer_tensors) { + std::vector const &grad_buffer_tensors) { return attrs.visit( overload{[&](SGDOptimizerAttrs const &s) { - return sgd_update(s, weight, buffer_tensors.at(0)); + return sgd_update(s, weight, grad_buffer_tensors.at(0)); }, [&](AdamOptimizerAttrs const &s) { return adam_update( - s, weight, buffer_tensors.at(0), buffer_tensors.at(1)); + s, weight, grad_buffer_tensors.at(0), grad_buffer_tensors.at(1)); }}); } TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) { return attrs.visit(overload{ - [&](SGDOptimizerAttrs const &s) { return get_sgd_update_task_impl(); }, - [&](AdamOptimizerAttrs const &s) { + [&](SGDOptimizerAttrs const &) { return get_sgd_update_task_impl(); }, + [&](AdamOptimizerAttrs const &) { return get_adam_update_task_impl(); }}); } diff --git a/lib/local-execution/src/task_invocation.cc b/lib/local-execution/src/task_invocation.cc index c64af5332e..e15b9ae4ef 100644 --- a/lib/local-execution/src/task_invocation.cc +++ b/lib/local-execution/src/task_invocation.cc @@ -42,8 +42,7 @@ std::unordered_map const & } bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) { - // TODO: implement signature checking - return true; + NOT_IMPLEMENTED(); } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 15bf089b6b..740c2a7355 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -19,7 +19,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { EnableProfiling::NO, ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; - OptimizerAttrs optimizer_attrs = make_empty_sgd_attrs(); + OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ + /*lr=*/0.0, + /*momentum=*/0.0, + /*nesterov=*/false, + /*weight_decay=*/0.0}}; // construct graph ComputationGraphBuilder cg_builder; @@ -76,7 +80,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { std::optional model_training_instance = - ModelTrainingInstance{LossAttrs{OtherLossAttrs{ + ModelTrainingInstance{LossAttrs{NonconfigurableLossAttrs{ LossFunction::CATEGORICAL_CROSSENTROPY}}, label_tensor, logit_tensor, @@ -94,7 +98,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{ + LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, logit_tensor, @@ -112,7 +116,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("LossFunction::IDENTITY") { std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, + LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}, label_tensor, logit_tensor, optimizer_attrs}; diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index 3899f60b83..6ad59c8286 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -55,7 +55,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*weight_decay=*/0.001}}; std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{ + LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, logit_tensor, @@ -78,7 +78,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*weight_decay=*/0.001}}; std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{ + LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, logit_tensor, @@ -107,7 +107,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::optional model_training_instance = ModelTrainingInstance{ LossAttrs{ - OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + NonconfigurableLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, logit_tensor, optimizer_attrs}; diff --git a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml index 8a4f38839c..d60c6507cf 100644 --- a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml +++ b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml @@ -10,7 +10,7 @@ features = [ includes = [ "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h", - "op-attrs/ops/other_loss_attrs.dtg.h" + "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h" ] [[values]] @@ -18,5 +18,5 @@ type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs" key = "sparse_categorical_ce_loss_attrs" [[values]] -type = "::FlexFlow::OtherLossAttrs" -key = "other_loss_attrs" +type = "::FlexFlow::NonconfigurableLossAttrs" +key = "nonconfigurable_loss_attrs" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions.h index 9fb0597197..74d2d0a479 100644 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions.h +++ b/lib/op-attrs/include/op-attrs/ops/loss_functions.h @@ -1,11 +1,11 @@ #ifndef _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H #define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H -#include "core.h" -#include "loss_attrs.dtg.h" -#include "loss_function.dtg.h" -#include "other_loss_attrs.dtg.h" -#include "sparse_categorical_ce_loss_attrs.dtg.h" +#include "op-attrs/ops/core.h" +#include "op-attrs/ops/loss_attrs.dtg.h" +#include "op-attrs/ops/loss_function.dtg.h" +#include "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h" +#include "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml similarity index 86% rename from lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml rename to lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml index 81055f5835..0420e7ef7b 100644 --- a/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "OtherLossAttrs" +name = "NonconfigurableLossAttrs" features = [ "eq", "ord", diff --git a/lib/op-attrs/src/loss_functions.cc b/lib/op-attrs/src/loss_functions.cc index cae88be453..50a26ec792 100644 --- a/lib/op-attrs/src/loss_functions.cc +++ b/lib/op-attrs/src/loss_functions.cc @@ -12,7 +12,7 @@ LossFunction get_loss_function(LossAttrs const &attrs) { overload{[&](SparseCategoricalCrossEntropyLossAttrs const &s) { return LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY; }, - [&](OtherLossAttrs const &s) { return s.loss_type; }}); + [&](NonconfigurableLossAttrs const &s) { return s.loss_type; }}); } LossFunction parse_loss_name(std::string const &raw_name) { diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h deleted file mode 100644 index 550bf12cc8..0000000000 --- a/lib/pcg/include/pcg/optimizer_attrs.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H -#define _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H - -#include "pcg/optimizer_attrs.dtg.h" - -namespace FlexFlow { - -OptimizerAttrs make_empty_sgd_attrs(); -OptimizerAttrs make_empty_adam_attrs(); - -} // namespace FlexFlow - -#endif diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc deleted file mode 100644 index d51070b10d..0000000000 --- a/lib/pcg/src/pcg/optimizer_attrs.cc +++ /dev/null @@ -1,14 +0,0 @@ -#include "pcg/optimizer_attrs.h" - -namespace FlexFlow { - -OptimizerAttrs make_empty_sgd_attrs() { - return OptimizerAttrs{SGDOptimizerAttrs{0.0, 0.0, false, 0.0}}; -} - -OptimizerAttrs make_empty_adam_attrs() { - return OptimizerAttrs{ - AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}}; -} - -} // namespace FlexFlow From 103ef073a4eedd0108ac6537541d5e4d2f6a03d9 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 11 Sep 2024 12:59:33 -0700 Subject: [PATCH 11/22] Format --- .../include/local-execution/optimizer.h | 7 ++-- .../include/local-execution/task_signature.h | 4 +-- .../src/init_op_task_impl_function.cc | 18 ++++++---- .../src/local_slots_backing.cc | 35 +++++++++---------- lib/local-execution/src/optimizer.cc | 34 ++++++++---------- lib/local-execution/test/src/test_loss_e2e.cc | 10 +++--- .../test/src/test_update_e2e.cc | 4 +-- 7 files changed, 57 insertions(+), 55 deletions(-) diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index e1f11b8a68..a6395a4daa 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -11,9 +11,10 @@ namespace FlexFlow { TaskSignature get_update_signature(OptimizerAttrs const &); -TaskInvocation get_update_invocation(OptimizerAttrs const &, - tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors); +TaskInvocation get_update_invocation( + OptimizerAttrs const &, + tensor_guid_t const &weight, + std::vector const &grad_buffer_tensors); TaskImplFunction get_update_task_impl(OptimizerAttrs const &); TaskSignature get_sgd_update_signature(); diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h index ed28f8eaea..6da69f2441 100644 --- a/lib/local-execution/include/local-execution/task_signature.h +++ b/lib/local-execution/include/local-execution/task_signature.h @@ -35,7 +35,7 @@ void add_return_value(TaskSignature &task_signature) { /** * @brief Adds an argument slot without checking if it is serializable. - * + * * This function is used for arguments that are device-specific. */ @@ -46,7 +46,7 @@ void add_unchecked_arg_slot(TaskSignature &task_signature, int name) { /** * @brief Adds an argument slot without checking if it is serializable. - * + * * This function is used for arguments that are device-specific. */ diff --git a/lib/local-execution/src/init_op_task_impl_function.cc b/lib/local-execution/src/init_op_task_impl_function.cc index 1c946982f5..abe84b828e 100644 --- a/lib/local-execution/src/init_op_task_impl_function.cc +++ b/lib/local-execution/src/init_op_task_impl_function.cc @@ -2,27 +2,33 @@ namespace FlexFlow { -bool InitOpTaskImplFunction::operator==(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator==( + InitOpTaskImplFunction const &other) const { return this->function_ptr == other.function_ptr; } -bool InitOpTaskImplFunction::operator!=(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator!=( + InitOpTaskImplFunction const &other) const { return this->function_ptr != other.function_ptr; } -bool InitOpTaskImplFunction::operator<(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator<( + InitOpTaskImplFunction const &other) const { return this->function_ptr < other.function_ptr; } -bool InitOpTaskImplFunction::operator>(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator>( + InitOpTaskImplFunction const &other) const { return this->function_ptr > other.function_ptr; } -bool InitOpTaskImplFunction::operator<=(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator<=( + InitOpTaskImplFunction const &other) const { return this->function_ptr <= other.function_ptr; } -bool InitOpTaskImplFunction::operator>=(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator>=( + InitOpTaskImplFunction const &other) const { return this->function_ptr >= other.function_ptr; } diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 0a1497b6c8..7050063254 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -154,29 +154,28 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { - return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){ - return arg_binding.template visit(overload{ - [&](OpArgRefSpec const &s) { - return this->resolve_op_arg_ref_spec(s, op_guid); - }, - [&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; } - }); + return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) { + return arg_binding.template visit( + overload{[&](OpArgRefSpec const &s) { + return this->resolve_op_arg_ref_spec(s, op_guid); + }, + [&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }}); }); } ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( TaskBinding const &binding) const { - return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){ - return arg_binding.template visit(overload{ - [&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; } - }); - });; + return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) { + return arg_binding.template visit( + overload{[&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }}); + }); + ; } ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec( diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 485955a5dc..29beb15edf 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -35,7 +35,7 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { b.bind_arg(HANDLE, ff_handle()); return {task_id_t::SGD_UPD_NCCL_TASK_ID, b}; - } else { + } else { return {task_id_t::SGD_UPD_PS_TASK_ID, b}; } } @@ -185,31 +185,27 @@ TaskImplFunction get_adam_update_task_impl() { TaskSignature get_update_signature(OptimizerAttrs const &attrs) { return attrs.visit(overload{ [&](SGDOptimizerAttrs const &) { return get_sgd_update_signature(); }, - [&](AdamOptimizerAttrs const &) { - return get_adam_update_signature(); - }}); + [&](AdamOptimizerAttrs const &) { return get_adam_update_signature(); }}); } -TaskInvocation - get_update_invocation(OptimizerAttrs const &attrs, - tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors) { - return attrs.visit( - overload{[&](SGDOptimizerAttrs const &s) { - return sgd_update(s, weight, grad_buffer_tensors.at(0)); - }, - [&](AdamOptimizerAttrs const &s) { - return adam_update( - s, weight, grad_buffer_tensors.at(0), grad_buffer_tensors.at(1)); - }}); +TaskInvocation get_update_invocation( + OptimizerAttrs const &attrs, + tensor_guid_t const &weight, + std::vector const &grad_buffer_tensors) { + return attrs.visit(overload{ + [&](SGDOptimizerAttrs const &s) { + return sgd_update(s, weight, grad_buffer_tensors.at(0)); + }, + [&](AdamOptimizerAttrs const &s) { + return adam_update( + s, weight, grad_buffer_tensors.at(0), grad_buffer_tensors.at(1)); + }}); } TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) { return attrs.visit(overload{ [&](SGDOptimizerAttrs const &) { return get_sgd_update_task_impl(); }, - [&](AdamOptimizerAttrs const &) { - return get_adam_update_task_impl(); - }}); + [&](AdamOptimizerAttrs const &) { return get_adam_update_task_impl(); }}); } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 740c2a7355..6cc66032ff 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -19,11 +19,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { EnableProfiling::NO, ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; - OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ - /*lr=*/0.0, - /*momentum=*/0.0, - /*nesterov=*/false, - /*weight_decay=*/0.0}}; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.0, + /*momentum=*/0.0, + /*nesterov=*/false, + /*weight_decay=*/0.0}}; // construct graph ComputationGraphBuilder cg_builder; diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index 6ad59c8286..f300fe0720 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -106,8 +106,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*epsilon=*/1e-8}}; std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{ - NonconfigurableLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + LossAttrs{NonconfigurableLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, logit_tensor, optimizer_attrs}; From f48f9ff97022910e69e0711b3cc0155db23da5bb Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 17 Sep 2024 17:43:22 -0700 Subject: [PATCH 12/22] Fix test and small issues --- lib/kernels/include/kernels/array_shape.h | 1 + lib/kernels/include/kernels/profiling.h | 1 + lib/kernels/src/array_shape.cc | 28 +++----- .../local-execution/local_slots_backing.h | 2 +- .../include/local-execution/loss_functions.h | 2 +- .../include/local-execution/optimizer.h | 2 +- .../include/local-execution/task_binding.h | 58 +++++++++++++++++ .../include/local-execution/task_invocation.h | 65 +------------------ .../task_invocation.struct.toml | 19 ++++++ .../src/local_slots_backing.cc | 4 +- .../src/local_training_backing.cc | 5 +- lib/local-execution/src/loss_functions.cc | 6 +- lib/local-execution/src/optimizer.cc | 9 +-- lib/local-execution/src/task_binding.cc | 44 +++++++++++++ lib/local-execution/src/task_invocation.cc | 39 ----------- lib/local-execution/test/src/test_loss_e2e.cc | 6 +- .../test/src/test_update_e2e.cc | 6 +- 17 files changed, 157 insertions(+), 140 deletions(-) create mode 100644 lib/local-execution/include/local-execution/task_binding.h create mode 100644 lib/local-execution/include/local-execution/task_invocation.struct.toml create mode 100644 lib/local-execution/src/task_binding.cc diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 6b0b57b57f..015cacc7cb 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -17,6 +17,7 @@ struct ArrayShape { ArrayShape(size_t *dims, size_t num_dims); ArrayShape(TensorShape const &shape); ArrayShape(std::vector const &); + ArrayShape(LegionTensorDims const &); /** * @brief Alias of ArrayShape::num_elements for compatibility with diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h index 655d540685..31c70010a0 100644 --- a/lib/kernels/include/kernels/profiling.h +++ b/lib/kernels/include/kernels/profiling.h @@ -40,6 +40,7 @@ std::optional profiling_wrapper(F const &f, } float elapsed = 0; + std::cout << "hello"; checkCUDA(ffEventRecord(t_end, stream)); checkCUDA(ffEventSynchronize(t_end)); checkCUDA(ffEventElapsedTime(&elapsed, t_start, t_end)); diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 054e16e90a..8464212290 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -1,4 +1,5 @@ #include "kernels/array_shape.h" +#include "op-attrs/dim_ordered/slice.h" #include "utils/containers/product.h" namespace FlexFlow { @@ -19,6 +20,9 @@ ArrayShape::ArrayShape(TensorShape const &shape) ArrayShape::ArrayShape(std::vector const &input_dims) : dims(input_dims) {} +ArrayShape::ArrayShape(LegionTensorDims const &legion_tensor_dims) + : dims(legion_tensor_dims) {} + std::size_t ArrayShape::get_volume() const { return this->num_elements(); } @@ -51,33 +55,19 @@ std::size_t ArrayShape::at(ff_dim_t idx) const { } ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const { - NOT_IMPLEMENTED(); + legion_dim_t legion_end = legion_dim_from_ff_dim(end, num_dims()); + return this->sub_shape(start, legion_end); } ArrayShape ArrayShape::sub_shape(std::optional start, std::optional end) const { - std::vector new_shape; - ff_dim_t start_idx = start.value_or(ff_dim_t{0}); - ff_dim_t end_idx = end.value_or(ff_dim_t{this->num_dims()}); - - while (start_idx < end_idx) { - new_shape.push_back(this->at(start_idx)); - start_idx = ff_dim_t{start_idx.value + 1}; - } - return ArrayShape{new_shape}; + return ArrayShape{legion_dims_from_ff_dims( + slice(ff_ordered_from_legion_ordered(this->dims), start, end))}; } ArrayShape ArrayShape::sub_shape(std::optional start, std::optional end) const { - std::vector new_shape; - legion_dim_t start_idx = start.value_or(legion_dim_t{0}); - legion_dim_t end_idx = end.value_or(legion_dim_t{this->num_dims()}); - - while (start_idx < end_idx) { - new_shape.push_back(this->at(start_idx)); - start_idx = add_to_legion_dim(start_idx, 1); - } - return ArrayShape{new_shape}; + return ArrayShape{slice(this->dims, start, end)}; } std::optional ArrayShape::at_maybe(legion_dim_t index) const { diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 439113c873..678be4c96b 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -7,7 +7,7 @@ #include "local-execution/op_task_invocation.h" #include "local-execution/per_device_op_state.h" #include "local-execution/runtime_arg_config.h" -#include "local-execution/task_invocation.h" +#include "local-execution/task_invocation.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index 58405536d8..2298115d5d 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -17,7 +17,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ #include "local-execution/task_impl_function.dtg.h" -#include "local-execution/task_invocation.h" +#include "local-execution/task_invocation.dtg.h" #include "local-execution/task_signature.h" #include "op-attrs/ops/loss_functions.h" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index a6395a4daa..1e2cd65362 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ #include "local-execution/task_impl_function.dtg.h" -#include "local-execution/task_invocation.h" +#include "local-execution/task_invocation.dtg.h" #include "local-execution/task_signature.h" #include "pcg/optimizer_attrs.dtg.h" #include "pcg/optimizers/adam_optimizer_attrs.dtg.h" diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h new file mode 100644 index 0000000000..cbe210f438 --- /dev/null +++ b/lib/local-execution/include/local-execution/task_binding.h @@ -0,0 +1,58 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H + +#include "local-execution/slot_grad_id.dtg.h" +#include "local-execution/slot_id_t.dtg.h" +#include "local-execution/task_arg_spec.dtg.h" +#include "local-execution/task_id_t.dtg.h" +#include "local-execution/task_signature.dtg.h" +#include "local-execution/tensor_guid_spec.dtg.h" + +namespace FlexFlow { + +struct TaskBinding { + TaskBinding() = default; + + void bind(int, TensorGuidSpec const &); + void bind(slot_id_t, TensorGuidSpec const &); + + template + void bind_arg(int name, T const &t) { + this->bind_arg(slot_id_t{name}, t); + } + + template + void bind_arg(slot_id_t name, T const &t) { + this->insert_arg_spec(name, TaskArgSpec{ConcreteArgSpec::create(t)}); + } + + template + void bind_arg(int name, RuntimeArgRef const &t) { + this->bind_arg(slot_id_t{name}, t); + } + + template + void bind_arg(slot_id_t name, RuntimeArgRef const &ref) { + this->insert_arg_spec(name, TaskArgSpec{RuntimeArgRefSpec::create(ref)}); + } + + bool operator==(TaskBinding const &other) const; + bool operator!=(TaskBinding const &other) const; + + std::unordered_map const & + get_tensor_bindings() const; + std::unordered_map const &get_arg_bindings() const; + +private: + std::unordered_map tensor_bindings; + std::unordered_map arg_bindings; + +private: + void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec); + std::tuple + tie() const; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/task_invocation.h b/lib/local-execution/include/local-execution/task_invocation.h index 2317c65c02..93b5743a80 100644 --- a/lib/local-execution/include/local-execution/task_invocation.h +++ b/lib/local-execution/include/local-execution/task_invocation.h @@ -1,71 +1,12 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H -#include "local-execution/slot_grad_id.dtg.h" -#include "local-execution/slot_id_t.dtg.h" -#include "local-execution/task_arg_spec.dtg.h" -#include "local-execution/task_id_t.dtg.h" -#include "local-execution/task_signature.dtg.h" -#include "local-execution/tensor_guid_spec.dtg.h" +#include "local-execution/task_invocation.dtg.h" namespace FlexFlow { -struct TaskBinding { - TaskBinding() = default; - - void bind(int, TensorGuidSpec const &); - void bind(slot_id_t, TensorGuidSpec const &); - - template - void bind_arg(int name, T const &t) { - this->bind_arg(slot_id_t{name}, t); - } - - template - void bind_arg(slot_id_t name, T const &t) { - this->insert_arg_spec(name, TaskArgSpec{ConcreteArgSpec::create(t)}); - } - - template - void bind_arg(int name, RuntimeArgRef const &t) { - this->bind_arg(slot_id_t{name}, t); - } - - template - void bind_arg(slot_id_t name, RuntimeArgRef const &ref) { - this->insert_arg_spec(name, TaskArgSpec{RuntimeArgRefSpec::create(ref)}); - } - - bool operator==(TaskBinding const &other) const; - bool operator!=(TaskBinding const &other) const; - - std::unordered_map const & - get_tensor_bindings() const; - std::unordered_map const &get_arg_bindings() const; - -private: - std::unordered_map tensor_bindings; - std::unordered_map arg_bindings; - -private: - void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec); - std::tuple - tie() const; -}; - -struct TaskInvocation { -public: - TaskInvocation() = delete; - TaskInvocation(task_id_t task_id, TaskBinding const &binding) - : task_id(task_id), binding(binding) {} - -public: - task_id_t task_id; - TaskBinding binding; -}; - bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv); - -} // namespace FlexFlow + +} #endif diff --git a/lib/local-execution/include/local-execution/task_invocation.struct.toml b/lib/local-execution/include/local-execution/task_invocation.struct.toml new file mode 100644 index 0000000000..abcaabda93 --- /dev/null +++ b/lib/local-execution/include/local-execution/task_invocation.struct.toml @@ -0,0 +1,19 @@ +namespace = "FlexFlow" +name = "TaskInvocation" +features = [ + "eq" +] + +includes = [ + "local-execution/task_binding.h", + "local-execution/task_id_t.dtg.h" +] + + +[[fields]] +name = "task_id" +type = "::FlexFlow::task_id_t" + +[[fields]] +name = "binding" +type = "::FlexFlow::TaskBinding" diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 7050063254..194d64c34b 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -154,7 +154,7 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { - return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) { + return map_values(binding.get_arg_bindings(), [&](OpArgSpec const &arg_binding) { return arg_binding.template visit( overload{[&](OpArgRefSpec const &s) { return this->resolve_op_arg_ref_spec(s, op_guid); @@ -168,7 +168,7 @@ ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( TaskBinding const &binding) const { - return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) { + return map_values(binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { return arg_binding.template visit( overload{[&](RuntimeArgRefSpec const &s) { return this->resolve_runtime_arg_ref_spec(s); diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index dff33826b9..7f0b179390 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -3,6 +3,7 @@ #include "local-execution/model_training_instance.h" #include "local-execution/optimizer.h" #include "local-execution/task_signature_impl.h" +#include "local-execution/task_invocation.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" @@ -124,7 +125,7 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() { backward(unwrapped_training_instance.loss_attrs, unwrapped_training_instance.logit_tensor, unwrapped_training_instance.label_tensor); - assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); + // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); TaskArgumentAccessor loss_accessor = this->get_task_arg_accessor(loss_invocation); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); @@ -167,7 +168,7 @@ void LocalTrainingBacking::execute_update() { // get invocation TaskInvocation invocation = get_update_invocation(attrs, weight_tensor, grad_buffer_tensors); - assert(is_invocation_valid(get_update_signature(attrs), invocation)); + // assert(is_invocation_valid(get_update_signature(attrs), invocation)); // execute update TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 771d175a7d..3a4c616377 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -41,7 +41,7 @@ TaskInvocation b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); - return {task_id_t::LOSS_BWD_TASK_ID, b}; + return TaskInvocation{task_id_t::LOSS_BWD_TASK_ID, b}; } static void backward_task_impl(TaskArgumentAccessor const &acc) { @@ -51,7 +51,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { auto logit = acc.get_tensor(LOGIT); auto label = acc.get_tensor(LABEL); int batch_size = logit.shape.at(legion_dim_t{1}); - // assuming logit shape is [parallel dim(?), batch dim, num classes] + // assuming logit shape is [batch dim, num classes] LossFunction loss_type = get_loss_function(attrs); float scale_factor = 1.0f / batch_size; @@ -61,7 +61,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { } if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) { - // label shape is [parallel dim(?), batch dim, 1] + // label shape is [batch dim, 1] auto scce_attrs = attrs.get(); size_t ndim = logit.shape.num_dims(); int num_classes = logit.shape.at(legion_dim_t{0}); diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 29beb15edf..30f20bf8ec 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -34,9 +34,9 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { b.bind_arg(HANDLE, ff_handle()); - return {task_id_t::SGD_UPD_NCCL_TASK_ID, b}; + return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, b}; } else { - return {task_id_t::SGD_UPD_PS_TASK_ID, b}; + return TaskInvocation{task_id_t::SGD_UPD_PS_TASK_ID, b}; } } @@ -123,9 +123,10 @@ TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { b.bind_arg(HANDLE, ff_handle()); - return {task_id_t::ADAM_UPD_NCCL_TASK_ID, b}; + return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b}; + } else { + return TaskInvocation{task_id_t::ADAM_UPD_PS_TASK_ID, b}; } - return {task_id_t::ADAM_UPD_PS_TASK_ID, b}; } static void adam_update_task_impl(TaskArgumentAccessor const &acc) { diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc new file mode 100644 index 0000000000..a5a3b2dc34 --- /dev/null +++ b/lib/local-execution/src/task_binding.cc @@ -0,0 +1,44 @@ +#include "local-execution/task_binding.h" +#include "utils/containers/contains_key.h" + +namespace FlexFlow { + +void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) { + this->bind(slot_id_t{name}, tensor_guid_spec); +} + +void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) { + this->tensor_bindings.insert( + {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec}); +} + +void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) { + assert(!contains_key(this->arg_bindings, name)); + this->arg_bindings.insert({name, arg_spec}); +} + +bool TaskBinding::operator==(TaskBinding const &other) const { + return this->tie() == other.tie(); +} + +bool TaskBinding::operator!=(TaskBinding const &other) const { + return this->tie() != other.tie(); +} + +std::tuple const &, + std::unordered_map const &> + TaskBinding::tie() const { + return std::tie(this->tensor_bindings, this->arg_bindings); +} + +std::unordered_map const & + TaskBinding::get_tensor_bindings() const { + return this->tensor_bindings; +} + +std::unordered_map const & + TaskBinding::get_arg_bindings() const { + return this->arg_bindings; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/task_invocation.cc b/lib/local-execution/src/task_invocation.cc index e15b9ae4ef..e08c1036da 100644 --- a/lib/local-execution/src/task_invocation.cc +++ b/lib/local-execution/src/task_invocation.cc @@ -1,46 +1,7 @@ #include "local-execution/task_invocation.h" -#include "utils/containers/contains_key.h" namespace FlexFlow { -void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) { - this->bind(slot_id_t{name}, tensor_guid_spec); -} - -void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) { - this->tensor_bindings.insert( - {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec}); -} - -void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) { - assert(!contains_key(this->arg_bindings, name)); - this->arg_bindings.insert({name, arg_spec}); -} - -bool TaskBinding::operator==(TaskBinding const &other) const { - return this->tie() == other.tie(); -} - -bool TaskBinding::operator!=(TaskBinding const &other) const { - return this->tie() != other.tie(); -} - -std::tuple const &, - std::unordered_map const &> - TaskBinding::tie() const { - return std::tie(this->tensor_bindings, this->arg_bindings); -} - -std::unordered_map const & - TaskBinding::get_tensor_bindings() const { - return this->tensor_bindings; -} - -std::unordered_map const & - TaskBinding::get_arg_bindings() const { - return this->arg_bindings; -} - bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) { NOT_IMPLEMENTED(); } diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 6cc66032ff..3bc85354a0 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -4,7 +4,7 @@ #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" #include "pcg/computation_graph_builder.h" -#include "pcg/optimizer_attrs.h" +#include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" namespace FlexFlow { @@ -16,8 +16,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::NO, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.0, diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index f300fe0720..b5a503f430 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -4,7 +4,7 @@ #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" #include "pcg/computation_graph_builder.h" -#include "pcg/optimizer_attrs.h" +#include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" namespace FlexFlow { @@ -16,8 +16,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::NO, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; // construct graph ComputationGraphBuilder cg_builder; From 189c9c8c034143cd4a5fc4bab0db652444601915 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 17 Sep 2024 17:43:37 -0700 Subject: [PATCH 13/22] Format --- .../include/local-execution/task_invocation.h | 2 +- .../src/local_slots_backing.cc | 36 ++++++++++--------- .../src/local_training_backing.cc | 2 +- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/lib/local-execution/include/local-execution/task_invocation.h b/lib/local-execution/include/local-execution/task_invocation.h index 93b5743a80..d03d6ac8e1 100644 --- a/lib/local-execution/include/local-execution/task_invocation.h +++ b/lib/local-execution/include/local-execution/task_invocation.h @@ -6,7 +6,7 @@ namespace FlexFlow { bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv); - + } #endif diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 194d64c34b..ff23c269e7 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -154,27 +154,29 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { - return map_values(binding.get_arg_bindings(), [&](OpArgSpec const &arg_binding) { - return arg_binding.template visit( - overload{[&](OpArgRefSpec const &s) { - return this->resolve_op_arg_ref_spec(s, op_guid); - }, - [&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; }}); - }); + return map_values( + binding.get_arg_bindings(), [&](OpArgSpec const &arg_binding) { + return arg_binding.template visit( + overload{[&](OpArgRefSpec const &s) { + return this->resolve_op_arg_ref_spec(s, op_guid); + }, + [&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }}); + }); } ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( TaskBinding const &binding) const { - return map_values(binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { - return arg_binding.template visit( - overload{[&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; }}); - }); + return map_values( + binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { + return arg_binding.template visit( + overload{[&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }}); + }); ; } diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 7f0b179390..9c1136f198 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -2,8 +2,8 @@ #include "local-execution/loss_functions.h" #include "local-execution/model_training_instance.h" #include "local-execution/optimizer.h" -#include "local-execution/task_signature_impl.h" #include "local-execution/task_invocation.h" +#include "local-execution/task_signature_impl.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" From b5647c8336848f0030445c9254cfc0e07b88ef4f Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 1 Oct 2024 09:17:46 -0700 Subject: [PATCH 14/22] Pass tests after merge --- lib/kernels/include/kernels/profiling.h | 1 - .../model_training_instance.struct.toml | 2 +- .../src/local_training_backing.cc | 2 +- .../test/src/test_local_cost_estimator.cc | 2 +- lib/local-execution/test/src/test_loss_e2e.cc | 6 ++--- .../test/src/test_update_e2e.cc | 4 ++-- .../op-attrs/ops/loss_attrs.variant.toml | 22 ------------------ .../op-attrs/ops/loss_function.enum.toml | 23 ------------------- .../include/op-attrs/ops/loss_functions.h | 8 +++---- .../loss_functions/loss_attrs.variant.toml | 6 ++--- .../ops/loss_functions/loss_functions.h | 20 ---------------- .../nonconfigurable_loss_attrs.struct.toml | 2 +- .../other_loss_attrs.struct.toml | 18 --------------- ...arse_categorical_ce_loss_attrs.struct.toml | 14 ----------- .../src/op-attrs/ops/loss_functions.cc | 2 +- 15 files changed, 17 insertions(+), 115 deletions(-) delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h rename lib/op-attrs/include/op-attrs/ops/{ => loss_functions}/nonconfigurable_loss_attrs.struct.toml (80%) delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml delete mode 100644 lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h index 31c70010a0..655d540685 100644 --- a/lib/kernels/include/kernels/profiling.h +++ b/lib/kernels/include/kernels/profiling.h @@ -40,7 +40,6 @@ std::optional profiling_wrapper(F const &f, } float elapsed = 0; - std::cout << "hello"; checkCUDA(ffEventRecord(t_end, stream)); checkCUDA(ffEventSynchronize(t_end)); checkCUDA(ffEventElapsedTime(&elapsed, t_start, t_end)); diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml index b460d6bd3a..28282e21c0 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml +++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml @@ -8,7 +8,7 @@ features = [ ] includes = [ - "op-attrs/ops/loss_attrs.dtg.h", + "op-attrs/ops/loss_functions/loss_attrs.dtg.h", "pcg/tensor_guid_t.dtg.h", "pcg/optimizer_attrs.dtg.h", ] diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index b794cc6da6..edbb377047 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -4,10 +4,10 @@ #include "local-execution/optimizer.h" #include "local-execution/task_invocation.h" #include "local-execution/task_signature_impl.h" +#include "pcg/computation_graph.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" -#include "pcg/computation_graph.h" #include "utils/containers/reversed.h" #include "utils/exception.h" diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 4c01df53e9..2b22d64969 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -31,7 +31,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*kdim=*/embed_dim, /*vdim=*/embed_dim, /*dropout=*/0.0, - /*bias=*/true, + /*bias=*/false, /*add_bias_kv=*/false, /*add_zero_attn=*/false, }; diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 3bc85354a0..4801aff6a9 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -33,7 +33,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; tensor_guid_t input_tensor = - cg_builder.create_tensor(input_shape, CreateGrad::YES); + cg_builder.create_input(input_shape, CreateGrad::YES); float scalar = 4.0; tensor_guid_t logit_tensor = @@ -50,7 +50,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorShape label_shape = TensorShape{ TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; tensor_guid_t label_tensor = - cg_builder.create_tensor(label_shape, CreateGrad::NO); + cg_builder.create_input(label_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); tensor_backing_map.insert({label_tensor, label_backing}); @@ -73,7 +73,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("OtherAttrs") { tensor_guid_t label_tensor = - cg_builder.create_tensor(input_shape, CreateGrad::NO); + cg_builder.create_input(input_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); tensor_backing_map.insert({label_tensor, label_backing}); diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index b5a503f430..af4303fab8 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -27,7 +27,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; tensor_guid_t input_tensor = - cg_builder.create_tensor(input_shape, CreateGrad::YES); + cg_builder.create_input(input_shape, CreateGrad::YES); float scalar = 4.0; tensor_guid_t logit_tensor = @@ -41,7 +41,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { tensor_backing_map.insert({input_tensor, input_backing}); tensor_guid_t label_tensor = - cg_builder.create_tensor(input_shape, CreateGrad::NO); + cg_builder.create_input(input_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); tensor_backing_map.insert({label_tensor, label_backing}); diff --git a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml deleted file mode 100644 index d60c6507cf..0000000000 --- a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml +++ /dev/null @@ -1,22 +0,0 @@ -namespace = "FlexFlow" -name = "LossAttrs" -features = [ - "eq", - "ord", - "hash", - "json", - "fmt", -] - -includes = [ - "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h", - "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h" -] - -[[values]] -type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs" -key = "sparse_categorical_ce_loss_attrs" - -[[values]] -type = "::FlexFlow::NonconfigurableLossAttrs" -key = "nonconfigurable_loss_attrs" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml b/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml deleted file mode 100644 index b9cd13eabf..0000000000 --- a/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml +++ /dev/null @@ -1,23 +0,0 @@ -namespace = "FlexFlow" -name = "LossFunction" -features = [ - "hash", - "json", - "rapidcheck", - "fmt", -] - -[[values]] -name = "CATEGORICAL_CROSSENTROPY" - -[[values]] -name = "SPARSE_CATEGORICAL_CROSSENTROPY" - -[[values]] -name = "MEAN_SQUARED_ERROR_AVG_REDUCE" - -[[values]] -name = "MEAN_SQUARED_ERROR_SUM_REDUCE" - -[[values]] -name = "IDENTITY" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions.h index 74d2d0a479..657f8d91dc 100644 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions.h +++ b/lib/op-attrs/include/op-attrs/ops/loss_functions.h @@ -2,10 +2,10 @@ #define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H #include "op-attrs/ops/core.h" -#include "op-attrs/ops/loss_attrs.dtg.h" -#include "op-attrs/ops/loss_function.dtg.h" -#include "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h" -#include "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "op-attrs/ops/loss_functions/loss_function.dtg.h" +#include "op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.dtg.h" +#include "op-attrs/ops/loss_functions/sparse_categorical_cross_entropy_loss_attrs.dtg.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml index 17293095e4..943760d949 100644 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml +++ b/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml @@ -11,7 +11,7 @@ features = [ includes = [ "op-attrs/ops/loss_functions/sparse_categorical_cross_entropy_loss_attrs.dtg.h", - "op-attrs/ops/loss_functions/other_loss_attrs.dtg.h", + "op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.dtg.h", ] [[values]] @@ -19,5 +19,5 @@ type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs" key = "sparse_categorical_cross_entropy_loss" [[values]] -type = "::FlexFlow::OtherLossAttrs" -key = "other_loss" +type = "::FlexFlow::NonconfigurableLossAttrs" +key = "nonconfigurable_loss_attrs" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h deleted file mode 100644 index ca8f3e6602..0000000000 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H -#define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H - -#include "op-attrs/ops/core.h" -#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" -#include "op-attrs/ops/loss_functions/loss_function.dtg.h" - -namespace FlexFlow { - -CHECK_VALID_OP_ATTR(LossAttrs); - -LossFunction parse_loss_function_name(std::string const &); - -LossFunction get_loss_function(OtherLossAttrs const &); -LossFunction get_loss_function(SparseCategoricalCrossEntropyLossAttrs const &); -LossFunction get_loss_function(LossAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.struct.toml similarity index 80% rename from lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml rename to lib/op-attrs/include/op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.struct.toml index 0420e7ef7b..3fe7ac86c5 100644 --- a/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.struct.toml @@ -10,7 +10,7 @@ features = [ ] includes = [ - "op-attrs/ops/loss_function.dtg.h" + "op-attrs/ops/loss_functions/loss_function.dtg.h" ] [[fields]] diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml deleted file mode 100644 index 284a4b1d7d..0000000000 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml +++ /dev/null @@ -1,18 +0,0 @@ -namespace = "FlexFlow" -name = "OtherLossAttrs" -features = [ - "eq", - "ord", - "hash", - "fmt", - "rapidcheck", - "json", -] - -includes = [ - "op-attrs/ops/loss_functions/loss_function.dtg.h", -] - -[[fields]] -name = "loss_type" -type = "::FlexFlow::LossFunction" diff --git a/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml deleted file mode 100644 index 21378a1154..0000000000 --- a/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml +++ /dev/null @@ -1,14 +0,0 @@ -namespace = "FlexFlow" -name = "SparseCategoricalCrossEntropyLossAttrs" -features = [ - "eq", - "ord", - "hash", - "json", - "rapidcheck", - "fmt", -] - -[[fields]] -name = "replace_labels" -type = "bool" diff --git a/lib/op-attrs/src/op-attrs/ops/loss_functions.cc b/lib/op-attrs/src/op-attrs/ops/loss_functions.cc index a5c6aeb2a5..2b9a7533f0 100644 --- a/lib/op-attrs/src/op-attrs/ops/loss_functions.cc +++ b/lib/op-attrs/src/op-attrs/ops/loss_functions.cc @@ -1,4 +1,4 @@ -#include "op-attrs/ops/loss_functions/loss_functions.h" +#include "op-attrs/ops/loss_functions.h" #include "utils/containers/transform.h" #include "utils/exception.h" #include "utils/overload.h" From f5ff91e9757a73c94d73dddaec2243b0c46c87ec Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 1 Oct 2024 10:49:02 -0700 Subject: [PATCH 15/22] Fix input/weight differentiation --- .../local-execution/local_slots_backing.h | 2 ++ .../src/local_slots_backing.cc | 24 +++++++++---------- .../test/src/test_local_slots_backing.cc | 12 +++++++--- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 4c6dbacfe3..93c534e583 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -54,6 +54,8 @@ struct LocalSlotsBacking { TensorBackingMap gradient_tensor_mapping; std::unordered_map> input_tensor_slots; + std::unordered_map> + weight_tensor_slots; std::unordered_map> output_tensor_slots; std::unordered_map> diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 93cfe4498c..bdbfa4f222 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -22,8 +22,10 @@ void LocalSlotsBacking::allocate_outgoing_tensors( layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, Allocator &allocator) { - std::vector incoming_tensors = - get_incoming_tensors(computation_graph, layer_guid); + std::vector incoming_input_tensors = + get_incoming_inputs(computation_graph, layer_guid); + std::vector incoming_weight_tensors = + get_incoming_weights(computation_graph, layer_guid); std::vector outgoing_tensors = get_outgoing_tensors(computation_graph, layer_guid); for (tensor_guid_t const &output_tensor : outgoing_tensors) { @@ -46,7 +48,8 @@ void LocalSlotsBacking::allocate_outgoing_tensors( } } - this->input_tensor_slots.insert({layer_guid, incoming_tensors}); + this->input_tensor_slots.insert({layer_guid, incoming_input_tensors}); + this->weight_tensor_slots.insert({layer_guid, incoming_weight_tensors}); this->output_tensor_slots.insert({layer_guid, outgoing_tensors}); } @@ -100,13 +103,6 @@ GenericTensorAccessorW const & TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { TensorSlotsBacking mapping; - int num_inputs = 0; - for (auto const &tensor_binding : binding.get_tensor_bindings()) { - if (tensor_binding.first.is_grad == IsGrad::NO && - tensor_binding.second.role == TensorRole::INPUT) { - num_inputs += 1; - } - } for (auto const &tensor_binding : binding.get_tensor_bindings()) { SlotGradId slot_grad_id = tensor_binding.first; @@ -115,7 +111,9 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( int weight_adjusted_idx = 0; switch (tensor_spec.role) { case TensorRole::WEIGHT: - weight_adjusted_idx = num_inputs; + assert(contains_key(this->weight_tensor_slots, op_guid)); + tensor_guids = this->weight_tensor_slots.at(op_guid); + break; case TensorRole::INPUT: assert(contains_key(this->input_tensor_slots, op_guid)); tensor_guids = this->input_tensor_slots.at(op_guid); @@ -130,8 +128,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( } IsGrad is_grad = slot_grad_id.is_grad; - GenericTensorAccessorW tensor_backing = this->get_tensor_backing( - tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad); + GenericTensorAccessorW tensor_backing = + this->get_tensor_backing(tensor_guids.at(tensor_spec.idx), is_grad); mapping.insert({slot_grad_id, tensor_backing}); } diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index c18108d6b4..779ba43f26 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -157,11 +157,17 @@ TEST_SUITE(FF_TEST_SUITE) { local_slots_backing.allocate_outgoing_tensors( layer_guid, cg_builder.computation_graph, allocator); SUBCASE("Input tensor slots") { - std::vector correct_incoming_tensors = - get_incoming_tensors(cg_builder.computation_graph, layer_guid); - CHECK(correct_incoming_tensors == + std::vector correct_incoming_input_tensors = + get_incoming_inputs(cg_builder.computation_graph, layer_guid); + CHECK(correct_incoming_input_tensors == local_slots_backing.input_tensor_slots.at(layer_guid)); } + SUBCASE("Weight tensor slots") { + std::vector correct_incoming_weight_tensors = + get_incoming_weights(cg_builder.computation_graph, layer_guid); + CHECK(correct_incoming_weight_tensors == + local_slots_backing.weight_tensor_slots.at(layer_guid)); + } SUBCASE("Output tensor slots") { std::vector correct_outgoing_tensors = get_outgoing_tensors(cg_builder.computation_graph, layer_guid); From 7470e71eaa959f2304fc5e111b18f045473c3364 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 1 Oct 2024 11:53:07 -0700 Subject: [PATCH 16/22] Fix signature to use unified rep --- .../local-execution/local_slots_backing.h | 9 ++- .../non_graph_tensor_guid_t.struct.toml | 17 ++++++ .../include/local-execution/optimizer.h | 9 +-- .../task_signature.struct.toml | 5 +- .../tensor_guid_slot_spec.struct.toml | 5 -- .../tensor_guid_spec.struct.toml | 3 +- .../unified_tensor_guid.variant.toml | 21 +++++++ .../src/local_slots_backing.cc | 56 +++++++++++-------- .../src/local_training_backing.cc | 5 +- lib/local-execution/src/loss_functions.cc | 6 +- lib/local-execution/src/optimizer.cc | 22 ++++---- lib/local-execution/src/task_signature.cc | 4 +- lib/pcg/include/pcg/computation_graph.h | 3 - lib/pcg/src/pcg/computation_graph.cc | 13 ----- 14 files changed, 105 insertions(+), 73 deletions(-) create mode 100644 lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml create mode 100644 lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 93c534e583..d201d3c405 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -4,10 +4,12 @@ #include "kernels/accessor.h" #include "local-execution/local_task_argument_accessor.h" +#include "local-execution/non_graph_tensor_guid_t.dtg.h" #include "local-execution/op_task_invocation.h" #include "local-execution/per_device_op_state.h" #include "local-execution/runtime_arg_config.h" #include "local-execution/task_invocation.dtg.h" +#include "local-execution/unified_tensor_guid.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/layer_guid_t.dtg.h" #include "pcg/tensor_guid_t.dtg.h" @@ -16,6 +18,8 @@ namespace FlexFlow { using TensorBackingMap = std::unordered_map; +using NonGraphTensorBackingMap = + std::unordered_map; struct LocalSlotsBacking { LocalSlotsBacking(TensorBackingMap const &, RuntimeArgConfig const &); @@ -42,7 +46,7 @@ struct LocalSlotsBacking { ConcreteArgSpec resolve_op_arg_ref_spec(OpArgRefSpec const &, layer_guid_t const &) const; - GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &, + GenericTensorAccessorW const &get_tensor_backing(UnifiedTensorGuid const &, IsGrad) const; bool is_tensor_allocated(tensor_guid_t const &) const; @@ -52,13 +56,14 @@ struct LocalSlotsBacking { // tensors TensorBackingMap tensor_mapping; TensorBackingMap gradient_tensor_mapping; + NonGraphTensorBackingMap optimizer_tensor_mapping; std::unordered_map> input_tensor_slots; std::unordered_map> weight_tensor_slots; std::unordered_map> output_tensor_slots; - std::unordered_map> + std::unordered_map> weight_optimizer_tensor_guids; // arguments diff --git a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml new file mode 100644 index 0000000000..8904c232c9 --- /dev/null +++ b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml @@ -0,0 +1,17 @@ +namespace = "FlexFlow" +name = "non_graph_tensor_guid_t" +features = [ + "eq", + "ord", + "hash", + "fmt", + "json", +] + +includes = [ + "", +] + +[[fields]] +name = "raw_uid" +type = "size_t" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index 1e2cd65362..acf9b8a550 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ +#include "local-execution/non_graph_tensor_guid_t.dtg.h" #include "local-execution/task_impl_function.dtg.h" #include "local-execution/task_invocation.dtg.h" #include "local-execution/task_signature.h" @@ -14,20 +15,20 @@ TaskSignature get_update_signature(OptimizerAttrs const &); TaskInvocation get_update_invocation( OptimizerAttrs const &, tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors); + std::vector const &grad_buffer_tensors); TaskImplFunction get_update_task_impl(OptimizerAttrs const &); TaskSignature get_sgd_update_signature(); TaskInvocation sgd_update(SGDOptimizerAttrs const &, tensor_guid_t const &weight, - tensor_guid_t const &sgd_v); + non_graph_tensor_guid_t const &sgd_v); TaskImplFunction get_sgd_update_task_impl(); TaskSignature get_adam_update_signature(); TaskInvocation adam_update(AdamOptimizerAttrs const &, tensor_guid_t const &weight, - tensor_guid_t const &adam_v, - tensor_guid_t const &adam_m); + non_graph_tensor_guid_t const &adam_v, + non_graph_tensor_guid_t const &adam_m); TaskImplFunction get_adam_update_task_impl(); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml index fd15df91d5..ac408a7b68 100644 --- a/lib/local-execution/include/local-execution/task_signature.struct.toml +++ b/lib/local-execution/include/local-execution/task_signature.struct.toml @@ -8,15 +8,14 @@ features = [ includes = [ "local-execution/tensor_guid_slot_spec.dtg.h", + "local-execution/slot_id_t.dtg.h", "", "" ] src_includes = [ "utils/fmt/unordered_map.h", - "utils/fmt/unordered_set.h", "utils/hash/unordered_map.h", - "utils/hash/unordered_set.h", "utils/fmt/optional.h", "utils/type_index.h" ] @@ -31,4 +30,4 @@ type = "std::unordered_map<::FlexFlow::slot_id_t, std::type_index>" [[fields]] name = "tensor_guid_slots" -type = "std::unordered_set<::FlexFlow::TensorGuidSlotSpec>" +type = "std::unordered_map<::FlexFlow::slot_id_t, ::FlexFlow::TensorGuidSlotSpec>" diff --git a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml index 4b3e5b2674..9b7e9c14f9 100644 --- a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml +++ b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml @@ -8,15 +8,10 @@ features = [ ] includes = [ - "local-execution/slot_id_t.dtg.h", "local-execution/slot_type.dtg.h", "local-execution/is_grad.dtg.h", ] -[[fields]] -name = "name" -type = "::FlexFlow::slot_id_t" - [[fields]] name = "slot_type" type = "::FlexFlow::SlotType" diff --git a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml index a51d6ccf1b..1d147f60e5 100644 --- a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml +++ b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml @@ -10,11 +10,12 @@ features = [ includes = [ "pcg/tensor_guid_t.dtg.h", "local-execution/is_grad.dtg.h", + "local-execution/unified_tensor_guid.dtg.h" ] [[fields]] name = "tensor_guid" -type = "::FlexFlow::tensor_guid_t" +type = "::FlexFlow::UnifiedTensorGuid" [[fields]] name = "is_grad" diff --git a/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml b/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml new file mode 100644 index 0000000000..3d2cd8e45f --- /dev/null +++ b/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml @@ -0,0 +1,21 @@ +namespace = "FlexFlow" +name = "UnifiedTensorGuid" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "pcg/tensor_guid_t.dtg.h", + "local-execution/non_graph_tensor_guid_t.dtg.h", +] + +[[values]] +type = "::FlexFlow::tensor_guid_t" +key = "tensor_guid" + +[[values]] +type = "::FlexFlow::non_graph_tensor_guid_t" +key = "non_graph_tensor_guid" diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index bdbfa4f222..f10b7c0126 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -60,18 +60,19 @@ void LocalSlotsBacking::allocate_optimizer_tensors( Allocator &allocator, TaskSignature const &sig) { GenericTensorAccessorW weight_backing = - get_tensor_backing(weight, IsGrad::NO); + get_tensor_backing(UnifiedTensorGuid{weight}, IsGrad::NO); int num_grad_buffer_tensors = sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad) - std::vector grad_buffer_tensors = - get_new_tensor_guids_for_layer_without_graph_insertion( - cg, weight_layer, num_grad_buffer_tensors); - for (tensor_guid_t const &tensor_guid : grad_buffer_tensors) { + std::vector grad_buffer_tensors; + for (int i = 0; i < num_grad_buffer_tensors; ++i) { + non_graph_tensor_guid_t buffer_tensor_guid = non_graph_tensor_guid_t{i}; GenericTensorAccessorW buffer_backing = allocator.allocate_tensor( get_tensor_shape(weight_backing.shape, weight_backing.data_type)); - this->gradient_tensor_mapping.insert({tensor_guid, buffer_backing}); + this->optimizer_tensor_mapping.insert({buffer_tensor_guid, buffer_backing}); + grad_buffer_tensors.push_back(buffer_tensor_guid); } - this->weight_optimizer_tensor_guids.insert({weight, grad_buffer_tensors}); + this->weight_optimizer_tensor_guids.insert( + {weight_layer, grad_buffer_tensors}); } bool LocalSlotsBacking::is_tensor_allocated( @@ -85,18 +86,26 @@ bool LocalSlotsBacking::is_gradient_tensor_allocated( } GenericTensorAccessorW const & - LocalSlotsBacking::get_tensor_backing(tensor_guid_t const &tensor_id, + LocalSlotsBacking::get_tensor_backing(UnifiedTensorGuid const &tensor_id, IsGrad is_grad) const { - switch (is_grad) { - case IsGrad::NO: - assert(contains_key(this->tensor_mapping, tensor_id)); - return this->tensor_mapping.at(tensor_id); - case IsGrad::YES: - assert(contains_key(this->gradient_tensor_mapping, tensor_id)); - return this->gradient_tensor_mapping.at(tensor_id); - default: - throw mk_runtime_error(fmt::format( - "IsGrad should only have YES or NO, received {}", is_grad)); + if (tensor_id.has()) { + tensor_guid_t graph_tensor_guid = tensor_id.get(); + switch (is_grad) { + case IsGrad::NO: + assert(contains_key(this->tensor_mapping, graph_tensor_guid)); + return this->tensor_mapping.at(graph_tensor_guid); + case IsGrad::YES: + assert(contains_key(this->gradient_tensor_mapping, graph_tensor_guid)); + return this->gradient_tensor_mapping.at(graph_tensor_guid); + default: + throw mk_runtime_error(fmt::format( + "IsGrad should only have YES or NO, received {}", is_grad)); + } + } else { + non_graph_tensor_guid_t non_graph_tensor_guid = + tensor_id.get(); + assert(contains_key(this->optimizer_tensor_mapping, non_graph_tensor_guid)); + return this->optimizer_tensor_mapping.at(non_graph_tensor_guid); } } @@ -128,8 +137,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( } IsGrad is_grad = slot_grad_id.is_grad; - GenericTensorAccessorW tensor_backing = - this->get_tensor_backing(tensor_guids.at(tensor_spec.idx), is_grad); + GenericTensorAccessorW tensor_backing = this->get_tensor_backing( + UnifiedTensorGuid{tensor_guids.at(tensor_spec.idx)}, is_grad); mapping.insert({slot_grad_id, tensor_backing}); } @@ -144,8 +153,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( SlotGradId slot_grad_id = tensor_binding.first; TensorGuidSpec tensor_spec = tensor_binding.second; - GenericTensorAccessorW accessor = - this->get_tensor_backing(tensor_spec.tensor_guid, slot_grad_id.is_grad); + GenericTensorAccessorW accessor = this->get_tensor_backing( + UnifiedTensorGuid{tensor_spec.tensor_guid}, slot_grad_id.is_grad); mapping.insert({slot_grad_id, accessor}); } @@ -199,7 +208,8 @@ ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec( assert(input_tensor_guids.size() > index_op_arg_ref.idx); GenericTensorAccessorW tensor_backing = this->get_tensor_backing( - input_tensor_guids.at(index_op_arg_ref.idx), IsGrad::NO); + UnifiedTensorGuid{input_tensor_guids.at(index_op_arg_ref.idx)}, + IsGrad::NO); ParallelTensorShape shape = lift_to_parallel( get_tensor_shape(tensor_backing.shape, tensor_backing.data_type)); return ConcreteArgSpec::create(shape); diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index edbb377047..dafa28a70f 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -162,9 +162,8 @@ void LocalTrainingBacking::execute_update() { // get tensors tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); - std::vector grad_buffer_tensors = - this->local_slots_backing.weight_optimizer_tensor_guids.at( - weight_tensor); + std::vector grad_buffer_tensors = + this->local_slots_backing.weight_optimizer_tensor_guids.at(node); // get invocation TaskInvocation invocation = diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 3a4c616377..a37c1d706b 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -35,9 +35,9 @@ TaskSignature get_loss_bwd_signature() { TaskInvocation backward(LossAttrs const &attrs, tensor_guid_t logit, tensor_guid_t label) { TaskBinding b; - b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::NO}); - b.bind(LABEL, TensorGuidSpec{label, IsGrad::NO}); - b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::YES}); + b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::NO}); + b.bind(LABEL, TensorGuidSpec{UnifiedTensorGuid{label}, IsGrad::NO}); + b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::YES}); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 30f20bf8ec..1e06dee96a 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -22,12 +22,12 @@ TaskSignature get_sgd_update_signature() { TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, tensor_guid_t const &weight, - tensor_guid_t const &sgd_v) { + non_graph_tensor_guid_t const &sgd_v) { TaskBinding b; - b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); - b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); + b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES}); + b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO}); if (attrs.momentum > 0.0f) { - b.bind(SGD_V, TensorGuidSpec{sgd_v, IsGrad::YES}); + b.bind(SGD_V, TensorGuidSpec{UnifiedTensorGuid{sgd_v}, IsGrad::YES}); } b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -111,13 +111,13 @@ TaskSignature get_adam_update_signature() { TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, tensor_guid_t const &weight, - tensor_guid_t const &adam_v, - tensor_guid_t const &adam_m) { + non_graph_tensor_guid_t const &adam_v, + non_graph_tensor_guid_t const &adam_m) { TaskBinding b; - b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); - b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); - b.bind(ADAM_M, TensorGuidSpec{adam_m, IsGrad::YES}); - b.bind(ADAM_V, TensorGuidSpec{adam_v, IsGrad::YES}); + b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES}); + b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO}); + b.bind(ADAM_M, TensorGuidSpec{UnifiedTensorGuid{adam_m}, IsGrad::YES}); + b.bind(ADAM_V, TensorGuidSpec{UnifiedTensorGuid{adam_v}, IsGrad::YES}); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -192,7 +192,7 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) { TaskInvocation get_update_invocation( OptimizerAttrs const &attrs, tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors) { + std::vector const &grad_buffer_tensors) { return attrs.visit(overload{ [&](SGDOptimizerAttrs const &s) { return sgd_update(s, weight, grad_buffer_tensors.at(0)); diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc index 3bba9e2c8a..27bcbcd266 100644 --- a/lib/local-execution/src/task_signature.cc +++ b/lib/local-execution/src/task_signature.cc @@ -18,8 +18,8 @@ void add_slot(TaskSignature &task_signature, IsGrad is_grad, SlotType slot_type) { TensorGuidSlotSpec tensor_guid_slot_spec = - TensorGuidSlotSpec{name, slot_type, is_grad}; - task_signature.tensor_guid_slots.insert(tensor_guid_slot_spec); + TensorGuidSlotSpec{slot_type, is_grad}; + task_signature.tensor_guid_slots.insert({name, tensor_guid_slot_spec}); } } // namespace FlexFlow diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index 32ed0e3025..f70d9f7404 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -52,9 +52,6 @@ LayerAttrs get_layer_attrs(ComputationGraph const &cg, layer_guid_t const &n); layer_guid_t get_layer_by_name(ComputationGraph const &cg, std::string const &name); -std::vector - get_new_tensor_guids_for_layer_without_graph_insertion( - ComputationGraph const &, layer_guid_t const &n, int num_tensors); std::string as_dot(ComputationGraph const &); void debug_print_dot(ComputationGraph const &); diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc index 6f6c10d798..a69e54fd93 100644 --- a/lib/pcg/src/pcg/computation_graph.cc +++ b/lib/pcg/src/pcg/computation_graph.cc @@ -175,19 +175,6 @@ layer_guid_t get_layer_by_name(ComputationGraph const &cg, return get_only(found); } -std::vector - get_new_tensor_guids_for_layer_without_graph_insertion( - ComputationGraph const &cg, layer_guid_t const &n, int num_tensors) { - std::vector new_tensor_guids; - int num_outgoing_tensors = get_outgoing_tensors(cg, n).size(); - - for (int i = 0; i < num_tensors; ++i) { - new_tensor_guids.push_back( - tensor_guid_t{DataflowOutput{n.raw_node, num_outgoing_tensors + i}}); - } - return new_tensor_guids; -} - std::string as_dot(ComputationGraph const &cg) { std::function get_node_label = [](LayerAttrs const &a) -> std::string { From deece1be7eae96ef4604679a13c2ec58207632e3 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 1 Oct 2024 12:39:29 -0700 Subject: [PATCH 17/22] Fix model training instance abstraction --- .../local-execution/local_training_backing.h | 5 +++- .../local-execution/model_training_instance.h | 13 -------- .../model_training_instance.struct.toml | 7 +---- .../non_graph_tensor_guid_t.struct.toml | 6 +--- .../src/local_cost_estimator.cc | 4 ++- .../src/local_training_backing.cc | 22 +++++++------- .../src/model_training_instance.cc | 30 ------------------- lib/local-execution/test/src/test_loss_e2e.cc | 24 +++++++-------- .../test/src/test_update_e2e.cc | 18 +++++------ lib/pcg/include/pcg/optimizer_attrs.h | 13 ++++++++ lib/pcg/src/pcg/optimizer_attrs.cc | 24 +++++++++++++++ 11 files changed, 79 insertions(+), 87 deletions(-) delete mode 100644 lib/local-execution/include/local-execution/model_training_instance.h delete mode 100644 lib/local-execution/src/model_training_instance.cc create mode 100644 lib/pcg/include/pcg/optimizer_attrs.h create mode 100644 lib/pcg/src/pcg/optimizer_attrs.cc diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index 08a458cb7f..2313d55732 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -5,6 +5,7 @@ #include "local-execution/model_training_instance.dtg.h" #include "local-execution/task_registry.h" #include "pcg/computation_graph.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" namespace FlexFlow { @@ -16,7 +17,8 @@ struct LocalTrainingBacking { ComputationGraph const &, TensorBackingMap const &, RuntimeArgConfig const &, - std::optional &); + std::optional const &, + std::optional const &); void execute_init(); PerLayerElapsedTime execute_forward(); @@ -38,6 +40,7 @@ struct LocalTrainingBacking { TaskRegistry task_registry; LocalSlotsBacking local_slots_backing; std::optional training_instance; + std::optional optimizer_attrs; }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h deleted file mode 100644 index afc8fa7472..0000000000 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ /dev/null @@ -1,13 +0,0 @@ - -#ifndef _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H -#define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H - -#include "local-execution/model_training_instance.dtg.h" - -namespace FlexFlow { - -ModelTrainingInstance next(ModelTrainingInstance const &old); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml index 28282e21c0..dcfaf2175d 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml +++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml @@ -9,8 +9,7 @@ features = [ includes = [ "op-attrs/ops/loss_functions/loss_attrs.dtg.h", - "pcg/tensor_guid_t.dtg.h", - "pcg/optimizer_attrs.dtg.h", + "pcg/tensor_guid_t.dtg.h" ] [[fields]] @@ -24,7 +23,3 @@ type = "::FlexFlow::tensor_guid_t" [[fields]] name = "logit_tensor" type = "::FlexFlow::tensor_guid_t" - -[[fields]] -name = "optimizer_attrs" -type = "::FlexFlow::OptimizerAttrs" diff --git a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml index 8904c232c9..4832ecaafa 100644 --- a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml +++ b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml @@ -8,10 +8,6 @@ features = [ "json", ] -includes = [ - "", -] - [[fields]] name = "raw_uid" -type = "size_t" +type = "int" diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index f153db3240..186c2d516a 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -76,11 +76,13 @@ CostDetails LocalCostEstimator::estimate_cost( get_vector_piece_attrs(outputs)); std::optional model_training_instance = std::nullopt; + std::optional optimizer_attrs = std::nullopt; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, this->runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); PerLayerElapsedTime fwd = local_backing.execute_forward(); diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index dafa28a70f..46a8f83709 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,10 +1,10 @@ #include "local-execution/local_training_backing.h" #include "local-execution/loss_functions.h" -#include "local-execution/model_training_instance.h" #include "local-execution/optimizer.h" #include "local-execution/task_invocation.h" #include "local-execution/task_signature_impl.h" #include "pcg/computation_graph.h" +#include "pcg/optimizer_attrs.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" @@ -18,11 +18,12 @@ LocalTrainingBacking::LocalTrainingBacking( ComputationGraph const &computation_graph, TensorBackingMap const &tensor_backing_mapping, RuntimeArgConfig const &runtime_arg_config, - std::optional &training_instance) + std::optional const &training_instance, + std::optional const &optimizer_attrs) : allocator(allocator), computation_graph(computation_graph), local_slots_backing(tensor_backing_mapping, runtime_arg_config), task_registry(empty_task_registry()), - training_instance(training_instance) { + training_instance(training_instance), optimizer_attrs(optimizer_attrs) { for (layer_guid_t const &node : topological_ordering(this->computation_graph)) { @@ -38,8 +39,8 @@ LocalTrainingBacking::LocalTrainingBacking( // allocate optimizer buffers if (attrs.has() && this->training_instance.has_value()) { - OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs; - TaskSignature sig = get_update_signature(attrs); + assert(this->optimizer_attrs.has_value()); + TaskSignature sig = get_update_signature(this->optimizer_attrs.value()); tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); this->local_slots_backing.allocate_optimizer_tensors( @@ -153,7 +154,7 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() { void LocalTrainingBacking::execute_update() { assert(this->training_instance.has_value()); - OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs; + assert(this->optimizer_attrs.has_value()); for (layer_guid_t const &node : topological_ordering(this->computation_graph)) { @@ -166,18 +167,19 @@ void LocalTrainingBacking::execute_update() { this->local_slots_backing.weight_optimizer_tensor_guids.at(node); // get invocation - TaskInvocation invocation = - get_update_invocation(attrs, weight_tensor, grad_buffer_tensors); + TaskInvocation invocation = get_update_invocation( + this->optimizer_attrs.value(), weight_tensor, grad_buffer_tensors); // assert(is_invocation_valid(get_update_signature(attrs), invocation)); // execute update TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); - TaskImplFunction update_impl_fn = get_update_task_impl(attrs); + TaskImplFunction update_impl_fn = + get_update_task_impl(this->optimizer_attrs.value()); update_impl_fn.get().function_ptr(accessor); } } - this->training_instance = next(this->training_instance.value()); + this->optimizer_attrs = next(this->optimizer_attrs.value()); } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc deleted file mode 100644 index c626bfc0e0..0000000000 --- a/lib/local-execution/src/model_training_instance.cc +++ /dev/null @@ -1,30 +0,0 @@ -#include "local-execution/model_training_instance.h" - -namespace FlexFlow { - -ModelTrainingInstance next(ModelTrainingInstance const &old_training_instance) { - if (old_training_instance.optimizer_attrs.has()) { - AdamOptimizerAttrs old = - old_training_instance.optimizer_attrs.get(); - double new_beta1_t = old.beta_t * old.beta1; - double new_beta2_t = old.beta2_t * old.beta2; - double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); - OptimizerAttrs new_attrs = - OptimizerAttrs{AdamOptimizerAttrs{old.alpha, - old.beta1, - old.beta2, - old.weight_decay, - new_alpha_t, - new_beta1_t, - new_beta2_t, - old.epsilon}}; - return ModelTrainingInstance{old_training_instance.loss_attrs, - old_training_instance.label_tensor, - old_training_instance.logit_tensor, - new_attrs}; - } else { - return old_training_instance; - } -} - -} // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 4801aff6a9..72df1a08f1 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -59,13 +59,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossAttrs{SparseCategoricalCrossEntropyLossAttrs{ /*replace_labels=*/false}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -83,13 +83,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ModelTrainingInstance{LossAttrs{NonconfigurableLossAttrs{ LossFunction::CATEGORICAL_CROSSENTROPY}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -101,13 +101,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -118,13 +118,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ModelTrainingInstance{ LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index af4303fab8..96b748806f 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -58,13 +58,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -81,13 +81,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -109,13 +109,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h new file mode 100644 index 0000000000..4b78f66fe4 --- /dev/null +++ b/lib/pcg/include/pcg/optimizer_attrs.h @@ -0,0 +1,13 @@ + +#ifndef _FLEXFLOW_PCG_OPTIMIZER_ATTRS_H +#define _FLEXFLOW_PCG_OPTIMIZER_ATTRS_H + +#include "pcg/optimizer_attrs.dtg.h" + +namespace FlexFlow { + +OptimizerAttrs next(OptimizerAttrs const &old); + +} // namespace FlexFlow + +#endif diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc new file mode 100644 index 0000000000..5307450a68 --- /dev/null +++ b/lib/pcg/src/pcg/optimizer_attrs.cc @@ -0,0 +1,24 @@ +#include "pcg/optimizer_attrs.h" + +namespace FlexFlow { + +OptimizerAttrs next(OptimizerAttrs const &old_attrs) { + if (old_attrs.has()) { + AdamOptimizerAttrs old = old_attrs.get(); + double new_beta1_t = old.beta_t * old.beta1; + double new_beta2_t = old.beta2_t * old.beta2; + double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); + return OptimizerAttrs{AdamOptimizerAttrs{old.alpha, + old.beta1, + old.beta2, + old.weight_decay, + new_alpha_t, + new_beta1_t, + new_beta2_t, + old.epsilon}}; + } else { + return old_attrs; + } +} + +} // namespace FlexFlow From 1d3cc9498fb5afe5e9f1b0aa1e50260a58e1c424 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 1 Oct 2024 13:10:28 -0700 Subject: [PATCH 18/22] Change subcase test name --- lib/local-execution/test/src/test_loss_e2e.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 72df1a08f1..37024adc26 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -71,7 +71,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { local_backing.execute_backward(); } - SUBCASE("OtherAttrs") { + SUBCASE("NonconfigurableLossAttrs") { tensor_guid_t label_tensor = cg_builder.create_input(input_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = From 3cf5d08fb3b56f0e70145179c5dfd72eacd3cc2e Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 16 Oct 2024 12:34:59 -0700 Subject: [PATCH 19/22] Quick fixes --- lib/kernels/include/kernels/array_shape.h | 8 ++++---- lib/kernels/include/kernels/legion_dim.h | 3 +++ lib/kernels/src/allocation.cc | 2 +- lib/kernels/src/array_shape.cc | 7 +++++-- lib/kernels/src/legion_dim.cc | 9 +++++++++ .../include/local-execution/arg_ref.h | 17 ++++++++++++++-- .../include/local-execution/concrete_arg.h | 14 +++++++++++++ .../include/local-execution/runtime_arg_ref.h | 16 +++++++++++++++ .../task_arg_spec.variant.toml | 4 +++- .../include/local-execution/task_binding.h | 18 +++++++++++++++++ .../task_invocation.struct.toml | 4 +++- .../src/local_training_backing.cc | 4 ++-- lib/local-execution/src/ops/element_unary.cc | 6 ++++-- lib/local-execution/src/runtime_arg_ref.cc | 13 ++++++++++++ lib/local-execution/src/task_binding.cc | 13 ++++++++++++ .../test/src/test_local_cost_estimator.cc | 20 ++++++++++++------- lib/pcg/include/pcg/optimizer_attrs.h | 2 +- .../parallel_computation_graph_builder.h | 4 ++++ lib/pcg/src/pcg/optimizer_attrs.cc | 3 ++- .../parallel_computation_graph_builder.cc | 2 +- 20 files changed, 144 insertions(+), 25 deletions(-) diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index e60f0cd9c1..fd66697793 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -14,10 +14,10 @@ namespace FlexFlow { struct ArrayShape { public: ArrayShape() = delete; - ArrayShape(size_t *dims, size_t num_dims); - ArrayShape(TensorShape const &shape); - ArrayShape(std::vector const &); - ArrayShape(LegionTensorDims const &); + explicit ArrayShape(size_t *dims, size_t num_dims); + explicit ArrayShape(TensorShape const &shape); + explicit ArrayShape(std::vector const &); + explicit ArrayShape(LegionTensorDims const &); /** * @brief Alias of ArrayShape::num_elements for compatibility with diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h index e4dd9723b8..29c5e29a93 100644 --- a/lib/kernels/include/kernels/legion_dim.h +++ b/lib/kernels/include/kernels/legion_dim.h @@ -10,6 +10,9 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value); legion_dim_t legion_dim_from_ff_dim(ff_dim_t, int num_dimensions); +std::optional legion_dim_from_ff_dim(std::optional, + int num_dimensions); + template using LegionOrdered = DimOrdered; diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index ccd88580db..b57fbee257 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -14,7 +14,7 @@ void Allocator::deallocate(void *ptr) { GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); - return {tensor_shape.data_type, tensor_shape, ptr}; + return {tensor_shape.data_type, ArrayShape{tensor_shape}, ptr}; } } // namespace FlexFlow diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 8464212290..31ee7b6001 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -61,8 +61,11 @@ ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const { ArrayShape ArrayShape::sub_shape(std::optional start, std::optional end) const { - return ArrayShape{legion_dims_from_ff_dims( - slice(ff_ordered_from_legion_ordered(this->dims), start, end))}; + std::optional legion_start = + legion_dim_from_ff_dim(start, num_dims()); + std::optional legion_end = + legion_dim_from_ff_dim(end, num_dims()); + return this->sub_shape(legion_start, legion_end); } ArrayShape ArrayShape::sub_shape(std::optional start, diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc index 9ef47d40ae..c190a02220 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/legion_dim.cc @@ -10,4 +10,13 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, int num_dimensions) { return legion_dim_t(num_dimensions - ff_dim.value - 1); } +std::optional + legion_dim_from_ff_dim(std::optional ff_dim, int num_dimensions) { + if (ff_dim.has_value()) { + return legion_dim_from_ff_dim(ff_dim.value(), num_dimensions); + } else { + return std::nullopt; + } +} + } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/arg_ref.h b/lib/local-execution/include/local-execution/arg_ref.h index 30326b0e84..30da405c13 100644 --- a/lib/local-execution/include/local-execution/arg_ref.h +++ b/lib/local-execution/include/local-execution/arg_ref.h @@ -60,6 +60,20 @@ struct ArgRefSpec { friend struct std::hash>; }; +template +std::string format_as(ArgRefSpec const &x) { + std::ostringstream oss; + oss << ""; + return oss.str(); +} + +template +std::ostream &operator<<(std::ostream &s, ArgRefSpec const &x) { + return (s << fmt::to_string(x)); +} + } // namespace FlexFlow namespace std { @@ -68,8 +82,7 @@ template struct hash<::FlexFlow::ArgRefSpec> { size_t operator()(::FlexFlow::ArgRefSpec const &s) const { size_t result = 0; - hash_combine(s.type_idx); - hash_combine(s.ref_type); + ::FlexFlow::hash_combine(result, s.type_idx); return result; } }; diff --git a/lib/local-execution/include/local-execution/concrete_arg.h b/lib/local-execution/include/local-execution/concrete_arg.h index 3bc2714a71..ac5d97f3c4 100644 --- a/lib/local-execution/include/local-execution/concrete_arg.h +++ b/lib/local-execution/include/local-execution/concrete_arg.h @@ -3,6 +3,7 @@ #include "fmt/format.h" #include "local-execution/serialization.h" +#include "utils/hash-utils.h" #include "utils/type_index.h" #include @@ -53,4 +54,17 @@ std::ostream &operator<<(std::ostream &, ConcreteArgSpec const &); } // namespace FlexFlow +namespace std { + +template <> +struct hash<::FlexFlow::ConcreteArgSpec> { + size_t operator()(::FlexFlow::ConcreteArgSpec const &s) const { + size_t result = 0; + ::FlexFlow::hash_combine(result, s.get_type_index()); + return result; + } +}; + +} // namespace std + #endif diff --git a/lib/local-execution/include/local-execution/runtime_arg_ref.h b/lib/local-execution/include/local-execution/runtime_arg_ref.h index 279d854a27..fd79e23126 100644 --- a/lib/local-execution/include/local-execution/runtime_arg_ref.h +++ b/lib/local-execution/include/local-execution/runtime_arg_ref.h @@ -5,6 +5,8 @@ #include "local-execution/config.h" #include "local-execution/device_specific.h" #include "local-execution/profiling.h" +#include "utils/fmt.h" +#include "utils/type_index.h" namespace FlexFlow { @@ -14,6 +16,8 @@ enum class RuntimeArgRefType { FF_ITERATION_CONFIG }; +std::string to_string(RuntimeArgRefType const &); + template using RuntimeArgRef = ArgRef; @@ -23,6 +27,18 @@ RuntimeArgRef profiling_settings(); RuntimeArgRef> ff_handle(); RuntimeArgRef iteration_config(); +// std::string format_as(RuntimeArgRefSpec const & x) { +// std::ostringstream oss; +// oss << ""; +// return oss.str(); +// } + +// std::ostream &operator<<(std::ostream & s, RuntimeArgRefSpec const & x) { +// return (s << fmt::to_string(x)); +// } + } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml index a6df0c8a7d..271e3b73d6 100644 --- a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml +++ b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml @@ -1,7 +1,9 @@ namespace = "FlexFlow" name = "TaskArgSpec" features = [ - "eq" + "eq", + "fmt", + "hash" ] includes = [ diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h index cbe210f438..96c96473e4 100644 --- a/lib/local-execution/include/local-execution/task_binding.h +++ b/lib/local-execution/include/local-execution/task_binding.h @@ -7,6 +7,7 @@ #include "local-execution/task_id_t.dtg.h" #include "local-execution/task_signature.dtg.h" #include "local-execution/tensor_guid_spec.dtg.h" +#include "utils/hash/unordered_map.h" namespace FlexFlow { @@ -53,6 +54,23 @@ struct TaskBinding { tie() const; }; +std::string format_as(TaskBinding const &x); +std::ostream &operator<<(std::ostream &s, TaskBinding const &x); + } // namespace FlexFlow +namespace std { + +template <> +struct hash<::FlexFlow::TaskBinding> { + size_t operator()(::FlexFlow::TaskBinding const &s) const { + size_t result = 0; + hash_combine(result, s.get_tensor_bindings()); + hash_combine(result, s.get_arg_bindings()); + return result; + } +}; + +} // namespace std + #endif diff --git a/lib/local-execution/include/local-execution/task_invocation.struct.toml b/lib/local-execution/include/local-execution/task_invocation.struct.toml index abcaabda93..c9e1e22ba1 100644 --- a/lib/local-execution/include/local-execution/task_invocation.struct.toml +++ b/lib/local-execution/include/local-execution/task_invocation.struct.toml @@ -1,7 +1,9 @@ namespace = "FlexFlow" name = "TaskInvocation" features = [ - "eq" + "eq", + "fmt", + "hash" ] includes = [ diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 46a8f83709..b7631470b7 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -39,7 +39,6 @@ LocalTrainingBacking::LocalTrainingBacking( // allocate optimizer buffers if (attrs.has() && this->training_instance.has_value()) { - assert(this->optimizer_attrs.has_value()); TaskSignature sig = get_update_signature(this->optimizer_attrs.value()); tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); @@ -179,7 +178,8 @@ void LocalTrainingBacking::execute_update() { } } - this->optimizer_attrs = next(this->optimizer_attrs.value()); + this->optimizer_attrs = + get_next_iteration_optimizer_attrs(this->optimizer_attrs.value()); } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index dbbfad10fb..ccb41d7461 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -61,8 +61,10 @@ static DeviceSpecificDeviceStates ParallelTensorShape output_shape = throw_if_unexpected(get_output_shape(attrs, input_shape)); - ElementUnaryPerDeviceState per_device_state = init_kernel( - get_piece_shape(input_shape), get_piece_shape(output_shape), attrs); + ElementUnaryPerDeviceState per_device_state = + init_kernel(ArrayShape{get_piece_shape(input_shape)}, + ArrayShape{get_piece_shape(output_shape)}, + attrs); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; diff --git a/lib/local-execution/src/runtime_arg_ref.cc b/lib/local-execution/src/runtime_arg_ref.cc index 56201a5c55..1f591b4d82 100644 --- a/lib/local-execution/src/runtime_arg_ref.cc +++ b/lib/local-execution/src/runtime_arg_ref.cc @@ -3,6 +3,19 @@ namespace FlexFlow { +std::string to_string(RuntimeArgRefType const &runtime_arg_ref_type) { + switch (runtime_arg_ref_type) { + case RuntimeArgRefType::FF_HANDLE: + return "FF_HANDLE"; + case RuntimeArgRefType::PROFILING_SETTINGS: + return "PROFILING_SETTINGS"; + case RuntimeArgRefType::FF_ITERATION_CONFIG: + return "FF_ITERATION_CONFIG"; + default: + return "Unknown"; + } +} + RuntimeArgRef profiling_settings() { return {RuntimeArgRefType::PROFILING_SETTINGS}; } diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc index a5a3b2dc34..45d9d0cdb9 100644 --- a/lib/local-execution/src/task_binding.cc +++ b/lib/local-execution/src/task_binding.cc @@ -1,5 +1,6 @@ #include "local-execution/task_binding.h" #include "utils/containers/contains_key.h" +#include "utils/fmt/unordered_map.h" namespace FlexFlow { @@ -41,4 +42,16 @@ std::unordered_map const & return this->arg_bindings; } +std::string format_as(TaskBinding const &x) { + std::ostringstream oss; + oss << " weights; ParallelTensorShape weights_shape = throw_if_unexpected( get_weights_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); - ParallelTensorAttrs weight_attrs = - ParallelTensorAttrs{weights_shape, - /*sync_type=*/std::nullopt, - /*initializer=*/std::nullopt, - CreateGrad::YES}; + weights.push_back(make_weight_attrs(weights_shape, std::nullopt)); + ParallelTensorShape input_bias_shape = + throw_if_unexpected(get_input_bias_shape( + attrs, inputs_shape, inputs_shape, inputs_shape)); + weights.push_back(make_weight_attrs(input_bias_shape, std::nullopt)); + ParallelTensorShape output_bias_shape = + throw_if_unexpected(get_output_bias_shape( + attrs, inputs_shape, inputs_shape, inputs_shape)); + weights.push_back(make_weight_attrs(output_bias_shape, std::nullopt)); ParallelTensorShape output_shape = throw_if_unexpected( get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); @@ -66,7 +72,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { PCGOperatorAttrs{attrs}, std::vector{ inputs_shape, inputs_shape, inputs_shape}, - std::vector{weight_attrs}, + weights, std::vector{output_attrs}, make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h index 4b78f66fe4..d4abd1b52f 100644 --- a/lib/pcg/include/pcg/optimizer_attrs.h +++ b/lib/pcg/include/pcg/optimizer_attrs.h @@ -6,7 +6,7 @@ namespace FlexFlow { -OptimizerAttrs next(OptimizerAttrs const &old); +OptimizerAttrs get_next_iteration_optimizer_attrs(OptimizerAttrs const &old); } // namespace FlexFlow diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h index 019b120936..35113553f2 100644 --- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h +++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h @@ -179,6 +179,10 @@ struct ParallelComputationGraphBuilder { ParallelComputationGraph pcg; }; +ParallelTensorAttrs + make_weight_attrs(ParallelTensorShape const &shape, + std::optional const &initializer_attrs); + } // namespace FlexFlow #endif diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc index 5307450a68..8d66f7af7e 100644 --- a/lib/pcg/src/pcg/optimizer_attrs.cc +++ b/lib/pcg/src/pcg/optimizer_attrs.cc @@ -2,7 +2,8 @@ namespace FlexFlow { -OptimizerAttrs next(OptimizerAttrs const &old_attrs) { +OptimizerAttrs + get_next_iteration_optimizer_attrs(OptimizerAttrs const &old_attrs) { if (old_attrs.has()) { AdamOptimizerAttrs old = old_attrs.get(); double new_beta1_t = old.beta_t * old.beta1; diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index ce00ea62f4..b56156fe8a 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -20,7 +20,7 @@ static std::string get_default_name(PCGOperatorAttrs const &attrs) { return get_default_name(get_op_type(attrs)); } -static ParallelTensorAttrs make_weight_attrs( +ParallelTensorAttrs make_weight_attrs( ParallelTensorShape const &shape, std::optional const &initializer_attrs) { return ParallelTensorAttrs{ From 79ef4c964fa4abebf9813166353ecce230b83c75 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 22 Oct 2024 08:55:37 -0700 Subject: [PATCH 20/22] Refactor training backing and instance --- .../local-execution/local_slots_backing.h | 13 +- .../local-execution/local_training_backing.h | 26 +- .../local-execution/model_training_instance.h | 39 +++ .../model_training_instance.struct.toml | 25 -- .../include/local-execution/task_registry.h | 5 + .../src/local_cost_estimator.cc | 39 ++- .../src/local_slots_backing.cc | 64 +++-- .../src/local_training_backing.cc | 224 ++++++++---------- .../src/model_training_instance.cc | 64 +++++ lib/local-execution/src/task_registry.cc | 24 +- .../test/src/test_local_slots_backing.cc | 32 ++- lib/local-execution/test/src/test_loss_e2e.cc | 96 +++----- .../test/src/test_update_e2e.cc | 77 ++---- .../include/pcg/computation_graph_builder.h | 7 + lib/pcg/src/pcg/computation_graph_builder.cc | 14 +- 15 files changed, 402 insertions(+), 347 deletions(-) create mode 100644 lib/local-execution/include/local-execution/model_training_instance.h delete mode 100644 lib/local-execution/include/local-execution/model_training_instance.struct.toml create mode 100644 lib/local-execution/src/model_training_instance.cc diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index d201d3c405..46e66e97a2 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -9,6 +9,7 @@ #include "local-execution/per_device_op_state.h" #include "local-execution/runtime_arg_config.h" #include "local-execution/task_invocation.dtg.h" +#include "local-execution/tensor_role.dtg.h" #include "local-execution/unified_tensor_guid.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/layer_guid_t.dtg.h" @@ -27,9 +28,15 @@ struct LocalSlotsBacking { public: void add_per_device_op_state(layer_guid_t const &, DeviceSpecificDeviceStates const &); - void allocate_outgoing_tensors(layer_guid_t const &, - ComputationGraph const &, - Allocator &); + void insert_into_tensor_mapping(tensor_guid_t const &, + GenericTensorAccessorW const &); + void allocate_layer_tensors(layer_guid_t const &, + ComputationGraph const &, + Allocator &); + void allocate_tensors_by_role(TensorRole const &, + layer_guid_t const &, + ComputationGraph const &, + Allocator &); void allocate_optimizer_tensors(layer_guid_t const &weight_layer, tensor_guid_t const &, ComputationGraph const &, diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index 2313d55732..6dfa8ad443 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H #include "local-execution/local_slots_backing.h" -#include "local-execution/model_training_instance.dtg.h" #include "local-execution/task_registry.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/optimizer_attrs.dtg.h" @@ -16,19 +16,25 @@ struct LocalTrainingBacking { LocalTrainingBacking(Allocator const &, ComputationGraph const &, TensorBackingMap const &, - RuntimeArgConfig const &, - std::optional const &, - std::optional const &); - - void execute_init(); - PerLayerElapsedTime execute_forward(); - PerLayerElapsedTime execute_backward(); - void execute_update(); + RuntimeArgConfig const &); + void register_and_allocate_layer(layer_guid_t const &); + void allocate_layer_optimizer_tensors(layer_guid_t const &, + OptimizerAttrs const &); + + void execute_init(layer_guid_t const &); + std::optional execute_forward(layer_guid_t const &); + void compute_loss(LossAttrs const &loss_attrs, + tensor_guid_t const &logit_tensor, + tensor_guid_t const &label_tensor); + std::optional execute_backward(layer_guid_t const &); + void execute_update(layer_guid_t const &, OptimizerAttrs const &); TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const; TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &, layer_guid_t const &) const; + void insert_tensor(tensor_guid_t const &, GenericTensorAccessorW const &); + private: DeviceSpecificDeviceStates call_init_task_impl(task_id_t, TaskArgumentAccessor const &); @@ -39,8 +45,6 @@ struct LocalTrainingBacking { ComputationGraph computation_graph; TaskRegistry task_registry; LocalSlotsBacking local_slots_backing; - std::optional training_instance; - std::optional optimizer_attrs; }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h new file mode 100644 index 0000000000..08f373a16f --- /dev/null +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -0,0 +1,39 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H +#define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H + +#include "local-execution/local_training_backing.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" + +namespace FlexFlow { + +using PerLayerElapsedTime = + std::unordered_map>; + +struct ModelTrainingInstance { + ModelTrainingInstance(Allocator const &, + ComputationGraph const &, + TensorBackingMap const &, + RuntimeArgConfig const &, + LossAttrs const &, + tensor_guid_t const & logit_tensor, + tensor_guid_t const & label_tensor, + OptimizerAttrs const &); + + void register_and_allocate_layers(); + void allocate_optimizer_tensors(); + void execute_init(); + PerLayerElapsedTime execute_forward(); + PerLayerElapsedTime execute_backward(); + void execute_update(); + + ComputationGraph computation_graph; + LocalTrainingBacking training_backing; + LossAttrs loss_attrs; + tensor_guid_t logit_tensor; + tensor_guid_t label_tensor; + OptimizerAttrs optimizer_attrs; +}; + +} + +#endif diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml deleted file mode 100644 index dcfaf2175d..0000000000 --- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml +++ /dev/null @@ -1,25 +0,0 @@ -namespace = "FlexFlow" -name = "ModelTrainingInstance" -features = [ - "eq", - "ord", - "hash", - "fmt", -] - -includes = [ - "op-attrs/ops/loss_functions/loss_attrs.dtg.h", - "pcg/tensor_guid_t.dtg.h" -] - -[[fields]] -name = "loss_attrs" -type = "::FlexFlow::LossAttrs" - -[[fields]] -name = "label_tensor" -type = "::FlexFlow::tensor_guid_t" - -[[fields]] -name = "logit_tensor" -type = "::FlexFlow::tensor_guid_t" diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h index e00cc183da..24790a28e3 100644 --- a/lib/local-execution/include/local-execution/task_registry.h +++ b/lib/local-execution/include/local-execution/task_registry.h @@ -2,6 +2,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H +#include "local-execution/op_task_type.dtg.h" #include "local-execution/task_registry.dtg.h" #include "op-attrs/computation_graph_op_attrs.h" @@ -13,6 +14,10 @@ void register_tasks_for_layer(TaskRegistry &, layer_guid_t const &, ComputationGraphOpAttrs const &attrs); +bool registry_contains_op_task(TaskRegistry const &, + layer_guid_t const &, + OpTaskType const &); + } // namespace FlexFlow #endif diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 186c2d516a..c99a2b154f 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -4,6 +4,7 @@ #include "local-execution/tracked_allocator.h" #include "op-attrs/computation_graph_op_attrs.h" #include "op-attrs/pcg_operator_attrs.h" +#include "pcg/computation_graph/layer_added_result.dtg.h" #include "pcg/computation_graph_builder.h" #include "pcg/parallel_tensor_attrs.h" #include "utils/containers/transform.h" @@ -66,29 +67,27 @@ CostDetails LocalCostEstimator::estimate_cost( }; // add operator to graph - std::vector output_tensor_ids = - cg_builder.add_layer(layer_attrs, - input_tensor_ids, - transform(get_vector_piece_attrs(weights), - [&](TensorAttrs const &a) { - return cg_builder.create_weight(a); - }), - get_vector_piece_attrs(outputs)); - - std::optional model_training_instance = std::nullopt; - std::optional optimizer_attrs = std::nullopt; + LayerAddedResult layer_added_result = + cg_builder.add_layer_and_get_layer_added_result( + layer_attrs, + input_tensor_ids, + transform(get_vector_piece_attrs(weights), + [&](TensorAttrs const &a) { + return cg_builder.create_weight(a); + }), + get_vector_piece_attrs(outputs)); + LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, - this->runtime_arg_config, - model_training_instance, - optimizer_attrs); - - local_backing.execute_init(); - PerLayerElapsedTime fwd = local_backing.execute_forward(); - PerLayerElapsedTime bwd = local_backing.execute_backward(); - - return CostDetails{get_total_elapsed_time(fwd, bwd), + this->runtime_arg_config); + local_backing.register_and_allocate_layer(layer_added_result.layer); + local_backing.execute_init(layer_added_result.layer); + float fwd = local_backing.execute_forward(layer_added_result.layer).value(); + float bwd = local_backing.execute_backward(layer_added_result.layer).value(); + float total_execution_time = fwd + bwd; + + return CostDetails{total_execution_time, tracked_allocator_ptr->get_current_mem_usage()}; } diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index f10b7c0126..25abc72567 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -18,39 +18,65 @@ void LocalSlotsBacking::add_per_device_op_state( this->per_device_op_states.insert({op_guid, device_state}); } -void LocalSlotsBacking::allocate_outgoing_tensors( +void LocalSlotsBacking::insert_into_tensor_mapping( + tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) { + if (!contains_key(this->tensor_mapping, tensor)) { + this->tensor_mapping.insert({tensor, tensor_backing}); + } +} + +void LocalSlotsBacking::allocate_layer_tensors( layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, Allocator &allocator) { - std::vector incoming_input_tensors = - get_incoming_inputs(computation_graph, layer_guid); - std::vector incoming_weight_tensors = - get_incoming_weights(computation_graph, layer_guid); - std::vector outgoing_tensors = - get_outgoing_tensors(computation_graph, layer_guid); - for (tensor_guid_t const &output_tensor : outgoing_tensors) { - TensorAttrs tensor_attrs = - get_tensor_attrs(computation_graph, output_tensor); + this->allocate_tensors_by_role( + TensorRole::INPUT, layer_guid, computation_graph, allocator); + this->allocate_tensors_by_role( + TensorRole::WEIGHT, layer_guid, computation_graph, allocator); + this->allocate_tensors_by_role( + TensorRole::OUTPUT, layer_guid, computation_graph, allocator); +} + +void LocalSlotsBacking::allocate_tensors_by_role( + TensorRole const &role, + layer_guid_t const &layer_guid, + ComputationGraph const &computation_graph, + Allocator &allocator) { + std::vector tensors; + switch (role) { + case TensorRole::INPUT: + tensors = get_incoming_inputs(computation_graph, layer_guid); + this->input_tensor_slots.insert({layer_guid, tensors}); + break; + case TensorRole::WEIGHT: + tensors = get_incoming_weights(computation_graph, layer_guid); + this->weight_tensor_slots.insert({layer_guid, tensors}); + break; + case TensorRole::OUTPUT: + tensors = get_outgoing_tensors(computation_graph, layer_guid); + this->output_tensor_slots.insert({layer_guid, tensors}); + break; + default: + throw mk_runtime_error("Invalid tensor role, got {}", role); + } + + for (tensor_guid_t const &tensor : tensors) { + TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor); // tensor allocation - if (!is_tensor_allocated(output_tensor)) { + if (!is_tensor_allocated(tensor)) { GenericTensorAccessorW tensor_backing = allocator.allocate_tensor(tensor_attrs.shape); - this->tensor_mapping.insert({output_tensor, tensor_backing}); + this->tensor_mapping.insert({tensor, tensor_backing}); } // gradient tensor allocation if (tensor_attrs.create_gradients == CreateGrad::YES && - !is_gradient_tensor_allocated(output_tensor)) { + !is_gradient_tensor_allocated(tensor)) { GenericTensorAccessorW gradient_tensor_backing = allocator.allocate_tensor(tensor_attrs.shape); - this->gradient_tensor_mapping.insert( - {output_tensor, gradient_tensor_backing}); + this->gradient_tensor_mapping.insert({tensor, gradient_tensor_backing}); } } - - this->input_tensor_slots.insert({layer_guid, incoming_input_tensors}); - this->weight_tensor_slots.insert({layer_guid, incoming_weight_tensors}); - this->output_tensor_slots.insert({layer_guid, outgoing_tensors}); } void LocalSlotsBacking::allocate_optimizer_tensors( diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index b7631470b7..0cb8146467 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -8,7 +8,6 @@ #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" -#include "utils/containers/reversed.h" #include "utils/exception.h" namespace FlexFlow { @@ -17,42 +16,30 @@ LocalTrainingBacking::LocalTrainingBacking( Allocator const &allocator, ComputationGraph const &computation_graph, TensorBackingMap const &tensor_backing_mapping, - RuntimeArgConfig const &runtime_arg_config, - std::optional const &training_instance, - std::optional const &optimizer_attrs) + RuntimeArgConfig const &runtime_arg_config) : allocator(allocator), computation_graph(computation_graph), local_slots_backing(tensor_backing_mapping, runtime_arg_config), - task_registry(empty_task_registry()), - training_instance(training_instance), optimizer_attrs(optimizer_attrs) { - - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, node).attrs; - - // allocate outgoing tensors - this->local_slots_backing.allocate_outgoing_tensors( - node, this->computation_graph, this->allocator); - - // register tasks - register_tasks_for_layer(this->task_registry, node, attrs); - - // allocate optimizer buffers - if (attrs.has() && this->training_instance.has_value()) { - TaskSignature sig = get_update_signature(this->optimizer_attrs.value()); - tensor_guid_t weight_tensor = - get_only(get_outgoing_tensors(this->computation_graph, node)); - this->local_slots_backing.allocate_optimizer_tensors( - node, weight_tensor, this->computation_graph, this->allocator, sig); - } - } + task_registry(empty_task_registry()) {} + +void LocalTrainingBacking::register_and_allocate_layer( + layer_guid_t const &node) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, node).attrs; + this->local_slots_backing.allocate_layer_tensors( + node, this->computation_graph, this->allocator); + register_tasks_for_layer(this->task_registry, node, attrs); +} - if (this->training_instance.has_value()) { - // label and logit tensor should be allocated - assert(this->local_slots_backing.is_tensor_allocated( - this->training_instance.value().label_tensor)); - assert(this->local_slots_backing.is_tensor_allocated( - this->training_instance.value().logit_tensor)); +void LocalTrainingBacking::allocate_layer_optimizer_tensors( + layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, node).attrs; + if (attrs.has()) { + TaskSignature sig = get_update_signature(optimizer_attrs); + tensor_guid_t weight_tensor = + get_only(get_outgoing_tensors(this->computation_graph, node)); + this->local_slots_backing.allocate_optimizer_tensors( + node, weight_tensor, this->computation_graph, this->allocator, sig); } } @@ -76,110 +63,88 @@ std::optional return fn(acc); } -void LocalTrainingBacking::execute_init() { - for (layer_guid_t const &operator_node : - topological_ordering(this->computation_graph)) { - if (this->task_registry.init_task_ids.at(operator_node).has_value()) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - - OpTaskInvocation invocation = init(attrs); - TaskArgumentAccessor accessor = - this->get_op_task_arg_accessor(invocation, operator_node); - DeviceSpecificDeviceStates device_state = - this->call_init_task_impl(invocation.task_id, accessor); - this->local_slots_backing.add_per_device_op_state(operator_node, - device_state); - } +void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) { + if (registry_contains_op_task( + this->task_registry, operator_node, OpTaskType::INIT)) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, operator_node).attrs; + + OpTaskInvocation invocation = init(attrs); + TaskArgumentAccessor accessor = + this->get_op_task_arg_accessor(invocation, operator_node); + DeviceSpecificDeviceStates device_state = + this->call_init_task_impl(invocation.task_id, accessor); + this->local_slots_backing.add_per_device_op_state(operator_node, + device_state); } } -PerLayerElapsedTime LocalTrainingBacking::execute_forward() { - PerLayerElapsedTime per_op_elapsed_time; - - for (layer_guid_t const &operator_node : - topological_ordering(this->computation_graph)) { - if (this->task_registry.forward_task_ids.at(operator_node).has_value()) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - - OpTaskInvocation invocation = forward(attrs); - TaskArgumentAccessor accessor = - this->get_op_task_arg_accessor(invocation, operator_node); - std::optional elapsed_time = - this->call_task_impl(invocation.task_id, accessor); - per_op_elapsed_time.insert({operator_node, elapsed_time}); - } +std::optional + LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) { + if (registry_contains_op_task( + this->task_registry, operator_node, OpTaskType::FWD)) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, operator_node).attrs; + + OpTaskInvocation invocation = forward(attrs); + TaskArgumentAccessor accessor = + this->get_op_task_arg_accessor(invocation, operator_node); + return this->call_task_impl(invocation.task_id, accessor); + } else { + return std::nullopt; } - - return per_op_elapsed_time; } -PerLayerElapsedTime LocalTrainingBacking::execute_backward() { - PerLayerElapsedTime per_op_elapsed_time; - - // compute loss - if (this->training_instance.has_value()) { - ModelTrainingInstance unwrapped_training_instance = - training_instance.value(); - TaskInvocation loss_invocation = - backward(unwrapped_training_instance.loss_attrs, - unwrapped_training_instance.logit_tensor, - unwrapped_training_instance.label_tensor); - // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); - TaskArgumentAccessor loss_accessor = - this->get_task_arg_accessor(loss_invocation); - TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); - loss_impl_fn.get().function_ptr(loss_accessor); - } +void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs, + tensor_guid_t const &logit_tensor, + tensor_guid_t const &label_tensor) { + assert(this->local_slots_backing.is_tensor_allocated(logit_tensor) && + this->local_slots_backing.is_tensor_allocated(label_tensor)); + TaskInvocation loss_invocation = + backward(loss_attrs, logit_tensor, label_tensor); + // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); + TaskArgumentAccessor loss_accessor = + this->get_task_arg_accessor(loss_invocation); + TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); + loss_impl_fn.get().function_ptr(loss_accessor); +} - // backward through computation graph - for (layer_guid_t const &operator_node : - reversed(topological_ordering(this->computation_graph))) { - if (this->task_registry.backward_task_ids.at(operator_node).has_value()) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - - OpTaskInvocation invocation = backward(attrs); - TaskArgumentAccessor accessor = - this->get_op_task_arg_accessor(invocation, operator_node); - std::optional elapsed_time = - this->call_task_impl(invocation.task_id, accessor); - per_op_elapsed_time.insert({operator_node, elapsed_time}); - } +std::optional + LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) { + if (registry_contains_op_task( + this->task_registry, operator_node, OpTaskType::BWD)) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, operator_node).attrs; + + OpTaskInvocation invocation = backward(attrs); + TaskArgumentAccessor accessor = + this->get_op_task_arg_accessor(invocation, operator_node); + return this->call_task_impl(invocation.task_id, accessor); + } else { + return std::nullopt; } - return per_op_elapsed_time; } -void LocalTrainingBacking::execute_update() { - assert(this->training_instance.has_value()); - assert(this->optimizer_attrs.has_value()); - - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { - LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); - if (layer_attrs.attrs.has()) { - // get tensors - tensor_guid_t weight_tensor = - get_only(get_outgoing_tensors(this->computation_graph, node)); - std::vector grad_buffer_tensors = - this->local_slots_backing.weight_optimizer_tensor_guids.at(node); - - // get invocation - TaskInvocation invocation = get_update_invocation( - this->optimizer_attrs.value(), weight_tensor, grad_buffer_tensors); - // assert(is_invocation_valid(get_update_signature(attrs), invocation)); - - // execute update - TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); - TaskImplFunction update_impl_fn = - get_update_task_impl(this->optimizer_attrs.value()); - update_impl_fn.get().function_ptr(accessor); - } +void LocalTrainingBacking::execute_update( + layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { + LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); + if (layer_attrs.attrs.has()) { + // get tensors + tensor_guid_t weight_tensor = + get_only(get_outgoing_tensors(this->computation_graph, node)); + std::vector grad_buffer_tensors = + this->local_slots_backing.weight_optimizer_tensor_guids.at(node); + + // get invocation + TaskInvocation invocation = get_update_invocation( + optimizer_attrs, weight_tensor, grad_buffer_tensors); + // assert(is_invocation_valid(get_update_signature(attrs), invocation)); + + // execute update + TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); + update_impl_fn.get().function_ptr(accessor); } - - this->optimizer_attrs = - get_next_iteration_optimizer_attrs(this->optimizer_attrs.value()); } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( @@ -206,4 +171,9 @@ TaskArgumentAccessor LocalTrainingBacking::get_op_task_arg_accessor( this->allocator, tensor_slots_backing, arg_slots_backing); } +void LocalTrainingBacking::insert_tensor( + tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) { + this->local_slots_backing.insert_into_tensor_mapping(tensor, tensor_backing); +} + } // namespace FlexFlow diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc new file mode 100644 index 0000000000..7256a82478 --- /dev/null +++ b/lib/local-execution/src/model_training_instance.cc @@ -0,0 +1,64 @@ +#include "local-execution/model_training_instance.h" +#include "pcg/computation_graph.h" +#include "utils/containers/reversed.h" +#include "pcg/optimizer_attrs.h" + +namespace FlexFlow { + +ModelTrainingInstance::ModelTrainingInstance(Allocator const & allocator, + ComputationGraph const & computation_graph, + TensorBackingMap const & tensor_backing_map, + RuntimeArgConfig const & runtime_arg_config, + LossAttrs const & loss_attrs, + tensor_guid_t const &logit_tensor, + tensor_guid_t const &label_tensor, + OptimizerAttrs const & optimizer_attrs) + : computation_graph(computation_graph), training_backing(allocator, computation_graph, tensor_backing_map, runtime_arg_config), + loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {} + +void ModelTrainingInstance::register_and_allocate_layers() { + for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + this->training_backing.register_and_allocate_layer(node); + } +} + +void ModelTrainingInstance::allocate_optimizer_tensors() { + for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + this->training_backing.allocate_layer_optimizer_tensors(node, this->optimizer_attrs); + } +} + +void ModelTrainingInstance::execute_init() { + for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + this->training_backing.execute_init(node); + } +} + +PerLayerElapsedTime ModelTrainingInstance::execute_forward() { + PerLayerElapsedTime per_layer_elapsed_time; + for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + std::optional elapsed_time = this->training_backing.execute_forward(node); + per_layer_elapsed_time.insert({node, elapsed_time}); + } + return per_layer_elapsed_time; +} + +PerLayerElapsedTime ModelTrainingInstance::execute_backward() { + this->training_backing.compute_loss(this->loss_attrs, this->logit_tensor, this->label_tensor); + + PerLayerElapsedTime per_layer_elapsed_time; + for (layer_guid_t const & node: reversed(topological_ordering(this->computation_graph))) { + std::optional elapsed_time = this->training_backing.execute_backward(node); + per_layer_elapsed_time.insert({node, elapsed_time}); + } + return per_layer_elapsed_time; +} + +void ModelTrainingInstance::execute_update() { + for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + this->training_backing.execute_update(node, this->optimizer_attrs); + } + this->optimizer_attrs = get_next_iteration_optimizer_attrs(this->optimizer_attrs); +} + +} diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index dad5c1fc69..3cd2cccae8 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -35,10 +35,32 @@ void register_tasks_for_layer(TaskRegistry &task_registry, task_registry.backward_task_ids[op_id] = task_id; break; default: - throw mk_runtime_error("Invalid OpTaskType"); + throw mk_runtime_error("Invalid OpTaskType, got {}", + task_signature_impl.task_signature.type); } task_registry.task_mapping.insert({task_id, task_signature_impl}); } } +bool registry_contains_op_task(TaskRegistry const &task_registry, + layer_guid_t const &op, + OpTaskType const &op_task_type) { + std::unordered_map> task_ids; + switch (op_task_type) { + case OpTaskType::INIT: + task_ids = task_registry.init_task_ids; + break; + case OpTaskType::FWD: + task_ids = task_registry.forward_task_ids; + break; + case OpTaskType::BWD: + task_ids = task_registry.backward_task_ids; + break; + default: + throw mk_runtime_error("Invalid OpTaskType, got {}", op_task_type); + } + + return task_ids.at(op).has_value(); +} + } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index 779ba43f26..5d58e7e757 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -81,7 +81,7 @@ TEST_SUITE(FF_TEST_SUITE) { LocalSlotsBacking local_slots_backing = {tensor_backing_map, runtime_arg_config}; - SUBCASE("LocalSlotsBacking::allocate_outgoing_tensors") { + SUBCASE("LocalSlotsBacking::allocate_tensors_by_role") { auto get_result_shape_and_dtype_for_tensor_guid_and_map = [&](tensor_guid_t t, TensorBackingMap m) -> std::pair { @@ -92,14 +92,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Input (QKV) and gradient tensors allocation") { // allocate all tensors from input nodes - for (layer_guid_t const &node : - topological_ordering(cg_builder.computation_graph)) { - if (node == layer_guid) { - break; - } - local_slots_backing.allocate_outgoing_tensors( - node, cg_builder.computation_graph, allocator); - } + local_slots_backing.allocate_tensors_by_role( + TensorRole::INPUT, + layer_guid, + cg_builder.computation_graph, + allocator); SUBCASE("Query grad") { std::pair result = @@ -127,8 +124,11 @@ TEST_SUITE(FF_TEST_SUITE) { } } SUBCASE("Output and gradient tensors allocation") { - local_slots_backing.allocate_outgoing_tensors( - layer_guid, cg_builder.computation_graph, allocator); + local_slots_backing.allocate_tensors_by_role( + TensorRole::OUTPUT, + layer_guid, + cg_builder.computation_graph, + allocator); SUBCASE("Output") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( @@ -154,7 +154,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Tensor slots") { - local_slots_backing.allocate_outgoing_tensors( + local_slots_backing.allocate_layer_tensors( layer_guid, cg_builder.computation_graph, allocator); SUBCASE("Input tensor slots") { std::vector correct_incoming_input_tensors = @@ -211,12 +211,8 @@ TEST_SUITE(FF_TEST_SUITE) { return b; }(); - // allocate all incoming and outgoing tensors for graph - for (layer_guid_t const &node : - topological_ordering(cg_builder.computation_graph)) { - local_slots_backing.allocate_outgoing_tensors( - node, cg_builder.computation_graph, allocator); - } + local_slots_backing.allocate_layer_tensors( + layer_guid, cg_builder.computation_graph, allocator); SUBCASE("LocalSlotsBacking::construct_tensor_slots_backing") { TensorSlotsBackingWithoutAddresses result = diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 37024adc26..c4662d624c 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -3,6 +3,8 @@ #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" @@ -19,12 +21,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { EnableProfiling::YES, ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.0, - /*momentum=*/0.0, - /*nesterov=*/false, - /*weight_decay=*/0.0}}; - // construct graph ComputationGraphBuilder cg_builder; @@ -36,8 +32,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { cg_builder.create_input(input_shape, CreateGrad::YES); float scalar = 4.0; + std::string layer_name = "scalar multiply"; tensor_guid_t logit_tensor = - cg_builder.scalar_multiply(input_tensor, scalar); + cg_builder.scalar_multiply(input_tensor, scalar, layer_name); // allocate memory Allocator allocator = create_local_cuda_memory_allocator(); @@ -46,6 +43,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { allocator.allocate_tensor(input_shape); tensor_backing_map.insert({input_tensor, input_backing}); + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config); + // for (layer_guid_t const & node: + // topological_ordering(cg_builder.computation_graph)) { + // local_backing.register_and_allocate_layer(node); + // } + local_backing.register_and_allocate_layer( + get_layer_by_name(cg_builder.computation_graph, layer_name)); + SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { TensorShape label_shape = TensorShape{ TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; @@ -53,22 +61,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { cg_builder.create_input(label_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); - tensor_backing_map.insert({label_tensor, label_backing}); - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{SparseCategoricalCrossEntropyLossAttrs{ - /*replace_labels=*/false}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); + local_backing.insert_tensor(label_tensor, label_backing); + LossAttrs loss_attrs = LossAttrs{ + SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}; + local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); } SUBCASE("NonconfigurableLossAttrs") { @@ -76,58 +72,24 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { cg_builder.create_input(input_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); - tensor_backing_map.insert({label_tensor, label_backing}); + local_backing.insert_tensor(label_tensor, label_backing); SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { - std::optional model_training_instance = - ModelTrainingInstance{LossAttrs{NonconfigurableLossAttrs{ - LossFunction::CATEGORICAL_CROSSENTROPY}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); + LossAttrs loss_attrs = LossAttrs{ + NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; + local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); } SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{NonconfigurableLossAttrs{ - LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); + LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}; + local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); } SUBCASE("LossFunction::IDENTITY") { - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); + LossAttrs loss_attrs = + LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}; + local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); } } } diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index 96b748806f..b48214d89d 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -3,6 +3,7 @@ #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" +#include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" @@ -30,8 +31,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { cg_builder.create_input(input_shape, CreateGrad::YES); float scalar = 4.0; + std::string layer_name = "scalar_multiply"; tensor_guid_t logit_tensor = - cg_builder.scalar_multiply(input_tensor, scalar); + cg_builder.scalar_multiply(input_tensor, scalar, layer_name); // allocate memory Allocator allocator = create_local_cuda_memory_allocator(); @@ -40,11 +42,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { allocator.allocate_tensor(input_shape); tensor_backing_map.insert({input_tensor, input_backing}); - tensor_guid_t label_tensor = - cg_builder.create_input(input_shape, CreateGrad::NO); - GenericTensorAccessorW label_backing = - allocator.allocate_tensor(input_shape); - tensor_backing_map.insert({label_tensor, label_backing}); + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config); + // for (layer_guid_t const & node: + // topological_ordering(cg_builder.computation_graph)) { + // local_backing.register_and_allocate_layer(node); + // } + layer_guid_t layer_guid = + get_layer_by_name(cg_builder.computation_graph, layer_name); + local_backing.register_and_allocate_layer(layer_guid); SUBCASE("SGDOptimizerAttrs") { SUBCASE("momentum=0") { @@ -53,22 +61,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*momentum=*/0.0f, /*nesterov=*/false, /*weight_decay=*/0.001}}; - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{NonconfigurableLossAttrs{ - LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); - local_backing.execute_update(); + local_backing.allocate_layer_optimizer_tensors(layer_guid, + optimizer_attrs); + local_backing.execute_update(layer_guid, optimizer_attrs); } SUBCASE("momentum=0.9") { OptimizerAttrs optimizer_attrs = @@ -76,22 +71,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*momentum=*/0.9, /*nesterov=*/false, /*weight_decay=*/0.001}}; - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{NonconfigurableLossAttrs{ - LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); - local_backing.execute_update(); + local_backing.allocate_layer_optimizer_tensors(layer_guid, + optimizer_attrs); + local_backing.execute_update(layer_guid, optimizer_attrs); } } SUBCASE("AdamOptimizerAttrs") { @@ -104,22 +86,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*beta_t=*/0.9, /*beta2_t=*/0.999, /*epsilon=*/1e-8}}; - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{NonconfigurableLossAttrs{ - LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); - local_backing.execute_update(); + local_backing.allocate_layer_optimizer_tensors(layer_guid, + optimizer_attrs); + local_backing.execute_update(layer_guid, optimizer_attrs); } } } diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h index 45cde0de57..585399ea1d 100644 --- a/lib/pcg/include/pcg/computation_graph_builder.h +++ b/lib/pcg/include/pcg/computation_graph_builder.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_PCG_INCLUDE_PCG_COMPUTATION_GRAPH_BUILDER_H #include "pcg/computation_graph.dtg.h" +#include "pcg/computation_graph/layer_added_result.dtg.h" #include "pcg/initializer_attrs.dtg.h" #include "pcg/tensor_guid_t.dtg.h" @@ -256,6 +257,12 @@ struct ComputationGraphBuilder { std::vector get_outputs(LayerAttrs const &) const; tensor_guid_t get_output(LayerAttrs const &, int idx) const; + LayerAddedResult add_layer_and_get_layer_added_result( + LayerAttrs const &layer, + std::vector const &inputs, + std::vector const &weights, + std::vector const &outputs); + std::vector add_layer(LayerAttrs const &layer, std::vector const &inputs, diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc index 4a565476bd..4c619288cb 100644 --- a/lib/pcg/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/src/pcg/computation_graph_builder.cc @@ -106,7 +106,7 @@ static void check_incoming_tensor_roles(LayerAttrs const &layer, } } -std::vector ComputationGraphBuilder::add_layer( +LayerAddedResult ComputationGraphBuilder::add_layer_and_get_layer_added_result( LayerAttrs const &layer, std::vector const &inputs, std::vector const &weights, @@ -115,7 +115,17 @@ std::vector ComputationGraphBuilder::add_layer( LayerAddedResult added = ::FlexFlow::add_layer( this->computation_graph, layer, concat_vectors(inputs, weights), outputs); - return added.outputs; + return added; +} + +std::vector ComputationGraphBuilder::add_layer( + LayerAttrs const &layer, + std::vector const &inputs, + std::vector const &weights, + std::vector const &outputs) { + return this + ->add_layer_and_get_layer_added_result(layer, inputs, weights, outputs) + .outputs; } tensor_guid_t ComputationGraphBuilder::as_type(tensor_guid_t const &x, From a73b1c325f819f1ffdcdc0ce38fda1e25fd2eb28 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 13 Nov 2024 09:22:43 -0800 Subject: [PATCH 21/22] Expose op folders publicly --- .../local-execution/model_training_instance.h | 2 - .../local-execution}/ops/attention.h | 0 .../local-execution}/ops/batch_matmul.h | 0 .../local-execution}/ops/batch_norm.h | 0 .../local-execution}/ops/cast.h | 0 .../local-execution}/ops/combine.h | 0 .../local-execution}/ops/concat.h | 0 .../local-execution}/ops/conv_2d.h | 0 .../local-execution}/ops/dropout.h | 0 .../local-execution}/ops/element_binary.h | 0 .../local-execution}/ops/element_unary.h | 0 .../local-execution}/ops/embedding.h | 0 .../local-execution}/ops/flat.h | 0 .../local-execution}/ops/gather.h | 0 .../local-execution}/ops/input.h | 0 .../local-execution}/ops/layer_norm.h | 0 .../local-execution}/ops/linear.h | 0 .../local-execution}/ops/noop.h | 0 .../local-execution}/ops/parallel_op.h | 0 .../local-execution}/ops/pool_2d.h | 0 .../local-execution}/ops/reduce.h | 0 .../local-execution}/ops/reduction.h | 0 .../local-execution}/ops/repartition.h | 0 .../local-execution}/ops/replicate.h | 0 .../local-execution}/ops/reshape.h | 0 .../local-execution}/ops/reverse.h | 0 .../local-execution}/ops/softmax.h | 0 .../local-execution}/ops/split.h | 0 .../local-execution}/ops/topk.h | 0 .../local-execution}/ops/transpose.h | 0 .../local-execution}/ops/weight.h | 0 .../src/model_training_instance.cc | 11 +--- lib/local-execution/src/ops/attention.cc | 2 +- lib/local-execution/src/ops/batch_matmul.cc | 2 +- lib/local-execution/src/ops/batch_norm.cc | 2 +- lib/local-execution/src/ops/cast.cc | 2 +- lib/local-execution/src/ops/combine.cc | 2 +- lib/local-execution/src/ops/concat.cc | 2 +- lib/local-execution/src/ops/conv_2d.cc | 2 +- lib/local-execution/src/ops/dropout.cc | 2 +- lib/local-execution/src/ops/element_binary.cc | 2 +- lib/local-execution/src/ops/element_unary.cc | 2 +- lib/local-execution/src/ops/flat.cc | 2 +- lib/local-execution/src/ops/gather.cc | 2 +- lib/local-execution/src/ops/input.cc | 2 +- lib/local-execution/src/ops/layer_norm.cc | 2 +- lib/local-execution/src/ops/linear.cc | 2 +- lib/local-execution/src/ops/noop.cc | 2 +- lib/local-execution/src/ops/pool_2d.cc | 2 +- lib/local-execution/src/ops/reduce.cc | 2 +- lib/local-execution/src/ops/reduction.cc | 2 +- lib/local-execution/src/ops/repartition.cc | 2 +- lib/local-execution/src/ops/replicate.cc | 2 +- lib/local-execution/src/ops/reshape.cc | 2 +- lib/local-execution/src/ops/reverse.cc | 2 +- lib/local-execution/src/ops/softmax.cc | 2 +- lib/local-execution/src/ops/split.cc | 2 +- lib/local-execution/src/ops/topk.cc | 2 +- lib/local-execution/src/ops/transpose.cc | 2 +- lib/local-execution/src/ops/weight.cc | 2 +- .../src/task_signature_impl.cc | 58 +++++++++---------- .../include/op-attrs/operator_attrs.h | 58 +++++++++---------- 62 files changed, 89 insertions(+), 96 deletions(-) rename lib/local-execution/{src => include/local-execution}/ops/attention.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/batch_matmul.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/batch_norm.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/cast.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/combine.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/concat.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/conv_2d.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/dropout.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/element_binary.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/element_unary.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/embedding.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/flat.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/gather.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/input.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/layer_norm.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/linear.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/noop.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/parallel_op.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/pool_2d.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/reduce.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/reduction.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/repartition.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/replicate.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/reshape.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/reverse.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/softmax.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/split.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/topk.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/transpose.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/weight.h (100%) diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 08f373a16f..14473ff26e 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -19,8 +19,6 @@ struct ModelTrainingInstance { tensor_guid_t const & label_tensor, OptimizerAttrs const &); - void register_and_allocate_layers(); - void allocate_optimizer_tensors(); void execute_init(); PerLayerElapsedTime execute_forward(); PerLayerElapsedTime execute_backward(); diff --git a/lib/local-execution/src/ops/attention.h b/lib/local-execution/include/local-execution/ops/attention.h similarity index 100% rename from lib/local-execution/src/ops/attention.h rename to lib/local-execution/include/local-execution/ops/attention.h diff --git a/lib/local-execution/src/ops/batch_matmul.h b/lib/local-execution/include/local-execution/ops/batch_matmul.h similarity index 100% rename from lib/local-execution/src/ops/batch_matmul.h rename to lib/local-execution/include/local-execution/ops/batch_matmul.h diff --git a/lib/local-execution/src/ops/batch_norm.h b/lib/local-execution/include/local-execution/ops/batch_norm.h similarity index 100% rename from lib/local-execution/src/ops/batch_norm.h rename to lib/local-execution/include/local-execution/ops/batch_norm.h diff --git a/lib/local-execution/src/ops/cast.h b/lib/local-execution/include/local-execution/ops/cast.h similarity index 100% rename from lib/local-execution/src/ops/cast.h rename to lib/local-execution/include/local-execution/ops/cast.h diff --git a/lib/local-execution/src/ops/combine.h b/lib/local-execution/include/local-execution/ops/combine.h similarity index 100% rename from lib/local-execution/src/ops/combine.h rename to lib/local-execution/include/local-execution/ops/combine.h diff --git a/lib/local-execution/src/ops/concat.h b/lib/local-execution/include/local-execution/ops/concat.h similarity index 100% rename from lib/local-execution/src/ops/concat.h rename to lib/local-execution/include/local-execution/ops/concat.h diff --git a/lib/local-execution/src/ops/conv_2d.h b/lib/local-execution/include/local-execution/ops/conv_2d.h similarity index 100% rename from lib/local-execution/src/ops/conv_2d.h rename to lib/local-execution/include/local-execution/ops/conv_2d.h diff --git a/lib/local-execution/src/ops/dropout.h b/lib/local-execution/include/local-execution/ops/dropout.h similarity index 100% rename from lib/local-execution/src/ops/dropout.h rename to lib/local-execution/include/local-execution/ops/dropout.h diff --git a/lib/local-execution/src/ops/element_binary.h b/lib/local-execution/include/local-execution/ops/element_binary.h similarity index 100% rename from lib/local-execution/src/ops/element_binary.h rename to lib/local-execution/include/local-execution/ops/element_binary.h diff --git a/lib/local-execution/src/ops/element_unary.h b/lib/local-execution/include/local-execution/ops/element_unary.h similarity index 100% rename from lib/local-execution/src/ops/element_unary.h rename to lib/local-execution/include/local-execution/ops/element_unary.h diff --git a/lib/local-execution/src/ops/embedding.h b/lib/local-execution/include/local-execution/ops/embedding.h similarity index 100% rename from lib/local-execution/src/ops/embedding.h rename to lib/local-execution/include/local-execution/ops/embedding.h diff --git a/lib/local-execution/src/ops/flat.h b/lib/local-execution/include/local-execution/ops/flat.h similarity index 100% rename from lib/local-execution/src/ops/flat.h rename to lib/local-execution/include/local-execution/ops/flat.h diff --git a/lib/local-execution/src/ops/gather.h b/lib/local-execution/include/local-execution/ops/gather.h similarity index 100% rename from lib/local-execution/src/ops/gather.h rename to lib/local-execution/include/local-execution/ops/gather.h diff --git a/lib/local-execution/src/ops/input.h b/lib/local-execution/include/local-execution/ops/input.h similarity index 100% rename from lib/local-execution/src/ops/input.h rename to lib/local-execution/include/local-execution/ops/input.h diff --git a/lib/local-execution/src/ops/layer_norm.h b/lib/local-execution/include/local-execution/ops/layer_norm.h similarity index 100% rename from lib/local-execution/src/ops/layer_norm.h rename to lib/local-execution/include/local-execution/ops/layer_norm.h diff --git a/lib/local-execution/src/ops/linear.h b/lib/local-execution/include/local-execution/ops/linear.h similarity index 100% rename from lib/local-execution/src/ops/linear.h rename to lib/local-execution/include/local-execution/ops/linear.h diff --git a/lib/local-execution/src/ops/noop.h b/lib/local-execution/include/local-execution/ops/noop.h similarity index 100% rename from lib/local-execution/src/ops/noop.h rename to lib/local-execution/include/local-execution/ops/noop.h diff --git a/lib/local-execution/src/ops/parallel_op.h b/lib/local-execution/include/local-execution/ops/parallel_op.h similarity index 100% rename from lib/local-execution/src/ops/parallel_op.h rename to lib/local-execution/include/local-execution/ops/parallel_op.h diff --git a/lib/local-execution/src/ops/pool_2d.h b/lib/local-execution/include/local-execution/ops/pool_2d.h similarity index 100% rename from lib/local-execution/src/ops/pool_2d.h rename to lib/local-execution/include/local-execution/ops/pool_2d.h diff --git a/lib/local-execution/src/ops/reduce.h b/lib/local-execution/include/local-execution/ops/reduce.h similarity index 100% rename from lib/local-execution/src/ops/reduce.h rename to lib/local-execution/include/local-execution/ops/reduce.h diff --git a/lib/local-execution/src/ops/reduction.h b/lib/local-execution/include/local-execution/ops/reduction.h similarity index 100% rename from lib/local-execution/src/ops/reduction.h rename to lib/local-execution/include/local-execution/ops/reduction.h diff --git a/lib/local-execution/src/ops/repartition.h b/lib/local-execution/include/local-execution/ops/repartition.h similarity index 100% rename from lib/local-execution/src/ops/repartition.h rename to lib/local-execution/include/local-execution/ops/repartition.h diff --git a/lib/local-execution/src/ops/replicate.h b/lib/local-execution/include/local-execution/ops/replicate.h similarity index 100% rename from lib/local-execution/src/ops/replicate.h rename to lib/local-execution/include/local-execution/ops/replicate.h diff --git a/lib/local-execution/src/ops/reshape.h b/lib/local-execution/include/local-execution/ops/reshape.h similarity index 100% rename from lib/local-execution/src/ops/reshape.h rename to lib/local-execution/include/local-execution/ops/reshape.h diff --git a/lib/local-execution/src/ops/reverse.h b/lib/local-execution/include/local-execution/ops/reverse.h similarity index 100% rename from lib/local-execution/src/ops/reverse.h rename to lib/local-execution/include/local-execution/ops/reverse.h diff --git a/lib/local-execution/src/ops/softmax.h b/lib/local-execution/include/local-execution/ops/softmax.h similarity index 100% rename from lib/local-execution/src/ops/softmax.h rename to lib/local-execution/include/local-execution/ops/softmax.h diff --git a/lib/local-execution/src/ops/split.h b/lib/local-execution/include/local-execution/ops/split.h similarity index 100% rename from lib/local-execution/src/ops/split.h rename to lib/local-execution/include/local-execution/ops/split.h diff --git a/lib/local-execution/src/ops/topk.h b/lib/local-execution/include/local-execution/ops/topk.h similarity index 100% rename from lib/local-execution/src/ops/topk.h rename to lib/local-execution/include/local-execution/ops/topk.h diff --git a/lib/local-execution/src/ops/transpose.h b/lib/local-execution/include/local-execution/ops/transpose.h similarity index 100% rename from lib/local-execution/src/ops/transpose.h rename to lib/local-execution/include/local-execution/ops/transpose.h diff --git a/lib/local-execution/src/ops/weight.h b/lib/local-execution/include/local-execution/ops/weight.h similarity index 100% rename from lib/local-execution/src/ops/weight.h rename to lib/local-execution/include/local-execution/ops/weight.h diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 7256a82478..abdced1bb5 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -14,16 +14,11 @@ ModelTrainingInstance::ModelTrainingInstance(Allocator const & allocator, tensor_guid_t const &label_tensor, OptimizerAttrs const & optimizer_attrs) : computation_graph(computation_graph), training_backing(allocator, computation_graph, tensor_backing_map, runtime_arg_config), - loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {} + loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) { -void ModelTrainingInstance::register_and_allocate_layers() { + // allocate each layer's tensors for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { this->training_backing.register_and_allocate_layer(node); - } -} - -void ModelTrainingInstance::allocate_optimizer_tensors() { - for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { this->training_backing.allocate_layer_optimizer_tensors(node, this->optimizer_attrs); } } @@ -61,4 +56,4 @@ void ModelTrainingInstance::execute_update() { this->optimizer_attrs = get_next_iteration_optimizer_attrs(this->optimizer_attrs); } -} +} // namespace FlexFlow diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc index 5e693d43db..b4c5d1ff8a 100644 --- a/lib/local-execution/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "attention.h" +#include "local-execution/ops/attention.h" #include "kernels/attention_kernels.h" #include "local-execution/op_task_signature.h" #include "op-attrs/ops/attention.h" diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc index d60a003061..e358e0a645 100644 --- a/lib/local-execution/src/ops/batch_matmul.cc +++ b/lib/local-execution/src/ops/batch_matmul.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "batch_matmul.h" +#include "local-execution/ops/batch_matmul.h" #include "kernels/batch_matmul_kernels.h" #include "local-execution/op_task_signature.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 254d7ef39e..62155aa161 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "batch_norm.h" +#include "local-execution/ops/batch_norm.h" #include "kernels/batch_norm_kernels.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index d3e43a46a0..846faa9262 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "cast.h" +#include "local-execution/ops/cast.h" #include "kernels/cast_kernels.h" #include "local-execution/op_task_signature.h" diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc index 92f2931344..b7e84878f4 100644 --- a/lib/local-execution/src/ops/combine.cc +++ b/lib/local-execution/src/ops/combine.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "combine.h" +#include "local-execution/ops/combine.h" #include "kernels/combine_kernels.h" #include "local-execution/op_task_invocation.h" #include "utils/hash-utils.h" diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc index 42d98c336a..dee1dd08e5 100644 --- a/lib/local-execution/src/ops/concat.cc +++ b/lib/local-execution/src/ops/concat.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "concat.h" +#include "local-execution/ops/concat.h" #include "kernels/concat_kernels.h" #include "local-execution/op_task_signature.h" diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index 7694a03947..7ae92d70c7 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -1,4 +1,4 @@ -#include "conv_2d.h" +#include "local-execution/ops/conv_2d.h" #include "kernels/conv_2d_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc index 77a2963313..017d023ec4 100644 --- a/lib/local-execution/src/ops/dropout.cc +++ b/lib/local-execution/src/ops/dropout.cc @@ -1,4 +1,4 @@ -#include "dropout.h" +#include "local-execution/ops/dropout.h" #include "kernels/dropout_kernels.h" #include "local-execution/op_task_invocation.h" #include "local-execution/op_task_signature.h" diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc index 2152b1beea..d4c12c7285 100644 --- a/lib/local-execution/src/ops/element_binary.cc +++ b/lib/local-execution/src/ops/element_binary.cc @@ -1,4 +1,4 @@ -#include "element_binary.h" +#include "local-execution/ops/element_binary.h" #include "kernels/element_binary_kernels.h" #include "local-execution/task_signature_impl.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index ccb41d7461..85ecf3db23 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -1,4 +1,4 @@ -#include "element_unary.h" +#include "local-execution/ops/element_unary.h" #include "kernels/element_unary_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/parallel_tensor_shape.h" diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 8df5703f60..ef4dc7ab68 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -1,4 +1,4 @@ -#include "flat.h" +#include "local-execution/ops/flat.h" #include "kernels/flat_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc index 558988f9a4..180026e9ba 100644 --- a/lib/local-execution/src/ops/gather.cc +++ b/lib/local-execution/src/ops/gather.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "gather.h" +#include "local-execution/ops/gather.h" #include "kernels/gather_kernels.h" #include "local-execution/legion_tensor_shape.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/input.cc b/lib/local-execution/src/ops/input.cc index 56d19fa1ba..d7a3888220 100644 --- a/lib/local-execution/src/ops/input.cc +++ b/lib/local-execution/src/ops/input.cc @@ -1,4 +1,4 @@ -#include "input.h" +#include "local-execution/ops/input.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc index b1f44d69ae..c9e2a8d55e 100644 --- a/lib/local-execution/src/ops/layer_norm.cc +++ b/lib/local-execution/src/ops/layer_norm.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "layer_norm.h" +#include "local-execution/ops/layer_norm.h" #include "kernels/layer_norm_kernels.h" #include "local-execution/legion_tensor_shape.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 9e29a0cce0..075aa1d9e4 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -1,4 +1,4 @@ -#include "linear.h" +#include "local-execution/ops/linear.h" #include "kernels/linear_kernels.h" #include "local-execution/task_argument_accessor.h" #include "op-attrs/ff_dim.h" diff --git a/lib/local-execution/src/ops/noop.cc b/lib/local-execution/src/ops/noop.cc index e35fdec275..7357806880 100644 --- a/lib/local-execution/src/ops/noop.cc +++ b/lib/local-execution/src/ops/noop.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "noop.h" +#include "local-execution/ops/noop.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index 126f57be0d..66f27fa69f 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -1,4 +1,4 @@ -#include "pool_2d.h" +#include "local-execution/ops/pool_2d.h" #include "kernels/pool_2d_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc index 01d2f0e86f..c157a98b36 100644 --- a/lib/local-execution/src/ops/reduce.cc +++ b/lib/local-execution/src/ops/reduce.cc @@ -1,4 +1,4 @@ -#include "reduce.h" +#include "local-execution/ops/reduce.h" #include "kernels/reduce_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index f946b7d146..95962661e2 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "reduction.h" +#include "local-execution/ops/reduction.h" #include "kernels/reduction_kernels.h" #include "op-attrs/get_output_shapes.h" #include "utils/exception.h" diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc index e260fd77f5..9bba8109f3 100644 --- a/lib/local-execution/src/ops/repartition.cc +++ b/lib/local-execution/src/ops/repartition.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "repartition.h" +#include "local-execution/ops/repartition.h" #include "kernels/partition_kernels.h" #include "op-attrs/get_output_shapes.h" #include "utils/exception.h" diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index 10cd80a6d9..5ae93c4439 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "replicate.h" +#include "local-execution/ops/replicate.h" #include "kernels/replicate_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/parallel_tensor_shape.h" diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index 433e961a8a..838542a8eb 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "reshape.h" +#include "local-execution/ops/reshape.h" #include "kernels/reshape_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index b767b61b20..63032585b8 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "reverse.h" +#include "local-execution/ops/reverse.h" #include "kernels/accessor.h" #include "kernels/reverse_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 36c4afcaf3..5e78781ddc 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "softmax.h" +#include "local-execution/ops/softmax.h" #include "kernels/softmax_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/parallel_tensor_shape.h" diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc index dc627aae96..556d30109b 100644 --- a/lib/local-execution/src/ops/split.cc +++ b/lib/local-execution/src/ops/split.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "split.h" +#include "local-execution/ops/split.h" #include "kernels/array_shape.h" #include "kernels/split_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc index ea4fc09e19..41a28340db 100644 --- a/lib/local-execution/src/ops/topk.cc +++ b/lib/local-execution/src/ops/topk.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "topk.h" +#include "local-execution/ops/topk.h" #include "kernels/topk_kernels.h" #include "op-attrs/get_output_shapes.h" #include "utils/exception.h" diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 435df464c0..78e9fbde6f 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "transpose.h" +#include "local-execution/ops/transpose.h" #include "kernels/transpose_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/transpose.h" diff --git a/lib/local-execution/src/ops/weight.cc b/lib/local-execution/src/ops/weight.cc index 5537163e85..f96c104f33 100644 --- a/lib/local-execution/src/ops/weight.cc +++ b/lib/local-execution/src/ops/weight.cc @@ -1,4 +1,4 @@ -#include "weight.h" +#include "local-execution/ops/weight.h" namespace FlexFlow { diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc index 3072b9a8bd..199e232a6b 100644 --- a/lib/local-execution/src/task_signature_impl.cc +++ b/lib/local-execution/src/task_signature_impl.cc @@ -1,33 +1,33 @@ #include "local-execution/task_signature_impl.h" -#include "ops/attention.h" -#include "ops/batch_matmul.h" -#include "ops/batch_norm.h" -#include "ops/cast.h" -#include "ops/combine.h" -#include "ops/concat.h" -#include "ops/conv_2d.h" -#include "ops/dropout.h" -#include "ops/element_binary.h" -#include "ops/element_unary.h" -#include "ops/embedding.h" -#include "ops/flat.h" -#include "ops/gather.h" -#include "ops/input.h" -#include "ops/layer_norm.h" -#include "ops/linear.h" -#include "ops/noop.h" -#include "ops/pool_2d.h" -#include "ops/reduce.h" -#include "ops/reduction.h" -#include "ops/repartition.h" -#include "ops/replicate.h" -#include "ops/reshape.h" -#include "ops/reverse.h" -#include "ops/softmax.h" -#include "ops/split.h" -#include "ops/topk.h" -#include "ops/transpose.h" -#include "ops/weight.h" +#include "local-execution/ops/attention.h" +#include "local-execution/ops/batch_matmul.h" +#include "local-execution/ops/batch_norm.h" +#include "local-execution/ops/cast.h" +#include "local-execution/ops/combine.h" +#include "local-execution/ops/concat.h" +#include "local-execution/ops/conv_2d.h" +#include "local-execution/ops/dropout.h" +#include "local-execution/ops/element_binary.h" +#include "local-execution/ops/element_unary.h" +#include "local-execution/ops/embedding.h" +#include "local-execution/ops/flat.h" +#include "local-execution/ops/gather.h" +#include "local-execution/ops/input.h" +#include "local-execution/ops/layer_norm.h" +#include "local-execution/ops/linear.h" +#include "local-execution/ops/noop.h" +#include "local-execution/ops/pool_2d.h" +#include "local-execution/ops/reduce.h" +#include "local-execution/ops/reduction.h" +#include "local-execution/ops/repartition.h" +#include "local-execution/ops/replicate.h" +#include "local-execution/ops/reshape.h" +#include "local-execution/ops/reverse.h" +#include "local-execution/ops/softmax.h" +#include "local-execution/ops/split.h" +#include "local-execution/ops/topk.h" +#include "local-execution/ops/transpose.h" +#include "local-execution/ops/weight.h" #include "utils/overload.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/operator_attrs.h b/lib/op-attrs/include/op-attrs/operator_attrs.h index 268554b5be..11afc5b209 100644 --- a/lib/op-attrs/include/op-attrs/operator_attrs.h +++ b/lib/op-attrs/include/op-attrs/operator_attrs.h @@ -3,35 +3,35 @@ #include "op-attrs/ops/core.h" #include "op-attrs/pcg_operator_attrs.dtg.h" -#include "ops/attention.h" -#include "ops/batch_matmul.h" -#include "ops/batch_norm.h" -#include "ops/broadcast.h" -#include "ops/cast.h" -#include "ops/combine.h" -#include "ops/concat.h" -#include "ops/conv_2d.h" -#include "ops/dropout.h" -#include "ops/element_binary.h" -#include "ops/element_unary.h" -#include "ops/embedding.h" -#include "ops/flat.h" -#include "ops/gather.h" -#include "ops/input.h" -#include "ops/layer_norm.h" -#include "ops/linear.h" -#include "ops/noop.h" -#include "ops/pool_2d.h" -#include "ops/reduce.h" -#include "ops/reduction.h" -#include "ops/repartition.h" -#include "ops/replicate.h" -#include "ops/reshape.h" -#include "ops/reverse.h" -#include "ops/softmax.h" -#include "ops/split.h" -#include "ops/topk.h" -#include "ops/transpose.h" +#include "local-execution/ops/attention.h" +#include "local-execution/ops/batch_matmul.h" +#include "local-execution/ops/batch_norm.h" +#include "local-execution/ops/broadcast.h" +#include "local-execution/ops/cast.h" +#include "local-execution/ops/combine.h" +#include "local-execution/ops/concat.h" +#include "local-execution/ops/conv_2d.h" +#include "local-execution/ops/dropout.h" +#include "local-execution/ops/element_binary.h" +#include "local-execution/ops/element_unary.h" +#include "local-execution/ops/embedding.h" +#include "local-execution/ops/flat.h" +#include "local-execution/ops/gather.h" +#include "local-execution/ops/input.h" +#include "local-execution/ops/layer_norm.h" +#include "local-execution/ops/linear.h" +#include "local-execution/ops/noop.h" +#include "local-execution/ops/pool_2d.h" +#include "local-execution/ops/reduce.h" +#include "local-execution/ops/reduction.h" +#include "local-execution/ops/repartition.h" +#include "local-execution/ops/replicate.h" +#include "local-execution/ops/reshape.h" +#include "local-execution/ops/reverse.h" +#include "local-execution/ops/softmax.h" +#include "local-execution/ops/split.h" +#include "local-execution/ops/topk.h" +#include "local-execution/ops/transpose.h" #include "utils/record_formatter.h" #include "utils/variant.h" #include From c6fed294c5b31001f978123c43681c0db32b3e0b Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 13 Nov 2024 13:24:19 -0800 Subject: [PATCH 22/22] Add tensor type, operate over reduced tensor --- .../local-execution/itask_argument_accessor.h | 8 +- .../layer_tensor_key.struct.toml | 23 +++ .../local-execution/local_slots_backing.h | 45 +++--- .../local_task_argument_accessor.h | 10 +- .../local-execution/local_training_backing.h | 13 +- .../include/local-execution/loss_functions.h | 2 +- .../local-execution/model_training_instance.h | 13 +- .../local-execution/op_task_invocation.h | 6 +- .../op_tensor_slot_spec.struct.toml | 6 +- .../include/local-execution/optimizer.h | 14 +- .../reduced_tensor_t.struct.toml | 13 ++ ...t.toml => slot_tensor_type_id.struct.toml} | 8 +- .../local-execution/task_argument_accessor.h | 59 +++++++- .../include/local-execution/task_binding.h | 12 +- .../include/local-execution/task_signature.h | 4 +- .../task_signature.struct.toml | 4 +- .../tensor_guid_slot_spec.struct.toml | 22 --- .../tensor_guid_spec.struct.toml | 23 --- .../local-execution/tensor_reduction.h | 15 ++ .../local-execution/tensor_type.enum.toml | 20 +++ .../tensor_type_slot_spec.struct.toml | 26 ++++ .../unified_tensor_guid.variant.toml | 21 --- .../src/local_slots_backing.cc | 137 +++++++++--------- .../src/local_task_argument_accessor.cc | 12 +- .../src/local_training_backing.cc | 40 ++--- lib/local-execution/src/loss_functions.cc | 19 ++- .../src/model_training_instance.cc | 64 +++++--- lib/local-execution/src/op_task_invocation.cc | 14 +- lib/local-execution/src/op_task_signature.cc | 42 ++++-- lib/local-execution/src/optimizer.cc | 45 +++--- lib/local-execution/src/task_binding.cc | 17 ++- lib/local-execution/src/task_signature.cc | 10 +- lib/local-execution/src/tensor_reduction.cc | 17 +++ .../include/op-attrs/operator_attrs.h | 4 +- 34 files changed, 471 insertions(+), 317 deletions(-) create mode 100644 lib/local-execution/include/local-execution/layer_tensor_key.struct.toml create mode 100644 lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml rename lib/local-execution/include/local-execution/{slot_grad_id.struct.toml => slot_tensor_type_id.struct.toml} (62%) delete mode 100644 lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml delete mode 100644 lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml create mode 100644 lib/local-execution/include/local-execution/tensor_reduction.h create mode 100644 lib/local-execution/include/local-execution/tensor_type.enum.toml create mode 100644 lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml delete mode 100644 lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml create mode 100644 lib/local-execution/src/tensor_reduction.cc diff --git a/lib/local-execution/include/local-execution/itask_argument_accessor.h b/lib/local-execution/include/local-execution/itask_argument_accessor.h index b4d188e4a3..9eff9460c2 100644 --- a/lib/local-execution/include/local-execution/itask_argument_accessor.h +++ b/lib/local-execution/include/local-execution/itask_argument_accessor.h @@ -5,6 +5,7 @@ #include "local-execution/concrete_arg.h" #include "local-execution/op_task_signature.h" #include "local-execution/privilege_tensor_accessor.h" +#include "local-execution/tensor_type.dtg.h" namespace FlexFlow { @@ -15,10 +16,11 @@ struct ITaskArgumentAccessor { virtual ConcreteArgSpec const &get_concrete_arg(slot_id_t) const = 0; - virtual GenericTensorAccessor - get_tensor(slot_id_t slot, Permissions priv, IsGrad is_grad) const = 0; + virtual GenericTensorAccessor get_tensor(slot_id_t slot, + Permissions priv, + TensorType tensor_type) const = 0; virtual VariadicGenericTensorAccessor get_variadic_tensor( - slot_id_t slot, Permissions priv, IsGrad is_grad) const = 0; + slot_id_t slot, Permissions priv, TensorType tensor_type) const = 0; virtual Allocator get_allocator() const = 0; virtual size_t get_device_idx() const = 0; diff --git a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml new file mode 100644 index 0000000000..3ec6d7b0f1 --- /dev/null +++ b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml @@ -0,0 +1,23 @@ +namespace = "FlexFlow" +name = "LayerTensorKey" +features = [ + "eq", + "ord", + "hash", + "json", + "rapidcheck", + "fmt", +] + +includes = [ + "pcg/layer_guid_t.dtg.h", + "local-execution/reduced_tensor_t.dtg.h" +] + +[[fields]] +name = "layer_guid" +type = "::FlexFlow::layer_guid_t" + +[[fields]] +name = "reduced_tensor" +type = "::FlexFlow::reduced_tensor_t" diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 46e66e97a2..a632f432cf 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -3,6 +3,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H #include "kernels/accessor.h" +#include "local-execution/layer_tensor_key.dtg.h" #include "local-execution/local_task_argument_accessor.h" #include "local-execution/non_graph_tensor_guid_t.dtg.h" #include "local-execution/op_task_invocation.h" @@ -10,26 +11,25 @@ #include "local-execution/runtime_arg_config.h" #include "local-execution/task_invocation.dtg.h" #include "local-execution/tensor_role.dtg.h" -#include "local-execution/unified_tensor_guid.dtg.h" #include "pcg/computation_graph.dtg.h" -#include "pcg/layer_guid_t.dtg.h" #include "pcg/tensor_guid_t.dtg.h" namespace FlexFlow { +using LayerTensorBackingMap = + std::unordered_map; + using TensorBackingMap = - std::unordered_map; -using NonGraphTensorBackingMap = - std::unordered_map; + std::unordered_map; struct LocalSlotsBacking { - LocalSlotsBacking(TensorBackingMap const &, RuntimeArgConfig const &); + LocalSlotsBacking(LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, + RuntimeArgConfig const &); public: void add_per_device_op_state(layer_guid_t const &, DeviceSpecificDeviceStates const &); - void insert_into_tensor_mapping(tensor_guid_t const &, - GenericTensorAccessorW const &); void allocate_layer_tensors(layer_guid_t const &, ComputationGraph const &, Allocator &); @@ -44,7 +44,9 @@ struct LocalSlotsBacking { TaskSignature const &); TensorSlotsBacking construct_tensor_slots_backing(OpTaskBinding const &, layer_guid_t const &) const; - TensorSlotsBacking construct_tensor_slots_backing(TaskBinding const &) const; + TensorSlotsBacking + construct_tensor_slots_backing(TaskBinding const &, + std::optional const &) const; ArgSlotsBacking construct_arg_slots_backing(OpTaskBinding const &, layer_guid_t const &) const; ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const; @@ -53,24 +55,27 @@ struct LocalSlotsBacking { ConcreteArgSpec resolve_op_arg_ref_spec(OpArgRefSpec const &, layer_guid_t const &) const; - GenericTensorAccessorW const &get_tensor_backing(UnifiedTensorGuid const &, - IsGrad) const; + GenericTensorAccessorW const & + get_tensor_backing(TensorType const &, + reduced_tensor_t const &, + std::optional const &) const; - bool is_tensor_allocated(tensor_guid_t const &) const; - bool is_gradient_tensor_allocated(tensor_guid_t const &) const; + bool is_forward_tensor_allocated(LayerTensorKey const &) const; + bool is_non_graph_tensor_allocated(reduced_tensor_t const &) const; public: // tensors - TensorBackingMap tensor_mapping; - TensorBackingMap gradient_tensor_mapping; - NonGraphTensorBackingMap optimizer_tensor_mapping; - std::unordered_map> + LayerTensorBackingMap tensor_mapping; + LayerTensorBackingMap gradient_tensor_mapping; + LayerTensorBackingMap optimizer_tensor_mapping; + TensorBackingMap non_graph_tensor_mapping; + std::unordered_map> input_tensor_slots; - std::unordered_map> + std::unordered_map> weight_tensor_slots; - std::unordered_map> + std::unordered_map> output_tensor_slots; - std::unordered_map> + std::unordered_map> weight_optimizer_tensor_guids; // arguments diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h index 1e1516a0de..db0e98c2b1 100644 --- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H -#include "local-execution/slot_grad_id.dtg.h" +#include "local-execution/slot_tensor_type_id.dtg.h" #include "local-execution/task_argument_accessor.h" #include #include @@ -9,7 +9,7 @@ namespace FlexFlow { using TensorSlotsBacking = std::unordered_map< - SlotGradId, + SlotTensorTypeId, std::variant>>; using ArgSlotsBacking = std::unordered_map; @@ -25,9 +25,9 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { GenericTensorAccessor get_tensor(slot_id_t slot, Permissions priv, - IsGrad is_grad) const override; + TensorType tensor_type) const override; VariadicGenericTensorAccessor get_variadic_tensor( - slot_id_t slot, Permissions priv, IsGrad is_grad) const override; + slot_id_t slot, Permissions priv, TensorType tensor_type) const override; Allocator get_allocator() const override; @@ -40,7 +40,7 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { }; using TensorSlotsBackingWithoutAddresses = std::unordered_map< - SlotGradId, + SlotTensorTypeId, std::variant, std::vector>>>; diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index 6dfa8ad443..cbab4bf031 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -15,7 +15,8 @@ using PerLayerElapsedTime = struct LocalTrainingBacking { LocalTrainingBacking(Allocator const &, ComputationGraph const &, - TensorBackingMap const &, + LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, RuntimeArgConfig const &); void register_and_allocate_layer(layer_guid_t const &); void allocate_layer_optimizer_tensors(layer_guid_t const &, @@ -24,17 +25,17 @@ struct LocalTrainingBacking { void execute_init(layer_guid_t const &); std::optional execute_forward(layer_guid_t const &); void compute_loss(LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - tensor_guid_t const &label_tensor); + reduced_tensor_t const &logit_tensor, + reduced_tensor_t const &label_tensor); std::optional execute_backward(layer_guid_t const &); void execute_update(layer_guid_t const &, OptimizerAttrs const &); - TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const; + TaskArgumentAccessor + get_task_arg_accessor(TaskInvocation const &, + std::optional const &) const; TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &, layer_guid_t const &) const; - void insert_tensor(tensor_guid_t const &, GenericTensorAccessorW const &); - private: DeviceSpecificDeviceStates call_init_task_impl(task_id_t, TaskArgumentAccessor const &); diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index 2298115d5d..4ce74da766 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -26,7 +26,7 @@ namespace FlexFlow { TaskImplFunction get_loss_bwd_task_impl(); TaskSignature get_loss_bwd_signature(); TaskInvocation - backward(LossAttrs const &, tensor_guid_t logit, tensor_guid_t label); + backward(LossAttrs const &, reduced_tensor_t logit, reduced_tensor_t label); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 14473ff26e..5cc13f0b40 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -12,11 +12,12 @@ using PerLayerElapsedTime = struct ModelTrainingInstance { ModelTrainingInstance(Allocator const &, ComputationGraph const &, - TensorBackingMap const &, + LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, RuntimeArgConfig const &, LossAttrs const &, - tensor_guid_t const & logit_tensor, - tensor_guid_t const & label_tensor, + reduced_tensor_t const &logit_tensor, + reduced_tensor_t const &label_tensor, OptimizerAttrs const &); void execute_init(); @@ -27,11 +28,11 @@ struct ModelTrainingInstance { ComputationGraph computation_graph; LocalTrainingBacking training_backing; LossAttrs loss_attrs; - tensor_guid_t logit_tensor; - tensor_guid_t label_tensor; + reduced_tensor_t logit_tensor; + reduced_tensor_t label_tensor; OptimizerAttrs optimizer_attrs; }; -} +} // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h index 0f351c3a0e..6484981ebf 100644 --- a/lib/local-execution/include/local-execution/op_task_invocation.h +++ b/lib/local-execution/include/local-execution/op_task_invocation.h @@ -10,7 +10,7 @@ #include "local-execution/op_tensor_spec.h" #include "local-execution/profiling.h" #include "local-execution/runtime_arg_ref.h" -#include "local-execution/slot_grad_id.dtg.h" +#include "local-execution/slot_tensor_type_id.dtg.h" #include "local-execution/task_id_t.dtg.h" #include "local-execution/variadic_tensor_ref.h" #include @@ -84,14 +84,14 @@ struct OpTaskBinding { bool operator==(OpTaskBinding const &other) const; bool operator!=(OpTaskBinding const &other) const; - std::unordered_map const & + std::unordered_map const & get_tensor_bindings() const; std::unordered_map const &get_arg_bindings() const; void bind_from_forward(OpTaskBinding const &fwd); private: - std::unordered_map tensor_bindings; + std::unordered_map tensor_bindings; std::unordered_map arg_bindings; private: diff --git a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml b/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml index 590dbe6362..54638a7eb6 100644 --- a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml +++ b/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml @@ -11,7 +11,7 @@ includes = [ "local-execution/slot_id_t.dtg.h", "local-execution/slot_type.dtg.h", "local-execution/tensor_role.dtg.h", - "local-execution/is_grad.dtg.h", + "local-execution/tensor_type.dtg.h", "local-execution/op_slot_options.dtg.h", ] @@ -28,8 +28,8 @@ name = "tensor_role" type = "::FlexFlow::TensorRole" [[fields]] -name = "is_grad" -type = "::FlexFlow::IsGrad" +name = "tensor_type" +type = "::FlexFlow::TensorType" [[fields]] name = "slot_option" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index acf9b8a550..2eb480a0c1 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -14,21 +14,21 @@ namespace FlexFlow { TaskSignature get_update_signature(OptimizerAttrs const &); TaskInvocation get_update_invocation( OptimizerAttrs const &, - tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors); + reduced_tensor_t const &weight, + std::vector const &grad_buffer_tensors); TaskImplFunction get_update_task_impl(OptimizerAttrs const &); TaskSignature get_sgd_update_signature(); TaskInvocation sgd_update(SGDOptimizerAttrs const &, - tensor_guid_t const &weight, - non_graph_tensor_guid_t const &sgd_v); + reduced_tensor_t const &weight, + reduced_tensor_t const &sgd_v); TaskImplFunction get_sgd_update_task_impl(); TaskSignature get_adam_update_signature(); TaskInvocation adam_update(AdamOptimizerAttrs const &, - tensor_guid_t const &weight, - non_graph_tensor_guid_t const &adam_v, - non_graph_tensor_guid_t const &adam_m); + reduced_tensor_t const &weight, + reduced_tensor_t const &adam_v, + reduced_tensor_t const &adam_m); TaskImplFunction get_adam_update_task_impl(); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml b/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml new file mode 100644 index 0000000000..726249c970 --- /dev/null +++ b/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml @@ -0,0 +1,13 @@ +namespace = "FlexFlow" +name = "reduced_tensor_t" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + + +[[fields]] +name = "raw_index" +type = "int" diff --git a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml b/lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml similarity index 62% rename from lib/local-execution/include/local-execution/slot_grad_id.struct.toml rename to lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml index 256091d272..b3b3a320c7 100644 --- a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml +++ b/lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "SlotGradId" +name = "SlotTensorTypeId" features = [ "eq", "ord", @@ -8,7 +8,7 @@ features = [ ] includes = [ - "local-execution/is_grad.dtg.h", + "local-execution/tensor_type.dtg.h", "local-execution/slot_id_t.dtg.h", ] @@ -17,5 +17,5 @@ name = "slot_id" type = "::FlexFlow::slot_id_t" [[fields]] -name = "is_grad" -type = "::FlexFlow::IsGrad" +name = "tensor_type" +type = "::FlexFlow::TensorType" diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 54c8dfc5f1..29d5fb8fbe 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct TaskArgumentAccessor { + // arguments template T const &get_argument(slot_id_t slot) const { if constexpr (PerDeviceOpState::IsPartOfPerDeviceOpState_v) { @@ -24,6 +25,7 @@ struct TaskArgumentAccessor { return this->get_argument(slot_id_t{slot}); } + // tensors template privilege_mode_to_accessor get_tensor(int slot) const { return this->get_tensor(slot_id_t{slot}); @@ -32,7 +34,7 @@ struct TaskArgumentAccessor { template privilege_mode_to_accessor get_tensor(slot_id_t slot) const { return std::get>( - this->ptr->get_tensor(slot, PRIV, IsGrad::NO)); + this->ptr->get_tensor(slot, PRIV, TensorType::FORWARD)); } template @@ -43,9 +45,32 @@ struct TaskArgumentAccessor { template privilege_mode_to_accessor get_tensor_grad(slot_id_t slot) const { return std::get>( - this->ptr->get_tensor(slot, PRIV, IsGrad::YES)); + this->ptr->get_tensor(slot, PRIV, TensorType::GRADIENT)); } + template + privilege_mode_to_accessor get_optimizer_tensor(int slot) const { + return this->get_tensor_grad(slot_id_t{slot}); + } + + template + privilege_mode_to_accessor get_optimizer_tensor(slot_id_t slot) const { + return std::get>( + this->ptr->get_tensor(slot, PRIV, TensorType::OPTIMIZER)); + } + + template + privilege_mode_to_accessor get_non_graph_tensor(int slot) const { + return this->get_tensor_grad(slot_id_t{slot}); + } + + template + privilege_mode_to_accessor get_non_graph_tensor(slot_id_t slot) const { + return std::get>( + this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH)); + } + + // variadic tensors template std::vector> get_variadic_tensor(int slot) const { @@ -56,7 +81,7 @@ struct TaskArgumentAccessor { std::vector> get_variadic_tensor(slot_id_t slot) const { return std::get>>( - this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::NO)); + this->ptr->get_variadic_tensor(slot, PRIV, TensorType::FORWARD)); } template @@ -69,7 +94,33 @@ struct TaskArgumentAccessor { std::vector> get_variadic_tensor_grad(slot_id_t slot) const { return std::get>>( - this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::YES)); + this->ptr->get_variadic_tensor(slot, PRIV, TensorType::GRADIENT)); + } + + template + std::vector> + get_variadic_optimizer_tensor(int slot) const { + return this->get_variadic_tensor_grad(slot_id_t{slot}); + } + + template + std::vector> + get_variadic_optimizer_tensor(slot_id_t slot) const { + return std::get>>( + this->ptr->get_variadic_tensor(slot, PRIV, TensorType::OPTIMIZER)); + } + + template + std::vector> + get_variadic_non_graph_tensor(int slot) const { + return this->get_variadic_tensor_grad(slot_id_t{slot}); + } + + template + std::vector> + get_variadic_non_graph_tensor(slot_id_t slot) const { + return std::get>>( + this->ptr->get_variadic_tensor(slot, PRIV, TensorType::NON_GRAPH)); } Allocator get_allocator() const { diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h index 96c96473e4..93461e2e55 100644 --- a/lib/local-execution/include/local-execution/task_binding.h +++ b/lib/local-execution/include/local-execution/task_binding.h @@ -1,12 +1,12 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H -#include "local-execution/slot_grad_id.dtg.h" +#include "local-execution/reduced_tensor_t.dtg.h" #include "local-execution/slot_id_t.dtg.h" +#include "local-execution/slot_tensor_type_id.dtg.h" #include "local-execution/task_arg_spec.dtg.h" #include "local-execution/task_id_t.dtg.h" #include "local-execution/task_signature.dtg.h" -#include "local-execution/tensor_guid_spec.dtg.h" #include "utils/hash/unordered_map.h" namespace FlexFlow { @@ -14,8 +14,8 @@ namespace FlexFlow { struct TaskBinding { TaskBinding() = default; - void bind(int, TensorGuidSpec const &); - void bind(slot_id_t, TensorGuidSpec const &); + void bind(int, TensorType const &, reduced_tensor_t const &); + void bind(slot_id_t, TensorType const &, reduced_tensor_t const &); template void bind_arg(int name, T const &t) { @@ -40,12 +40,12 @@ struct TaskBinding { bool operator==(TaskBinding const &other) const; bool operator!=(TaskBinding const &other) const; - std::unordered_map const & + std::unordered_map const & get_tensor_bindings() const; std::unordered_map const &get_arg_bindings() const; private: - std::unordered_map tensor_bindings; + std::unordered_map tensor_bindings; std::unordered_map arg_bindings; private: diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h index 6da69f2441..b10edce6d4 100644 --- a/lib/local-execution/include/local-execution/task_signature.h +++ b/lib/local-execution/include/local-execution/task_signature.h @@ -10,11 +10,11 @@ TaskSignature make_empty_task_signature(); void add_slot(TaskSignature &, int name, - IsGrad, + TensorType, SlotType slot_type = SlotType::TENSOR); void add_slot(TaskSignature &, slot_id_t name, - IsGrad, + TensorType, SlotType slot_type = SlotType::TENSOR); template diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml index ac408a7b68..7efb0c658a 100644 --- a/lib/local-execution/include/local-execution/task_signature.struct.toml +++ b/lib/local-execution/include/local-execution/task_signature.struct.toml @@ -7,7 +7,7 @@ features = [ ] includes = [ - "local-execution/tensor_guid_slot_spec.dtg.h", + "local-execution/tensor_type_slot_spec.dtg.h", "local-execution/slot_id_t.dtg.h", "", "" @@ -30,4 +30,4 @@ type = "std::unordered_map<::FlexFlow::slot_id_t, std::type_index>" [[fields]] name = "tensor_guid_slots" -type = "std::unordered_map<::FlexFlow::slot_id_t, ::FlexFlow::TensorGuidSlotSpec>" +type = "std::unordered_map<::FlexFlow::slot_id_t, ::FlexFlow::TensorTypeSlotSpec>" diff --git a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml deleted file mode 100644 index 9b7e9c14f9..0000000000 --- a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml +++ /dev/null @@ -1,22 +0,0 @@ -namespace = "FlexFlow" -name = "TensorGuidSlotSpec" -features = [ - "eq", - "fmt", - "hash", - "ord", -] - -includes = [ - "local-execution/slot_type.dtg.h", - "local-execution/is_grad.dtg.h", -] - -[[fields]] -name = "slot_type" -type = "::FlexFlow::SlotType" - -[[fields]] -name = "is_grad" -type = "::FlexFlow::IsGrad" - diff --git a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml deleted file mode 100644 index 1d147f60e5..0000000000 --- a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml +++ /dev/null @@ -1,23 +0,0 @@ -namespace = "FlexFlow" -name = "TensorGuidSpec" -features = [ - "eq", - "fmt", - "hash", - "ord" -] - -includes = [ - "pcg/tensor_guid_t.dtg.h", - "local-execution/is_grad.dtg.h", - "local-execution/unified_tensor_guid.dtg.h" -] - -[[fields]] -name = "tensor_guid" -type = "::FlexFlow::UnifiedTensorGuid" - -[[fields]] -name = "is_grad" -type = "::FlexFlow::IsGrad" - diff --git a/lib/local-execution/include/local-execution/tensor_reduction.h b/lib/local-execution/include/local-execution/tensor_reduction.h new file mode 100644 index 0000000000..eb55b07ee4 --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_reduction.h @@ -0,0 +1,15 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H +#define _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H + +#include "local-execution/reduced_tensor_t.dtg.h" +#include "pcg/tensor_guid_t.dtg.h" + +namespace FlexFlow { + +reduced_tensor_t lower(tensor_guid_t const &); + +std::vector lower(std::vector const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/tensor_type.enum.toml b/lib/local-execution/include/local-execution/tensor_type.enum.toml new file mode 100644 index 0000000000..31ce5ba83a --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_type.enum.toml @@ -0,0 +1,20 @@ +namespace = "FlexFlow" +name = "TensorType" +features = [ + "hash", + "fmt", + "rapidcheck", + "json", +] + +[[values]] +name = "NON_GRAPH" + +[[values]] +name = "FORWARD" + +[[values]] +name = "GRADIENT" + +[[values]] +name = "OPTIMIZER" diff --git a/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml new file mode 100644 index 0000000000..ceba809474 --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "TensorTypeSlotSpec" +features = [ + "eq", + "fmt", + "hash", + "ord", +] + +includes = [ + "local-execution/slot_type.dtg.h", + "local-execution/slot_id_t.dtg.h", + "local-execution/tensor_type.dtg.h", +] + +[[fields]] +name = "slot_id" +type = "::FlexFlow::slot_id_t" + +[[fields]] +name = "tensor_type" +type = "::FlexFlow::TensorType" + +[[fields]] +name = "slot_type" +type = "::FlexFlow::SlotType" diff --git a/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml b/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml deleted file mode 100644 index 3d2cd8e45f..0000000000 --- a/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml +++ /dev/null @@ -1,21 +0,0 @@ -namespace = "FlexFlow" -name = "UnifiedTensorGuid" -features = [ - "eq", - "ord", - "hash", - "fmt", -] - -includes = [ - "pcg/tensor_guid_t.dtg.h", - "local-execution/non_graph_tensor_guid_t.dtg.h", -] - -[[values]] -type = "::FlexFlow::tensor_guid_t" -key = "tensor_guid" - -[[values]] -type = "::FlexFlow::non_graph_tensor_guid_t" -key = "non_graph_tensor_guid" diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 25abc72567..f1bb5a9a5b 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -1,4 +1,5 @@ #include "local-execution/local_slots_backing.h" +#include "local-execution/tensor_reduction.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/computation_graph.h" #include "utils/containers/contains_key.h" @@ -7,9 +8,12 @@ namespace FlexFlow { -LocalSlotsBacking::LocalSlotsBacking(TensorBackingMap const &allocated_tensors, - RuntimeArgConfig const &runtime_arg_config) - : tensor_mapping(allocated_tensors), +LocalSlotsBacking::LocalSlotsBacking( + LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, + RuntimeArgConfig const &runtime_arg_config) + : tensor_mapping(allocated_forward_tensors), + non_graph_tensor_mapping(allocated_non_graph_tensors), runtime_arg_config(runtime_arg_config){}; void LocalSlotsBacking::add_per_device_op_state( @@ -18,13 +22,6 @@ void LocalSlotsBacking::add_per_device_op_state( this->per_device_op_states.insert({op_guid, device_state}); } -void LocalSlotsBacking::insert_into_tensor_mapping( - tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) { - if (!contains_key(this->tensor_mapping, tensor)) { - this->tensor_mapping.insert({tensor, tensor_backing}); - } -} - void LocalSlotsBacking::allocate_layer_tensors( layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, @@ -46,15 +43,15 @@ void LocalSlotsBacking::allocate_tensors_by_role( switch (role) { case TensorRole::INPUT: tensors = get_incoming_inputs(computation_graph, layer_guid); - this->input_tensor_slots.insert({layer_guid, tensors}); + this->input_tensor_slots.insert({layer_guid, lower(tensors)}); break; case TensorRole::WEIGHT: tensors = get_incoming_weights(computation_graph, layer_guid); - this->weight_tensor_slots.insert({layer_guid, tensors}); + this->weight_tensor_slots.insert({layer_guid, lower(tensors)}); break; case TensorRole::OUTPUT: tensors = get_outgoing_tensors(computation_graph, layer_guid); - this->output_tensor_slots.insert({layer_guid, tensors}); + this->output_tensor_slots.insert({layer_guid, lower(tensors)}); break; default: throw mk_runtime_error("Invalid tensor role, got {}", role); @@ -62,19 +59,22 @@ void LocalSlotsBacking::allocate_tensors_by_role( for (tensor_guid_t const &tensor : tensors) { TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor); + reduced_tensor_t reduced_tensor = lower(tensor); + LayerTensorKey layer_tensor_key = + LayerTensorKey{layer_guid, reduced_tensor}; // tensor allocation - if (!is_tensor_allocated(tensor)) { + if (!is_forward_tensor_allocated(layer_tensor_key)) { GenericTensorAccessorW tensor_backing = allocator.allocate_tensor(tensor_attrs.shape); - this->tensor_mapping.insert({tensor, tensor_backing}); + this->tensor_mapping.insert({layer_tensor_key, tensor_backing}); } // gradient tensor allocation - if (tensor_attrs.create_gradients == CreateGrad::YES && - !is_gradient_tensor_allocated(tensor)) { + if (tensor_attrs.create_gradients == CreateGrad::YES) { GenericTensorAccessorW gradient_tensor_backing = allocator.allocate_tensor(tensor_attrs.shape); - this->gradient_tensor_mapping.insert({tensor, gradient_tensor_backing}); + this->gradient_tensor_mapping.insert( + {layer_tensor_key, gradient_tensor_backing}); } } } @@ -85,53 +85,52 @@ void LocalSlotsBacking::allocate_optimizer_tensors( ComputationGraph const &cg, Allocator &allocator, TaskSignature const &sig) { - GenericTensorAccessorW weight_backing = - get_tensor_backing(UnifiedTensorGuid{weight}, IsGrad::NO); + GenericTensorAccessorW weight_backing = this->get_tensor_backing( + TensorType::FORWARD, lower(weight), weight_layer); int num_grad_buffer_tensors = sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad) - std::vector grad_buffer_tensors; + std::vector optimizer_buffer_tensors; for (int i = 0; i < num_grad_buffer_tensors; ++i) { - non_graph_tensor_guid_t buffer_tensor_guid = non_graph_tensor_guid_t{i}; + reduced_tensor_t buffer_tensor = reduced_tensor_t{i}; GenericTensorAccessorW buffer_backing = allocator.allocate_tensor( get_tensor_shape(weight_backing.shape, weight_backing.data_type)); - this->optimizer_tensor_mapping.insert({buffer_tensor_guid, buffer_backing}); - grad_buffer_tensors.push_back(buffer_tensor_guid); + this->optimizer_tensor_mapping.insert( + {LayerTensorKey{weight_layer, buffer_tensor}, buffer_backing}); + optimizer_buffer_tensors.push_back(buffer_tensor); } this->weight_optimizer_tensor_guids.insert( - {weight_layer, grad_buffer_tensors}); + {weight_layer, optimizer_buffer_tensors}); } -bool LocalSlotsBacking::is_tensor_allocated( - tensor_guid_t const &tensor_id) const { - return contains_key(this->tensor_mapping, tensor_id); +bool LocalSlotsBacking::is_forward_tensor_allocated( + LayerTensorKey const &layer_tensor_id) const { + return contains_key(this->tensor_mapping, layer_tensor_id); } -bool LocalSlotsBacking::is_gradient_tensor_allocated( - tensor_guid_t const &tensor_id) const { - return contains_key(this->gradient_tensor_mapping, tensor_id); +bool LocalSlotsBacking::is_non_graph_tensor_allocated( + reduced_tensor_t const &tensor_id) const { + return contains_key(this->non_graph_tensor_mapping, tensor_id); } -GenericTensorAccessorW const & - LocalSlotsBacking::get_tensor_backing(UnifiedTensorGuid const &tensor_id, - IsGrad is_grad) const { - if (tensor_id.has()) { - tensor_guid_t graph_tensor_guid = tensor_id.get(); - switch (is_grad) { - case IsGrad::NO: - assert(contains_key(this->tensor_mapping, graph_tensor_guid)); - return this->tensor_mapping.at(graph_tensor_guid); - case IsGrad::YES: - assert(contains_key(this->gradient_tensor_mapping, graph_tensor_guid)); - return this->gradient_tensor_mapping.at(graph_tensor_guid); - default: - throw mk_runtime_error(fmt::format( - "IsGrad should only have YES or NO, received {}", is_grad)); - } - } else { - non_graph_tensor_guid_t non_graph_tensor_guid = - tensor_id.get(); - assert(contains_key(this->optimizer_tensor_mapping, non_graph_tensor_guid)); - return this->optimizer_tensor_mapping.at(non_graph_tensor_guid); +GenericTensorAccessorW const &LocalSlotsBacking::get_tensor_backing( + TensorType const &tensor_type, + reduced_tensor_t const &tensor_id, + std::optional const &layer_guid) const { + switch (tensor_type) { + case TensorType::FORWARD: + return this->tensor_mapping.at( + LayerTensorKey{layer_guid.value(), tensor_id}); + case TensorType::NON_GRAPH: + return this->non_graph_tensor_mapping.at(tensor_id); + case TensorType::GRADIENT: + return this->gradient_tensor_mapping.at( + LayerTensorKey{layer_guid.value(), tensor_id}); + case TensorType::OPTIMIZER: + return this->optimizer_tensor_mapping.at( + LayerTensorKey{layer_guid.value(), tensor_id}); + default: + throw mk_runtime_error( + fmt::format("Invalid tensor type {}", tensor_type)); } } @@ -140,9 +139,9 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( TensorSlotsBacking mapping; for (auto const &tensor_binding : binding.get_tensor_bindings()) { - SlotGradId slot_grad_id = tensor_binding.first; + SlotTensorTypeId slot_grad_id = tensor_binding.first; OpTensorSpec tensor_spec = tensor_binding.second; - std::vector tensor_guids; + std::vector tensor_guids; int weight_adjusted_idx = 0; switch (tensor_spec.role) { case TensorRole::WEIGHT: @@ -162,26 +161,25 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( fmt::format("Invalid TensorRole {}", tensor_spec.role)); } - IsGrad is_grad = slot_grad_id.is_grad; - GenericTensorAccessorW tensor_backing = this->get_tensor_backing( - UnifiedTensorGuid{tensor_guids.at(tensor_spec.idx)}, is_grad); - - mapping.insert({slot_grad_id, tensor_backing}); + mapping.insert({slot_grad_id, + this->get_tensor_backing(slot_grad_id.tensor_type, + tensor_guids.at(tensor_spec.idx), + op_guid)}); } return mapping; } TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( - TaskBinding const &binding) const { + TaskBinding const &binding, + std::optional const &layer_guid) const { TensorSlotsBacking mapping; for (auto const &tensor_binding : binding.get_tensor_bindings()) { - SlotGradId slot_grad_id = tensor_binding.first; - TensorGuidSpec tensor_spec = tensor_binding.second; - + reduced_tensor_t tensor_id = tensor_binding.second; + SlotTensorTypeId slot_tensor_type_id = tensor_binding.first; GenericTensorAccessorW accessor = this->get_tensor_backing( - UnifiedTensorGuid{tensor_spec.tensor_guid}, slot_grad_id.is_grad); - mapping.insert({slot_grad_id, accessor}); + slot_tensor_type_id.tensor_type, tensor_id, layer_guid); + mapping.insert({slot_tensor_type_id, accessor}); } return mapping; @@ -229,13 +227,14 @@ ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec( op_arg_ref_spec.get_ref_type().get(); assert(contains_key(this->input_tensor_slots, op_guid)); - std::vector input_tensor_guids = + std::vector input_tensor_guids = this->input_tensor_slots.at(op_guid); assert(input_tensor_guids.size() > index_op_arg_ref.idx); - GenericTensorAccessorW tensor_backing = this->get_tensor_backing( - UnifiedTensorGuid{input_tensor_guids.at(index_op_arg_ref.idx)}, - IsGrad::NO); + GenericTensorAccessorW tensor_backing = + this->get_tensor_backing(TensorType::FORWARD, + input_tensor_guids.at(index_op_arg_ref.idx), + op_guid); ParallelTensorShape shape = lift_to_parallel( get_tensor_shape(tensor_backing.shape, tensor_backing.data_type)); return ConcreteArgSpec::create(shape); diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index 5d0156201e..75479a1f88 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -19,10 +19,10 @@ ConcreteArgSpec const & } GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( - slot_id_t slot, Permissions priv, IsGrad is_grad) const { - SlotGradId slot_grad_pair = SlotGradId{slot, is_grad}; + slot_id_t slot, Permissions priv, TensorType tensor_type) const { + SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; auto tensor_backing = std::get( - this->tensor_slots_backing.at(slot_grad_pair)); + this->tensor_slots_backing.at(slot_tensor_type)); if (priv == Permissions::RO) { GenericTensorAccessorR readonly_tensor_backing = { tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; @@ -34,10 +34,10 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( } } VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( - slot_id_t slot, Permissions priv, IsGrad is_grad) const { - SlotGradId slot_grad_pair = SlotGradId{slot, is_grad}; + slot_id_t slot, Permissions priv, TensorType tensor_type) const { + SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; auto variadic_tensor_backing = std::get>( - this->tensor_slots_backing.at(slot_grad_pair)); + this->tensor_slots_backing.at(slot_tensor_type)); if (priv == Permissions::RO) { std::vector readonly_variadic_tensor_backing = {}; for (GenericTensorAccessorW const &tensor_backing : diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 0cb8146467..e432b1afe9 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -3,6 +3,7 @@ #include "local-execution/optimizer.h" #include "local-execution/task_invocation.h" #include "local-execution/task_signature_impl.h" +#include "local-execution/tensor_reduction.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" #include "utils/containers/contains.h" @@ -15,10 +16,13 @@ namespace FlexFlow { LocalTrainingBacking::LocalTrainingBacking( Allocator const &allocator, ComputationGraph const &computation_graph, - TensorBackingMap const &tensor_backing_mapping, + LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, RuntimeArgConfig const &runtime_arg_config) : allocator(allocator), computation_graph(computation_graph), - local_slots_backing(tensor_backing_mapping, runtime_arg_config), + local_slots_backing(allocated_forward_tensors, + allocated_non_graph_tensors, + runtime_arg_config), task_registry(empty_task_registry()) {} void LocalTrainingBacking::register_and_allocate_layer( @@ -96,15 +100,16 @@ std::optional } void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - tensor_guid_t const &label_tensor) { - assert(this->local_slots_backing.is_tensor_allocated(logit_tensor) && - this->local_slots_backing.is_tensor_allocated(label_tensor)); + reduced_tensor_t const &logit_tensor, + reduced_tensor_t const &label_tensor) { + assert( + this->local_slots_backing.is_non_graph_tensor_allocated(logit_tensor) && + this->local_slots_backing.is_non_graph_tensor_allocated(label_tensor)); TaskInvocation loss_invocation = backward(loss_attrs, logit_tensor, label_tensor); // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); TaskArgumentAccessor loss_accessor = - this->get_task_arg_accessor(loss_invocation); + this->get_task_arg_accessor(loss_invocation, std::nullopt); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); loss_impl_fn.get().function_ptr(loss_accessor); } @@ -130,28 +135,30 @@ void LocalTrainingBacking::execute_update( LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); if (layer_attrs.attrs.has()) { // get tensors - tensor_guid_t weight_tensor = - get_only(get_outgoing_tensors(this->computation_graph, node)); - std::vector grad_buffer_tensors = + reduced_tensor_t weight_tensor = + lower(get_only(get_outgoing_tensors(this->computation_graph, node))); + std::vector optimizer_buffer_tensors = this->local_slots_backing.weight_optimizer_tensor_guids.at(node); // get invocation TaskInvocation invocation = get_update_invocation( - optimizer_attrs, weight_tensor, grad_buffer_tensors); + optimizer_attrs, weight_tensor, optimizer_buffer_tensors); // assert(is_invocation_valid(get_update_signature(attrs), invocation)); // execute update - TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + TaskArgumentAccessor accessor = + this->get_task_arg_accessor(invocation, node); TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); update_impl_fn.get().function_ptr(accessor); } } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( - TaskInvocation const &invocation) const { + TaskInvocation const &invocation, + std::optional const &layer_guid) const { TensorSlotsBacking tensor_slots_backing = this->local_slots_backing.construct_tensor_slots_backing( - invocation.binding); + invocation.binding, layer_guid); ArgSlotsBacking arg_slots_backing = this->local_slots_backing.construct_arg_slots_backing(invocation.binding); return TaskArgumentAccessor::create( @@ -171,9 +178,4 @@ TaskArgumentAccessor LocalTrainingBacking::get_op_task_arg_accessor( this->allocator, tensor_slots_backing, arg_slots_backing); } -void LocalTrainingBacking::insert_tensor( - tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) { - this->local_slots_backing.insert_into_tensor_mapping(tensor, tensor_backing); -} - } // namespace FlexFlow diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index a37c1d706b..e54841acb5 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -24,20 +24,23 @@ enum Slots { LOGIT, LABEL, ATTRS, PROFILING }; TaskSignature get_loss_bwd_signature() { TaskSignature sig = make_empty_task_signature(); - add_slot(sig, LOGIT, IsGrad::NO); - add_slot(sig, LABEL, IsGrad::NO); - add_slot(sig, LOGIT, IsGrad::YES); + add_slot(sig, LOGIT, TensorType::NON_GRAPH); + add_slot(sig, LABEL, TensorType::NON_GRAPH); + add_slot(sig, LOGIT, TensorType::GRADIENT); + add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); return sig; } -TaskInvocation - backward(LossAttrs const &attrs, tensor_guid_t logit, tensor_guid_t label) { +TaskInvocation backward(LossAttrs const &attrs, + reduced_tensor_t logit, + reduced_tensor_t label) { TaskBinding b; - b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::NO}); - b.bind(LABEL, TensorGuidSpec{UnifiedTensorGuid{label}, IsGrad::NO}); - b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::YES}); + b.bind(LOGIT, TensorType::NON_GRAPH, logit); + b.bind(LABEL, TensorType::NON_GRAPH, label); + b.bind(LOGIT, TensorType::GRADIENT, logit); + b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index abdced1bb5..5a58e4c524 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -1,59 +1,77 @@ #include "local-execution/model_training_instance.h" #include "pcg/computation_graph.h" -#include "utils/containers/reversed.h" #include "pcg/optimizer_attrs.h" +#include "utils/containers/reversed.h" namespace FlexFlow { - -ModelTrainingInstance::ModelTrainingInstance(Allocator const & allocator, - ComputationGraph const & computation_graph, - TensorBackingMap const & tensor_backing_map, - RuntimeArgConfig const & runtime_arg_config, - LossAttrs const & loss_attrs, - tensor_guid_t const &logit_tensor, - tensor_guid_t const &label_tensor, - OptimizerAttrs const & optimizer_attrs) - : computation_graph(computation_graph), training_backing(allocator, computation_graph, tensor_backing_map, runtime_arg_config), - loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) { + +ModelTrainingInstance::ModelTrainingInstance( + Allocator const &allocator, + ComputationGraph const &computation_graph, + LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, + RuntimeArgConfig const &runtime_arg_config, + LossAttrs const &loss_attrs, + reduced_tensor_t const &logit_tensor, + reduced_tensor_t const &label_tensor, + OptimizerAttrs const &optimizer_attrs) + : computation_graph(computation_graph), + training_backing(allocator, + computation_graph, + allocated_forward_tensors, + allocated_non_graph_tensors, + runtime_arg_config), + loss_attrs(loss_attrs), logit_tensor(logit_tensor), + label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) { // allocate each layer's tensors - for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { this->training_backing.register_and_allocate_layer(node); - this->training_backing.allocate_layer_optimizer_tensors(node, this->optimizer_attrs); + this->training_backing.allocate_layer_optimizer_tensors( + node, this->optimizer_attrs); } } void ModelTrainingInstance::execute_init() { - for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { this->training_backing.execute_init(node); } } PerLayerElapsedTime ModelTrainingInstance::execute_forward() { PerLayerElapsedTime per_layer_elapsed_time; - for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { - std::optional elapsed_time = this->training_backing.execute_forward(node); + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { + std::optional elapsed_time = + this->training_backing.execute_forward(node); per_layer_elapsed_time.insert({node, elapsed_time}); } return per_layer_elapsed_time; } PerLayerElapsedTime ModelTrainingInstance::execute_backward() { - this->training_backing.compute_loss(this->loss_attrs, this->logit_tensor, this->label_tensor); - + this->training_backing.compute_loss( + this->loss_attrs, this->logit_tensor, this->label_tensor); + PerLayerElapsedTime per_layer_elapsed_time; - for (layer_guid_t const & node: reversed(topological_ordering(this->computation_graph))) { - std::optional elapsed_time = this->training_backing.execute_backward(node); + for (layer_guid_t const &node : + reversed(topological_ordering(this->computation_graph))) { + std::optional elapsed_time = + this->training_backing.execute_backward(node); per_layer_elapsed_time.insert({node, elapsed_time}); } return per_layer_elapsed_time; } void ModelTrainingInstance::execute_update() { - for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { this->training_backing.execute_update(node, this->optimizer_attrs); } - this->optimizer_attrs = get_next_iteration_optimizer_attrs(this->optimizer_attrs); + this->optimizer_attrs = + get_next_iteration_optimizer_attrs(this->optimizer_attrs); } } // namespace FlexFlow diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc index 19c8894b05..81bf185911 100644 --- a/lib/local-execution/src/op_task_invocation.cc +++ b/lib/local-execution/src/op_task_invocation.cc @@ -20,7 +20,8 @@ void OpTaskBinding::bind(int slot, OpTensorSpec const &tensor_spec) { } void OpTaskBinding::bind(slot_id_t slot, OpTensorSpec const &tensor_spec) { - this->tensor_bindings.insert({SlotGradId{slot, IsGrad::NO}, tensor_spec}); + this->tensor_bindings.insert( + {SlotTensorTypeId{slot, TensorType::FORWARD}, tensor_spec}); } void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) { @@ -28,7 +29,8 @@ void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) { } void OpTaskBinding::bind_grad(slot_id_t slot, OpTensorSpec const &tensor_spec) { - this->tensor_bindings.insert({SlotGradId{slot, IsGrad::YES}, tensor_spec}); + this->tensor_bindings.insert( + {SlotTensorTypeId{slot, TensorType::GRADIENT}, tensor_spec}); } void OpTaskBinding::insert_arg_spec(slot_id_t name, OpArgSpec const &arg_spec) { @@ -44,13 +46,13 @@ bool OpTaskBinding::operator!=(OpTaskBinding const &other) const { return this->tie() != other.tie(); } -std::tuple const &, +std::tuple const &, std::unordered_map const &> OpTaskBinding::tie() const { return std::tie(this->tensor_bindings, this->arg_bindings); } -std::unordered_map const & +std::unordered_map const & OpTaskBinding::get_tensor_bindings() const { return this->tensor_bindings; } @@ -89,8 +91,8 @@ bool is_tensor_invocation_valid(OpTaskSignature const &sig, OpTaskInvocation const &inv) { auto tensor_bindings = inv.binding.get_tensor_bindings(); for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) { - SlotGradId tensor_key = - SlotGradId{op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad}; + SlotTensorTypeId tensor_key = SlotTensorTypeId{ + op_tensor_slot_spec.name, op_tensor_slot_spec.tensor_type}; OpTensorSpec op_tensor_spec = tensor_bindings.at(tensor_key); if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) { return false; diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc index 932b330453..5c8b19265a 100644 --- a/lib/local-execution/src/op_task_signature.cc +++ b/lib/local-execution/src/op_task_signature.cc @@ -12,8 +12,12 @@ void OpTaskSignature::add_input_slot(int name, SlotType slot_type) { } void OpTaskSignature::add_input_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ - name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::NECESSARY}; + OpTensorSlotSpec op_tensor_slot_spec = + OpTensorSlotSpec{name, + slot_type, + TensorRole::INPUT, + TensorType::FORWARD, + OpSlotOptions::NECESSARY}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -23,8 +27,12 @@ void OpTaskSignature::add_optional_input_slot(int name, SlotType slot_type) { void OpTaskSignature::add_optional_input_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ - name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + OpTensorSlotSpec op_tensor_slot_spec = + OpTensorSlotSpec{name, + slot_type, + TensorRole::INPUT, + TensorType::FORWARD, + OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -38,7 +46,7 @@ void OpTaskSignature::add_untrainable_input_slot(slot_id_t name, OpTensorSlotSpec{name, slot_type, TensorRole::INPUT, - IsGrad::NO, + TensorType::FORWARD, OpSlotOptions::UNTRAINABLE}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -54,7 +62,7 @@ void OpTaskSignature::add_optional_untrainable_input_slot(slot_id_t name, OpTensorSlotSpec{name, slot_type, TensorRole::INPUT, - IsGrad::NO, + TensorType::FORWARD, OpSlotOptions::OPTIONAL_UNTRAINABLE}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -68,7 +76,7 @@ void OpTaskSignature::add_output_slot(slot_id_t name, SlotType slot_type) { OpTensorSlotSpec{name, slot_type, TensorRole::OUTPUT, - IsGrad::NO, + TensorType::FORWARD, OpSlotOptions::NECESSARY}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -80,8 +88,12 @@ void OpTaskSignature::add_bwd_optional_output_slot(int name, void OpTaskSignature::add_bwd_optional_output_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ - name, slot_type, TensorRole::OUTPUT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + OpTensorSlotSpec op_tensor_slot_spec = + OpTensorSlotSpec{name, + slot_type, + TensorRole::OUTPUT, + TensorType::FORWARD, + OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -94,7 +106,7 @@ void OpTaskSignature::add_weight_slot(slot_id_t name, SlotType slot_type) { OpTensorSlotSpec{name, slot_type, TensorRole::WEIGHT, - IsGrad::NO, + TensorType::FORWARD, OpSlotOptions::NECESSARY}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -105,8 +117,12 @@ void OpTaskSignature::add_optional_weight_slot(int name, SlotType slot_type) { void OpTaskSignature::add_optional_weight_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ - name, slot_type, TensorRole::WEIGHT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + OpTensorSlotSpec op_tensor_slot_spec = + OpTensorSlotSpec{name, + slot_type, + TensorRole::WEIGHT, + TensorType::FORWARD, + OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -130,7 +146,7 @@ OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd) { OpTensorSlotSpec{op_tensor_slot_spec.name, op_tensor_slot_spec.slot_type, op_tensor_slot_spec.tensor_role, - IsGrad::YES, + TensorType::GRADIENT, op_tensor_slot_spec.slot_option}; bwd.op_tensor_slots.insert(grad_spec); } diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 1e06dee96a..5c0d6c54f2 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -9,9 +9,10 @@ enum Slots { ATTRS, WEIGHT, SGD_V, PROFILING, ADAM_M, ADAM_V, HANDLE }; TaskSignature get_sgd_update_signature() { TaskSignature sig = make_empty_task_signature(); - add_slot(sig, WEIGHT, IsGrad::YES); - add_slot(sig, WEIGHT, IsGrad::NO); - add_slot(sig, SGD_V, IsGrad::YES); + add_slot(sig, WEIGHT, TensorType::FORWARD); + add_slot(sig, WEIGHT, TensorType::GRADIENT); + add_slot(sig, SGD_V, TensorType::OPTIMIZER); + add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { @@ -21,13 +22,14 @@ TaskSignature get_sgd_update_signature() { } TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, - tensor_guid_t const &weight, - non_graph_tensor_guid_t const &sgd_v) { + reduced_tensor_t const &weight, + reduced_tensor_t const &sgd_v) { TaskBinding b; - b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES}); - b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO}); + b.bind(WEIGHT, TensorType::FORWARD, weight); + b.bind(WEIGHT, TensorType::GRADIENT, weight); + if (attrs.momentum > 0.0f) { - b.bind(SGD_V, TensorGuidSpec{UnifiedTensorGuid{sgd_v}, IsGrad::YES}); + b.bind(SGD_V, TensorType::OPTIMIZER, sgd_v); } b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -97,10 +99,11 @@ TaskImplFunction get_sgd_update_task_impl() { TaskSignature get_adam_update_signature() { TaskSignature sig = make_empty_task_signature(); - add_slot(sig, WEIGHT, IsGrad::YES); - add_slot(sig, WEIGHT, IsGrad::NO); - add_slot(sig, ADAM_V, IsGrad::YES); - add_slot(sig, ADAM_M, IsGrad::YES); + add_slot(sig, WEIGHT, TensorType::FORWARD); + add_slot(sig, WEIGHT, TensorType::GRADIENT); + add_slot(sig, ADAM_V, TensorType::OPTIMIZER); + add_slot(sig, ADAM_M, TensorType::OPTIMIZER); + add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { @@ -110,14 +113,14 @@ TaskSignature get_adam_update_signature() { } TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, - tensor_guid_t const &weight, - non_graph_tensor_guid_t const &adam_v, - non_graph_tensor_guid_t const &adam_m) { + reduced_tensor_t const &weight, + reduced_tensor_t const &adam_v, + reduced_tensor_t const &adam_m) { TaskBinding b; - b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES}); - b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO}); - b.bind(ADAM_M, TensorGuidSpec{UnifiedTensorGuid{adam_m}, IsGrad::YES}); - b.bind(ADAM_V, TensorGuidSpec{UnifiedTensorGuid{adam_v}, IsGrad::YES}); + b.bind(WEIGHT, TensorType::FORWARD, weight); + b.bind(WEIGHT, TensorType::GRADIENT, weight); + b.bind(ADAM_M, TensorType::OPTIMIZER, adam_m); + b.bind(ADAM_V, TensorType::OPTIMIZER, adam_v); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -191,8 +194,8 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) { TaskInvocation get_update_invocation( OptimizerAttrs const &attrs, - tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors) { + reduced_tensor_t const &weight, + std::vector const &grad_buffer_tensors) { return attrs.visit(overload{ [&](SGDOptimizerAttrs const &s) { return sgd_update(s, weight, grad_buffer_tensors.at(0)); diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc index 45d9d0cdb9..5261eec217 100644 --- a/lib/local-execution/src/task_binding.cc +++ b/lib/local-execution/src/task_binding.cc @@ -4,13 +4,16 @@ namespace FlexFlow { -void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) { - this->bind(slot_id_t{name}, tensor_guid_spec); +void TaskBinding::bind(int name, + TensorType const &tensor_type, + reduced_tensor_t const &binding) { + this->bind(slot_id_t{name}, tensor_type, binding); } -void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) { - this->tensor_bindings.insert( - {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec}); +void TaskBinding::bind(slot_id_t name, + TensorType const &tensor_type, + reduced_tensor_t const &binding) { + this->tensor_bindings.insert({SlotTensorTypeId{name, tensor_type}, binding}); } void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) { @@ -26,13 +29,13 @@ bool TaskBinding::operator!=(TaskBinding const &other) const { return this->tie() != other.tie(); } -std::tuple const &, +std::tuple const &, std::unordered_map const &> TaskBinding::tie() const { return std::tie(this->tensor_bindings, this->arg_bindings); } -std::unordered_map const & +std::unordered_map const & TaskBinding::get_tensor_bindings() const { return this->tensor_bindings; } diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc index 27bcbcd266..a608ab8ab8 100644 --- a/lib/local-execution/src/task_signature.cc +++ b/lib/local-execution/src/task_signature.cc @@ -8,17 +8,17 @@ TaskSignature make_empty_task_signature() { void add_slot(TaskSignature &task_signature, int name, - IsGrad is_grad, + TensorType tensor_type, SlotType slot_type) { - add_slot(task_signature, slot_id_t{name}, is_grad, slot_type); + add_slot(task_signature, slot_id_t{name}, tensor_type, slot_type); } void add_slot(TaskSignature &task_signature, slot_id_t name, - IsGrad is_grad, + TensorType tensor_type, SlotType slot_type) { - TensorGuidSlotSpec tensor_guid_slot_spec = - TensorGuidSlotSpec{slot_type, is_grad}; + TensorTypeSlotSpec tensor_guid_slot_spec = + TensorTypeSlotSpec{slot_type, tensor_type}; task_signature.tensor_guid_slots.insert({name, tensor_guid_slot_spec}); } diff --git a/lib/local-execution/src/tensor_reduction.cc b/lib/local-execution/src/tensor_reduction.cc new file mode 100644 index 0000000000..19324509bb --- /dev/null +++ b/lib/local-execution/src/tensor_reduction.cc @@ -0,0 +1,17 @@ +#include "local-execution/tensor_reduction.h" +#include "utils/containers/transform.h" + +namespace FlexFlow { + +reduced_tensor_t lower(tensor_guid_t const &tensor_guid) { + return reduced_tensor_t{tensor_guid.raw_graph_output.idx}; +} + +std::vector + lower(std::vector const &tensor_guids) { + return transform(tensor_guids, [&](tensor_guid_t const &tensor_guid) { + return lower(tensor_guid); + }); +} + +} // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/operator_attrs.h b/lib/op-attrs/include/op-attrs/operator_attrs.h index 11afc5b209..73473d6ac5 100644 --- a/lib/op-attrs/include/op-attrs/operator_attrs.h +++ b/lib/op-attrs/include/op-attrs/operator_attrs.h @@ -1,8 +1,6 @@ #ifndef _OPERATOR_PARAMS_H #define _OPERATOR_PARAMS_H -#include "op-attrs/ops/core.h" -#include "op-attrs/pcg_operator_attrs.dtg.h" #include "local-execution/ops/attention.h" #include "local-execution/ops/batch_matmul.h" #include "local-execution/ops/batch_norm.h" @@ -32,6 +30,8 @@ #include "local-execution/ops/split.h" #include "local-execution/ops/topk.h" #include "local-execution/ops/transpose.h" +#include "op-attrs/ops/core.h" +#include "op-attrs/pcg_operator_attrs.dtg.h" #include "utils/record_formatter.h" #include "utils/variant.h" #include