Merge branch 'master' into nn_int

flexflow · Jan 15, 2025 · 76cc39b · 76cc39b
2 parents 7881a82 + 5e7f207
commit 76cc39b
Show file tree

Hide file tree

Showing 32 changed files with 1,716 additions and 203 deletions.
diff --git a/.flake/pkgs/ffdb/default.nix b/.flake/pkgs/ffdb/default.nix
@@ -0,0 +1,40 @@
+{ lib
+, stdenv
+, makeWrapper
+, gdb
+, python3
+, proj
+}:
+
+stdenv.mkDerivation rec {
+  pname = "ffdb";
+  version = "0.1";
+
+  pythonPath = with python3.pkgs; makePythonPath [
+    proj
+  ];
+
+  dontBuild = true;
+
+  nativeBuildInputs = [ makeWrapper ];
+
+  src = ./.;
+
+  installPhase = ''
+    mkdir -p $out/share/ffdb
+    cp ffdb.py $out/share/ffdb
+    makeWrapper ${gdb}/bin/gdb $out/bin/gdb \
+      --add-flags "-q -x $out/share/ffdb/ffdb.py" \
+      --set NIX_PYTHONPATH ${pythonPath} \
+      --prefix PATH : ${lib.makeBinPath [
+        python3
+      ]}
+    cp $out/bin/gdb $out/bin/ffdb
+  '';
+
+  nativeCheckInputs = [
+    gdb
+    python3
+    proj
+  ];
+}
diff --git a/.flake/pkgs/ffdb/ffdb.py b/.flake/pkgs/ffdb/ffdb.py
@@ -0,0 +1,7 @@
+from proj.config_file import get_config_root
+from pathlib import Path
+import gdb
+
+gdb.execute(f'directory {get_config_root(Path.cwd())}')
+gdb.prompt_hook = lambda x: '(ffdb) '
+gdb.execute('set history save on')
diff --git a/.github/runs-on.yml b/.github/runs-on.yml
@@ -0,0 +1,11 @@
+images:
+  dlami-x64:
+    platform: "linux"
+    arch: "x64"
+    owner: "898082745236" # AWS
+    name: "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*"
+
+runners:
+  gpu-nvidia:
+    family: ["g4dn.xlarge"]
+    image: dlami-x64
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# gdb history
+.gdb_history
+
 # dtgen files
 *.dtg.cc
 *.dtg.h

diff --git a/README.md b/README.md
@@ -1,8 +1,21 @@
-# FlexFlow
-![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=master) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=master) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=master) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=master) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=master) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest)
+# flexflow-train
+[![clang-format Check](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml/badge.svg?branch=master)](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml)
+[![per-lib-checks](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml)
+[![shell-check](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml)
+[![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest)
 
-FlexFlow is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. FlexFlow provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow only requires [a few lines of changes to the program](https://flexflow.ai/keras).
+> [!WARNING]
+> The FlexFlow repository has been split into separate [flexflow-train](https://github.com/flexflow/flexflow-train) and [flexflow-serve](https://github.com/flexflow/flexflow-serve) repositories.
+> You are currently viewing [flexflow-train](https://github.com/flexflow/flexflow-train). 
+> For anything inference/serving-related, go to [flexflow-serve](https://github.com/flexflow/flexflow-serve). 
 
+FlexFlow is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. 
+
+<!--
+FlexFlow provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow only requires [a few lines of changes to the program](https://flexflow.ai/keras).
+-->
+
+<!--
 ## Install FlexFlow
 To install FlexFlow from source code, please read the [instructions](INSTALL.md). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages ([flexflow-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-cuda) with a CUDA backend, [flexflow-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. You can also use `conda` to install the FlexFlow Python package (coming soon).
 
@@ -67,10 +80,11 @@ Performance auto-tuning flags:
 * `--enable-parameter-parallel`: allow FlexFlow to explore parameter parallelism for performance auto-tuning. (By default FlexFlow only considers data and model parallelism.)
 * `--enable-attribute-parallel`: allow FlexFlow to explore attribute parallelism for performance auto-tuning. (By default FlexFlow only considers data and model parallelism.)
 For performance tuning related flags: see [performance autotuning](https://flexflow.ai/search).
+-->
 
 ## Contributing
 
-Please let us know if you encounter any bugs or have any suggestions by [submitting an issue](https://github.com/flexflow/flexflow/issues).
+Please let us know if you encounter any bugs or have any suggestions by [submitting an issue](https://github.com/flexflow/flexflow-train/issues).
 
 We welcome all contributions to FlexFlow from bug fixes to new features and extensions.
 

diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake
@@ -39,7 +39,7 @@ function(ff_set_cxx_properties target)
       CXX_EXTENSIONS NO
   )
   target_compile_options(${target}
-    PRIVATE $<$<COMPILE_LANGUAGE:CXX>:> # add C++ compile flags here
+    PRIVATE $<$<COMPILE_LANGUAGE:CXX>:> "-ffile-prefix-map=${CMAKE_SOURCE_DIR}=." # add C++ compile flags here
   )
 endfunction()
 

diff --git a/flake.lock b/flake.lock
diff --git a/flake.nix b/flake.nix
@@ -35,10 +35,13 @@
       mkShell = pkgs.mkShell.override {
         stdenv = pkgs.cudaPackages.backendStdenv;
       };
+
+      proj = proj-repo.packages.${system}.proj;
     in 
     {
       packages = {
         legion = pkgs.callPackage ./.flake/pkgs/legion.nix { };
+        ffdb = pkgs.callPackage ./.flake/pkgs/ffdb { inherit proj; };
         hpp2plantuml = pkgs.python3Packages.callPackage ./.flake/pkgs/hpp2plantuml.nix { };
         rapidcheckFull = pkgs.symlinkJoin {
           name = "rapidcheckFull";
@@ -102,9 +105,7 @@
               doxygen
               lcov # for code coverage
             ])
-            (with proj-repo.packages.${system}; [
-              proj
-            ])
+            [ proj ]
             (with self.packages.${system}; [
               legion
               hpp2plantuml
@@ -128,7 +129,6 @@
               gh-markdown-preview
               shellcheck
               plantuml
-              gdb
               ruff
               compdb
               jq
@@ -148,6 +148,9 @@
               black
               toml
             ])
+            (with self.packages.${system}; [
+              ffdb
+            ])
           ];
         };
       };

diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H
 
 #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
 #include "compiler/cost_estimator/tensor_set_movement.dtg.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
 #include "op-attrs/pcg_operator_attrs.dtg.h"
@@ -11,7 +12,7 @@
 namespace FlexFlow {
 
 struct ICostEstimator {
-  virtual float estimate_cost(OpCostEstimateKey const &) const = 0;
+  virtual OpCostMetrics estimate_cost(OpCostEstimateKey const &) const = 0;
   virtual float estimate_cost(TensorSetMovement const &) const = 0;
 
   ICostEstimator() = default;
@@ -23,7 +24,7 @@ struct ICostEstimator {
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator);
 
 struct CostEstimator {
-  float estimate_cost(OpCostEstimateKey const &k) const;
+  OpCostMetrics estimate_cost(OpCostEstimateKey const &) const;
   float estimate_cost(TensorSetMovement const &m) const;
 
   template <typename T, typename... Args>

diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
@@ -0,0 +1,18 @@
+namespace = "FlexFlow"
+name = "OpCostMetrics"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+]
+
+includes = [
+]
+
+[[fields]]
+name = "runtime"
+type = "float"
+
+[[fields]]
+name = "memory"
+type = "size_t"
diff --git a/...de/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h b/...de/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
@@ -0,0 +1,48 @@
+#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_GET_OPTIMAL_MACHINE_MAPPING_WITH_MEMORY_H
+#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_GET_OPTIMAL_MACHINE_MAPPING_WITH_MEMORY_H
+
+#include "compiler/machine_mapping/machine_mapping_cache.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_constraints.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_context.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.dtg.h"
+#include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+
+namespace FlexFlow {
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &context,
+    MachineMappingProblemTree const &problem_tree,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints);
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeSeriesSplit const &series_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints,
+    std::optional<ParallelSplitTransformation> const
+        &parallel_split_transformation);
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeParallelSplit const &parallel_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints);
+
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
+    MachineMappingContext const &,
+    UnmappedOpCostEstimateKey const &leaf,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/...compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.struct.toml b/...compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.struct.toml
@@ -0,0 +1,20 @@
+namespace = "FlexFlow"
+name = "MachineMappingForSingleLayer"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h",
+  "compiler/cost_estimator/op_cost_metrics.dtg.h",
+]
+
+[[fields]]
+name = "cost"
+type = "::FlexFlow::OpCostMetrics"
+
+[[fields]]
+name = "machine_mapping"
+type = "::FlexFlow::ParallelLayerGuidObliviousMachineMapping"
diff --git a/.../include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h b/.../include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H
+
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.dtg.h"
+
+namespace FlexFlow {
+
+MachineMappingWithMemoryCache empty_machine_mapping_with_memory_cache();
+std::optional<MachineMappingWithMemoryResult>
+    machine_mapping_with_memory_cache_load(
+        MachineMappingWithMemoryCache const &, MachineMappingState const &);
+void machine_mapping_with_memory_cache_save(
+    MachineMappingWithMemoryCache &,
+    MachineMappingState const &,
+    MachineMappingWithMemoryResult const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/...ompiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.struct.toml b/...ompiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.struct.toml
@@ -0,0 +1,22 @@
+namespace = "FlexFlow"
+name = "MachineMappingWithMemoryCache"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "<unordered_map>",
+  "compiler/machine_mapping/machine_mapping_state.dtg.h",
+  "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/unordered_map.h",
+  "utils/hash/unordered_map.h",
+]
+
+[[fields]]
+name = "raw_map"
+type = "std::unordered_map<::FlexFlow::MachineMappingState, ::FlexFlow::MachineMappingWithMemoryResult>"
diff --git a/...include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h b/...include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h
@@ -0,0 +1,41 @@
+#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H
+#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H
+
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.dtg.h"
+#include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    empty_machine_mapping_with_memory_result();
+[[nodiscard]] bool is_empty(MachineMappingWithMemoryResult const &);
+
+[[nodiscard]] MachineMappingWithMemoryResult get_mapping_with_minimal_runtime(
+    std::unordered_set<MachineMappingWithMemoryResult> const &);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    remove_non_pareto_optimal_machine_mapping_result(
+        MachineMappingWithMemoryResult const &);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    series_combine(float comm_cost,
+                   MachineMappingWithMemoryResult const &pre_result,
+                   MachineMappingWithMemoryResult const &post_result,
+                   std::optional<ParallelSplitTransformation> const
+                       &parallel_split_transformation);
+[[nodiscard]] MachineMappingWithMemoryResult
+    parallel_combine(MachineMappingWithMemoryResult const &lhs_result,
+                     MachineMappingWithMemoryResult const &rhs_result);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    minimize_runtime(MachineMappingWithMemoryResult const &m1,
+                     MachineMappingWithMemoryResult const &m2);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    make_singleton_machine_mapping_with_memory_result(
+        OpCostMetrics cost, MachineView const &machine_view);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/...mpiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml b/...mpiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml
@@ -0,0 +1,20 @@
+namespace = "FlexFlow"
+name = "MachineMappingWithMemoryResult"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/unordered_set.h",
+  "utils/fmt/unordered_set.h",
+]
+
+[[fields]]
+name = "machine_mappings"
+type = "std::unordered_set<::FlexFlow::MachineMappingForSingleLayer>"