flexflow · lockshaw · Oct 9, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/lib/compiler/include/compiler/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping.h
@@ -8,6 +8,7 @@
 #include "pcg/machine_specification.h"
 #include "pcg/machine_view.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/start_invariant_machine_view.h"
 #include "substitutions/sub_parallel_computation_graph.h"
 #include "utils/graph/serial_parallel/serial_parallel_decomposition.dtg.h"
 
@@ -53,17 +54,15 @@ OptimalCostResult optimal_cost(
     MachineSpecification const &resources,
     OptimalCostCache &cached_subgraph_costs);
 
-} // namespace FlexFlow
+std::unordered_set<MachineView>
+    get_allowed_machine_views(MachineSpecification const &machinespec,
+                              ParallelTensorShape const &shape);
 
-// namespace std {
-//
-// template <>
-// struct hash<std::unordered_map<FlexFlow::Node, FlexFlow::MachineMapping>> {
-//   size_t operator()(
-//       std::unordered_map<FlexFlow::Node, FlexFlow::MachineMapping> const &g)
-//       const;
-// };
+std::unordered_set<StartInvariantMachineView>
+    get_allowed_start_invariant_machine_views(
+        MachineSpecification const &machinespec,
+        ParallelTensorShape const &shape);
 
-// }; // namespace std
+} // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/src/machine_mapping.cc b/lib/compiler/src/machine_mapping.cc
@@ -6,12 +6,20 @@
 #include "pcg/machine_view.dtg.h"
 #include "pcg/machine_view.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/start_invariant_machine_view.h"
 #include "utils/containers.h"
 #include "utils/containers/are_disjoint.h"
 #include "utils/containers/as_vector.h"
+#include "utils/containers/cartesian_product.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
 #include "utils/containers/keys.h"
+#include "utils/containers/product.h"
+#include "utils/containers/range.h"
+#include "utils/containers/replicate.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/without_order.h"
+#include "utils/containers/zip.h"
 #include "utils/exception.h"
 #include "utils/graph/graph_split.dtg.h"
 #include "utils/graph/node/algorithms.h"
@@ -358,4 +366,93 @@
   return searcher.optimal_cost(subpcg, resources, sp_decomposition);
 }
 
+bool is_valid_machine_view(MachineView const &mv,
+                           MachineSpecification const &machinespec) {
+  int num_devices_per_node = ((get_device_type(mv) == DeviceType::GPU)
+                                  ? machinespec.num_gpus_per_node
+                                  : machinespec.num_cpus_per_node);
+  int num_devices = machinespec.num_nodes * num_devices_per_node;
+  return (num_devices > get_raw_id(get_last_device_id(mv)));
+}
+
+std::vector<int> get_tensor_parallel_degrees(ParallelTensorShape const &shape) {
+  std::vector<int> degrees = as_vector(ff_ordered_shard_degrees(shape));
+  degrees.push_back(get_sum_degree(shape));
+  degrees.push_back(get_discard_copy_degree(shape));
+  return degrees;
+}
+
+bool is_valid_machine_view(MachineView const &mv,
+                           ParallelTensorShape const &shape) {
+  std::vector<int> mv_degrees =
+      transform(get_num_devices_per_dim(mv),
+                [](num_points_t degree) { return degree.unwrapped; });
+  std::vector<int> tensor_degrees = get_tensor_parallel_degrees(shape);
+  tensor_degrees =
+      filter(tensor_degrees, [](int degree) { return degree != 1; });
+  return without_order(mv_degrees) == without_order(tensor_degrees);
+}
+
+// TODO(@pietro): add support for both CPU and GPU
+static std::unordered_set<MachineView>
+    get_candidate_machine_views(MachineSpecification const &machinespec,
+                                ParallelTensorShape const &shape) {
+
+  auto candidate_strides = [](std::vector<int> tensor_dims, int total_devices) {
+    int max_stride_upper_bound =
+        (total_devices + 1) /
+        product(transform(tensor_dims, [](int degree) { return degree - 1; }));
+    std::unordered_multiset<std::vector<int>> strides = cartesian_product(
+        replicate(tensor_dims.size(), range(1, max_stride_upper_bound + 1)));
+    return strides;
+  };
+
+  std::vector<int> tensor_dims = filter(get_tensor_parallel_degrees(shape),
+                                        [](int degree) { return degree != 1; });
+  std::unordered_set<MachineView> machine_views;
+  int total_devices = machinespec.num_nodes * machinespec.num_gpus_per_node;
+  for (std::vector<int> stride :
+       candidate_strides(tensor_dims, total_devices)) {
+    for (int start_id = 0; start_id < total_devices; start_id++) {
+      std::vector<StridedRectangleSide> sides =
+          transform(zip(tensor_dims, stride), [&](auto const &pair) {
+            return StridedRectangleSide(num_points_t(pair.first),
+                                        stride_t(pair.second));
+          });
+      MachineView mv =
+          MachineView{device_id_t(gpu_id_t(start_id)), StridedRectangle{sides}};
+      machine_views.insert(mv);
+    }
+  }
+  return machine_views;
+}
+
+std::unordered_set<MachineView>
+    get_allowed_machine_views(MachineSpecification const &machinespec,
+                              ParallelTensorShape const &shape) {
+
+  std::unordered_set<MachineView> views =
+      get_candidate_machine_views(machinespec, shape);
+  views = filter(views, [&](MachineView const &view) {
+    return is_valid_machine_view(view, shape);
+  });
+  views = filter(views, [&](MachineView const &view) {
+    return is_valid_machine_view(view, machinespec);
+  });
+  return views;
+}
+
+std::unordered_set<StartInvariantMachineView>
+    get_allowed_start_invariant_machine_views(
+        MachineSpecification const &machinespec,
+        ParallelTensorShape const &shape) {
+  return transform(get_allowed_machine_views(machinespec, shape),
+                   to_start_invariant);
+}
+
+auto get_all_machine_views_to_tensor_dim_bijections(
+    MachineView const &mv, ParallelTensorShape const &shape) {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/test/src/machine_mapping.cc b/lib/compiler/test/src/machine_mapping.cc
@@ -0,0 +1,160 @@
+#include "compiler/machine_mapping.h"
+#include "doctest/doctest.h"
+#include "pcg/machine_specification.dtg.h"
+#include "test_generator.h"
+#include "utils/containers/extend.h"
+
+TEST_SUITE(FF_TEST_SUITE) {
+
+  TEST_CASE("get_allowed_machine_view") {
+
+    SUBCASE("1 degree of parallelism") {
+      MachineSpecification ms = MachineSpecification(5, 1, 1, 0, 0);
+      ParallelTensorShape shape = ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{
+                  ShardParallelDim{10, 3},
+              },
+              ReplicaParallelDimSet{
+                  SumDegree{1},
+                  DiscardCopyDegree{1},
+              },
+          },
+          DataType::FLOAT,
+      };
+
+      std::unordered_set<MachineView> correct = {
+          make_1d_machine_view(gpu_id_t(0), gpu_id_t(3), stride_t(1)),
+          make_1d_machine_view(gpu_id_t(1), gpu_id_t(4), stride_t(1)),
+          make_1d_machine_view(gpu_id_t(2), gpu_id_t(5), stride_t(1)),
+          make_1d_machine_view(gpu_id_t(0), gpu_id_t(6), stride_t(2))};
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(ms, shape);
+      CHECK(correct == result);
+    }
+
+    SUBCASE("2 degrees of parallelism") {
+      MachineSpecification ms = MachineSpecification(18, 1, 1, 0, 0);
+      ParallelTensorShape shape = ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{
+                  ShardParallelDim{10, 3},
+              },
+              ReplicaParallelDimSet{
+                  SumDegree{2},
+                  DiscardCopyDegree{1},
+              },
+          },
+          DataType::FLOAT,
+      };
+
+      auto make_2d_views = [&](int num_starts, int stride1, int stride2) {
+        std::unordered_set<MachineView> views;
+        for (int i = 0; i < num_starts; i++) {
+          StridedRectangle rect = StridedRectangle{
+              {StridedRectangleSide{num_points_t(2), stride_t(stride1)},
+               StridedRectangleSide{num_points_t(3), stride_t(stride2)}}};
+          MachineView mv = MachineView{device_id_t(gpu_id_t(i)), rect};
+          views.insert(mv);
+        }
+        return views;
+      };
+      std::unordered_set<MachineView> correct;
+      extend(correct,
+             make_2d_views(/*num_starts*/ 13, /*stride1*/ 1, /*stride2*/ 1));
+      extend(correct,
+             make_2d_views(/*num_starts*/ 8, /*stride1*/ 2, /*stride2*/ 1));
+      extend(correct,
+             make_2d_views(/*num_starts*/ 9, /*stride1*/ 1, /*stride2*/ 2));
+      extend(correct,
+             make_2d_views(/*num_starts*/ 3, /*stride1*/ 3, /*stride2*/ 1));
+      extend(correct,
+             make_2d_views(/*num_starts*/ 5, /*stride1*/ 1, /*stride2*/ 3));
+      extend(correct,
+             make_2d_views(/*num_starts*/ 1, /*stride1*/ 1, /*stride2*/ 4));
+
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(ms, shape);
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("get_allowed_start_invariant_machine_views") {
+
+    SUBCASE("1 degree of parallelism") {
+      MachineSpecification ms = MachineSpecification(5, 1, 1, 0, 0);
+      ParallelTensorShape shape = ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{
+                  ShardParallelDim{10, 3},
+              },
+              ReplicaParallelDimSet{
+                  SumDegree{1},
+                  DiscardCopyDegree{1},
+              },
+          },
+          DataType::FLOAT,
+      };
+
+      std::unordered_set<StartInvariantMachineView> correct = {
+          make_1d_start_invariant_machine_view(num_points_t(3), stride_t(1)),
+          make_1d_start_invariant_machine_view(num_points_t(3), stride_t(2))};
+      std::unordered_set<StartInvariantMachineView> result =
+          get_allowed_start_invariant_machine_views(ms, shape);
+      CHECK(correct == result);
+    }
+
+    SUBCASE("2 degrees of parallelism") {
+      MachineSpecification ms = MachineSpecification(18, 1, 1, 0, 0);
+      ParallelTensorShape shape = ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{
+                  ShardParallelDim{10, 3},
+              },
+              ReplicaParallelDimSet{
+                  SumDegree{2},
+                  DiscardCopyDegree{1},
+              },
+          },
+          DataType::FLOAT,
+      };
+
+      auto make_2d_view = [&](int stride1, int stride2) {
+        StridedRectangle rect = StridedRectangle{
+            {StridedRectangleSide{num_points_t(2), stride_t(stride1)},
+             StridedRectangleSide{num_points_t(3), stride_t(stride2)}}};
+        return StartInvariantMachineView{rect};
+      };
+      std::unordered_set<StartInvariantMachineView> correct = {
+          make_2d_view(/*stride1*/ 1, /*stride2*/ 1),
+          make_2d_view(/*stride1*/ 2, /*stride2*/ 1),
+          make_2d_view(/*stride1*/ 1, /*stride2*/ 2),
+          make_2d_view(/*stride1*/ 3, /*stride2*/ 1),
+          make_2d_view(/*stride1*/ 1, /*stride2*/ 3),
+          make_2d_view(/*stride1*/ 1, /*stride2*/ 4)};
+
+      std::unordered_set<StartInvariantMachineView> result =
+          get_allowed_start_invariant_machine_views(ms, shape);
+      CHECK(result == correct);
+    }
+  }
+
+  // TEST_CASE("MachineMapping::combine") {
+  //   RC_SUBCASE([](MachineMapping const &m0, MachineMapping const &m1) {
+  //     RC_PRE(MachineMapping::nodes_are_disjoint(m0, m1));
+
+  //     MachineMapping comb = MachineMapping::combine(m0, m1);
+
+  //     RC_ASSERT(comb.machine_views.size() ==
+  //               m0.machine_views.size() + m1.machine_views.size());
+  //     RC_ASSERT(is_submap(comb.machine_views, m0.machine_views));
+  //     RC_ASSERT(is_submap(comb.machine_views, m1.machine_views));
+  //   });
+  // }
+
+  // TEST_CASE("OptimalCostResult::infinity") {
+  //   RC_SUBCASE([](OptimalCostResult const &c) {
+  //     RC_ASSERT(c.runtime <= OptimalCostResult::infinity().runtime);
+  //   });
+  // }
+}
diff --git a/lib/compiler/test/src/test_machine_mapping.cc b/lib/compiler/test/src/test_machine_mapping.cc
diff --git a/lib/pcg/include/pcg/device_coordinates.struct.toml b/lib/pcg/include/pcg/device_coordinates.struct.toml
@@ -0,0 +1,18 @@
+namespace = "FlexFlow"
+name = "DeviceCoordinates"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "json",
+  # "rapidcheck",
+  "fmt",
+]
+
+includes = [ 
+  "op-attrs/dim_ordered.h",
+]
+
+[[fields]]
+name = "coords"
+type = "::FlexFlow::FFOrdered<int>"
diff --git a/lib/pcg/include/pcg/device_id.h b/lib/pcg/include/pcg/device_id.h
@@ -13,6 +13,7 @@ device_id_t operator+(device_id_t, size_t);
 DeviceType get_device_type(device_id_t const &device_id);
 gpu_id_t unwrap_gpu(device_id_t);
 cpu_id_t unwrap_cpu(device_id_t);
+int get_raw_id(device_id_t);
 
 device_id_t device_id_from_index(int, DeviceType);