flexflow · Marsella8 · Oct 16, 2023 · Oct 18, 2023 · Nov 8, 2023 · Nov 15, 2023
diff --git a/lib/compiler/include/compiler/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_H
+#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_H
+
+#include "cost_estimate.h"
+#include "pcg/machine_specification.h"
+#include "pcg/machine_view.h"
+#include "pcg/parallel_computation_graph.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+
+namespace FlexFlow {
+
+float parallel_estimate_cost(
+    SubParallelComputationGraphView const &g,
+    CostEstimator const &estimator,
+    MachineMapping const &device_mapping,
+    std::unordered_map<InputMultiDiEdge, MachineView> const
+        &frontier_machine_views);
+
+} // namespace FlexFlow
+
+namespace std {
+
+template <>
+struct hash<std::unordered_map<FlexFlow::Node, FlexFlow::MachineMapping>> {
+  size_t operator()(
+      std::unordered_map<FlexFlow::Node, FlexFlow::MachineMapping> const &g)
+      const;
+};
+
+}; // namespace std
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping.h
@@ -69,6 +69,7 @@ OptimalCostResult
                  MachineSpecification const &resources,
                  OptimalCostCache &cached_subgraph_costs);
 
+
 } // namespace FlexFlow
 
 namespace std {

diff --git a/lib/compiler/src/cost_estimator.cc b/lib/compiler/src/cost_estimator.cc
@@ -0,0 +1,103 @@
+
+#include "compiler/machine_mapping.h"
+#include "pcg/parallel_computation_graph.h"
+#include "utils/exception.h"
+#include "utils/graph/serialparallel.h"
+#include "utils/deduplicated_priority_queue.h"
+#include <algorithm>
+
+namespace FlexFlow {
+
+// Computes estimated execution cost for a single node
+float node_estimate_cost(Node node,
+                         SubParallelComputationGraphView const &g,
+                         CostEstimator const &estimator,
+                         MachineMapping const &device_mapping) {
+  std::unordered_set<UpwardOpenMultiDiEdge> incoming_edges =
+      get_incoming_edges(g, node);
+  std::vector<ParallelTensorShape> inputs =
+      transform(as_vector(get_incoming_edges(g, node)),
+                [&](UpwardOpenMultiDiEdge const &input_edge) {
+                  return g.at(input_edge).get_shape();
+                });
+  float cost = estimator.estimate_cost(
+      g.at(node).attrs, inputs, device_mapping.machine_views.at(node));
+  return cost;
+}
+
+struct TimedNode { // Node and associated finishing time
+  Node node;
+  req<float> endtime;
+};
+FF_VISITABLE_STRUCT(TimedNode, node, endtime);
+
+struct TimeComparison {
+  bool operator()(TimedNode const &lhs, TimedNode const &rhs) const {
+    return (lhs.endtime < rhs.endtime);
+  }
+};
+
+float parallel_estimate_cost(
+    SubParallelComputationGraphView const &g,
+    CostEstimator const &estimator,
+    MachineMapping const &device_mapping,
+    std::unordered_map<InputMultiDiEdge, MachineView> const
+        &frontier_machine_views) {
+  float current_time = 0;
+  std::unordered_set<Node> frontier; // nodes whose dependencies (previous nodes) have been met, and
+                // are waiting to be processed.
+  DeduplicatedPriorityQueue<TimedNode, std::vector<TimedNode>, TimeComparison>
+      processing; // nodes currently being processed.
+  std::unordered_set<Node>
+      processed; // set of nodes that have already been processed
+  std::unordered_map<device_id_t, bool>
+      occupied; // keeps track of the devices that are currently occupied
+  // Filling the frontier
+  for (auto const &[edge, _] : frontier_machine_views) {
+      auto node = get_dst_node(edge);
+      frontier.insert(node);
+    }
+
+  while (!frontier.empty() || !processing.empty()) {
+    // Processing new nodes
+    std::unordered_set<Node> copy(frontier);
+    for (Node const &node : copy) {
+      std::vector<device_id_t> devices =
+          device_mapping.machine_views.at(node).device_ids();
+      if (std::all_of(devices.begin(), devices.end(), [&occupied](device_id_t d) {
+            return occupied[d] == false;
+          })) {
+        float cost = node_estimate_cost(node, g, estimator, device_mapping);
+        processing.push({node, current_time + cost});
+        for (device_id_t d : devices) {
+          occupied[d] = true;
+        }
+        frontier.erase(node);
+      }
+    }
+    // Finish processing one node
+    TimedNode finished = processing.top();
+    processing.pop();
+    std::vector<device_id_t> devices =
+        device_mapping.machine_views.at(finished.node).device_ids();
+    for (device_id_t d : devices) { // free devices
+      occupied[d] = false;
+    }
+    processed.insert(finished.node);
+    current_time = finished.endtime;
+
+    // Adding candidates to the frontier
+    for (Node const &successor :
+         get_successors(g, finished.node)) { // All nodes depending on finished
+      std::unordered_set<Node> predecessors = get_predecessors(g, successor);
+      if (std::all_of(
+              predecessors.begin(), predecessors.end(), [&processed](Node p) {
+                return processed.find(p) != processed.end();
+              })) {
+        frontier.insert(successor);
+      }
+    }
+  }
+  return current_time;
+}
+} // namespace FlexFlow
diff --git a/lib/compiler/src/machine_mapping.cc b/lib/compiler/src/machine_mapping.cc
@@ -5,6 +5,9 @@
 #include "utils/exception.h"
 #include "utils/graph/serialparallel.h"
 
+#include "utils/deduplicated_priority_queue.h"
+#include <algorithm>
+
 namespace FlexFlow {
 
 MachineMapping MachineMapping::combine(MachineMapping const &s1,
@@ -110,6 +113,7 @@ float estimate_cost(SubParallelComputationGraphView const &g,
   return cost;
 }
 
+
 void minimize_runtime(OptimalCostResult &m1, OptimalCostResult const &m2) {
   minimize(m1, m2, OptimalCostRuntimeCmp{});
 }

diff --git a/lib/compiler/test/src/test_parallel_cost_estimator.cpp b/lib/compiler/test/src/test_parallel_cost_estimator.cpp
@@ -0,0 +1,132 @@
+#include "compiler/cost_estimate.h"
+#include "compiler/cost_estimator.h"
+#include "doctest/doctest.h"
+#include "test_cost_estimator.h"
+
+
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+
+  TEST_CASE("parallel_estimate_cost: linear graph") {
+    // Straight line example, 3 nodes (with the last 3 being input to the cost estimator)
+    auto g =
+        OutputLabelledOpenMultiDiGraph<Operator, ParallelTensor>::template create<
+            UnorderedOutputLabelledOpenMultiDiGraph<Operator, ParallelTensor>>();
+
+    Node n1 = g.add_node(Operator{InputAttrs{}, "n1"});
+    Node n2 = g.add_node(Operator{InputAttrs{}, "n2"});
+    Node n3 = g.add_node(Operator{InputAttrs{}, "n3"});
+
+    NodePort p1 = g.add_node_port();
+    NodePort p2 = g.add_node_port();
+    NodePort p3 = g.add_node_port();
+
+    //dst, dstport, uid
+    InputMultiDiEdge e0{n1, p1, {1,1}};
+    // MultiDiEdge: dst, dstport, src, srcport
+    MultiDiEdge e1{n2, p2, n1, p1};
+    MultiDiEdge e2{n3, p3, n2, p2};
+
+    g.add_edge(e0);
+    g.add_edge(e1);
+    g.add_edge(e2);
+
+    g.add_label(e0,
+                ParallelTensor(ParallelTensorDims({2, 1}),
+                               DataType::FLOAT,
+                               CreateGrad::YES));
+    g.add_label(e1,
+                ParallelTensor(ParallelTensorDims({2, 1}),
+                               DataType::FLOAT,
+                               CreateGrad::YES));
+
+    g.add_label(e2,
+                ParallelTensor(ParallelTensorDims({2, 1}),
+                               DataType::FLOAT,
+                               CreateGrad::YES));
+
+    CostEstimator estimator = CostEstimator::create<TestCostEstimator>(); //Returns 0.1 regardless
+    std::unordered_map<Node, MachineView> devices = { //single device per node
+        {n1, make_1d_machine_view(gpu_id_t(1), gpu_id_t(2))},
+        {n2, make_1d_machine_view(gpu_id_t(1), gpu_id_t(2))},
+        {n3, make_1d_machine_view(gpu_id_t(1), gpu_id_t(2))}};
+
+    MachineMapping device_mapping = {devices};
+    auto frontier_machine_views =
+        std::unordered_map<InputMultiDiEdge, MachineView> {
+      {e0, make_1d_machine_view(gpu_id_t(1), gpu_id_t(2))},
+    };
+
+    float result = parallel_estimate_cost(
+        g, estimator, device_mapping, frontier_machine_views);
+    CHECK(std::abs(result-.3) < 1e-7);
+  }
+
+
+  TEST_CASE("parallel_estimate_cost: non-linear graph") {
+    // Non-linear graph example, 4 nodes
+    auto g =
+        OutputLabelledOpenMultiDiGraph<Operator, ParallelTensor>::template create<
+            UnorderedOutputLabelledOpenMultiDiGraph<Operator, ParallelTensor>>();
+
+    Node n0 = g.add_node(Operator{InputAttrs{}, "n0"});
+    Node n1 = g.add_node(Operator{InputAttrs{}, "n1"});
+    Node n2 = g.add_node(Operator{InputAttrs{}, "n2"});
+    Node n3 = g.add_node(Operator{InputAttrs{}, "n3"});
+
+    NodePort p0 = g.add_node_port();
+    NodePort p1 = g.add_node_port();
+    NodePort p2 = g.add_node_port();
+    NodePort p3 = g.add_node_port();
+
+    // dst, dstport, uid
+    InputMultiDiEdge e0{n0, p0, {1, 1}};
+    // MultiDiEdge: dst, dstport, src, srcport
+    MultiDiEdge e1{n1, p1, n0, p0};
+    MultiDiEdge e2{n2, p2, n0, p0};
+    MultiDiEdge e3{n3, p3, n1, p1};
+    MultiDiEdge e4{n3, p3, n2, p2};
+
+    g.add_edge(e0);
+    g.add_edge(e1);
+    g.add_edge(e2);
+    g.add_edge(e3);
+    g.add_edge(e4);
+
+    g.add_label(e0,
+                ParallelTensor(ParallelTensorDims({2, 1}),
+                               DataType::FLOAT,
+                               CreateGrad::YES));
+    g.add_label(e1,
+                ParallelTensor(ParallelTensorDims({2, 1}),
+                               DataType::FLOAT,
+                               CreateGrad::YES));
+    g.add_label(e2,
+                ParallelTensor(ParallelTensorDims({2, 1}),
+                               DataType::FLOAT,
+                               CreateGrad::YES));
+    g.add_label(e3,
+                ParallelTensor(ParallelTensorDims({2, 1}),
+                               DataType::FLOAT,
+                               CreateGrad::YES));
+
+    CostEstimator estimator = CostEstimator::create<TestCostEstimator>();
+    std::unordered_map<Node, MachineView> devices = {
+        {n0, make_1d_machine_view(gpu_id_t(1), gpu_id_t(2))},
+        {n1, make_1d_machine_view(gpu_id_t(1), gpu_id_t(2))},
+        {n2, make_1d_machine_view(gpu_id_t(1), gpu_id_t(2))},
+        {n3, make_1d_machine_view(gpu_id_t(1), gpu_id_t(2))}};
+
+    MachineMapping device_mapping = {devices};
+    auto frontier_machine_views =
+        std::unordered_map<InputMultiDiEdge, MachineView> {
+      {e0, make_1d_machine_view(gpu_id_t(1), gpu_id_t(2))},
+    };
+
+    float result = parallel_estimate_cost(
+        g, estimator, device_mapping, frontier_machine_views);
+    CHECK(std::abs(result - 0.3) < 1e-7);
+  }
+}
diff --git a/lib/pcg/include/pcg/device_id.h b/lib/pcg/include/pcg/device_id.h
@@ -18,7 +18,7 @@ struct cpu_id_t : strong_typedef<cpu_id_t, int> {
 using device_id_t = std::variant<gpu_id_t, cpu_id_t>;
 device_id_t operator+(device_id_t, size_t);
 
-DeviceType get_device_type(device_id_t);
+DeviceType get_device_type(device_id_t const &id);
 gpu_id_t unwrap_gpu(device_id_t);
 cpu_id_t unwrap_cpu(device_id_t);
 

diff --git a/lib/pcg/include/pcg/machine_view.h b/lib/pcg/include/pcg/machine_view.h
@@ -13,7 +13,7 @@
 namespace FlexFlow {
 
 struct MachineView {
-  std::vector<int> device_ids() const;
+  std::vector<device_id_t> device_ids() const;
 
   device_id_t at(FFOrdered<num_points_t> const &coord) const;
   StridedRectangleSide at(size_t) const;

diff --git a/lib/pcg/src/device_id.cc b/lib/pcg/src/device_id.cc
@@ -13,7 +13,16 @@ DeviceType get_device_type(device_id_t const &id) {
   }
 }
 
-device_id_t operator+(device_id_t, size_t) {
-  NOT_IMPLEMENTED();
+//Most likely not the best way to do it.
+device_id_t operator+(device_id_t device, size_t increment) {
+  if (get_device_type(device) == DeviceType::GPU) {
+    gpu_id_t val = std::get<gpu_id_t>(device);
+    return val+increment;
+  }
+  else {
+    cpu_id_t val = std::get<cpu_id_t>(device);
+    return val+increment;
+  }
 }
+
 } // namespace FlexFlow
diff --git a/lib/pcg/src/machine_view.cc b/lib/pcg/src/machine_view.cc
@@ -1,5 +1,6 @@
 #include "pcg/machine_view.h"
 #include "utils/utils.h"
+#include <vector>
 
 namespace FlexFlow {
 
@@ -21,6 +22,20 @@ MachineView make_1d_machine_view(cpu_id_t start, cpu_id_t stop, int stride) {
   return {start, rect};
 }
 
+std::vector<device_id_t> MachineView::device_ids() const {
+  std::vector<device_id_t> ids;
+  if (rect.num_dims()==1) {
+    StridedRectangleSide side = this->rect.at(ff_dim_t{0});
+    for (device_id_t id = this->start; id < this->start+side.get_num_points(); id = id+1) {
+      ids.push_back(id);
+    }
+    return ids;
+  }
+  else {
+    NOT_IMPLEMENTED();
+  }
+}
+
 device_id_t MachineView::at(FFOrdered<num_points_t> const &coord) const {
   size_t offset = this->rect.at(coord);
   return this->start + offset;

diff --git a/lib/pcg/src/strided_rectangle.cc b/lib/pcg/src/strided_rectangle.cc
@@ -15,6 +15,11 @@ size_t StridedRectangle::at(FFOrdered<num_points_t> const &coord) const {
   return idx;
 }
 
+StridedRectangleSide StridedRectangle::at(ff_dim_t const &dim) const {
+  StridedRectangleSide side = this->sides.at(dim);
+  return side;
+}
+
 StridedRectangleSide::StridedRectangleSide(side_size_t const &num, int stride)
     : num_points(num.value()), stride(stride) {}
 
@@ -31,7 +36,7 @@ side_size_t StridedRectangleSide::get_size() const {
 }
 
 size_t StridedRectangle::num_dims() const {
-  NOT_IMPLEMENTED();
+  return this->sides.size();
 }
 
 } // namespace FlexFlow
diff --git a/lib/utils/include/utils/graph/algorithms.h b/lib/utils/include/utils/graph/algorithms.h
@@ -160,6 +160,10 @@ std::unordered_set<Node> get_predecessors(DiGraphView const &, Node const &);
 std::unordered_map<Node, std::unordered_set<Node>>
     get_predecessors(DiGraphView const &, std::unordered_set<Node> const &);
 
+std::unordered_set<Node> get_successors(DiGraphView const &, Node const &);
+std::unordered_map<Node, std::unordered_set<Node>>
+    get_successors(DiGraphView const &, std::unordered_set<Node> const &);
+
 Node get_src_node(MultiDiEdge const &);
 Node get_dst_node(MultiDiEdge const &);
 Node get_dst_node(InputMultiDiEdge const &);