From 8ff02615570ee31a38122b6876212cb4a80872b7 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shivsundram@berkeley.edu>
Date: Mon, 27 Sep 2021 15:06:00 -0700
Subject: [PATCH 01/44] task fusion and legality constraints

---
 install.py                    |   2 +-
 legate/core/launcher.py       |  15 ++-
 legate/core/legion.py         |   2 +
 legate/core/operation.py      |  16 ++-
 legate/core/runtime.py        | 214 ++++++++++++++++++++++++++++++++--
 src/core.mk                   |   4 +-
 src/data/scalar.h             |   3 +
 src/data/scalar.inl           |   1 +
 src/data/store.cc             |  14 ++-
 src/data/store.h              |  15 ++-
 src/mapping/mapper.cc         |   3 +-
 src/runtime/context.h         |   9 +-
 src/utilities/deserializer.cc |   6 +-
 src/utilities/span.h          |   2 +
 14 files changed, 279 insertions(+), 27 deletions(-)

diff --git a/install.py b/install.py
index bfc6211fa..ac8ce559e 100755
--- a/install.py
+++ b/install.py
@@ -887,7 +887,7 @@ def driver():
         "--clean",
         dest="clean_first",
         action=BooleanFlag,
-        default=True,
+        default=False,
         help="Clean before build, and pull latest Legion.",
     )
     parser.add_argument(
diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index 10486d629..bd09acf62 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -364,6 +364,7 @@ def coalesce(self):
         # promote them to read write permission.
         if len(all_perms - set([Permission.NO_ACCESS])) > 1:
             perm = Permission.READ_WRITE
+            #perm = Permission.WRITE
 
             # When the field requires read write permission,
             # all projections must be the same
@@ -440,6 +441,7 @@ def insert(self, req, field_id):
         field_set.insert(field_id, req.permission, proj_info)
 
     def analyze_requirements(self):
+        #import pdb; pdb.set_trace()
         for region, field_set in self._field_sets.items():
             perm_map = field_set.coalesce()
             for key, fields in perm_map.items():
@@ -568,7 +570,6 @@ def add_store(self, args, store, proj, perm, tag, flags):
         else:
             region = store.storage.region
             field_id = store.storage.field.field_id
-
             req = RegionReq(region, perm, proj, tag, flags)
 
             self._req_analyzer.insert(req, field_id)
@@ -591,6 +592,12 @@ def add_output(self, store, proj, tag=0, flags=0):
             self._outputs, store, proj, Permission.WRITE, tag, flags
         )
 
+    # currently this is adding to outputs but we can have a seperate "temps" array in the core
+    def add_temp(self, store, proj, tag=0, flags=0):
+        self.add_store(
+            self._outputs, store, proj, Permission.WRITE, tag, flags
+        )
+
     def add_reduction(self, store, proj, tag=0, flags=0, read_write=False):
         if read_write:
             self.add_store(
@@ -642,13 +649,17 @@ def pack_args(argbuf, args):
 
     def build_task(self, launch_domain, argbuf):
         self._req_analyzer.analyze_requirements()
+        #print("building task id", self._task_id)
+        for req in self._req_analyzer._requirements:
+            print(req)
+            print(req[0].__dict__)
+            print()
         self._out_analyzer.analyze_requirements()
 
         self.pack_args(argbuf, self._inputs)
         self.pack_args(argbuf, self._outputs)
         self.pack_args(argbuf, self._reductions)
         self.pack_args(argbuf, self._scalars)
-
         task = IndexTask(
             self.legion_task_id,
             launch_domain,
diff --git a/legate/core/legion.py b/legate/core/legion.py
index 5d1211099..ebadb5e65 100644
--- a/legate/core/legion.py
+++ b/legate/core/legion.py
@@ -155,6 +155,7 @@ def legate_task_postamble(runtime, context):
 # This is a decorator for wrapping the launch method on launchers
 # to dispatch any unordered deletions while the task is live
 def dispatch(func):
+    #print("dispatching")
     def launch(launcher, runtime, context, *args):
         # This context should always be in the dictionary
         legate_task_progress(runtime, context)
@@ -4744,6 +4745,7 @@ def get_string(self):
         if self.string is None or self.arglen != len(self.args):
             fmtstr = "".join(self.fmt)
             assert len(fmtstr) == len(self.args) + 1
+            print(self.args)
             self.string = struct.pack(fmtstr, *self.args)
             self.arglen = len(self.args)
         return self.string
diff --git a/legate/core/operation.py b/legate/core/operation.py
index e8f093eb5..52bd14a12 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -28,6 +28,7 @@ def __init__(self, context, mapper_id=0):
         self._inputs = []
         self._outputs = []
         self._reductions = []
+        self._temps = []
         self._future_output = None
         self._future_reduction = None
         self._constraints = EqClass()
@@ -99,6 +100,14 @@ def add_output(self, store):
         else:
             self._outputs.append(store)
 
+    def add_temp(self, store):
+        self._check_store(store)
+        self._temps.append(store) #this may not be necessary
+
+    def add_output(self, store):
+        self._check_store(store)
+        self._outputs.append(store)
+
     def add_reduction(self, store, redop):
         self._check_store(store)
         if store.kind is Future:
@@ -143,10 +152,15 @@ def add_future(self, future):
 
     def launch(self, strategy):
         launcher = TaskLauncher(self.context, self._task_id, self.mapper_id)
-
         for input in self._inputs:
             proj = strategy.get_projection(input)
             launcher.add_input(input, proj)
+        for temp in self._temps:
+            proj = strategy.get_projection(temp)
+            launcher.add_temp(temp, proj)
+            partition = strategy.get_partition(temp)
+            # We update the key partition of a store only when it gets updated
+            temp.set_key_partition(partition)
         for output in self._outputs:
             if output.unbound:
                 continue
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 05a96d446..a85913f2b 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -41,7 +41,7 @@
 from .shape import Shape
 from .solver import Partitioner
 from .store import RegionField, Store
-
+import numpy as npo
 
 # A Field holds a reference to a field in a region tree
 # that can be used by many different RegionField objects
@@ -711,6 +711,110 @@ def record_partition(self, index_space, functor, index_partition):
         self._index_partitions[key] = index_partition
 
 
+class FusionChecker(object):
+    def __init__(self, ops, contexts, runtime):
+        """
+        This is a class containing a list of constraints for fusing ops
+        It emits whether or not a given list of ops can be fused
+        """
+        self.constraints = []
+        self.ops = ops
+        self.contexts = contexts
+        self.runtime=runtime
+
+    def register_constraint(self, fusion_constraint_rule):
+        self.constraints.append(fusion_constraint_rule)
+
+    def can_fuse(self):
+        results = [constraint.apply(self.contexts, self.runtime, self.ops) for constraint in self.constraints]
+        print(results)
+        return reduce(lambda x,y: x and y, results)
+
+class FusionConstraint(object):
+    def apply(self, contexts, runtime, ops):
+        """"
+         Abstract class for determining a rule that constrains
+         which legate operations can be fused
+         """
+        raise NotImplementedError("Implement in derived classes")
+
+
+class NumpyContextExists(FusionConstraint):
+    def apply(self, contexts, runtime, ops):
+        return "legate.numpy" in contexts
+
+
+class AllBinaryOps(FusionConstraint):
+    """Temporary class for only fusing Binary Ops. 
+       This constrains will be removed"""
+    def apply(self, contexts, runtime, ops):
+        allBinary = reduce(lambda x,y: x and y,[int(op._task_id)==400000 for op in ops])
+        return allBinary
+
+
+class IdenticalProjection(FusionConstraint):
+    """Fusion rule that only ops with identical
+       projection functors can be fused"""
+    def apply(self, contexts, runtime, ops):
+        partitioners = []
+        strategies = []
+        must_be_single = any(op._future_output is not None for op in ops)
+        for op in ops:
+            partitioner = Partitioner(runtime, ops, must_be_single=must_be_single)
+            strategy = partitioner.partition_stores()
+
+        store_to_ops = {}
+        for op in ops:
+            bufferSet = {}
+            for input in op._inputs:
+                if input not in bufferSet:
+                    proj = strategy.get_projection(input)
+                    if hasattr(proj, 'part'):
+                        bufferSet[input]=proj
+
+            for output in op._outputs:
+                if output not in bufferSet:
+                    proj = strategy.get_projection(output)
+                    if hasattr(proj, 'part'):
+                        bufferSet[output]=proj
+
+            for buffer in bufferSet.keys():
+                proj = bufferSet[buffer]
+                matrix = proj.part.index_partition.functor.transform.trans
+                if buffer not in store_to_ops:
+                    store_to_ops[buffer] = [matrix]
+                else:
+                    store_to_ops[buffer].append(matrix)
+        for store, matrices in store_to_ops.items():
+            if len(matrices)>1:
+                allEqual = reduce(lambda x,y: x==y, matrices)
+                if not allEqual:
+                    return False
+        print(store_to_ops)
+        return True
+
+
+class IdenticalLaunchShapes(FusionConstraint):
+    """Fusion rule that only ops with identical
+       launch shapes can be fused"""
+    def apply(self, contexts, runtime, ops):
+        partitioners = []
+        strategies = []
+        launch_shapes = []
+        must_be_single = any(op._future_output is not None for op in ops)
+        for op in ops:
+            partitioner = Partitioner(runtime, ops, must_be_single=must_be_single)
+            strategy = partitioner.partition_stores()
+            launch_shapes.append(strategy._launch_shape)
+        first_shape = launch_shapes[0]
+        print(launch_shapes)
+        for launch_shape in launch_shapes:
+            if launch_shape!=first_shape:
+                return False
+        return True
+
+
+   
 class Runtime(object):
     def __init__(self, core_library):
         """
@@ -753,7 +857,7 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size = 1
+        self._window_size = 2
 
         # Now we initialize managers
         self._attachment_manager = AttachmentManager(self)
@@ -857,20 +961,111 @@ def dispatch(self, op, redop=None):
         else:
             return op.launch(self.legion_runtime, self.legion_context)
 
-    def _schedule(self, ops):
+
+    def build_fused_binary_numpy_op(self,ops):
+        fusion_checker = FusionChecker(ops, self._contexts, self)
+        fusion_checker.register_constraint(NumpyContextExists())
+        fusion_checker.register_constraint(AllBinaryOps())
+        fusion_checker.register_constraint(IdenticalLaunchShapes())
+        fusion_checker.register_constraint(IdenticalProjection())
+        can_fuse = fusion_checker.can_fuse()
+        
+        if not can_fuse:
+            return None
+
+        #hacky way to get numpy context and designated fused task id
+        fused_id = self._contexts["legate.numpy"].fused_id
+        numpy_context = self._contexts["legate.numpy"]
+        numpy_runtime = numpy_context._library.runtime
+        #initialize fused task
+        fused_task = numpy_context.create_task(fused_id)
+         
+        #generate offset maps for all inputs
+        input_starts, output_starts, offset_starts, offsets= [],[],[],[]
+        input_start, output_start, offset_start = 0,0,0
+
+        for op in ops:
+            input_starts.append(input_start)
+            output_starts.append(output_start)
+            offset_starts.append(offset_start)
+
+            for i,input in enumerate(op._inputs):
+                offsets.append(i+1)
+            for o,output in enumerate(op._outputs):
+                offsets.append(-(o+1))
+
+            offset_start+=(len(op._inputs)+len(op._outputs))
+            input_start+=len(op._inputs)
+            output_start+=len(op._outputs)
+
+        #terminators
+        input_starts.append(input_start)
+        output_starts.append(output_start)
+        offset_starts.append(offset_start)
+
+        #turn offset maps into deferred arrays
+        #then load them into the task as the initial inputs
+       	#print(input_starts, output_starts, offset_starts, offsets)
+        inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets))
+        def make_deferred(inst):
+            return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) 
+        offset_maps = map(make_deferred, (inst, oust, offst, offs))
+        
+	#add offset maps to task
+        for offset_map in offset_maps:
+            fused_task.add_input(offset_map.base)
+            fused_task.add_broadcast(offset_map.base)
+
+        #add typical inputs and outputs to task
+        for op in ops:
+            for scalar in op._scalar_args:
+                fused_task.add_scalar_arg(scalar[0], ty.int32)
+            for input in op._inputs:
+                fused_task.add_input(input)   
+            for output in op._outputs:
+                fused_task.add_output(output)   
+
+        return fused_task
+
+    def _launch_outstanding(self):
+        print("launching final outstanding ops")
+        if len(self._outstanding_ops):
+            ops = self._outstanding_ops
+            self._outstanding_ops = []
+            self._schedule(ops, force_eval=True)
+               
+   
+    def _schedule(self, ops, force_eval=False):
+        ids = [op._task_id for op in ops]
+        #print("current ops", ids)
+        #try fusing tasks
+        if len(ops)>=2 and (not force_eval):
+            fused_task = self.build_fused_binary_numpy_op(ops)
+            if fused_task:
+                fused_task.execute() 
+                return
+
+        #if we cann't fuse op launch them individually
         must_be_single = any(op._future_output is not None for op in ops)
         partitioner = Partitioner(self, ops, must_be_single=must_be_single)
         strategy = partitioner.partition_stores()
-
         for op in ops:
+            #print("task_id", op._task_id, int(op._task_id))
+            #print("inputs", op._inputs)
+            #print("outputs", op._outputs)
             op.launch(strategy)
 
     def submit(self, op):
-        self._outstanding_ops.append(op)
-        if len(self._outstanding_ops) >= self._window_size:
-            ops = self._outstanding_ops
-            self._outstanding_ops = []
-            self._schedule(ops)
+        #always launch a fused op, dont add it to the window
+        #as the encapsulated ops already waited in the window
+        if int(op._task_id)==400028:
+            self._schedule([op])
+        else:
+            self._outstanding_ops.append(op)
+            if len(self._outstanding_ops) >= self._window_size:
+                ops = self._outstanding_ops
+                self._outstanding_ops = []
+                self._schedule(ops)
 
     def _progress_unordered_operations(self):
         legion.legion_context_progress_unordered_operations(
@@ -1065,6 +1260,7 @@ def record_partition(self, index_space, functor, index_partition):
 
 def _cleanup_legate_runtime():
     global _runtime
+    _runtime._launch_outstanding()
     _runtime.destroy()
     del _runtime
     gc.collect()
diff --git a/src/core.mk b/src/core.mk
index deedb74eb..8765eca36 100644
--- a/src/core.mk
+++ b/src/core.mk
@@ -25,7 +25,8 @@ GEN_CPU_SRC	= legate_c.cc               \
 							runtime/runtime.cc        \
 							runtime/shard.cc          \
 							task/task.cc              \
-							utilities/deserializer.cc
+							utilities/deserializer.cc \
+							utilities/makeshift_serializer.cc
 
 ifeq ($(strip $(USE_CUDA)),1)
 GEN_CPU_SRC	+= gpu/cudalibs.cc
@@ -55,4 +56,5 @@ INSTALL_HEADERS = legate.h                 \
 									utilities/dispatch.h     \
 									utilities/span.h         \
 									utilities/type_traits.h  \
+									utilities/makeshift_serializer.h  \
 									utilities/typedefs.h
diff --git a/src/data/scalar.h b/src/data/scalar.h
index 18d58c45f..fc1c1ede4 100644
--- a/src/data/scalar.h
+++ b/src/data/scalar.h
@@ -18,6 +18,7 @@
 
 #include "utilities/span.h"
 #include "utilities/typedefs.h"
+#include "utilities/makeshift_serializer.h"
 
 namespace legate {
 
@@ -44,6 +45,8 @@ class Scalar {
   bool tuple_{false};
   LegateTypeCode code_{MAX_TYPE_NUMBER};
   const void* data_;
+
+  friend class MakeshiftSerializer;
 };
 
 }  // namespace legate
diff --git a/src/data/scalar.inl b/src/data/scalar.inl
index bb9cfa5cb..3e49fe08b 100644
--- a/src/data/scalar.inl
+++ b/src/data/scalar.inl
@@ -25,6 +25,7 @@ VAL Scalar::value() const
 template <typename VAL>
 Span<const VAL> Scalar::values() const
 {
+
   if (tuple_) {
     auto size = *static_cast<const uint32_t*>(data_);
     auto data = static_cast<const uint8_t*>(data_) + sizeof(uint32_t);
diff --git a/src/data/store.cc b/src/data/store.cc
index e8a70ca67..d9accf3a4 100644
--- a/src/data/store.cc
+++ b/src/data/store.cc
@@ -21,8 +21,8 @@ namespace legate {
 
 using namespace Legion;
 
-RegionField::RegionField(int32_t dim, const PhysicalRegion& pr, FieldID fid)
-  : dim_(dim), pr_(pr), fid_(fid)
+RegionField::RegionField(int32_t dim, const PhysicalRegion& pr, FieldID fid, unsigned reqIdx)
+  : dim_(dim), pr_(pr), fid_(fid), reqIdx_(reqIdx)
 {
   auto priv  = pr.get_privilege();
   readable_  = static_cast<bool>(priv & LEGION_READ_PRIV);
@@ -34,6 +34,7 @@ RegionField::RegionField(RegionField&& other) noexcept
   : dim_(other.dim_),
     pr_(other.pr_),
     fid_(other.fid_),
+    reqIdx_(other.reqIdx_),
     readable_(other.readable_),
     writable_(other.writable_),
     reducible_(other.reducible_)
@@ -45,6 +46,8 @@ RegionField& RegionField::operator=(RegionField&& other) noexcept
   dim_       = other.dim_;
   pr_        = other.pr_;
   fid_       = other.fid_;
+  reqIdx_    = other.reqIdx_; 
+
   readable_  = other.readable_;
   writable_  = other.writable_;
   reducible_ = other.reducible_;
@@ -53,14 +56,15 @@ RegionField& RegionField::operator=(RegionField&& other) noexcept
 
 Domain RegionField::domain() const { return dim_dispatch(dim_, get_domain_fn{}, pr_); }
 
-OutputRegionField::OutputRegionField(const OutputRegion& out, FieldID fid) : out_(out), fid_(fid) {}
+OutputRegionField::OutputRegionField(const OutputRegion& out, FieldID fid, unsigned reqIdx) : out_(out), fid_(fid), reqIdx_(reqIdx) {}
 
 OutputRegionField::OutputRegionField(OutputRegionField&& other) noexcept
-  : bound_(other.bound_), out_(other.out_), fid_(other.fid_)
+  : bound_(other.bound_), out_(other.out_), fid_(other.fid_), reqIdx_(other.reqIdx_)
 {
   other.bound_ = false;
   other.out_   = OutputRegion();
   other.fid_   = -1;
+  //TODO, how should we invalidate reqIdx
 }
 
 OutputRegionField& OutputRegionField::operator=(OutputRegionField&& other) noexcept
@@ -68,10 +72,12 @@ OutputRegionField& OutputRegionField::operator=(OutputRegionField&& other) noexc
   bound_ = other.bound_;
   out_   = other.out_;
   fid_   = other.fid_;
+  reqIdx_= other.reqIdx_;
 
   other.bound_ = false;
   other.out_   = OutputRegion();
   other.fid_   = -1;
+  //TODO, how should we invalidate reqIdx
 
   return *this;
 }
diff --git a/src/data/store.h b/src/data/store.h
index 10c4428ff..4b31fd049 100644
--- a/src/data/store.h
+++ b/src/data/store.h
@@ -21,13 +21,14 @@
 #include "data/buffer.h"
 #include "data/transform.h"
 #include "utilities/typedefs.h"
+#include "utilities/makeshift_serializer.h"
 
 namespace legate {
 
 class RegionField {
  public:
   RegionField() {}
-  RegionField(int32_t dim, const Legion::PhysicalRegion& pr, Legion::FieldID fid);
+  RegionField(int32_t dim, const Legion::PhysicalRegion& pr, Legion::FieldID fid, unsigned reqIdx);
 
  public:
   RegionField(RegionField&& other) noexcept;
@@ -147,17 +148,20 @@ class RegionField {
   int32_t dim_{-1};
   Legion::PhysicalRegion pr_{};
   Legion::FieldID fid_{-1U};
+  unsigned reqIdx_; //this gets packed as an unsigned
 
  private:
   bool readable_{false};
   bool writable_{false};
   bool reducible_{false};
+
+  friend class MakeshiftSerializer;
 };
 
 class OutputRegionField {
  public:
   OutputRegionField() {}
-  OutputRegionField(const Legion::OutputRegion& out, Legion::FieldID fid);
+  OutputRegionField(const Legion::OutputRegion& out, Legion::FieldID fid, unsigned reqIdx);
 
  public:
   OutputRegionField(OutputRegionField&& other) noexcept;
@@ -175,6 +179,9 @@ class OutputRegionField {
   bool bound_{false};
   Legion::OutputRegion out_{};
   Legion::FieldID fid_{-1U};
+  unsigned reqIdx_;  //this gets packed as an unsigned
+
+  friend class MakeshiftSerializer;
 };
 
 class FutureWrapper {
@@ -237,6 +244,7 @@ class Store {
 
  public:
   int32_t dim() const { return dim_; }
+  bool is_future() const { return is_future_; }
   LegateTypeCode code() const { return code_; }
 
  public:
@@ -296,8 +304,9 @@ class Store {
   bool readable_{false};
   bool writable_{false};
   bool reducible_{false};
-};
 
+ friend class MakeshiftSerializer;
+};
 }  // namespace legate
 
 #include "data/store.inl"
diff --git a/src/mapping/mapper.cc b/src/mapping/mapper.cc
index 1e43b11ed..546097a09 100644
--- a/src/mapping/mapper.cc
+++ b/src/mapping/mapper.cc
@@ -195,7 +195,8 @@ Mapper::MapperSyncModel CoreMapper::get_mapper_sync_model(void) const
 
 void CoreMapper::select_task_options(const MapperContext ctx, const Task& task, TaskOptions& output)
 {
-  assert(context.valid_task_id(task.task_id));
+  std::cout<<"task_id "<<task.task_id<<std::endl;
+  //assert(context.valid_task_id(task.task_id));
   if (task.tag == LEGATE_CPU_VARIANT) {
     assert(!local_cpus.empty());
     output.initial_proc = local_cpus.front();
diff --git a/src/runtime/context.h b/src/runtime/context.h
index 627103d3d..f2bd32553 100644
--- a/src/runtime/context.h
+++ b/src/runtime/context.h
@@ -49,7 +49,10 @@ class ResourceScope {
 
  public:
   bool valid() const { return base_ != -1; }
-  bool in_scope(int64_t resource_id) const { return base_ <= resource_id && resource_id < max_; }
+  bool in_scope(int64_t resource_id) const { 
+  std::cout<<"resource_id "<<resource_id<<std::endl;  
+  std::cout<<"max "<<max_<<std::endl;  
+  return base_ <= resource_id && resource_id < max_; }
 
  private:
   int64_t base_{-1};
@@ -108,8 +111,10 @@ class TaskContext {
   std::vector<Store>& outputs() { return outputs_; }
   std::vector<Store>& reductions() { return reductions_; }
   std::vector<Scalar>& scalars() { return scalars_; }
+  //Deserializer dez;
+  //Serializer dez;
 
- private:
+ public:
   const Legion::Task* task_;
   const std::vector<Legion::PhysicalRegion>& regions_;
   Legion::Context context_;
diff --git a/src/utilities/deserializer.cc b/src/utilities/deserializer.cc
index 466ef03f8..c4d63733a 100644
--- a/src/utilities/deserializer.cc
+++ b/src/utilities/deserializer.cc
@@ -91,7 +91,7 @@ void Deserializer::_unpack(RegionField& value)
   auto idx = unpack<uint32_t>();
   auto fid = unpack<int32_t>();
 
-  value = RegionField(dim, regions_[idx], fid);
+  value = RegionField(dim, regions_[idx], fid, idx);
 }
 
 void Deserializer::_unpack(OutputRegionField& value)
@@ -101,12 +101,12 @@ void Deserializer::_unpack(OutputRegionField& value)
   auto idx = unpack<uint32_t>();
   auto fid = unpack<int32_t>();
 
-  value = OutputRegionField(outputs_[idx], fid);
+  value = OutputRegionField(outputs_[idx], fid, idx);
 }
 
 std::unique_ptr<StoreTransform> Deserializer::unpack_transform()
 {
-  auto code = unpack<int32_t>();
+  int32_t code = unpack<int32_t>();
   switch (code) {
     case -1: {
       return nullptr;
diff --git a/src/utilities/span.h b/src/utilities/span.h
index c0a20c5a8..c839bb365 100644
--- a/src/utilities/span.h
+++ b/src/utilities/span.h
@@ -35,6 +35,7 @@ struct Span {
  public:
   decltype(auto) operator[](size_t pos)
   {
+    //std::cout<<"pos "<<pos<<" "<<size_<<std::endl;
     assert(pos < size_);
     return data_[pos];
   }
@@ -42,6 +43,7 @@ struct Span {
  public:
   decltype(auto) subspan(size_t off)
   {
+    //std::cout<<"size "<<size_<<" off "<<off<<std::endl;
     assert(off <= size_);
     return Span(data_ + off, size_ - off);
   }

From 4a9248fa0f013a35e6d8f9d924b1fb018b1e385b Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shivsundram@berkeley.edu>
Date: Mon, 27 Sep 2021 15:08:51 -0700
Subject: [PATCH 02/44] makeshift serializer for inline ops

---
 src/utilities/makeshift_serializer.cc |  58 ++++++++++++
 src/utilities/makeshift_serializer.h  | 121 ++++++++++++++++++++++++++
 2 files changed, 179 insertions(+)
 create mode 100644 src/utilities/makeshift_serializer.cc
 create mode 100644 src/utilities/makeshift_serializer.h

diff --git a/src/utilities/makeshift_serializer.cc b/src/utilities/makeshift_serializer.cc
new file mode 100644
index 000000000..fe96a2820
--- /dev/null
+++ b/src/utilities/makeshift_serializer.cc
@@ -0,0 +1,58 @@
+#include "utilities/makeshift_serializer.h"
+
+namespace legate{
+
+    void MakeshiftSerializer::packScalar(const Scalar& scalar){
+        pack((bool) scalar.is_tuple()); 
+        pack((LegateTypeCode) scalar.code_); 
+        int32_t size = scalar.size();
+        packWithoutType(scalar.data_, size);    
+    }
+
+    void MakeshiftSerializer::packBuffer(const Store& buffer)
+    {
+        pack((bool) buffer.is_future()); //is_future
+        pack((int32_t) buffer.dim());
+        //int32_t code = buffer.code();
+        pack((int32_t)  buffer.code());
+        //pack transform:
+        //pack trasnform code
+        int32_t neg= -1;
+        pack((int32_t) neg);
+        //skip the rest for now, assume no transform, for now pack -1
+        // no need to implement this for benchmarking purposes 
+        // TODO: implement transform packing
+        // TODO: add "code" to transform object
+        //if _isfuture
+        if(buffer.is_future_)
+        {   
+            //pack future_wrapper
+        }   
+        //elif dim>=0
+        else if (buffer.dim()>=0){
+            pack((int32_t) buffer.redop_id_);
+            //pack reigon field
+                //pack dim
+                pack((int32_t) buffer.region_field_.dim()); 
+                //pack idx (req idx) //need to map regions to idx
+                pack((uint32_t) buffer.region_field_.reqIdx_); 
+                //pack fid (field id)
+                pack((int32_t) buffer.region_field_.fid_); 
+        }
+        else
+        {   
+            //pack redop_id
+            pack((int32_t) buffer.redop_id_);
+            //pack reigon field
+                //pack dim; always 1 in an buffer
+                pack((int32_t) 1); 
+                //pack idx (req idx) //need to map regions to idx
+                pack((uint32_t) buffer.region_field_.reqIdx_); 
+                //pack fid (field id)
+                pack((int32_t) buffer.region_field_.fid_); 
+        }   
+   }
+
+
+
+}
diff --git a/src/utilities/makeshift_serializer.h b/src/utilities/makeshift_serializer.h
new file mode 100644
index 000000000..a4eec808f
--- /dev/null
+++ b/src/utilities/makeshift_serializer.h
@@ -0,0 +1,121 @@
+
+#pragma once
+#include <iostream>
+#include <vector>
+#include "data/store.h"
+#include "data/scalar.h"
+namespace legate {
+
+class Scalar;
+class Store;
+class MakeshiftSerializer{
+    
+    public:
+    MakeshiftSerializer(){
+        size=128;
+        raw.resize(size); 
+        write_offset=0;
+        read_offset=0;
+    }
+/*
+    template <typename T> void pack(T&& arg) 
+    {
+        T copy = arg;
+        pack(copy); //call l-value version
+    }
+*/
+    template <typename T> void pack(T arg) 
+    {
+        int8_t * argAddr = (int8_t*) &arg;
+        //std::cout<<arg<<std::endl;
+        if (size<=write_offset+sizeof(T))
+        {
+            resize(sizeof(T));
+        }
+        for (int i=0; i<sizeof(T); i++)
+        {
+           raw[write_offset+i] = *reinterpret_cast<const int8_t*>((argAddr)+i);
+        }
+        //std::cout<<"reint "<<*reinterpret_cast<T*>(raw.data()+write_offset)<<std::endl;;
+        write_offset+=sizeof(T);
+        //std::cout<<"    "<<write_offset<<std::endl;
+    }
+ 
+    void packWithoutType(const void* arg, int argSize) 
+    {
+        const int8_t* argByte =(int8_t*) arg;
+        //std::cout<<"data of size: "<<argSize<<std::endl;
+        if (size<=write_offset+argSize)
+        {
+            resize(argSize);
+        }
+        for (int i=0; i<argSize; i++){
+            raw[write_offset+i] = *reinterpret_cast<const int8_t*>(argByte+i);
+        }
+        write_offset+=argSize;
+        //std::cout<<"    "<<write_offset<<std::endl;
+    }
+
+    void packScalar(const Scalar& scalar);
+
+    void packBuffer(const Store& input);
+    
+    template <typename T> T read() 
+    {
+        if (read_offset<write_offset)
+        {
+            T datum = *reinterpret_cast<T*>(raw.data()+read_offset);
+            read_offset+=sizeof(T);
+            return datum;
+        }
+        else{
+            std::cout<<"finished reading buffer"<<std::endl;
+            return NULL;
+        }
+    }
+
+    void resize(size_t argSize){
+        while(size<=write_offset+argSize)
+        {
+            //std::cout<<"resizing from "<<size<<" to "<<2*size<<std::endl; 
+            size=2*size;
+            raw.resize(size);
+        }
+    }
+
+    void reset_reader(){
+        read_offset=0;
+    }
+
+    int8_t* ptr(){
+        return raw.data();
+    }
+
+    int buffSize(){
+        return write_offset;
+    }
+    private: 
+    size_t size;
+    int read_offset;
+    int write_offset;
+    std::vector<int8_t> raw;
+};
+/*
+int main(){
+    MakeshiftSerializer ms;
+    int a=3; 
+    char g='a'; 
+    ms.pack<int>(a);
+    ms.pack<char>(g);
+    ms.pack<int>(a);
+    ms.pack<char>(g);
+    std::cout<<ms.read<int>()<<std::endl;;
+    std::cout<<ms.read<char>()<<std::endl;;
+    std::cout<<ms.read<int>()<<std::endl;;
+    std::cout<<ms.read<char>()<<std::endl;;
+    std::cout<<ms.read<int>()<<std::endl;;
+    ms.reset_reader();
+    std::cout<<ms.read<int>()<<std::endl;;
+     
+}*/
+}

From 99cd51c6fb3dc9ab2f3aa5c40ea33c2d2d9eb564 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shivsundram@berkeley.edu>
Date: Wed, 29 Sep 2021 07:35:17 -0700
Subject: [PATCH 03/44] reductions scalars, opids, need to remove dynamic
 allocations

---
 legate/core/launcher.py |  8 ++--
 legate/core/legion.py   |  2 +-
 legate/core/runtime.py  | 93 +++++++++++++++++++++++++----------------
 3 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index bd09acf62..8894bc0ee 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -650,10 +650,10 @@ def pack_args(argbuf, args):
     def build_task(self, launch_domain, argbuf):
         self._req_analyzer.analyze_requirements()
         #print("building task id", self._task_id)
-        for req in self._req_analyzer._requirements:
-            print(req)
-            print(req[0].__dict__)
-            print()
+        #for req in self._req_analyzer._requirements:
+            #print(req)
+            #print(req[0].__dict__)
+            #print()
         self._out_analyzer.analyze_requirements()
 
         self.pack_args(argbuf, self._inputs)
diff --git a/legate/core/legion.py b/legate/core/legion.py
index ebadb5e65..ff5c97dd1 100644
--- a/legate/core/legion.py
+++ b/legate/core/legion.py
@@ -4745,7 +4745,7 @@ def get_string(self):
         if self.string is None or self.arglen != len(self.args):
             fmtstr = "".join(self.fmt)
             assert len(fmtstr) == len(self.args) + 1
-            print(self.args)
+            #print(self.args)
             self.string = struct.pack(fmtstr, *self.args)
             self.arglen = len(self.args)
         return self.string
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index a85913f2b..0e6bda47f 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -727,7 +727,7 @@ def register_constraint(self, fusion_constraint_rule):
 
     def can_fuse(self):
         results = [constraint.apply(self.contexts, self.runtime, self.ops) for constraint in self.constraints]
-        print(results)
+        #print(results)
         return reduce(lambda x,y: x and y, results)
 
 class FusionConstraint(object):
@@ -790,7 +790,7 @@ def apply(self, contexts, runtime, ops):
                 allEqual = reduce(lambda x,y: x==y, matrices)
                 if not allEqual:
                     return False
-        print(store_to_ops)
+        #print(store_to_ops)
         return True
 
 
@@ -807,7 +807,7 @@ def apply(self, contexts, runtime, ops):
             strategy = partitioner.partition_stores()
             launch_shapes.append(strategy._launch_shape)
         first_shape = launch_shapes[0]
-        print(launch_shapes)
+        #print(launch_shapes)
         for launch_shape in launch_shapes:
             if launch_shape!=first_shape:
                 return False
@@ -857,7 +857,7 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size = 2
+        self._window_size = 1
 
         # Now we initialize managers
         self._attachment_manager = AttachmentManager(self)
@@ -962,64 +962,86 @@ def dispatch(self, op, redop=None):
             return op.launch(self.legion_runtime, self.legion_context)
 
 
-    def build_fused_binary_numpy_op(self,ops):
-        fusion_checker = FusionChecker(ops, self._contexts, self)
-        fusion_checker.register_constraint(NumpyContextExists())
-        fusion_checker.register_constraint(AllBinaryOps())
-        fusion_checker.register_constraint(IdenticalLaunchShapes())
-        fusion_checker.register_constraint(IdenticalProjection())
-        can_fuse = fusion_checker.can_fuse()
-        
-        if not can_fuse:
-            return None
-
-        #hacky way to get numpy context and designated fused task id
-        fused_id = self._contexts["legate.numpy"].fused_id
-        numpy_context = self._contexts["legate.numpy"]
-        numpy_runtime = numpy_context._library.runtime
-        #initialize fused task
-        fused_task = numpy_context.create_task(fused_id)
-         
-        #generate offset maps for all inputs
+    def serialize_multiop_metadata(self, numpy_runtime, ops):
+        """creates a 'header' for a fused op that denotes metadata
+        on each ops inputs, outputs, reductions and scalars
+        """
+        #generate offset maps for all inputs to serialize metadata
         input_starts, output_starts, offset_starts, offsets= [],[],[],[]
+        reduction_starts, scalar_starts,op_ids = [], [], [] 
         input_start, output_start, offset_start = 0,0,0
+        reduction_start, scalar_start = 0,0
 
         for op in ops:
             input_starts.append(input_start)
             output_starts.append(output_start)
             offset_starts.append(offset_start)
+            reduction_starts.append(reduction_start)
+            scalar_starts.append(scalar_start)
 
             for i,input in enumerate(op._inputs):
                 offsets.append(i+1)
             for o,output in enumerate(op._outputs):
-                offsets.append(-(o+1))
+                offsets.append(-(o+1)) 
+            op_ids.append(op._task_id._value_)
 
             offset_start+=(len(op._inputs)+len(op._outputs))
             input_start+=len(op._inputs)
             output_start+=len(op._outputs)
+            reduction_start+=len(op._reductions)
+            scalar_start+=len(op._scalar_args)
 
         #terminators
         input_starts.append(input_start)
         output_starts.append(output_start)
         offset_starts.append(offset_start)
+        reduction_starts.append(reduction_start)
+        scalar_starts.append(scalar_start)
 
-        #turn offset maps into deferred arrays
+        #turn metadata maps into deferred arrays
         #then load them into the task as the initial inputs
-       	#print(input_starts, output_starts, offset_starts, offsets)
-        inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets))
+        meta_arrs =  (input_starts, output_starts, offset_starts, offsets, reduction_starts, scalar_starts, op_ids)
+        #inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets))
+        meta_arrs_np =  map(npo.array, meta_arrs)
         def make_deferred(inst):
             return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) 
-        offset_maps = map(make_deferred, (inst, oust, offst, offs))
+        meta_maps = map(make_deferred, meta_arrs_np)
+        return meta_maps
+   
+
+    def build_fused_op(self,ops):
+        fusion_checker = FusionChecker(ops, self._contexts, self)
+        fusion_checker.register_constraint(NumpyContextExists())
+        fusion_checker.register_constraint(AllBinaryOps())
+        fusion_checker.register_constraint(IdenticalLaunchShapes())
+        fusion_checker.register_constraint(IdenticalProjection())
+        can_fuse = fusion_checker.can_fuse()
         
-	#add offset maps to task
-        for offset_map in offset_maps:
-            fused_task.add_input(offset_map.base)
-            fused_task.add_broadcast(offset_map.base)
+        if not can_fuse:
+            return None
+
+        #hacky way to get numpy context and designated fused task id
+        fused_id = self._contexts["legate.numpy"].fused_id
+        numpy_context = self._contexts["legate.numpy"]
+        numpy_runtime = numpy_context._library.runtime
+        #initialize fused task
+        fused_task = numpy_context.create_task(fused_id)
+         
+        #serialize necessary metadata on all encapsulated ops 
+        #this metadata will be fed into the fused op as inputs
+        meta_maps = self.serialize_multiop_metadata(numpy_runtime, ops)
+
+	#add metadata maps to task as inputs
+        for meta_map in meta_maps:
+            fused_task.add_input(meta_map.base)
+            fused_task.add_broadcast(meta_map.base)
 
-        #add typical inputs and outputs to task
+        #add typical inputs and outputs of all subtasks to fused task
         for op in ops:
             for scalar in op._scalar_args:
                 fused_task.add_scalar_arg(scalar[0], ty.int32)
+            for reduction in op._reductions:
+                fused_task.add_reduction(reduction)
             for input in op._inputs:
                 fused_task.add_input(input)   
             for output in op._outputs:
@@ -1040,7 +1062,7 @@ def _schedule(self, ops, force_eval=False):
         #print("current ops", ids)
         #try fusing tasks
         if len(ops)>=2 and (not force_eval):
-            fused_task = self.build_fused_binary_numpy_op(ops)
+            fused_task = self.build_fused_op(ops)
             if fused_task:
                 fused_task.execute() 
                 return
@@ -1050,9 +1072,6 @@ def _schedule(self, ops, force_eval=False):
         partitioner = Partitioner(self, ops, must_be_single=must_be_single)
         strategy = partitioner.partition_stores()
         for op in ops:
-            #print("task_id", op._task_id, int(op._task_id))
-            #print("inputs", op._inputs)
-            #print("outputs", op._outputs)
             op.launch(strategy)
 
     def submit(self, op):

From a4e21d8af18d2c094d67e7a01aa43ddc6d1f772a Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shivsundram@berkeley.edu>
Date: Wed, 29 Sep 2021 15:59:45 -0700
Subject: [PATCH 04/44] fusion metadata passed via serialization now

---
 legate/core/launcher.py       | 21 ++++++++++++++-
 legate/core/legion.py         |  1 -
 legate/core/operation.py      | 10 +++++++
 legate/core/runtime.py        | 41 ++++++++++++++++++----------
 legate/core/store.py          | 38 ++++++++++++++++++++++++++
 src/data/store.h              | 19 +++++++++++++
 src/runtime/context.cc        |  1 +
 src/runtime/context.h         |  6 +++--
 src/utilities/deserializer.cc | 50 +++++++++++++++++++++++++++++++++++
 src/utilities/deserializer.h  |  4 +++
 10 files changed, 173 insertions(+), 18 deletions(-)

diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index 8894bc0ee..06d275a47 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -441,7 +441,6 @@ def insert(self, req, field_id):
         field_set.insert(field_id, req.permission, proj_info)
 
     def analyze_requirements(self):
-        #import pdb; pdb.set_trace()
         for region, field_set in self._field_sets.items():
             perm_map = field_set.coalesce()
             for key, fields in perm_map.items():
@@ -531,6 +530,8 @@ def __init__(self, context, task_id, mapper_id=0, tag=0):
         self._sharding_space = None
         self._point = None
         self._output_regions = list()
+        self._is_fused = False
+        self._fusion_metadata = None
 
     @property
     def library_task_id(self):
@@ -641,12 +642,24 @@ def set_sharding_space(self, space):
     def set_point(self, point):
         self._point = point
 
+    def add_fusion_metadata(self, is_fused, fusion_metadata):
+        self._is_fused = is_fused
+        self._fusion_metadata = fusion_metadata
+
     @staticmethod
     def pack_args(argbuf, args):
         argbuf.pack_32bit_uint(len(args))
         for arg in args:
             arg.pack(argbuf)
 
+
+    @staticmethod
+    def pack_fusion_metadata(argbuf, is_fused, fusion_metadata):
+        argbuf.pack_bool(is_fused)
+        if is_fused:
+            fusion_metadata.pack(argbuf)
+
+
     def build_task(self, launch_domain, argbuf):
         self._req_analyzer.analyze_requirements()
         #print("building task id", self._task_id)
@@ -656,6 +669,9 @@ def build_task(self, launch_domain, argbuf):
             #print()
         self._out_analyzer.analyze_requirements()
 
+        #pack fusion metadata
+        self.pack_fusion_metadata(argbuf, self._is_fused, self._fusion_metadata)        
+
         self.pack_args(argbuf, self._inputs)
         self.pack_args(argbuf, self._outputs)
         self.pack_args(argbuf, self._reductions)
@@ -685,6 +701,9 @@ def build_task(self, launch_domain, argbuf):
     def build_single_task(self, argbuf):
         self._req_analyzer.analyze_requirements()
         self._out_analyzer.analyze_requirements()
+ 
+        #pack fusion metadata
+        self.pack_fusion_metadata(argbuf, self._is_fused, self._fusion_metadata)        
 
         self.pack_args(argbuf, self._inputs)
         self.pack_args(argbuf, self._outputs)
diff --git a/legate/core/legion.py b/legate/core/legion.py
index ff5c97dd1..3d6fa5299 100644
--- a/legate/core/legion.py
+++ b/legate/core/legion.py
@@ -4745,7 +4745,6 @@ def get_string(self):
         if self.string is None or self.arglen != len(self.args):
             fmtstr = "".join(self.fmt)
             assert len(fmtstr) == len(self.args) + 1
-            #print(self.args)
             self.string = struct.pack(fmtstr, *self.args)
             self.arglen = len(self.args)
         return self.string
diff --git a/legate/core/operation.py b/legate/core/operation.py
index 52bd14a12..e8a6ad952 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -33,6 +33,7 @@ def __init__(self, context, mapper_id=0):
         self._future_reduction = None
         self._constraints = EqClass()
         self._broadcasts = set()
+        self._is_fused = False
 
     @property
     def context(self):
@@ -139,6 +140,7 @@ def __init__(self, context, task_id, mapper_id=0):
         self._task_id = task_id
         self._scalar_args = []
         self._futures = []
+        self._fusion_metadata = None
 
     def add_scalar_arg(self, value, dtype):
         self._scalar_args.append((value, dtype))
@@ -150,8 +152,16 @@ def add_dtype_arg(self, dtype):
     def add_future(self, future):
         self._futures.append(future)
 
+    def add_fusion_metadata(self, fusion_metadata):
+        self._is_fused = True
+        self._fusion_metadata = fusion_metadata
+
     def launch(self, strategy):
         launcher = TaskLauncher(self.context, self._task_id, self.mapper_id)
+
+        if self._is_fused:
+            launcher.add_fusion_metadata(self._is_fused, self._fusion_metadata)
+ 
         for input in self._inputs:
             proj = strategy.get_projection(input)
             launcher.add_input(input, proj)
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 0e6bda47f..c35e19388 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -40,7 +40,7 @@
 from .partition import Restriction
 from .shape import Shape
 from .solver import Partitioner
-from .store import RegionField, Store
+from .store import RegionField, Store, FusionMetadata
 import numpy as npo
 
 # A Field holds a reference to a field in a region tree
@@ -759,6 +759,9 @@ def apply(self, contexts, runtime, ops):
         partitioners = []
         strategies = []
         must_be_single = any(op._future_output is not None for op in ops)
+
+        # TODO: cache as much as of the partitioner results as possible
+        # so the calls to Partitioner() and partition_stores done kill perf
         for op in ops:
             partitioner = Partitioner(runtime, ops, must_be_single=must_be_single)
             strategy = partitioner.partition_stores()
@@ -766,6 +769,8 @@ def apply(self, contexts, runtime, ops):
         store_to_ops = {}
         for op in ops:
             bufferSet = {}
+ 
+            # find the set union of input and output buffers for the op
             for input in op._inputs:
                 if input not in bufferSet:
                     proj = strategy.get_projection(input)
@@ -778,6 +783,7 @@ def apply(self, contexts, runtime, ops):
                     if hasattr(proj, 'part'):
                         bufferSet[output]=proj
 
+            # for each op in the union, record its associated transform
             for buffer in bufferSet.keys():
                 proj = bufferSet[buffer]
                 matrix = proj.part.index_partition.functor.transform.trans
@@ -785,12 +791,14 @@ def apply(self, contexts, runtime, ops):
                     store_to_ops[buffer] = [matrix]
                 else:
                     store_to_ops[buffer].append(matrix)
+
+        # for each buffer, check all it's associated transforms/partitions
+        # across ops are equivalent 
         for store, matrices in store_to_ops.items():
             if len(matrices)>1:
                 allEqual = reduce(lambda x,y: x==y, matrices)
                 if not allEqual:
                     return False
-        #print(store_to_ops)
         return True
 
 
@@ -803,6 +811,8 @@ def apply(self, contexts, runtime, ops):
         launch_shapes = []
         must_be_single = any(op._future_output is not None for op in ops)
         for op in ops:
+            # TODO: cache as much as of the partitioner results as possible
+            # so the calls to Partitioner() and partition_stores done kill perf
             partitioner = Partitioner(runtime, ops, must_be_single=must_be_single)
             strategy = partitioner.partition_stores()
             launch_shapes.append(strategy._launch_shape)
@@ -821,8 +831,8 @@ def __init__(self, core_library):
         This is a class that implements the Legate runtime.
         The Runtime object provides high-level APIs for Legate libraries
         to use services in the Legion runtime. The Runtime centralizes
-        resource management for all the libraries so that they can
-        focus on implementing their domain logic.
+	resource management for all the libraries so that they can
+	       focus on implementing their domain logic.
         """
 
         try:
@@ -857,7 +867,7 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size = 1
+        self._window_size = 5
 
         # Now we initialize managers
         self._attachment_manager = AttachmentManager(self)
@@ -1001,20 +1011,22 @@ def serialize_multiop_metadata(self, numpy_runtime, ops):
         #turn metadata maps into deferred arrays
         #then load them into the task as the initial inputs
         meta_arrs =  (input_starts, output_starts, offset_starts, offsets, reduction_starts, scalar_starts, op_ids)
+        fusion_metadata = FusionMetadata(*meta_arrs)
+
         #inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets))
         meta_arrs_np =  map(npo.array, meta_arrs)
         def make_deferred(inst):
             return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) 
         meta_maps = map(make_deferred, meta_arrs_np)
-        return meta_maps
+        return meta_maps, fusion_metadata
    
 
     def build_fused_op(self,ops):
         fusion_checker = FusionChecker(ops, self._contexts, self)
         fusion_checker.register_constraint(NumpyContextExists())
         fusion_checker.register_constraint(AllBinaryOps())
-        fusion_checker.register_constraint(IdenticalLaunchShapes())
-        fusion_checker.register_constraint(IdenticalProjection())
+        #fusion_checker.register_constraint(IdenticalLaunchShapes())
+        #fusion_checker.register_constraint(IdenticalProjection())
         can_fuse = fusion_checker.can_fuse()
         
         if not can_fuse:
@@ -1026,15 +1038,16 @@ def build_fused_op(self,ops):
         numpy_runtime = numpy_context._library.runtime
         #initialize fused task
         fused_task = numpy_context.create_task(fused_id)
-         
+        
         #serialize necessary metadata on all encapsulated ops 
         #this metadata will be fed into the fused op as inputs
-        meta_maps = self.serialize_multiop_metadata(numpy_runtime, ops)
+        meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_runtime, ops)
+        fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true
 
 	#add metadata maps to task as inputs
-        for meta_map in meta_maps:
-            fused_task.add_input(meta_map.base)
-            fused_task.add_broadcast(meta_map.base)
+        #for meta_map in meta_maps:
+        #    fused_task.add_input(meta_map.base)
+        #    fused_task.add_broadcast(meta_map.base)
 
         #add typical inputs and outputs of all subtasks to fused task
         for op in ops:
@@ -1059,7 +1072,7 @@ def _launch_outstanding(self):
    
     def _schedule(self, ops, force_eval=False):
         ids = [op._task_id for op in ops]
-        #print("current ops", ids)
+
         #try fusing tasks
         if len(ops)>=2 and (not force_eval):
             fused_task = self.build_fused_op(ops)
diff --git a/legate/core/store.py b/legate/core/store.py
index 3b9297a74..517c8def2 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -754,3 +754,41 @@ def find_or_create_partition(self, functor):
         part = converted.construct(self.storage.region, complete=complete)
         self._partitions[functor] = (part, proj)
         return part, proj
+
+
+class FusionMetadata(object):
+    def __init__(
+                 self,
+                 input_starts,
+                 output_starts,
+                 offset_starts,
+                 buffer_offsets,
+                 reduction_starts,
+                 scalar_starts,
+                 opIDs
+                 ):
+        self._input_starts = input_starts
+        self._output_starts = output_starts
+        self._offset_starts = offset_starts
+        self._buffer_offsets = buffer_offsets
+        self._reduction_starts = reduction_starts
+        self._scalar_starts = scalar_starts
+        self._opIDs = opIDs 
+
+    def packList(self, meta_list, buf):
+        for elem in meta_list: 
+            buf.pack_32bit_int(elem)
+
+    def pack(self, buf):
+       
+        buf.pack_32bit_int(len(self._opIDs)) #nOps
+        buf.pack_32bit_int(len(self._buffer_offsets)) #nIOBuffers+1
+
+        self.packList(self._input_starts, buf)
+        self.packList(self._output_starts, buf)
+        self.packList(self._offset_starts, buf)
+        self.packList(self._buffer_offsets, buf)
+        self.packList(self._reduction_starts, buf)
+        self.packList(self._scalar_starts, buf)
+        self.packList(self._opIDs, buf)
+
diff --git a/src/data/store.h b/src/data/store.h
index 4b31fd049..f862c8959 100644
--- a/src/data/store.h
+++ b/src/data/store.h
@@ -307,6 +307,25 @@ class Store {
 
  friend class MakeshiftSerializer;
 };
+
+//containts prefix sums for a sub-op
+//to index into its own data
+struct FusionMetadata {
+    public:
+    bool isFused;
+    int32_t nOps;
+    int32_t nBuffers;
+    std::vector<int32_t> inputStarts;
+    std::vector<int32_t> outputStarts;
+    std::vector<int32_t> offsetStarts;
+    std::vector<int32_t> offsets; // can contain negative elements
+    std::vector<int32_t> reductionStarts;
+    std::vector<int32_t> scalarStarts;
+    std::vector<int32_t> opIDs;
+};
+
+
+
 }  // namespace legate
 
 #include "data/store.inl"
diff --git a/src/runtime/context.cc b/src/runtime/context.cc
index 1a2a4d39d..bd1e316c2 100644
--- a/src/runtime/context.cc
+++ b/src/runtime/context.cc
@@ -143,6 +143,7 @@ TaskContext::TaskContext(const Legion::Task* task,
   : task_(task), regions_(regions), context_(context), runtime_(runtime)
 {
   Deserializer dez(task, regions);
+  fusionMetadata = dez.unpack<FusionMetadata>(); 
   inputs_     = dez.unpack<std::vector<Store>>();
   outputs_    = dez.unpack<std::vector<Store>>();
   reductions_ = dez.unpack<std::vector<Store>>();
diff --git a/src/runtime/context.h b/src/runtime/context.h
index f2bd32553..c92ce8a14 100644
--- a/src/runtime/context.h
+++ b/src/runtime/context.h
@@ -17,11 +17,13 @@
 #pragma once
 
 #include "legion.h"
+#include "data/scalar.h"
 
 namespace legate {
 
 class Store;
 class Scalar;
+struct FusionMetadata;
 
 struct ResourceConfig {
   int64_t max_tasks{1000000};
@@ -31,6 +33,7 @@ struct ResourceConfig {
   int64_t max_shardings{0};
 };
 
+
 class ResourceScope {
  public:
   ResourceScope() = default;
@@ -111,14 +114,13 @@ class TaskContext {
   std::vector<Store>& outputs() { return outputs_; }
   std::vector<Store>& reductions() { return reductions_; }
   std::vector<Scalar>& scalars() { return scalars_; }
-  //Deserializer dez;
-  //Serializer dez;
 
  public:
   const Legion::Task* task_;
   const std::vector<Legion::PhysicalRegion>& regions_;
   Legion::Context context_;
   Legion::Runtime* runtime_;
+  FusionMetadata fusionMetadata;
 
  private:
   std::vector<Store> inputs_, outputs_, reductions_;
diff --git a/src/utilities/deserializer.cc b/src/utilities/deserializer.cc
index c4d63733a..d4d4c92ca 100644
--- a/src/utilities/deserializer.cc
+++ b/src/utilities/deserializer.cc
@@ -38,6 +38,56 @@ void Deserializer::_unpack(LegateTypeCode& value)
   value = static_cast<LegateTypeCode>(unpack<int32_t>());
 }
 
+
+void Deserializer::_unpack(FusionMetadata& metadata){
+    metadata.isFused = unpack<bool>();
+    if (!metadata.isFused){
+        return;
+    }
+    //exit out if the this is not a fused op
+    metadata.nOps = unpack<int32_t>();
+    metadata.nBuffers = unpack<int32_t>();
+    int nOps = metadata.nOps;
+    int nBuffers = metadata.nBuffers; 
+
+    metadata.inputStarts.resize(nOps+1);
+    metadata.outputStarts.resize(nOps+1);
+    metadata.offsetStarts.resize(nOps+1);
+    metadata.offsets.resize(nBuffers+1);
+    metadata.reductionStarts.resize(nOps+1);
+    metadata.scalarStarts.resize(nOps+1);
+    metadata.opIDs.resize(nOps);
+    //TODO: wrap this up to reuse code`
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.inputStarts[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.outputStarts[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.offsetStarts[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nBuffers; i++)
+    {
+        metadata.offsets[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.reductionStarts[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.scalarStarts[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nOps; i++)
+    {
+        metadata.opIDs[i] = unpack<int32_t>();
+    }   
+}
+
 void Deserializer::_unpack(Store& value)
 {
   auto is_future = unpack<bool>();
diff --git a/src/utilities/deserializer.h b/src/utilities/deserializer.h
index 7d739aa6e..d142ab1c9 100644
--- a/src/utilities/deserializer.h
+++ b/src/utilities/deserializer.h
@@ -33,6 +33,7 @@ class Scalar;
 class FutureWrapper;
 class RegionField;
 class OutputRegionField;
+struct FusionMetadata;
 
 class Deserializer {
  public:
@@ -47,6 +48,8 @@ class Deserializer {
     return std::move(value);
   }
 
+ //void unpackFusionMetadata(bool& isFused);
+
  private:
   template <typename T, std::enable_if_t<legate_type_code_of<T> != MAX_TYPE_NUMBER>* = nullptr>
   void _unpack(T& value)
@@ -70,6 +73,7 @@ class Deserializer {
   void _unpack(FutureWrapper& value);
   void _unpack(RegionField& value);
   void _unpack(OutputRegionField& value);
+  void _unpack(FusionMetadata& value);
 
  private:
   std::unique_ptr<StoreTransform> unpack_transform();

From 9b57fc863951ea9ab2d6b4d476d7120ab99f3401 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shivsundram@berkeley.edu>
Date: Wed, 29 Sep 2021 21:31:19 -0700
Subject: [PATCH 05/44] remove redundant store partitioning

---
 legate/core/runtime.py | 82 ++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 39 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index c35e19388..67d89ba17 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -39,7 +39,7 @@
 )
 from .partition import Restriction
 from .shape import Shape
-from .solver import Partitioner
+from .solver import Partitioner, Strategy
 from .store import RegionField, Store, FusionMetadata
 import numpy as npo
 
@@ -721,17 +721,26 @@ def __init__(self, ops, contexts, runtime):
         self.ops = ops
         self.contexts = contexts
         self.runtime=runtime
+        self.partitioners = []
+        self.strategies = []
 
     def register_constraint(self, fusion_constraint_rule):
         self.constraints.append(fusion_constraint_rule)
 
     def can_fuse(self):
-        results = [constraint.apply(self.contexts, self.runtime, self.ops) for constraint in self.constraints]
-        #print(results)
-        return reduce(lambda x,y: x and y, results)
+        must_be_single = any(op._future_output is not None for op in self.ops)
+        for op in self.ops:
+            # TODO: cache as much as of the partitioner results as possible
+            # so the calls to Partitioner() and partition_stores done kill perf
+            partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single)
+            self.partitioners.append( partitioner )
+            strategy = partitioner.partition_stores()
+            self.strategies.append(strategy)
+        results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints]
+        return reduce(lambda x,y: x and y, results), self.strategies
 
 class FusionConstraint(object):
-    def apply(self, contexts, runtime, ops):
+    def apply(self, contexts, runtime, ops, partitioners, strategies):
         """"
          Abstract class for determining a rule that constrains
          which legate operations can be fused
@@ -740,14 +749,14 @@ def apply(self, contexts, runtime, ops):
 
 
 class NumpyContextExists(FusionConstraint):
-    def apply(self, contexts, runtime, ops):
+    def apply(self, contexts, runtime, ops, partitioners, strategies):
         return "legate.numpy" in contexts
 
 
 class AllBinaryOps(FusionConstraint):
     """Temporary class for only fusing Binary Ops. 
        This constrains will be removed"""
-    def apply(self, contexts, runtime, ops):
+    def apply(self, contexts, runtime, ops, partitioners, strategies):
         allBinary = reduce(lambda x,y: x and y,[int(op._task_id)==400000 for op in ops])
         return allBinary
 
@@ -755,31 +764,22 @@ def apply(self, contexts, runtime, ops):
 class IdenticalProjection(FusionConstraint):
     """Fusion rule that only ops with identical
        projection functors can be fused"""
-    def apply(self, contexts, runtime, ops):
-        partitioners = []
-        strategies = []
-        must_be_single = any(op._future_output is not None for op in ops)
-
-        # TODO: cache as much as of the partitioner results as possible
-        # so the calls to Partitioner() and partition_stores done kill perf
-        for op in ops:
-            partitioner = Partitioner(runtime, ops, must_be_single=must_be_single)
-            strategy = partitioner.partition_stores()
+    def apply(self, contexts, runtime, ops, partitioners, strategies):
 
         store_to_ops = {}
-        for op in ops:
+        for i, op in enumerate(ops):
             bufferSet = {}
  
             # find the set union of input and output buffers for the op
             for input in op._inputs:
                 if input not in bufferSet:
-                    proj = strategy.get_projection(input)
+                    proj = strategies[i].get_projection(input)
                     if hasattr(proj, 'part'):
                         bufferSet[input]=proj
 
             for output in op._outputs:
                 if output not in bufferSet:
-                    proj = strategy.get_projection(output)
+                    proj = strategies[i].get_projection(output)
                     if hasattr(proj, 'part'):
                         bufferSet[output]=proj
 
@@ -805,19 +805,11 @@ def apply(self, contexts, runtime, ops):
 class IdenticalLaunchShapes(FusionConstraint):
     """Fusion rule that only ops with identical
        launch shapes can be fused"""
-    def apply(self, contexts, runtime, ops):
-        partitioners = []
-        strategies = []
+    def apply(self, contexts, runtime, ops, partitioners, strategies):
         launch_shapes = []
-        must_be_single = any(op._future_output is not None for op in ops)
-        for op in ops:
-            # TODO: cache as much as of the partitioner results as possible
-            # so the calls to Partitioner() and partition_stores done kill perf
-            partitioner = Partitioner(runtime, ops, must_be_single=must_be_single)
-            strategy = partitioner.partition_stores()
-            launch_shapes.append(strategy._launch_shape)
+        for i in range(len(ops)):
+            launch_shapes.append(strategies[i]._launch_shape)
         first_shape = launch_shapes[0]
-        #print(launch_shapes)
         for launch_shape in launch_shapes:
             if launch_shape!=first_shape:
                 return False
@@ -867,7 +859,7 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size = 5
+        self._window_size =1
 
         # Now we initialize managers
         self._attachment_manager = AttachmentManager(self)
@@ -1025,20 +1017,27 @@ def build_fused_op(self,ops):
         fusion_checker = FusionChecker(ops, self._contexts, self)
         fusion_checker.register_constraint(NumpyContextExists())
         fusion_checker.register_constraint(AllBinaryOps())
-        #fusion_checker.register_constraint(IdenticalLaunchShapes())
-        #fusion_checker.register_constraint(IdenticalProjection())
-        can_fuse = fusion_checker.can_fuse()
+        fusion_checker.register_constraint(IdenticalLaunchShapes())
+        fusion_checker.register_constraint(IdenticalProjection())
+        can_fuse, partitions = fusion_checker.can_fuse()
         
         if not can_fuse:
             return None
+        super_strat = {}
+        super_fspace = {}
+        for partition in partitions:
+            super_strat = {**(super_strat.copy()), **partition._strategy}  
+            super_fspace = {**(super_fspace.copy()), **partition._fspaces}
 
+        super_strategy = Strategy(partitions[0]._launch_shape, super_strat, super_fspace)
         #hacky way to get numpy context and designated fused task id
         fused_id = self._contexts["legate.numpy"].fused_id
         numpy_context = self._contexts["legate.numpy"]
         numpy_runtime = numpy_context._library.runtime
         #initialize fused task
         fused_task = numpy_context.create_task(fused_id)
-        
+        fused_task.strategy = super_strategy
+       
         #serialize necessary metadata on all encapsulated ops 
         #this metadata will be fed into the fused op as inputs
         meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_runtime, ops)
@@ -1081,9 +1080,14 @@ def _schedule(self, ops, force_eval=False):
                 return
 
         #if we cann't fuse op launch them individually
-        must_be_single = any(op._future_output is not None for op in ops)
-        partitioner = Partitioner(self, ops, must_be_single=must_be_single)
-        strategy = partitioner.partition_stores()
+        #fused tasks already have their strategy
+        if len(ops)==1 and ops[0]._task_id==400028:
+            strategy = ops[0].strategy
+            
+        else:
+            must_be_single = any(op._future_output is not None for op in ops)
+            partitioner = Partitioner(self, ops, must_be_single=must_be_single)
+            strategy = partitioner.partition_stores()
         for op in ops:
             op.launch(strategy)
 

From bf7973ab677e8711944d0f89d1949babd28bcb3d Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shivsundram@berkeley.edu>
Date: Thu, 30 Sep 2021 17:41:24 -0700
Subject: [PATCH 06/44] remove creation of deferred arrays

---
 legate/core/runtime.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 67d89ba17..705fde3b4 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -859,7 +859,7 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size =1
+        self._window_size =10
 
         # Now we initialize managers
         self._attachment_manager = AttachmentManager(self)
@@ -1006,10 +1006,11 @@ def serialize_multiop_metadata(self, numpy_runtime, ops):
         fusion_metadata = FusionMetadata(*meta_arrs)
 
         #inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets))
-        meta_arrs_np =  map(npo.array, meta_arrs)
+        #meta_arrs_np =  map(npo.array, meta_arrs)
         def make_deferred(inst):
             return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) 
-        meta_maps = map(make_deferred, meta_arrs_np)
+        #meta_maps = map(make_deferred, meta_arrs_np)
+        meta_maps=None
         return meta_maps, fusion_metadata
    
 

From b6121e7539e769a046668719f72aa9827a269d02 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shivsundram@berkeley.edu>
Date: Fri, 1 Oct 2021 14:34:37 -0700
Subject: [PATCH 07/44] optimized packing, some transform packing

---
 legate/core/legion.py                 |  8 ++-
 legate/core/runtime.py                |  5 +-
 legate/core/store.py                  | 33 +++++++----
 src/data/transform.cc                 | 36 ++++++++++++
 src/data/transform.h                  | 18 +++++-
 src/utilities/makeshift_serializer.cc | 82 ++++++++++++++++++++++++++-
 src/utilities/makeshift_serializer.h  | 19 +++++--
 7 files changed, 175 insertions(+), 26 deletions(-)

diff --git a/legate/core/legion.py b/legate/core/legion.py
index 3d6fa5299..b498bfb8e 100644
--- a/legate/core/legion.py
+++ b/legate/core/legion.py
@@ -4560,6 +4560,12 @@ def pack_32bit_int(self, arg):
         self.size += 4
         self.add_arg(arg, legion.LEGION_TYPE_INT32)
 
+    def pack_32bit_int_arr(self, arg):
+        self.fmt.append(str(len(arg))+"i")
+        size = len(arg)
+        self.size += 4*size
+        self.args += arg
+
     def pack_64bit_int(self, arg):
         self.fmt.append("q")
         self.size += 8
@@ -4744,7 +4750,7 @@ def pack_dtype(self, dtype):
     def get_string(self):
         if self.string is None or self.arglen != len(self.args):
             fmtstr = "".join(self.fmt)
-            assert len(fmtstr) == len(self.args) + 1
+            #assert len(fmtstr) == len(self.args) + 1
             self.string = struct.pack(fmtstr, *self.args)
             self.arglen = len(self.args)
         return self.string
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 705fde3b4..3a8fa1e9b 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -41,7 +41,6 @@
 from .shape import Shape
 from .solver import Partitioner, Strategy
 from .store import RegionField, Store, FusionMetadata
-import numpy as npo
 
 # A Field holds a reference to a field in a region tree
 # that can be used by many different RegionField objects
@@ -1007,8 +1006,8 @@ def serialize_multiop_metadata(self, numpy_runtime, ops):
 
         #inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets))
         #meta_arrs_np =  map(npo.array, meta_arrs)
-        def make_deferred(inst):
-            return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) 
+        #def make_deferred(inst):
+        #    return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) 
         #meta_maps = map(make_deferred, meta_arrs_np)
         meta_maps=None
         return meta_maps, fusion_metadata
diff --git a/legate/core/store.py b/legate/core/store.py
index 517c8def2..fafdc941c 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -776,19 +776,28 @@ def __init__(
         self._opIDs = opIDs 
 
     def packList(self, meta_list, buf):
-        for elem in meta_list: 
-            buf.pack_32bit_int(elem)
+        buf.pack_32bit_int_arr(meta_list)
+        #for elem in meta_list: 
+        #    buf.pack_32bit_int(elem)
 
     def pack(self, buf):
        
-        buf.pack_32bit_int(len(self._opIDs)) #nOps
-        buf.pack_32bit_int(len(self._buffer_offsets)) #nIOBuffers+1
-
-        self.packList(self._input_starts, buf)
-        self.packList(self._output_starts, buf)
-        self.packList(self._offset_starts, buf)
-        self.packList(self._buffer_offsets, buf)
-        self.packList(self._reduction_starts, buf)
-        self.packList(self._scalar_starts, buf)
-        self.packList(self._opIDs, buf)
+        #buf.pack_32bit_int(len(self._opIDs)) #nOps
+        #buf.pack_32bit_int(len(self._buffer_offsets)) #nIOBuffers+1
+        superbuff = [len(self._opIDs)]+[len(self._buffer_offsets)]
+        superbuff += self._input_starts
+        superbuff += self._output_starts
+        superbuff += self._offset_starts
+        superbuff += self._buffer_offsets
+        superbuff += self._reduction_starts
+        superbuff += self._scalar_starts
+        superbuff += self._opIDs
+        self.packList(superbuff, buf)
+        #self.packList(self._input_starts, buf)
+        #self.packList(self._output_starts, buf)
+        #self.packList(self._offset_starts, buf)
+        #self.packList(self._buffer_offsets, buf)
+        #self.packList(self._reduction_starts, buf)
+        #self.packList(self._scalar_starts, buf)
+        #self.packList(self._opIDs, buf)
 
diff --git a/src/data/transform.cc b/src/data/transform.cc
index c9860fc65..0c117883b 100644
--- a/src/data/transform.cc
+++ b/src/data/transform.cc
@@ -22,6 +22,17 @@ using namespace Legion;
 
 using StoreTransformP = std::unique_ptr<StoreTransform>;
 
+/*
+typedef enum legate_core_transform_t {
+  LEGATE_CORE_TRANSFORM_SHIFT = 100,
+  LEGATE_CORE_TRANSFORM_PROMOTE,
+  LEGATE_CORE_TRANSFORM_PROJECT,
+  LEGATE_CORE_TRANSFORM_TRANSPOSE,
+  LEGATE_CORE_TRANSFORM_DELINEARIZE,
+} legate_core_transform_t;
+*/
+
+
 DomainAffineTransform combine(const DomainAffineTransform& lhs, const DomainAffineTransform& rhs)
 {
   DomainAffineTransform result;
@@ -39,6 +50,11 @@ Shift::Shift(int32_t dim, int64_t offset, StoreTransformP&& parent)
 {
 }
 
+int32_t Shift::getTransformCode() const
+{
+    return LEGATE_CORE_TRANSFORM_SHIFT;
+}
+
 Domain Shift::transform(const Domain& input) const
 {
   auto result = nullptr != parent_ ? parent_->transform(input) : input;
@@ -81,6 +97,11 @@ Promote::Promote(int32_t extra_dim, int64_t dim_size, StoreTransformP&& parent)
 {
 }
 
+int32_t Promote::getTransformCode() const
+{
+    return LEGATE_CORE_TRANSFORM_PROMOTE;
+}
+
 Domain Promote::transform(const Domain& input) const
 {
   auto promote = [](int32_t extra_dim, int64_t dim_size, const Domain& input) {
@@ -136,6 +157,11 @@ Project::Project(int32_t dim, int64_t coord, StoreTransformP&& parent)
 {
 }
 
+int32_t Project::getTransformCode() const
+{
+    return LEGATE_CORE_TRANSFORM_PROJECT;
+}
+
 Domain Project::transform(const Domain& input) const
 {
   auto project = [](int32_t collapsed_dim, const Domain& input) {
@@ -193,6 +219,11 @@ Transpose::Transpose(std::vector<int32_t>&& axes, StoreTransformP&& parent)
 {
 }
 
+int32_t Transpose::getTransformCode() const
+{
+    return LEGATE_CORE_TRANSFORM_TRANSPOSE;
+}
+
 Domain Transpose::transform(const Domain& input) const
 {
   auto transpose = [](const auto& axes, const Domain& input) {
@@ -246,6 +277,11 @@ Delinearize::Delinearize(int32_t dim, std::vector<int64_t>&& sizes, StoreTransfo
   for (auto size : sizes_) volume_ *= size;
 }
 
+int32_t Delinearize::getTransformCode() const
+{
+    return LEGATE_CORE_TRANSFORM_DELINEARIZE;
+}
+
 Domain Delinearize::transform(const Domain& input) const
 {
   Domain output;
diff --git a/src/data/transform.h b/src/data/transform.h
index 1f2453df8..cc1433968 100644
--- a/src/data/transform.h
+++ b/src/data/transform.h
@@ -19,7 +19,9 @@
 #include <memory>
 
 #include "legion.h"
+#include "legate_c.h"
 
+class MakeshiftSerializer;
 namespace legate {
 
 class StoreTransform {
@@ -31,9 +33,10 @@ class StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& input) const           = 0;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const = 0;
-
+  virtual int32_t getTransformCode() const =0;
  protected:
   std::unique_ptr<StoreTransform> parent_{nullptr};
+ friend class MakeshiftSerializer;
 };
 
 class Shift : public StoreTransform {
@@ -44,10 +47,11 @@ class Shift : public StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& input) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
-
+  virtual int32_t getTransformCode() const override;
  private:
   int32_t dim_;
-  int64_t offset_;
+  int64_t offset_; 
+ friend class MakeshiftSerializer;
 };
 
 class Promote : public StoreTransform {
@@ -58,10 +62,12 @@ class Promote : public StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& input) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
+  virtual int32_t getTransformCode() const override;
 
  private:
   int32_t extra_dim_;
   int64_t dim_size_;
+ friend class MakeshiftSerializer;
 };
 
 class Project : public StoreTransform {
@@ -72,10 +78,12 @@ class Project : public StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& domain) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
+  virtual int32_t getTransformCode() const override;
 
  private:
   int32_t dim_;
   int64_t coord_;
+ friend class MakeshiftSerializer;
 };
 
 class Transpose : public StoreTransform {
@@ -86,9 +94,11 @@ class Transpose : public StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& domain) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
+  virtual int32_t getTransformCode() const override;
 
  private:
   std::vector<int32_t> axes_;
+ friend class MakeshiftSerializer;
 };
 
 class Delinearize : public StoreTransform {
@@ -101,12 +111,14 @@ class Delinearize : public StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& domain) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
+  virtual int32_t getTransformCode() const override;
 
  private:
   int32_t dim_;
   std::vector<int64_t> sizes_;
   std::vector<int64_t> strides_;
   int64_t volume_;
+ friend class MakeshiftSerializer;
 };
 
 }  // namespace legate
diff --git a/src/utilities/makeshift_serializer.cc b/src/utilities/makeshift_serializer.cc
index fe96a2820..b04ee4836 100644
--- a/src/utilities/makeshift_serializer.cc
+++ b/src/utilities/makeshift_serializer.cc
@@ -9,6 +9,84 @@ namespace legate{
         packWithoutType(scalar.data_, size);    
     }
 
+    void MakeshiftSerializer::packTransform(const StoreTransform* trans){
+
+        if (trans==nullptr){
+            int32_t neg= -1;
+            pack((int32_t) neg);
+        }
+        else{
+            int32_t code = trans->getTransformCode();
+            switch (code) {
+                case -1: {
+                }
+                case LEGATE_CORE_TRANSFORM_SHIFT: {
+                    Shift * shifter = (Shift*) trans;
+                    pack((int32_t) shifter->dim_);
+                    pack((int64_t) shifter->offset_);
+                    packTransform(trans->parent_.get());
+                }
+                case LEGATE_CORE_TRANSFORM_PROMOTE: {
+                    Promote * promoter = (Promote*) trans;
+                    pack((int32_t) promoter->extra_dim_);
+                    pack((int64_t) promoter->dim_size_);
+                    packTransform(trans->parent_.get());
+                }
+                case LEGATE_CORE_TRANSFORM_PROJECT: {
+                    Project * projector = (Project*) trans;
+                    pack((int32_t) projector->dim_);
+                    pack((int64_t) projector->coord_);
+                    packTransform(trans->parent_.get());
+                }
+                case LEGATE_CORE_TRANSFORM_TRANSPOSE: {
+                    Transpose * projector = (Transpose*) trans;
+                    packTransform(trans->parent_.get());
+                }
+                case LEGATE_CORE_TRANSFORM_DELINEARIZE: {
+                    Delinearize * projector = (Delinearize*) trans;
+                    packTransform(trans->parent_.get());
+                }
+            }
+        }
+    }
+/*
+    case LEGATE_CORE_TRANSFORM_SHIFT: {
+      auto dim    = unpack<int32_t>();
+      auto offset = unpack<int64_t>();
+      auto parent = unpack_transform();
+      return std::make_unique<Shift>(dim, offset, std::move(parent));
+    }
+    case LEGATE_CORE_TRANSFORM_PROMOTE: {
+      auto extra_dim = unpack<int32_t>();
+      auto dim_size  = unpack<int64_t>();
+      auto parent    = unpack_transform();
+      return std::make_unique<Promote>(extra_dim, dim_size, std::move(parent));
+    }
+    case LEGATE_CORE_TRANSFORM_PROJECT: {
+      auto dim    = unpack<int32_t>();
+      auto coord  = unpack<int64_t>();
+      auto parent = unpack_transform();
+      return std::make_unique<Project>(dim, coord, std::move(parent));
+    }
+    case LEGATE_CORE_TRANSFORM_TRANSPOSE: {
+      auto axes   = unpack<std::vector<int32_t>>();
+      auto parent = unpack_transform();
+      return std::make_unique<Transpose>(std::move(axes), std::move(parent));
+    }
+    case LEGATE_CORE_TRANSFORM_DELINEARIZE: {
+      auto dim    = unpack<int32_t>();
+      auto sizes  = unpack<std::vector<int64_t>>();
+      auto parent = unpack_transform();
+      return std::make_unique<Delinearize>(dim, std::move(sizes), std::move(parent));
+    }
+
+    def _serialize_transform(self, buf):
+        if self._parent is not None:
+            self._transform.serialize(buf)
+            self._parent._serialize_transform(buf)
+        else:
+            buf.pack_32bit_int(-1)
+*/
     void MakeshiftSerializer::packBuffer(const Store& buffer)
     {
         pack((bool) buffer.is_future()); //is_future
@@ -17,8 +95,8 @@ namespace legate{
         pack((int32_t)  buffer.code());
         //pack transform:
         //pack trasnform code
-        int32_t neg= -1;
-        pack((int32_t) neg);
+        packTransform(buffer.transform_.get());
+
         //skip the rest for now, assume no transform, for now pack -1
         // no need to implement this for benchmarking purposes 
         // TODO: implement transform packing
diff --git a/src/utilities/makeshift_serializer.h b/src/utilities/makeshift_serializer.h
index a4eec808f..ecbf98101 100644
--- a/src/utilities/makeshift_serializer.h
+++ b/src/utilities/makeshift_serializer.h
@@ -4,6 +4,8 @@
 #include <vector>
 #include "data/store.h"
 #include "data/scalar.h"
+#include "data/transform.h"
+
 namespace legate {
 
 class Scalar;
@@ -12,11 +14,15 @@ class MakeshiftSerializer{
     
     public:
     MakeshiftSerializer(){
-        size=128;
+        size=512;
         raw.resize(size); 
         write_offset=0;
         read_offset=0;
     }
+    void zero(){
+        //memset ((void*)raw.data(),0,raw.size());
+        write_offset=0;
+    }
 /*
     template <typename T> void pack(T&& arg) 
     {
@@ -32,10 +38,11 @@ class MakeshiftSerializer{
         {
             resize(sizeof(T));
         }
-        for (int i=0; i<sizeof(T); i++)
-        {
-           raw[write_offset+i] = *reinterpret_cast<const int8_t*>((argAddr)+i);
-        }
+        //for (int i=0; i<sizeof(T); i++)
+        //{
+        //   raw[write_offset+i] = *reinterpret_cast<const int8_t*>((argAddr)+i);
+        //}
+        memcpy(raw.data()+write_offset, argAddr, sizeof(T));
         //std::cout<<"reint "<<*reinterpret_cast<T*>(raw.data()+write_offset)<<std::endl;;
         write_offset+=sizeof(T);
         //std::cout<<"    "<<write_offset<<std::endl;
@@ -59,6 +66,8 @@ class MakeshiftSerializer{
     void packScalar(const Scalar& scalar);
 
     void packBuffer(const Store& input);
+
+    void packTransform(const StoreTransform* trans);
     
     template <typename T> T read() 
     {

From e3708bfde139b4c735f2ee1b7cfbac9f3b3703ea Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shivsundram@berkeley.edu>
Date: Sun, 3 Oct 2021 20:30:20 -0700
Subject: [PATCH 08/44] more stuff

---
 legate/core/launcher.py               |  8 +++---
 legate/core/legion.py                 |  1 +
 legate/core/runtime.py                |  9 ++++---
 src/data/store.h                      |  2 ++
 src/runtime/context.cc                |  3 +++
 src/utilities/deserializer.cc         |  1 -
 src/utilities/makeshift_serializer.cc |  7 +++--
 src/utilities/makeshift_serializer.h  | 39 +++++++++++++++++++++++++++
 8 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index 06d275a47..525163bf3 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -663,10 +663,10 @@ def pack_fusion_metadata(argbuf, is_fused, fusion_metadata):
     def build_task(self, launch_domain, argbuf):
         self._req_analyzer.analyze_requirements()
         #print("building task id", self._task_id)
-        #for req in self._req_analyzer._requirements:
-            #print(req)
-            #print(req[0].__dict__)
-            #print()
+        for req in self._req_analyzer._requirements:
+            print(req)
+            print(req[0].__dict__)
+            print()
         self._out_analyzer.analyze_requirements()
 
         #pack fusion metadata
diff --git a/legate/core/legion.py b/legate/core/legion.py
index b498bfb8e..26e7395f2 100644
--- a/legate/core/legion.py
+++ b/legate/core/legion.py
@@ -3624,6 +3624,7 @@ def launch(self, runtime, context):
         """
         num_outputs = len(self.outputs)
         if num_outputs == 0:
+            return
             return Future(
                 legion.legion_task_launcher_execute(
                     runtime, context, self.launcher
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 3a8fa1e9b..35a31a197 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -794,10 +794,11 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         # for each buffer, check all it's associated transforms/partitions
         # across ops are equivalent 
         for store, matrices in store_to_ops.items():
-            if len(matrices)>1:
-                allEqual = reduce(lambda x,y: x==y, matrices)
-                if not allEqual:
-                    return False
+            if len(matrices)>1: 
+                first = matrices[0]
+                for matrix in matrices:
+                    if not (matrix==first).all():
+                        return False
         return True
 
 
diff --git a/src/data/store.h b/src/data/store.h
index f862c8959..b2c6b7e91 100644
--- a/src/data/store.h
+++ b/src/data/store.h
@@ -138,6 +138,7 @@ class RegionField {
   template <int32_t DIM>
   Legion::Rect<DIM> shape() const;
   Legion::Domain domain() const;
+  unsigned getReqIdx() const {return reqIdx_; }
 
  public:
   bool is_readable() const { return readable_; }
@@ -271,6 +272,7 @@ class Store {
   template <int32_t DIM>
   Legion::Rect<DIM> shape() const;
   Legion::Domain domain() const;
+  unsigned getReqIdx() const {return region_field_.getReqIdx(); }
 
  public:
   bool is_readable() const { return readable_; }
diff --git a/src/runtime/context.cc b/src/runtime/context.cc
index bd1e316c2..038d2f85f 100644
--- a/src/runtime/context.cc
+++ b/src/runtime/context.cc
@@ -20,6 +20,8 @@
 #include "data/store.h"
 #include "runtime/context.h"
 #include "utilities/deserializer.h"
+#include <time.h>
+#include <sys/time.h>
 
 namespace legate {
 
@@ -148,6 +150,7 @@ TaskContext::TaskContext(const Legion::Task* task,
   outputs_    = dez.unpack<std::vector<Store>>();
   reductions_ = dez.unpack<std::vector<Store>>();
   scalars_    = dez.unpack<std::vector<Scalar>>();
+
 }
 
 }  // namespace legate
diff --git a/src/utilities/deserializer.cc b/src/utilities/deserializer.cc
index d4d4c92ca..77f28bf92 100644
--- a/src/utilities/deserializer.cc
+++ b/src/utilities/deserializer.cc
@@ -140,7 +140,6 @@ void Deserializer::_unpack(RegionField& value)
   auto dim = unpack<int32_t>();
   auto idx = unpack<uint32_t>();
   auto fid = unpack<int32_t>();
-
   value = RegionField(dim, regions_[idx], fid, idx);
 }
 
diff --git a/src/utilities/makeshift_serializer.cc b/src/utilities/makeshift_serializer.cc
index b04ee4836..83ed87ce2 100644
--- a/src/utilities/makeshift_serializer.cc
+++ b/src/utilities/makeshift_serializer.cc
@@ -113,7 +113,9 @@ namespace legate{
                 //pack dim
                 pack((int32_t) buffer.region_field_.dim()); 
                 //pack idx (req idx) //need to map regions to idx
-                pack((uint32_t) buffer.region_field_.reqIdx_); 
+                unsigned newID = getNewReqID(buffer.region_field_.reqIdx_);
+                //pack((uint32_t) buffer.region_field_.reqIdx_); 
+                pack((uint32_t) newID); 
                 //pack fid (field id)
                 pack((int32_t) buffer.region_field_.fid_); 
         }
@@ -125,7 +127,8 @@ namespace legate{
                 //pack dim; always 1 in an buffer
                 pack((int32_t) 1); 
                 //pack idx (req idx) //need to map regions to idx
-                pack((uint32_t) buffer.region_field_.reqIdx_); 
+                unsigned newID = getNewReqID(buffer.region_field_.reqIdx_);
+                pack((uint32_t) newID); 
                 //pack fid (field id)
                 pack((int32_t) buffer.region_field_.fid_); 
         }   
diff --git a/src/utilities/makeshift_serializer.h b/src/utilities/makeshift_serializer.h
index ecbf98101..94eefc684 100644
--- a/src/utilities/makeshift_serializer.h
+++ b/src/utilities/makeshift_serializer.h
@@ -5,6 +5,7 @@
 #include "data/store.h"
 #include "data/scalar.h"
 #include "data/transform.h"
+#include <map>
 
 namespace legate {
 
@@ -18,10 +19,14 @@ class MakeshiftSerializer{
         raw.resize(size); 
         write_offset=0;
         read_offset=0;
+        buffer_counter=0;
     }
     void zero(){
         //memset ((void*)raw.data(),0,raw.size());
         write_offset=0;
+        buffer_counter=0;
+        neededReqIds.clear();
+        regionReqIdMap.clear();
     }
 /*
     template <typename T> void pack(T&& arg) 
@@ -103,11 +108,45 @@ class MakeshiftSerializer{
     int buffSize(){
         return write_offset;
     }
+
+    int32_t returnAndIncrCounter(){
+        int32_t old = buffer_counter;
+        buffer_counter++;
+        return old;
+    }
+    
+    //map old reqIdx to new reqIdx
+    void addReqID(int32_t id){
+        //register the region reqID if it hasn't been seen yet for this op
+        if (regionReqIdMap.find(id)==regionReqIdMap.end())
+        {
+            regionReqIdMap.insert(std::pair<int32_t, int32_t>(id, returnAndIncrCounter()));
+            neededReqIds.push_back(id);
+        }
+    }
+
+    int32_t getNewReqID(int32_t oldID)
+    {
+        return regionReqIdMap.find(oldID)->second;
+    }
+
+    std::vector<int32_t> getReqIds (){
+        //could use move semantics here
+        std::vector<int32_t> reqIdsCopy(neededReqIds);
+        return reqIdsCopy;
+    } 
+
     private: 
     size_t size;
     int read_offset;
     int write_offset;
+    int buffer_counter;
     std::vector<int8_t> raw;
+
+    private:
+    std::map<int32_t, int32_t> regionReqIdMap; //maps old reqids to new ones
+    std::vector<int32_t> neededReqIds; //list of old reqIds needed in child op
+
 };
 /*
 int main(){

From 8eca06fb33d068be6be342237541b54550deff69 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@c0001.stanford.edu>
Date: Thu, 14 Oct 2021 15:04:26 -0700
Subject: [PATCH 09/44] partial fusion

---
 install.py                            |   1 +
 legate/core/launcher.py               |   8 +-
 legate/core/legion.py                 |   1 -
 legate/core/runtime.py                | 309 ++++++++++++++++++++------
 legate/core/store.py                  |   5 +
 src/data/store.h                      |   1 +
 src/utilities/deserializer.cc         |   6 +
 src/utilities/makeshift_serializer.cc |  18 +-
 src/utilities/makeshift_serializer.h  |   1 -
 9 files changed, 278 insertions(+), 72 deletions(-)

diff --git a/install.py b/install.py
index ac8ce559e..aa7ee4a0d 100755
--- a/install.py
+++ b/install.py
@@ -887,6 +887,7 @@ def driver():
         "--clean",
         dest="clean_first",
         action=BooleanFlag,
+        #default=False,
         default=False,
         help="Clean before build, and pull latest Legion.",
     )
diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index 525163bf3..094a7b95c 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -663,10 +663,10 @@ def pack_fusion_metadata(argbuf, is_fused, fusion_metadata):
     def build_task(self, launch_domain, argbuf):
         self._req_analyzer.analyze_requirements()
         #print("building task id", self._task_id)
-        for req in self._req_analyzer._requirements:
-            print(req)
-            print(req[0].__dict__)
-            print()
+        #for req in self._req_analyzer._requirements:
+        #    print(req)
+        #    print(req[0].__dict__)
+        #    print()
         self._out_analyzer.analyze_requirements()
 
         #pack fusion metadata
diff --git a/legate/core/legion.py b/legate/core/legion.py
index 26e7395f2..b498bfb8e 100644
--- a/legate/core/legion.py
+++ b/legate/core/legion.py
@@ -3624,7 +3624,6 @@ def launch(self, runtime, context):
         """
         num_outputs = len(self.outputs)
         if num_outputs == 0:
-            return
             return Future(
                 legion.legion_task_launcher_execute(
                     runtime, context, self.launcher
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 35a31a197..d609b5af5 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -42,6 +42,17 @@
 from .solver import Partitioner, Strategy
 from .store import RegionField, Store, FusionMetadata
 
+debugPrint = False
+
+#debug printing
+def zprint(*args):
+    return
+if debugPrint:
+    dprint = print
+else:
+    dprint = zprint
+
+
 # A Field holds a reference to a field in a region tree
 # that can be used by many different RegionField objects
 class Field(object):
@@ -521,7 +532,6 @@ def compute_launch_shape(self, store, restrictions):
         for dim, restriction in enumerate(restrictions):
             if restriction != Restriction.RESTRICTED:
                 to_partition += (shape[dim],)
-
         launch_shape = self._compute_launch_shape(to_partition)
         if launch_shape is None:
             return None
@@ -726,6 +736,20 @@ def __init__(self, ops, contexts, runtime):
     def register_constraint(self, fusion_constraint_rule):
         self.constraints.append(fusion_constraint_rule)
 
+    def supress_small_fusions(self, intervals, threshold):
+        #find if there's a fusable sub window of length
+        #greater than or equal to fusion_thresh
+        final_set = []
+        fusable=False
+        for interval in intervals:
+            if interval[1] - interval[0]  >=threshold:
+                final_set.append(interval)
+                fusable = True
+            else:
+                for i in range(interval[0], interval[1]):
+                    final_set.append((i, i+1))
+        return fusable, final_set
+
     def can_fuse(self):
         must_be_single = any(op._future_output is not None for op in self.ops)
         for op in self.ops:
@@ -735,8 +759,33 @@ def can_fuse(self):
             self.partitioners.append( partitioner )
             strategy = partitioner.partition_stores()
             self.strategies.append(strategy)
+
         results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints]
-        return reduce(lambda x,y: x and y, results), self.strategies
+        dprint("fuse results", results)
+        all_fusable = [result[0] for result in results]
+        interval_sets = [result[1] for result in results]
+  
+        #intersect intervals
+        #this is a very, very bad way of doing this,
+        # in the future I'll just "intersect" in place
+        # as we apply constraints
+        curr_set = interval_sets[0]
+        for interval_set in interval_sets[1:]:
+            newset = []
+            for aset in curr_set:
+                for bset in interval_set:
+                    if not (aset[0] > bset[1] or bset[0] > aset[1]): 
+                        news = max(aset[0], bset[0])
+                        newe = min(aset[1], bset[1])
+                        newset.append((news, newe))
+            curr_set=newset
+        fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold)
+        dprint("curset", curr_set)
+
+        dprint("final_set", final_set)
+        dprint("all fusable", fusable)
+        dprint("intervals", interval_sets)
+        return reduce(lambda x,y: x and y, all_fusable), final_set, self.strategies
 
 class FusionConstraint(object):
     def apply(self, contexts, runtime, ops, partitioners, strategies):
@@ -749,16 +798,106 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
 
 class NumpyContextExists(FusionConstraint):
     def apply(self, contexts, runtime, ops, partitioners, strategies):
-        return "legate.numpy" in contexts
+        if "legate.numpy" in contexts:
+            return True, [(0, len(ops))]
+        else:
+           return False, [(0,0)]
+"""
+  NUMPY_BINARY_OP        = 400000,
+  NUMPY_SCALAR_BINARY_OP = 400002,
+  NUMPY_FILL             = 400003,
+  NUMPY_SCALAR_UNARY_RED = 400004,
+  NUMPY_UNARY_RED        = 400005,
+  NUMPY_UNARY_OP         = 400006,
+  NUMPY_SCALAR_UNARY_OP  = 400007,
+  NUMPY_BINARY_RED       = 400008,
+  NUMPY_CONVERT          = 400010,
+  NUMPY_SCALAR_CONVERT   = 400011,
+  NUMPY_WHERE            = 400012,
+  NUMPY_SCALAR_WHERE     = 400013,
+  NUMPY_READ             = 400014,
+  NUMPY_WRITE            = 400015,
+  NUMPY_DIAG             = 400016,
+  NUMPY_MATMUL           = 400017,
+  NUMPY_MATVECMUL        = 400018,
+  NUMPY_DOT              = 400019,
+  NUMPY_BINCOUNT         = 400020,
+  NUMPY_EYE              = 400021,
+  NUMPY_RAND             = 400022,
+  NUMPY_ARANGE           = 400023,
+  NUMPY_TRANSPOSE        = 400024,
+  NUMPY_TILE             = 400025,
+  NUMPY_NONZERO          = 400026,
+  NUMPY_DOUBLE_BINARY_OP = 400027,
+  NUMPY_FUSED_OP         = 400028,
+"""
+class AllValidOps(FusionConstraint):
+    """
+    Class for only fusing only potentially fusable ops.
+    This class performs the first pass of legality filtering
+    """
+    def __init__(self):
+        self.validIDs = set()
 
+        #these ops are always fusable
+        self.validIDs.add(400000) #Binary op
+        self.validIDs.add(400006) #Unary op
 
-class AllBinaryOps(FusionConstraint):
-    """Temporary class for only fusing Binary Ops. 
-       This constrains will be removed"""
-    def apply(self, contexts, runtime, ops, partitioners, strategies):
-        allBinary = reduce(lambda x,y: x and y,[int(op._task_id)==400000 for op in ops])
-        return allBinary
+        # the following are conditionally fusable
+        # they will be processed in the a subsequent level of filtering
+ 
+        # scalar producing ops are valid if the scalars they produce
+        # are NOT consumed by a subsequent op in the window
+        # however they can be printed, which we cannot detect in the runtime
+        # without static analysis, so consider these terminal fusable
+        self.validIDs.add(400004) #Scalar unary red      
+        self.validIDs.add(400005) #Unary red      
+
+        # as all scalars are futures,
+        # so we can just check if both Futures are "ready"
+        # more powerfully, we can also create a dependency tree
+        # of ops, and assuming they're all scalar ops, 
+        # and the "roots" are ready, we can fuse
+        self.validIDs.add(400002) #Scalar Binary op
+        self.validIDs.add(400007) #Scalar Unary op
+        self.validIDs.add(400008) #Scalar binary red     
+
+        #a matmul is valid if it is the last op in the sequence
+        #unless if it followed by a matmul of the exact same size 
+        #so it is terminal fusable
+        #self.validIDs.add(400017) #Matmul
+
+        #vector dot is binary op + scalar producing reduction
+        #it is thus terminal fusable
+        #self.validIDs.add(400019) #dot
 
+    def apply(self, contexts, runtime, ops, partitioners, strategies):
+        results = [int(op._task_id) in self.validIDs for op in ops]
+        fusable_intervals = []
+        start, end =0,0
+        rolling=False
+        while end<len(results):
+            result = results[end]
+            if result:
+                end=end+1
+            else:
+                if start<end:
+                    fusable_intervals.append((start,end))
+                    start=end 
+                    end=start
+                else:
+                    fusable_intervals.append((start, start+1))
+                    start=start+1
+                    end = start
+        if start<end:
+            fusable_intervals.append((start,end))
+        dprint(fusable_intervals)   
+        dprint("allFusableOps", results)
+        fusability_exists = reduce(lambda x,y: x or y,[int(op._task_id) in self.validIDs for op in ops])
+        return (fusability_exists, fusable_intervals)
+
+class ValidScalarProducers(FusionConstraint):
+   """Checks all scalar producing are terminal ops"""
 
 class IdenticalProjection(FusionConstraint):
     """Fusion rule that only ops with identical
@@ -798,8 +937,8 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
                 first = matrices[0]
                 for matrix in matrices:
                     if not (matrix==first).all():
-                        return False
-        return True
+                        return False, [(0,0)]
+        return True, [(0,len(ops))]
 
 
 class IdenticalLaunchShapes(FusionConstraint):
@@ -809,11 +948,13 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         launch_shapes = []
         for i in range(len(ops)):
             launch_shapes.append(strategies[i]._launch_shape)
+        dprint(strategies[3].__dict__)
+        dprint('launch shapes', launch_shapes)
         first_shape = launch_shapes[0]
         for launch_shape in launch_shapes:
             if launch_shape!=first_shape:
-                return False
-        return True
+                return False, [(0,0)]
+        return True, [(0,len(ops))]
 
 
    
@@ -859,7 +1000,9 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size =10
+        self._window_size =1
+        self._fusion_threshold =10
+        self._clearing_pipe = False
 
         # Now we initialize managers
         self._attachment_manager = AttachmentManager(self)
@@ -964,28 +1107,31 @@ def dispatch(self, op, redop=None):
             return op.launch(self.legion_runtime, self.legion_context)
 
 
-    def serialize_multiop_metadata(self, numpy_runtime, ops):
+    def serialize_multiop_metadata(self, numpy_context, ops):
         """creates a 'header' for a fused op that denotes metadata
         on each ops inputs, outputs, reductions and scalars
         """
         #generate offset maps for all inputs to serialize metadata
         input_starts, output_starts, offset_starts, offsets= [],[],[],[]
-        reduction_starts, scalar_starts,op_ids = [], [], [] 
+        reduction_starts, scalar_starts, future_starts, op_ids = [], [], [], []
         input_start, output_start, offset_start = 0,0,0
-        reduction_start, scalar_start = 0,0
-
+        reduction_start, scalar_start, future_start = 0,0,0
+ 
         for op in ops:
             input_starts.append(input_start)
             output_starts.append(output_start)
             offset_starts.append(offset_start)
             reduction_starts.append(reduction_start)
             scalar_starts.append(scalar_start)
+            future_starts.append(future_start)
 
             for i,input in enumerate(op._inputs):
                 offsets.append(i+1)
+                if input.kind is Future:
+                    future_start+=1
             for o,output in enumerate(op._outputs):
                 offsets.append(-(o+1)) 
-            op_ids.append(op._task_id._value_)
+            op_ids.append(numpy_context.get_task_id(op._task_id._value_))
 
             offset_start+=(len(op._inputs)+len(op._outputs))
             input_start+=len(op._inputs)
@@ -999,12 +1145,15 @@ def serialize_multiop_metadata(self, numpy_runtime, ops):
         offset_starts.append(offset_start)
         reduction_starts.append(reduction_start)
         scalar_starts.append(scalar_start)
+        future_starts.append(future_start)
 
         #turn metadata maps into deferred arrays
         #then load them into the task as the initial inputs
-        meta_arrs =  (input_starts, output_starts, offset_starts, offsets, reduction_starts, scalar_starts, op_ids)
+        meta_arrs =  (input_starts, output_starts, offset_starts, offsets, reduction_starts,  scalar_starts, 
+                      future_starts, op_ids)
         fusion_metadata = FusionMetadata(*meta_arrs)
 
+        #TODO: remove me
         #inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets))
         #meta_arrs_np =  map(npo.array, meta_arrs)
         #def make_deferred(inst):
@@ -1017,53 +1166,82 @@ def serialize_multiop_metadata(self, numpy_runtime, ops):
     def build_fused_op(self,ops):
         fusion_checker = FusionChecker(ops, self._contexts, self)
         fusion_checker.register_constraint(NumpyContextExists())
-        fusion_checker.register_constraint(AllBinaryOps())
+        fusion_checker.register_constraint(AllValidOps())
         fusion_checker.register_constraint(IdenticalLaunchShapes())
         fusion_checker.register_constraint(IdenticalProjection())
-        can_fuse, partitions = fusion_checker.can_fuse()
-        
+        can_fuse,fusable_sets, partitions = fusion_checker.can_fuse()
+
+        #short circuit         
         if not can_fuse:
+            dprint("CANNOT FUSE!")
             return None
+
+        super_strats = []
+        super_fspaces = []
+        super_strategies = []
+        for fusable_set in fusable_sets:   
+            #create super strategy for this fusable set
+            super_strat = {}
+            super_fspace = {}
+            start,end = fusable_set
+            dprint("creating fusable set for", start, end)
+            for j in range(start,end):
+                super_strat = {**(super_strat.copy()), **partitions[j]._strategy}
+                super_fspace = {**(super_fspace.copy()), **partitions[j]._fspaces}
+            super_strats.append(super_strat)
+            super_fspaces.append(super_fspace)
+            super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace))
+        dprint("lens", len(super_strats), len(super_fspaces), len(super_strategies))
+        """
         super_strat = {}
         super_fspace = {}
         for partition in partitions:
             super_strat = {**(super_strat.copy()), **partition._strategy}  
             super_fspace = {**(super_fspace.copy()), **partition._fspaces}
-
-        super_strategy = Strategy(partitions[0]._launch_shape, super_strat, super_fspace)
+        """
+        #super_strategy = Strategy(partitions[0]._launch_shape, super_strat, super_fspace)
         #hacky way to get numpy context and designated fused task id
         fused_id = self._contexts["legate.numpy"].fused_id
         numpy_context = self._contexts["legate.numpy"]
         numpy_runtime = numpy_context._library.runtime
-        #initialize fused task
-        fused_task = numpy_context.create_task(fused_id)
-        fused_task.strategy = super_strategy
-       
-        #serialize necessary metadata on all encapsulated ops 
-        #this metadata will be fed into the fused op as inputs
-        meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_runtime, ops)
-        fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true
-
-	#add metadata maps to task as inputs
-        #for meta_map in meta_maps:
-        #    fused_task.add_input(meta_map.base)
-        #    fused_task.add_broadcast(meta_map.base)
 
-        #add typical inputs and outputs of all subtasks to fused task
-        for op in ops:
-            for scalar in op._scalar_args:
-                fused_task.add_scalar_arg(scalar[0], ty.int32)
-            for reduction in op._reductions:
-                fused_task.add_reduction(reduction)
-            for input in op._inputs:
-                fused_task.add_input(input)   
-            for output in op._outputs:
-                fused_task.add_output(output)   
-
-        return fused_task
+        new_op_list = []
+        for i,fusable_set in enumerate(fusable_sets):
+            start, end = fusable_set
+            op_subset = ops[start:end]
+            #if nothing to fuse, just use the original op
+            if end-start==1:
+                normal_op = ops[start]
+                normal_op.strategy =  super_strategies[i]
+                new_op_list.append(normal_op)
+            elif end-start > 1:
+                #initialize fused task
+                fused_task = numpy_context.create_task(fused_id)
+                fused_task.strategy = super_strategies[i]
+       
+                #serialize necessary metadata on all encapsulated ops 
+                #this metadata will be fed into the fused op as inputs
+                meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset)
+                fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true
+
+                #add typical inputs and outputs of all subtasks to fused task
+                for op in op_subset:
+                    for scalar in op._scalar_args:
+                        fused_task.add_scalar_arg(scalar[0], ty.int32)
+                    for reduction in op._reductions:
+                        fused_task.add_reduction(reduction)
+                    for input in op._inputs:
+                        fused_task.add_input(input)   
+                    for output in op._outputs:
+                        fused_task.add_output(output)   
+                    for future in op._futures:
+                        fused_task.add_future(future)
+                new_op_list.append(fused_task)
+        dprint("new op list", new_op_list)
+        return new_op_list        
 
     def _launch_outstanding(self):
-        print("launching final outstanding ops")
+        dprint("launching final outstanding ops")
         if len(self._outstanding_ops):
             ops = self._outstanding_ops
             self._outstanding_ops = []
@@ -1072,20 +1250,26 @@ def _launch_outstanding(self):
    
     def _schedule(self, ops, force_eval=False):
         ids = [op._task_id for op in ops]
-
+        dprint("ids", ids)
         #try fusing tasks
         if len(ops)>=2 and (not force_eval):
-            fused_task = self.build_fused_op(ops)
-            if fused_task:
-                fused_task.execute() 
+            fused_task_list = self.build_fused_op(ops)
+            if fused_task_list:
+                dprint("start clearing pipe")
+                self._clearing_pipe = True
+                for task in fused_task_list:
+                    task.execute() 
+                self._clearing_pipe = False
+                dprint("stop clearing pipe")
                 return
 
         #if we cann't fuse op launch them individually
-        #fused tasks already have their strategy
-        if len(ops)==1 and ops[0]._task_id==400028:
+
+        # tasks processed for fusion already have  
+        # their strategy "baked in"
+        if len(ops)==1 and self._clearing_pipe:
             strategy = ops[0].strategy
-            
-        else:
+        else: #else do to the partition
             must_be_single = any(op._future_output is not None for op in ops)
             partitioner = Partitioner(self, ops, must_be_single=must_be_single)
             strategy = partitioner.partition_stores()
@@ -1093,9 +1277,10 @@ def _schedule(self, ops, force_eval=False):
             op.launch(strategy)
 
     def submit(self, op):
-        #always launch a fused op, dont add it to the window
-        #as the encapsulated ops already waited in the window
-        if int(op._task_id)==400028:
+        #always launch ops that've been processed for fusion
+        #do not re-add to the window
+        #as the these ops already waited in the window
+        if self._clearing_pipe:
             self._schedule([op])
         else:
             self._outstanding_ops.append(op)
diff --git a/legate/core/store.py b/legate/core/store.py
index fafdc941c..3c8220cfc 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -765,6 +765,7 @@ def __init__(
                  buffer_offsets,
                  reduction_starts,
                  scalar_starts,
+                 future_starts,
                  opIDs
                  ):
         self._input_starts = input_starts
@@ -773,9 +774,12 @@ def __init__(
         self._buffer_offsets = buffer_offsets
         self._reduction_starts = reduction_starts
         self._scalar_starts = scalar_starts
+        self._future_starts = future_starts
         self._opIDs = opIDs 
 
     def packList(self, meta_list, buf):
+        # aggregate the ints when packing
+        # much faster than individually packing each int
         buf.pack_32bit_int_arr(meta_list)
         #for elem in meta_list: 
         #    buf.pack_32bit_int(elem)
@@ -791,6 +795,7 @@ def pack(self, buf):
         superbuff += self._buffer_offsets
         superbuff += self._reduction_starts
         superbuff += self._scalar_starts
+        superbuff += self._future_starts
         superbuff += self._opIDs
         self.packList(superbuff, buf)
         #self.packList(self._input_starts, buf)
diff --git a/src/data/store.h b/src/data/store.h
index b2c6b7e91..8b93f83cf 100644
--- a/src/data/store.h
+++ b/src/data/store.h
@@ -323,6 +323,7 @@ struct FusionMetadata {
     std::vector<int32_t> offsets; // can contain negative elements
     std::vector<int32_t> reductionStarts;
     std::vector<int32_t> scalarStarts;
+    std::vector<int32_t> futureStarts;
     std::vector<int32_t> opIDs;
 };
 
diff --git a/src/utilities/deserializer.cc b/src/utilities/deserializer.cc
index 77f28bf92..061d4d3cf 100644
--- a/src/utilities/deserializer.cc
+++ b/src/utilities/deserializer.cc
@@ -56,6 +56,7 @@ void Deserializer::_unpack(FusionMetadata& metadata){
     metadata.offsets.resize(nBuffers+1);
     metadata.reductionStarts.resize(nOps+1);
     metadata.scalarStarts.resize(nOps+1);
+    metadata.futureStarts.resize(nOps+1);
     metadata.opIDs.resize(nOps);
     //TODO: wrap this up to reuse code`
     for (int i=0; i<nOps+1; i++)
@@ -82,6 +83,10 @@ void Deserializer::_unpack(FusionMetadata& metadata){
     {
         metadata.scalarStarts[i] = unpack<int32_t>();
     }   
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.futureStarts[i] = unpack<int32_t>();
+    }   
     for (int i=0; i<nOps; i++)
     {
         metadata.opIDs[i] = unpack<int32_t>();
@@ -125,6 +130,7 @@ void Deserializer::_unpack(FutureWrapper& value)
   futures_    = futures_.subspan(1);
 
   auto point = unpack<std::vector<int64_t>>();
+  
   Domain domain;
   domain.dim = static_cast<int32_t>(point.size());
   for (int32_t idx = 0; idx < domain.dim; ++idx) {
diff --git a/src/utilities/makeshift_serializer.cc b/src/utilities/makeshift_serializer.cc
index 83ed87ce2..bc5c5a02f 100644
--- a/src/utilities/makeshift_serializer.cc
+++ b/src/utilities/makeshift_serializer.cc
@@ -17,34 +17,42 @@ namespace legate{
         }
         else{
             int32_t code = trans->getTransformCode();
+            pack((int32_t) code);
             switch (code) {
                 case -1: {
+                    break;
+  
                 }
                 case LEGATE_CORE_TRANSFORM_SHIFT: {
                     Shift * shifter = (Shift*) trans;
                     pack((int32_t) shifter->dim_);
                     pack((int64_t) shifter->offset_);
                     packTransform(trans->parent_.get());
+                    break;
                 }
                 case LEGATE_CORE_TRANSFORM_PROMOTE: {
                     Promote * promoter = (Promote*) trans;
                     pack((int32_t) promoter->extra_dim_);
                     pack((int64_t) promoter->dim_size_);
                     packTransform(trans->parent_.get());
+                    break;
                 }
                 case LEGATE_CORE_TRANSFORM_PROJECT: {
                     Project * projector = (Project*) trans;
                     pack((int32_t) projector->dim_);
                     pack((int64_t) projector->coord_);
                     packTransform(trans->parent_.get());
+                    break;
                 }
                 case LEGATE_CORE_TRANSFORM_TRANSPOSE: {
                     Transpose * projector = (Transpose*) trans;
                     packTransform(trans->parent_.get());
+                    break;
                 }
                 case LEGATE_CORE_TRANSFORM_DELINEARIZE: {
                     Delinearize * projector = (Delinearize*) trans;
                     packTransform(trans->parent_.get());
+                    break;
                 }
             }
         }
@@ -97,14 +105,16 @@ namespace legate{
         //pack trasnform code
         packTransform(buffer.transform_.get());
 
-        //skip the rest for now, assume no transform, for now pack -1
-        // no need to implement this for benchmarking purposes 
-        // TODO: implement transform packing
-        // TODO: add "code" to transform object
         //if _isfuture
         if(buffer.is_future_)
         {   
             //pack future_wrapper
+            auto dom = buffer.future_.domain();
+            pack((uint32_t) dom.dim);
+            for (int32_t i =0; i<dom.dim; i++)
+            {
+                pack((int64_t) dom.rect_data[i + dom.dim] + 1);
+            }
         }   
         //elif dim>=0
         else if (buffer.dim()>=0){
diff --git a/src/utilities/makeshift_serializer.h b/src/utilities/makeshift_serializer.h
index 94eefc684..56b409e69 100644
--- a/src/utilities/makeshift_serializer.h
+++ b/src/utilities/makeshift_serializer.h
@@ -38,7 +38,6 @@ class MakeshiftSerializer{
     template <typename T> void pack(T arg) 
     {
         int8_t * argAddr = (int8_t*) &arg;
-        //std::cout<<arg<<std::endl;
         if (size<=write_offset+sizeof(T))
         {
             resize(sizeof(T));

From 303866b39d6aad2135e393e0c03621a8e8301644 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@c0001.stanford.edu>
Date: Thu, 14 Oct 2021 18:47:02 -0700
Subject: [PATCH 10/44] finishing merge

---
 legate.py                             |   2 +-
 legate/core/runtime.py                |  48 ++++++--
 legate/core/solver.py                 |   1 +
 src/core/data/store.h                 |   2 +-
 src/core/data/transform.h             |   5 +-
 src/core/mapping/task.cc              |   1 +
 src/core/mapping/task.h               |   4 +-
 src/core/runtime/context.h            |   2 +-
 src/core/utilities/deserializer.cc    |  62 +++++++++-
 src/core/utilities/deserializer.h     |   2 +-
 src/utilities/makeshift_serializer.cc | 149 -----------------------
 src/utilities/makeshift_serializer.h  | 168 --------------------------
 12 files changed, 108 insertions(+), 338 deletions(-)
 delete mode 100644 src/utilities/makeshift_serializer.cc
 delete mode 100644 src/utilities/makeshift_serializer.h

diff --git a/legate.py b/legate.py
index 95c21ce72..71c8c6699 100755
--- a/legate.py
+++ b/legate.py
@@ -830,7 +830,7 @@ def driver():
         args.cores_per_node,
         args.launcher,
         args.verbose,
-        args.interpreter,
+        #args.interpreter,
         args.gasnet_trace,
         args.eager_alloc,
         args.launcher_extra,
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index ac5f702c3..409dfadb6 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -43,7 +43,7 @@
 from .solver import Partitioner, Strategy
 from .store import RegionField, Store, FusionMetadata
 
-debugPrint = False
+debugPrint = True
 
 #debug printing
 def zprint(*args):
@@ -703,7 +703,7 @@ def supress_small_fusions(self, intervals, threshold):
         return fusable, final_set
 
     def can_fuse(self):
-        must_be_single = any(op._future_output is not None for op in self.ops)
+        must_be_single = any(len(op.scalar_outputs) > 0 for op in self.ops)
         for op in self.ops:
             # TODO: cache as much as of the partitioner results as possible
             # so the calls to Partitioner() and partition_stores done kill perf
@@ -782,6 +782,32 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
   NUMPY_NONZERO          = 400026,
   NUMPY_DOUBLE_BINARY_OP = 400027,
   NUMPY_FUSED_OP         = 400028,
+enum NumPyOpCode {
+  NUMPY_ARANGE           = 1,
+  NUMPY_BINARY_OP        = 2,
+  NUMPY_BINARY_RED       = 3,
+  NUMPY_BINCOUNT         = 4,
+  NUMPY_CONVERT          = 5,
+  NUMPY_DIAG             = 6,
+  NUMPY_DOT              = 7,
+  NUMPY_EYE              = 8,
+  NUMPY_FILL             = 9,
+  NUMPY_MATMUL           = 10,
+  NUMPY_MATVECMUL        = 11,
+  NUMPY_NONZERO          = 12,
+  NUMPY_RAND             = 13,
+  NUMPY_READ             = 14,
+  NUMPY_SCALAR_UNARY_RED = 15,
+  NUMPY_TILE             = 16,
+  NUMPY_TRANSPOSE        = 17,
+  NUMPY_UNARY_OP         = 18,
+  NUMPY_UNARY_RED        = 19,
+  NUMPY_WHERE            = 20,
+  NUMPY_WRITE            = 21,
+  NUMPY_DOUBLE_BINARY_OP  = 23,
+  NUMPY_FUSED_OP            = 24,
+}
+
 """
 class AllValidOps(FusionConstraint):
     """
@@ -792,8 +818,8 @@ def __init__(self):
         self.validIDs = set()
 
         #these ops are always fusable
-        self.validIDs.add(400000) #Binary op
-        self.validIDs.add(400006) #Unary op
+        self.validIDs.add(2) #Binary op
+        self.validIDs.add(18) #Unary op
 
         # the following are conditionally fusable
         # they will be processed in the a subsequent level of filtering
@@ -952,8 +978,8 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size =1
-        self._fusion_threshold =10
+        self._window_size =10
+        self._fusion_threshold =4
         self._clearing_pipe = False
 
         # Now we initialize managers
@@ -1135,19 +1161,23 @@ def build_fused_op(self,ops):
         super_strats = []
         super_fspaces = []
         super_strategies = []
+        super_keystores = []
         for fusable_set in fusable_sets:   
             #create super strategy for this fusable set
             super_strat = {}
             super_fspace = {}
+            super_keystore = set()
             start,end = fusable_set
             dprint("creating fusable set for", start, end)
             for j in range(start,end):
                 super_strat = {**(super_strat.copy()), **partitions[j]._strategy}
                 super_fspace = {**(super_fspace.copy()), **partitions[j]._fspaces}
+                super_keystore = super_keystore.union(partitions[j]._key_stores)
             super_strats.append(super_strat)
             super_fspaces.append(super_fspace)
-            super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace))
-        dprint("lens", len(super_strats), len(super_fspaces), len(super_strategies))
+            super_keystores.append(super_keystore)
+            super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore))
+        dprint("lens", len(super_strats), len(super_fspaces), len(super_strategies), len(super_keystore))
         """
         super_strat = {}
         super_fspace = {}
@@ -1226,7 +1256,7 @@ def _schedule(self, ops, force_eval=False):
         if len(ops)==1 and self._clearing_pipe:
             strategy = ops[0].strategy
         else: #else do to the partition
-            must_be_single = any(op._future_output is not None for op in ops)
+            must_be_single = any(len(op.scalar_outputs) > 0 for op in ops)
             partitioner = Partitioner(self, ops, must_be_single=must_be_single)
             strategy = partitioner.partition_stores()
         for op in ops:
diff --git a/legate/core/solver.py b/legate/core/solver.py
index d0510e5ec..e9bb18a6b 100644
--- a/legate/core/solver.py
+++ b/legate/core/solver.py
@@ -288,4 +288,5 @@ def partition_stores(self):
         if must_be_1d_launch and color_shape is not None:
             color_shape = Shape((color_shape.volume(),))
 
+        print("key_stores", key_stores)    
         return Strategy(color_shape, partitions, fspaces, key_stores)
diff --git a/src/core/data/store.h b/src/core/data/store.h
index 76e11e599..effa17a8c 100644
--- a/src/core/data/store.h
+++ b/src/core/data/store.h
@@ -273,7 +273,7 @@ class Store {
 
  public:
   int32_t dim() const { return dim_; }
-  bool is_future() const { return is_future_; }
+  bool is_future2() const { return is_future_; }
   LegateTypeCode code() const { return code_; }
 
  public:
diff --git a/src/core/data/transform.h b/src/core/data/transform.h
index 680b03cc5..6c272b735 100644
--- a/src/core/data/transform.h
+++ b/src/core/data/transform.h
@@ -19,7 +19,7 @@
 #include <memory>
 
 #include "legion.h"
-#include "legate_c.h"
+#include "core/legate_c.h"
 
 class MakeshiftSerializer;
 namespace legate {
@@ -85,11 +85,8 @@ class Project : public StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& domain) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
-<<<<<<< HEAD:src/data/transform.h
   virtual int32_t getTransformCode() const override;
-=======
   virtual void print(std::ostream& out) const override;
->>>>>>> source/branch-21.10:src/core/data/transform.h
 
  private:
   int32_t dim_;
diff --git a/src/core/mapping/task.cc b/src/core/mapping/task.cc
index 1f39a696a..404cc94f4 100644
--- a/src/core/mapping/task.cc
+++ b/src/core/mapping/task.cc
@@ -121,6 +121,7 @@ Task::Task(const LegionTask* task,
   : task_(task), library_(library)
 {
   MapperDeserializer dez(task, runtime, context);
+  fusionMetadata = dez.unpack<FusionMetadata>(); 
   inputs_     = dez.unpack<std::vector<Store>>();
   outputs_    = dez.unpack<std::vector<Store>>();
   reductions_ = dez.unpack<std::vector<Store>>();
diff --git a/src/core/mapping/task.h b/src/core/mapping/task.h
index 69efdc034..cf2533688 100644
--- a/src/core/mapping/task.h
+++ b/src/core/mapping/task.h
@@ -20,6 +20,7 @@
 #include <tuple>
 
 #include "core/data/scalar.h"
+#include "core/data/store.h"
 #include "core/data/transform.h"
 #include "core/runtime/context.h"
 
@@ -176,7 +177,8 @@ class Task {
   const LibraryContext& library_;
   const Legion::Task* task_;
 
- private:
+ public:
+  FusionMetadata fusionMetadata;
   std::vector<Store> inputs_, outputs_, reductions_;
   std::vector<Scalar> scalars_;
 };
diff --git a/src/core/runtime/context.h b/src/core/runtime/context.h
index 92152e769..891452cc9 100644
--- a/src/core/runtime/context.h
+++ b/src/core/runtime/context.h
@@ -17,7 +17,7 @@
 #pragma once
 
 #include "legion.h"
-#include "data/scalar.h"
+#include "core/data/scalar.h"
 
 #include "core/task/return.h"
 
diff --git a/src/core/utilities/deserializer.cc b/src/core/utilities/deserializer.cc
index e784ad4d1..fcc22c57c 100644
--- a/src/core/utilities/deserializer.cc
+++ b/src/core/utilities/deserializer.cc
@@ -40,8 +40,8 @@ TaskDeserializer::TaskDeserializer(const LegionTask* task,
   first_task_ = !task->is_index_space || (task->index_point == task->index_domain.lo());
 }
 
-/*
-void Deserializer::_unpack(FusionMetadata& metadata){
+
+void TaskDeserializer::_unpack(FusionMetadata& metadata){
     metadata.isFused = unpack<bool>();
     if (!metadata.isFused){
         return;
@@ -94,7 +94,7 @@ void Deserializer::_unpack(FusionMetadata& metadata){
         metadata.opIDs[i] = unpack<int32_t>();
     }   
 }
-*/
+
 void TaskDeserializer::_unpack(Store& value)
 {
   auto is_future = unpack<bool>();
@@ -190,6 +190,62 @@ void MapperDeserializer::_unpack(Store& value)
   }
 }
 
+void MapperDeserializer::_unpack(FusionMetadata& metadata){
+    metadata.isFused = unpack<bool>();
+    if (!metadata.isFused){
+        return;
+    }
+    //exit out if the this is not a fused op
+    metadata.nOps = unpack<int32_t>();
+    metadata.nBuffers = unpack<int32_t>();
+    int nOps = metadata.nOps;
+    int nBuffers = metadata.nBuffers; 
+
+    metadata.inputStarts.resize(nOps+1);
+    metadata.outputStarts.resize(nOps+1);
+    metadata.offsetStarts.resize(nOps+1);
+    metadata.offsets.resize(nBuffers+1);
+    metadata.reductionStarts.resize(nOps+1);
+    metadata.scalarStarts.resize(nOps+1);
+    metadata.futureStarts.resize(nOps+1);
+    metadata.opIDs.resize(nOps);
+    //TODO: wrap this up to reuse code`
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.inputStarts[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.outputStarts[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.offsetStarts[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nBuffers; i++)
+    {
+        metadata.offsets[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.reductionStarts[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.scalarStarts[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nOps+1; i++)
+    {
+        metadata.futureStarts[i] = unpack<int32_t>();
+    }   
+    for (int i=0; i<nOps; i++)
+    {
+        metadata.opIDs[i] = unpack<int32_t>();
+    }   
+}
+
+
+
 void MapperDeserializer::_unpack(FutureWrapper& value)
 {
   // We still need to deserialize these fields to get to the domain
diff --git a/src/core/utilities/deserializer.h b/src/core/utilities/deserializer.h
index 92f9d50dc..df1fe4f47 100644
--- a/src/core/utilities/deserializer.h
+++ b/src/core/utilities/deserializer.h
@@ -30,7 +30,6 @@
 
 namespace legate {
 
-struct FusionMetadata;
 template <typename Deserializer>
 class BaseDeserializer {
  public:
@@ -111,6 +110,7 @@ class MapperDeserializer : public BaseDeserializer<MapperDeserializer> {
   void _unpack(Store& value);
   void _unpack(FutureWrapper& value);
   void _unpack(RegionField& value, bool is_output_region);
+  void _unpack(FusionMetadata& value);
 
  private:
   Legion::Mapping::MapperRuntime* runtime_;
diff --git a/src/utilities/makeshift_serializer.cc b/src/utilities/makeshift_serializer.cc
deleted file mode 100644
index bc5c5a02f..000000000
--- a/src/utilities/makeshift_serializer.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-#include "utilities/makeshift_serializer.h"
-
-namespace legate{
-
-    void MakeshiftSerializer::packScalar(const Scalar& scalar){
-        pack((bool) scalar.is_tuple()); 
-        pack((LegateTypeCode) scalar.code_); 
-        int32_t size = scalar.size();
-        packWithoutType(scalar.data_, size);    
-    }
-
-    void MakeshiftSerializer::packTransform(const StoreTransform* trans){
-
-        if (trans==nullptr){
-            int32_t neg= -1;
-            pack((int32_t) neg);
-        }
-        else{
-            int32_t code = trans->getTransformCode();
-            pack((int32_t) code);
-            switch (code) {
-                case -1: {
-                    break;
-  
-                }
-                case LEGATE_CORE_TRANSFORM_SHIFT: {
-                    Shift * shifter = (Shift*) trans;
-                    pack((int32_t) shifter->dim_);
-                    pack((int64_t) shifter->offset_);
-                    packTransform(trans->parent_.get());
-                    break;
-                }
-                case LEGATE_CORE_TRANSFORM_PROMOTE: {
-                    Promote * promoter = (Promote*) trans;
-                    pack((int32_t) promoter->extra_dim_);
-                    pack((int64_t) promoter->dim_size_);
-                    packTransform(trans->parent_.get());
-                    break;
-                }
-                case LEGATE_CORE_TRANSFORM_PROJECT: {
-                    Project * projector = (Project*) trans;
-                    pack((int32_t) projector->dim_);
-                    pack((int64_t) projector->coord_);
-                    packTransform(trans->parent_.get());
-                    break;
-                }
-                case LEGATE_CORE_TRANSFORM_TRANSPOSE: {
-                    Transpose * projector = (Transpose*) trans;
-                    packTransform(trans->parent_.get());
-                    break;
-                }
-                case LEGATE_CORE_TRANSFORM_DELINEARIZE: {
-                    Delinearize * projector = (Delinearize*) trans;
-                    packTransform(trans->parent_.get());
-                    break;
-                }
-            }
-        }
-    }
-/*
-    case LEGATE_CORE_TRANSFORM_SHIFT: {
-      auto dim    = unpack<int32_t>();
-      auto offset = unpack<int64_t>();
-      auto parent = unpack_transform();
-      return std::make_unique<Shift>(dim, offset, std::move(parent));
-    }
-    case LEGATE_CORE_TRANSFORM_PROMOTE: {
-      auto extra_dim = unpack<int32_t>();
-      auto dim_size  = unpack<int64_t>();
-      auto parent    = unpack_transform();
-      return std::make_unique<Promote>(extra_dim, dim_size, std::move(parent));
-    }
-    case LEGATE_CORE_TRANSFORM_PROJECT: {
-      auto dim    = unpack<int32_t>();
-      auto coord  = unpack<int64_t>();
-      auto parent = unpack_transform();
-      return std::make_unique<Project>(dim, coord, std::move(parent));
-    }
-    case LEGATE_CORE_TRANSFORM_TRANSPOSE: {
-      auto axes   = unpack<std::vector<int32_t>>();
-      auto parent = unpack_transform();
-      return std::make_unique<Transpose>(std::move(axes), std::move(parent));
-    }
-    case LEGATE_CORE_TRANSFORM_DELINEARIZE: {
-      auto dim    = unpack<int32_t>();
-      auto sizes  = unpack<std::vector<int64_t>>();
-      auto parent = unpack_transform();
-      return std::make_unique<Delinearize>(dim, std::move(sizes), std::move(parent));
-    }
-
-    def _serialize_transform(self, buf):
-        if self._parent is not None:
-            self._transform.serialize(buf)
-            self._parent._serialize_transform(buf)
-        else:
-            buf.pack_32bit_int(-1)
-*/
-    void MakeshiftSerializer::packBuffer(const Store& buffer)
-    {
-        pack((bool) buffer.is_future()); //is_future
-        pack((int32_t) buffer.dim());
-        //int32_t code = buffer.code();
-        pack((int32_t)  buffer.code());
-        //pack transform:
-        //pack trasnform code
-        packTransform(buffer.transform_.get());
-
-        //if _isfuture
-        if(buffer.is_future_)
-        {   
-            //pack future_wrapper
-            auto dom = buffer.future_.domain();
-            pack((uint32_t) dom.dim);
-            for (int32_t i =0; i<dom.dim; i++)
-            {
-                pack((int64_t) dom.rect_data[i + dom.dim] + 1);
-            }
-        }   
-        //elif dim>=0
-        else if (buffer.dim()>=0){
-            pack((int32_t) buffer.redop_id_);
-            //pack reigon field
-                //pack dim
-                pack((int32_t) buffer.region_field_.dim()); 
-                //pack idx (req idx) //need to map regions to idx
-                unsigned newID = getNewReqID(buffer.region_field_.reqIdx_);
-                //pack((uint32_t) buffer.region_field_.reqIdx_); 
-                pack((uint32_t) newID); 
-                //pack fid (field id)
-                pack((int32_t) buffer.region_field_.fid_); 
-        }
-        else
-        {   
-            //pack redop_id
-            pack((int32_t) buffer.redop_id_);
-            //pack reigon field
-                //pack dim; always 1 in an buffer
-                pack((int32_t) 1); 
-                //pack idx (req idx) //need to map regions to idx
-                unsigned newID = getNewReqID(buffer.region_field_.reqIdx_);
-                pack((uint32_t) newID); 
-                //pack fid (field id)
-                pack((int32_t) buffer.region_field_.fid_); 
-        }   
-   }
-
-
-
-}
diff --git a/src/utilities/makeshift_serializer.h b/src/utilities/makeshift_serializer.h
deleted file mode 100644
index 56b409e69..000000000
--- a/src/utilities/makeshift_serializer.h
+++ /dev/null
@@ -1,168 +0,0 @@
-
-#pragma once
-#include <iostream>
-#include <vector>
-#include "data/store.h"
-#include "data/scalar.h"
-#include "data/transform.h"
-#include <map>
-
-namespace legate {
-
-class Scalar;
-class Store;
-class MakeshiftSerializer{
-    
-    public:
-    MakeshiftSerializer(){
-        size=512;
-        raw.resize(size); 
-        write_offset=0;
-        read_offset=0;
-        buffer_counter=0;
-    }
-    void zero(){
-        //memset ((void*)raw.data(),0,raw.size());
-        write_offset=0;
-        buffer_counter=0;
-        neededReqIds.clear();
-        regionReqIdMap.clear();
-    }
-/*
-    template <typename T> void pack(T&& arg) 
-    {
-        T copy = arg;
-        pack(copy); //call l-value version
-    }
-*/
-    template <typename T> void pack(T arg) 
-    {
-        int8_t * argAddr = (int8_t*) &arg;
-        if (size<=write_offset+sizeof(T))
-        {
-            resize(sizeof(T));
-        }
-        //for (int i=0; i<sizeof(T); i++)
-        //{
-        //   raw[write_offset+i] = *reinterpret_cast<const int8_t*>((argAddr)+i);
-        //}
-        memcpy(raw.data()+write_offset, argAddr, sizeof(T));
-        //std::cout<<"reint "<<*reinterpret_cast<T*>(raw.data()+write_offset)<<std::endl;;
-        write_offset+=sizeof(T);
-        //std::cout<<"    "<<write_offset<<std::endl;
-    }
- 
-    void packWithoutType(const void* arg, int argSize) 
-    {
-        const int8_t* argByte =(int8_t*) arg;
-        //std::cout<<"data of size: "<<argSize<<std::endl;
-        if (size<=write_offset+argSize)
-        {
-            resize(argSize);
-        }
-        for (int i=0; i<argSize; i++){
-            raw[write_offset+i] = *reinterpret_cast<const int8_t*>(argByte+i);
-        }
-        write_offset+=argSize;
-        //std::cout<<"    "<<write_offset<<std::endl;
-    }
-
-    void packScalar(const Scalar& scalar);
-
-    void packBuffer(const Store& input);
-
-    void packTransform(const StoreTransform* trans);
-    
-    template <typename T> T read() 
-    {
-        if (read_offset<write_offset)
-        {
-            T datum = *reinterpret_cast<T*>(raw.data()+read_offset);
-            read_offset+=sizeof(T);
-            return datum;
-        }
-        else{
-            std::cout<<"finished reading buffer"<<std::endl;
-            return NULL;
-        }
-    }
-
-    void resize(size_t argSize){
-        while(size<=write_offset+argSize)
-        {
-            //std::cout<<"resizing from "<<size<<" to "<<2*size<<std::endl; 
-            size=2*size;
-            raw.resize(size);
-        }
-    }
-
-    void reset_reader(){
-        read_offset=0;
-    }
-
-    int8_t* ptr(){
-        return raw.data();
-    }
-
-    int buffSize(){
-        return write_offset;
-    }
-
-    int32_t returnAndIncrCounter(){
-        int32_t old = buffer_counter;
-        buffer_counter++;
-        return old;
-    }
-    
-    //map old reqIdx to new reqIdx
-    void addReqID(int32_t id){
-        //register the region reqID if it hasn't been seen yet for this op
-        if (regionReqIdMap.find(id)==regionReqIdMap.end())
-        {
-            regionReqIdMap.insert(std::pair<int32_t, int32_t>(id, returnAndIncrCounter()));
-            neededReqIds.push_back(id);
-        }
-    }
-
-    int32_t getNewReqID(int32_t oldID)
-    {
-        return regionReqIdMap.find(oldID)->second;
-    }
-
-    std::vector<int32_t> getReqIds (){
-        //could use move semantics here
-        std::vector<int32_t> reqIdsCopy(neededReqIds);
-        return reqIdsCopy;
-    } 
-
-    private: 
-    size_t size;
-    int read_offset;
-    int write_offset;
-    int buffer_counter;
-    std::vector<int8_t> raw;
-
-    private:
-    std::map<int32_t, int32_t> regionReqIdMap; //maps old reqids to new ones
-    std::vector<int32_t> neededReqIds; //list of old reqIds needed in child op
-
-};
-/*
-int main(){
-    MakeshiftSerializer ms;
-    int a=3; 
-    char g='a'; 
-    ms.pack<int>(a);
-    ms.pack<char>(g);
-    ms.pack<int>(a);
-    ms.pack<char>(g);
-    std::cout<<ms.read<int>()<<std::endl;;
-    std::cout<<ms.read<char>()<<std::endl;;
-    std::cout<<ms.read<int>()<<std::endl;;
-    std::cout<<ms.read<char>()<<std::endl;;
-    std::cout<<ms.read<int>()<<std::endl;;
-    ms.reset_reader();
-    std::cout<<ms.read<int>()<<std::endl;;
-     
-}*/
-}

From e8b902112ee781fdd34ed81751488ef1f98eda90 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@c0001.stanford.edu>
Date: Thu, 14 Oct 2021 19:35:25 -0700
Subject: [PATCH 11/44] fix future stuff

---
 legate/core/runtime.py             | 4 +++-
 src/core/data/store.h              | 2 ++
 src/core/utilities/deserializer.cc | 1 -
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 409dfadb6..9040db7aa 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -1209,7 +1209,7 @@ def build_fused_op(self,ops):
                 #this metadata will be fed into the fused op as inputs
                 meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset)
                 fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true
-
+                
                 #add typical inputs and outputs of all subtasks to fused task
                 for op in op_subset:
                     for scalar in op._scalar_args:
@@ -1222,6 +1222,8 @@ def build_fused_op(self,ops):
                         fused_task.add_output(output)   
                     for future in op._futures:
                         fused_task.add_future(future)
+                print(fused_task)
+                print(fused_task.__dict__)
                 new_op_list.append(fused_task)
         dprint("new op list", new_op_list)
         return new_op_list        
diff --git a/src/core/data/store.h b/src/core/data/store.h
index effa17a8c..eef2896d4 100644
--- a/src/core/data/store.h
+++ b/src/core/data/store.h
@@ -245,6 +245,8 @@ class FutureWrapper {
  private:
   mutable bool uninitialized_{true};
   mutable void* rawptr_{nullptr};
+
+ friend class MakeshiftSerializer;
 };
 
 class Store {
diff --git a/src/core/utilities/deserializer.cc b/src/core/utilities/deserializer.cc
index fcc22c57c..3aa297811 100644
--- a/src/core/utilities/deserializer.cc
+++ b/src/core/utilities/deserializer.cc
@@ -137,7 +137,6 @@ void TaskDeserializer::_unpack(FutureWrapper& value)
     future   = futures_[0];
     futures_ = futures_.subspan(1);
   }
-
   value = FutureWrapper(read_only, field_size, domain, future, has_storage && first_task_);
 }
 

From 823808320791e7733135ef019672b59db590f50d Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@c0001.stanford.edu>
Date: Thu, 14 Oct 2021 23:48:33 -0700
Subject: [PATCH 12/44] re add serializer, fix horrible merge bug

---
 legate.py                                  |   1 +
 legate/core/operation.py                   |   4 -
 legate/core/runtime.py                     |  18 +--
 legate/core/solver.py                      |   1 -
 src/core/utilities/makeshift_serializer.cc | 155 +++++++++++++++++++
 src/core/utilities/makeshift_serializer.h  | 168 +++++++++++++++++++++
 6 files changed, 332 insertions(+), 15 deletions(-)
 create mode 100644 src/core/utilities/makeshift_serializer.cc
 create mode 100644 src/core/utilities/makeshift_serializer.h

diff --git a/legate.py b/legate.py
index 5e053b637..e4197d3a3 100755
--- a/legate.py
+++ b/legate.py
@@ -829,6 +829,7 @@ def driver():
         args.not_control_replicable,
         args.cores_per_node,
         args.launcher,
+        args.verbose,
         args.gasnet_trace,
         args.eager_alloc,
         args.launcher_extra,
diff --git a/legate/core/operation.py b/legate/core/operation.py
index 612519636..7b32c4f11 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -98,10 +98,6 @@ def add_temp(self, store):
         self._check_store(store)
         self._temps.append(store) #this may not be necessary
 
-    def add_output(self, store):
-        self._check_store(store)
-        self._outputs.append(store)
-
     def add_reduction(self, store, redop):
         self._check_store(store)
         if store.scalar:
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 9040db7aa..a30893a3b 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -43,7 +43,7 @@
 from .solver import Partitioner, Strategy
 from .store import RegionField, Store, FusionMetadata
 
-debugPrint = True
+debugPrint = False
 
 #debug printing
 def zprint(*args):
@@ -1222,18 +1222,16 @@ def build_fused_op(self,ops):
                         fused_task.add_output(output)   
                     for future in op._futures:
                         fused_task.add_future(future)
-                print(fused_task)
-                print(fused_task.__dict__)
                 new_op_list.append(fused_task)
         dprint("new op list", new_op_list)
         return new_op_list        
 
     def _launch_outstanding(self):
-        dprint("launching final outstanding ops")
+        print("launching final outstanding ops")
         if len(self._outstanding_ops):
             ops = self._outstanding_ops
             self._outstanding_ops = []
-            self._schedule(ops, force_eval=True)
+            #self._schedule(ops, force_eval=True)
                
    
     def _schedule(self, ops, force_eval=False):
@@ -1258,11 +1256,11 @@ def _schedule(self, ops, force_eval=False):
         if len(ops)==1 and self._clearing_pipe:
             strategy = ops[0].strategy
         else: #else do to the partition
-            must_be_single = any(len(op.scalar_outputs) > 0 for op in ops)
-            partitioner = Partitioner(self, ops, must_be_single=must_be_single)
-            strategy = partitioner.partition_stores()
-        for op in ops:
-            op.launch(strategy)
+            for op in ops:
+                must_be_single = any(len(op.scalar_outputs) > 0 for op in [op])
+                partitioner = Partitioner(self, [op], must_be_single=must_be_single)
+                strategy = partitioner.partition_stores()
+                op.launch(strategy)
 
     def submit(self, op):
         #always launch ops that've been processed for fusion
diff --git a/legate/core/solver.py b/legate/core/solver.py
index e9bb18a6b..d0510e5ec 100644
--- a/legate/core/solver.py
+++ b/legate/core/solver.py
@@ -288,5 +288,4 @@ def partition_stores(self):
         if must_be_1d_launch and color_shape is not None:
             color_shape = Shape((color_shape.volume(),))
 
-        print("key_stores", key_stores)    
         return Strategy(color_shape, partitions, fspaces, key_stores)
diff --git a/src/core/utilities/makeshift_serializer.cc b/src/core/utilities/makeshift_serializer.cc
new file mode 100644
index 000000000..8b0e47dc0
--- /dev/null
+++ b/src/core/utilities/makeshift_serializer.cc
@@ -0,0 +1,155 @@
+#include "core/utilities/makeshift_serializer.h"
+
+namespace legate{
+
+    void MakeshiftSerializer::packScalar(const Scalar& scalar){
+        pack((bool) scalar.is_tuple()); 
+        pack((LegateTypeCode) scalar.code_); 
+        int32_t size = scalar.size();
+        packWithoutType(scalar.data_, size);    
+    }
+
+    void MakeshiftSerializer::packTransform(const StoreTransform* trans){
+
+        if (trans==nullptr){
+            int32_t neg= -1;
+            pack((int32_t) neg);
+        }
+        else{
+            int32_t code = trans->getTransformCode();
+            pack((int32_t) code);
+            switch (code) {
+                case -1: {
+                    break;
+  
+                }
+                case LEGATE_CORE_TRANSFORM_SHIFT: {
+                    Shift * shifter = (Shift*) trans;
+                    pack((int32_t) shifter->dim_);
+                    pack((int64_t) shifter->offset_);
+                    packTransform(trans->parent_.get());
+                    break;
+                }
+                case LEGATE_CORE_TRANSFORM_PROMOTE: {
+                    Promote * promoter = (Promote*) trans;
+                    pack((int32_t) promoter->extra_dim_);
+                    pack((int64_t) promoter->dim_size_);
+                    packTransform(trans->parent_.get());
+                    break;
+                }
+                case LEGATE_CORE_TRANSFORM_PROJECT: {
+                    Project * projector = (Project*) trans;
+                    pack((int32_t) projector->dim_);
+                    pack((int64_t) projector->coord_);
+                    packTransform(trans->parent_.get());
+                    break;
+                }
+                case LEGATE_CORE_TRANSFORM_TRANSPOSE: {
+                    Transpose * projector = (Transpose*) trans;
+                    packTransform(trans->parent_.get());
+                    break;
+                }
+                case LEGATE_CORE_TRANSFORM_DELINEARIZE: {
+                    Delinearize * projector = (Delinearize*) trans;
+                    packTransform(trans->parent_.get());
+                    break;
+                }
+            }
+        }
+    }
+/*
+    case LEGATE_CORE_TRANSFORM_SHIFT: {
+      auto dim    = unpack<int32_t>();
+      auto offset = unpack<int64_t>();
+      auto parent = unpack_transform();
+      return std::make_unique<Shift>(dim, offset, std::move(parent));
+    }
+    case LEGATE_CORE_TRANSFORM_PROMOTE: {
+      auto extra_dim = unpack<int32_t>();
+      auto dim_size  = unpack<int64_t>();
+      auto parent    = unpack_transform();
+      return std::make_unique<Promote>(extra_dim, dim_size, std::move(parent));
+    }
+    case LEGATE_CORE_TRANSFORM_PROJECT: {
+      auto dim    = unpack<int32_t>();
+      auto coord  = unpack<int64_t>();
+      auto parent = unpack_transform();
+      return std::make_unique<Project>(dim, coord, std::move(parent));
+    }
+    case LEGATE_CORE_TRANSFORM_TRANSPOSE: {
+      auto axes   = unpack<std::vector<int32_t>>();
+      auto parent = unpack_transform();
+      return std::make_unique<Transpose>(std::move(axes), std::move(parent));
+    }
+    case LEGATE_CORE_TRANSFORM_DELINEARIZE: {
+      auto dim    = unpack<int32_t>();
+      auto sizes  = unpack<std::vector<int64_t>>();
+      auto parent = unpack_transform();
+      return std::make_unique<Delinearize>(dim, std::move(sizes), std::move(parent));
+    }
+
+    def _serialize_transform(self, buf):
+        if self._parent is not None:
+            self._transform.serialize(buf)
+            self._parent._serialize_transform(buf)
+        else:
+            buf.pack_32bit_int(-1)
+*/
+    void MakeshiftSerializer::packBuffer(const Store& buffer)
+    {
+        pack((bool) buffer.is_future2()); //is_future
+        pack((int32_t) buffer.dim());
+        //int32_t code = buffer.code();
+        pack((int32_t)  buffer.code());
+        //pack transform:
+        //pack trasnform code
+        packTransform(buffer.transform_.get());
+
+        //if _isfuture
+        if(buffer.is_future_)
+        {   
+
+            //pack future_wrapper
+            pack((bool) buffer.future_.read_only_);
+
+            pack((bool) !buffer.future_.uninitialized_);
+
+            pack((int32_t) buffer.future_.field_size_);
+            auto dom = buffer.future_.domain();
+            pack((uint32_t) dom.dim);
+            for (int32_t i =0; i<dom.dim; i++)
+            {
+                pack((int64_t) dom.rect_data[i + dom.dim] + 1);
+            }
+        }   
+        //elif dim>=0
+        else if (buffer.dim()>=0){
+            pack((int32_t) buffer.redop_id_);
+            //pack reigon field
+                //pack dim
+                pack((int32_t) buffer.region_field_.dim()); 
+                //pack idx (req idx) //need to map regions to idx
+                unsigned newID = getNewReqID(buffer.region_field_.reqIdx_);
+                //pack((uint32_t) buffer.region_field_.reqIdx_); 
+                pack((uint32_t) newID); 
+                //pack fid (field id)
+                pack((int32_t) buffer.region_field_.fid_); 
+        }
+        else
+        {   
+            //pack redop_id
+            pack((int32_t) buffer.redop_id_);
+            //pack reigon field
+                //pack dim; always 1 in an buffer
+                pack((int32_t) 1); 
+                //pack idx (req idx) //need to map regions to idx
+                unsigned newID = getNewReqID(buffer.region_field_.reqIdx_);
+                pack((uint32_t) newID); 
+                //pack fid (field id)
+                pack((int32_t) buffer.region_field_.fid_); 
+        }   
+   }
+
+
+
+}
diff --git a/src/core/utilities/makeshift_serializer.h b/src/core/utilities/makeshift_serializer.h
new file mode 100644
index 000000000..81a85b2f2
--- /dev/null
+++ b/src/core/utilities/makeshift_serializer.h
@@ -0,0 +1,168 @@
+
+#pragma once
+#include <iostream>
+#include <vector>
+#include "core/data/store.h"
+#include "core/data/scalar.h"
+#include "core/data/transform.h"
+#include <map>
+
+namespace legate {
+
+class Scalar;
+class Store;
+class MakeshiftSerializer{
+    
+    public:
+    MakeshiftSerializer(){
+        size=512;
+        raw.resize(size); 
+        write_offset=0;
+        read_offset=0;
+        buffer_counter=0;
+    }
+    void zero(){
+        //memset ((void*)raw.data(),0,raw.size());
+        write_offset=0;
+        buffer_counter=0;
+        neededReqIds.clear();
+        regionReqIdMap.clear();
+    }
+/*
+    template <typename T> void pack(T&& arg) 
+    {
+        T copy = arg;
+        pack(copy); //call l-value version
+    }
+*/
+    template <typename T> void pack(T arg) 
+    {
+        int8_t * argAddr = (int8_t*) &arg;
+        if (size<=write_offset+sizeof(T))
+        {
+            resize(sizeof(T));
+        }
+        //for (int i=0; i<sizeof(T); i++)
+        //{
+        //   raw[write_offset+i] = *reinterpret_cast<const int8_t*>((argAddr)+i);
+        //}
+        memcpy(raw.data()+write_offset, argAddr, sizeof(T));
+        //std::cout<<"reint "<<*reinterpret_cast<T*>(raw.data()+write_offset)<<std::endl;;
+        write_offset+=sizeof(T);
+        //std::cout<<"    "<<write_offset<<std::endl;
+    }
+ 
+    void packWithoutType(const void* arg, int argSize) 
+    {
+        const int8_t* argByte =(int8_t*) arg;
+        //std::cout<<"data of size: "<<argSize<<std::endl;
+        if (size<=write_offset+argSize)
+        {
+            resize(argSize);
+        }
+        for (int i=0; i<argSize; i++){
+            raw[write_offset+i] = *reinterpret_cast<const int8_t*>(argByte+i);
+        }
+        write_offset+=argSize;
+        //std::cout<<"    "<<write_offset<<std::endl;
+    }
+
+    void packScalar(const Scalar& scalar);
+
+    void packBuffer(const Store& input);
+
+    void packTransform(const StoreTransform* trans);
+    
+    template <typename T> T read() 
+    {
+        if (read_offset<write_offset)
+        {
+            T datum = *reinterpret_cast<T*>(raw.data()+read_offset);
+            read_offset+=sizeof(T);
+            return datum;
+        }
+        else{
+            std::cout<<"finished reading buffer"<<std::endl;
+            return NULL;
+        }
+    }
+
+    void resize(size_t argSize){
+        while(size<=write_offset+argSize)
+        {
+            //std::cout<<"resizing from "<<size<<" to "<<2*size<<std::endl; 
+            size=2*size;
+            raw.resize(size);
+        }
+    }
+
+    void reset_reader(){
+        read_offset=0;
+    }
+
+    int8_t* ptr(){
+        return raw.data();
+    }
+
+    int buffSize(){
+        return write_offset;
+    }
+
+    int32_t returnAndIncrCounter(){
+        int32_t old = buffer_counter;
+        buffer_counter++;
+        return old;
+    }
+    
+    //map old reqIdx to new reqIdx
+    void addReqID(int32_t id){
+        //register the region reqID if it hasn't been seen yet for this op
+        if (regionReqIdMap.find(id)==regionReqIdMap.end())
+        {
+            regionReqIdMap.insert(std::pair<int32_t, int32_t>(id, returnAndIncrCounter()));
+            neededReqIds.push_back(id);
+        }
+    }
+
+    int32_t getNewReqID(int32_t oldID)
+    {
+        return regionReqIdMap.find(oldID)->second;
+    }
+
+    std::vector<int32_t> getReqIds (){
+        //could use move semantics here
+        std::vector<int32_t> reqIdsCopy(neededReqIds);
+        return reqIdsCopy;
+    } 
+
+    private: 
+    size_t size;
+    int read_offset;
+    int write_offset;
+    int buffer_counter;
+    std::vector<int8_t> raw;
+
+    private:
+    std::map<int32_t, int32_t> regionReqIdMap; //maps old reqids to new ones
+    std::vector<int32_t> neededReqIds; //list of old reqIds needed in child op
+
+};
+/*
+int main(){
+    MakeshiftSerializer ms;
+    int a=3; 
+    char g='a'; 
+    ms.pack<int>(a);
+    ms.pack<char>(g);
+    ms.pack<int>(a);
+    ms.pack<char>(g);
+    std::cout<<ms.read<int>()<<std::endl;;
+    std::cout<<ms.read<char>()<<std::endl;;
+    std::cout<<ms.read<int>()<<std::endl;;
+    std::cout<<ms.read<char>()<<std::endl;;
+    std::cout<<ms.read<int>()<<std::endl;;
+    ms.reset_reader();
+    std::cout<<ms.read<int>()<<std::endl;;
+     
+}*/
+}

From 46856c7895b22fb472881bdf9110d41157e7d5e8 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@c0001.stanford.edu>
Date: Fri, 22 Oct 2021 15:06:40 -0700
Subject: [PATCH 13/44] debugging crap

---
 legate/core/launcher.py                    |   3 +-
 legate/core/partition.py                   |   5 +-
 legate/core/runtime.py                     | 231 +++++++++++++--------
 legate/core/store.py                       |  30 +++
 src/core/utilities/makeshift_serializer.cc |   1 +
 5 files changed, 179 insertions(+), 91 deletions(-)

diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index ef6f3d306..78924acfe 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -570,12 +570,13 @@ def add_store(self, args, store, proj, perm, tag, flags):
         if store.kind is Future:
             if store.has_storage:
                 self.add_future(store.storage)
-            elif perm == Permission.READ or perm == Permission.REDUCTION:
+            elif (perm == Permission.READ or perm == Permission.REDUCTION):
                 raise RuntimeError(
                     "Read access to an uninitialized store is disallowed"
                 )
             read_only = perm == Permission.READ
             args.append(FutureStoreArg(store, read_only, store.has_storage))
+            #args.append(FutureStoreArg(store, perm, store.has_storage))
 
         else:
             region = store.storage.region
diff --git a/legate/core/partition.py b/legate/core/partition.py
index 073772166..b7a517713 100644
--- a/legate/core/partition.py
+++ b/legate/core/partition.py
@@ -165,11 +165,12 @@ def construct(self, region, complete=False):
             transform = Transform(tile_shape.ndim, tile_shape.ndim)
             for idx, size in enumerate(tile_shape):
                 transform.trans[idx, idx] = size
-
+            print(self)
+            print("ndim" , tile_shape.ndim, "offset", self._offset, "tile_shape", self._tile_shape)
             lo = Shape((0,) * tile_shape.ndim) + self._offset
             hi = self._tile_shape - 1 + self._offset
-
             extent = Rect(hi, lo, exclusive=False)
+            print("extent", extent)
 
             color_space = self._runtime.find_or_create_index_space(
                 self.color_shape
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index a30893a3b..d48ee6092 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -43,15 +43,20 @@
 from .solver import Partitioner, Strategy
 from .store import RegionField, Store, FusionMetadata
 
-debugPrint = False
+debugPrint = True
+futureBugPrint = True
 
-#debug printing
 def zprint(*args):
     return
 if debugPrint:
-    dprint = print
+    drint = print
 else:
-    dprint = zprint
+    drint = zprint
+
+if futureBugPrint:
+    frint = print
+else:
+    frint = zprint
 
 
 # A Field holds a reference to a field in a region tree
@@ -709,11 +714,15 @@ def can_fuse(self):
             # so the calls to Partitioner() and partition_stores done kill perf
             partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single)
             self.partitioners.append( partitioner )
+            print(op._inputs)
+            import pdb; pdb.set_trace()
             strategy = partitioner.partition_stores()
+            if len(op.inputs)>1:
+                proj = strategy.get_projection(op._inputs[1])
             self.strategies.append(strategy)
 
         results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints]
-        dprint("fuse results", results)
+        drint("fuse results", results)
         all_fusable = [result[0] for result in results]
         interval_sets = [result[1] for result in results]
   
@@ -732,12 +741,13 @@ def can_fuse(self):
                         newset.append((news, newe))
             curr_set=newset
         fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold)
-        dprint("curset", curr_set)
+        drint("curset", curr_set)
 
-        dprint("final_set", final_set)
-        dprint("all fusable", fusable)
-        dprint("intervals", interval_sets)
-        return reduce(lambda x,y: x and y, all_fusable), final_set, self.strategies
+        drint("final_set", final_set)
+        drint("all fusable", fusable)
+        drint("intervals", interval_sets)
+        #return reduce(lambda x,y: x and y, all_fusable), final_set, self.strategies
+        return fusable, final_set, self.strategies
 
 class FusionConstraint(object):
     def apply(self, contexts, runtime, ops, partitioners, strategies):
@@ -755,33 +765,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         else:
            return False, [(0,0)]
 """
-  NUMPY_BINARY_OP        = 400000,
-  NUMPY_SCALAR_BINARY_OP = 400002,
-  NUMPY_FILL             = 400003,
-  NUMPY_SCALAR_UNARY_RED = 400004,
-  NUMPY_UNARY_RED        = 400005,
-  NUMPY_UNARY_OP         = 400006,
-  NUMPY_SCALAR_UNARY_OP  = 400007,
-  NUMPY_BINARY_RED       = 400008,
-  NUMPY_CONVERT          = 400010,
-  NUMPY_SCALAR_CONVERT   = 400011,
-  NUMPY_WHERE            = 400012,
-  NUMPY_SCALAR_WHERE     = 400013,
-  NUMPY_READ             = 400014,
-  NUMPY_WRITE            = 400015,
-  NUMPY_DIAG             = 400016,
-  NUMPY_MATMUL           = 400017,
-  NUMPY_MATVECMUL        = 400018,
-  NUMPY_DOT              = 400019,
-  NUMPY_BINCOUNT         = 400020,
-  NUMPY_EYE              = 400021,
-  NUMPY_RAND             = 400022,
-  NUMPY_ARANGE           = 400023,
-  NUMPY_TRANSPOSE        = 400024,
-  NUMPY_TILE             = 400025,
-  NUMPY_NONZERO          = 400026,
-  NUMPY_DOUBLE_BINARY_OP = 400027,
-  NUMPY_FUSED_OP         = 400028,
 enum NumPyOpCode {
   NUMPY_ARANGE           = 1,
   NUMPY_BINARY_OP        = 2,
@@ -819,7 +802,9 @@ def __init__(self):
 
         #these ops are always fusable
         self.validIDs.add(2) #Binary op
-        self.validIDs.add(18) #Unary op
+        #self.validIDs.add(5) #convert op
+        #self.validIDs.add(18) #Unary op
+        #self.validIDs.add(9) #Fill op
 
         # the following are conditionally fusable
         # they will be processed in the a subsequent level of filtering
@@ -828,17 +813,17 @@ def __init__(self):
         # are NOT consumed by a subsequent op in the window
         # however they can be printed, which we cannot detect in the runtime
         # without static analysis, so consider these terminal fusable
-        self.validIDs.add(400004) #Scalar unary red      
-        self.validIDs.add(400005) #Unary red      
+        #self.validIDs.add(400004) #Scalar unary red      
+        #self.validIDs.add(400005) #Unary red      
 
         # as all scalars are futures,
         # so we can just check if both Futures are "ready"
         # more powerfully, we can also create a dependency tree
         # of ops, and assuming they're all scalar ops, 
         # and the "roots" are ready, we can fuse
-        self.validIDs.add(400002) #Scalar Binary op
-        self.validIDs.add(400007) #Scalar Unary op
-        self.validIDs.add(400008) #Scalar binary red     
+        #self.validIDs.add(400002) #Scalar Binary op
+        #self.validIDs.add(400007) #Scalar Unary op
+        #self.validIDs.add(400008) #Scalar binary red     
 
         #a matmul is valid if it is the last op in the sequence
         #unless if it followed by a matmul of the exact same size 
@@ -851,9 +836,9 @@ def __init__(self):
 
     def apply(self, contexts, runtime, ops, partitioners, strategies):
         results = [int(op._task_id) in self.validIDs for op in ops]
+        drint("valids", results)
         fusable_intervals = []
         start, end =0,0
-        rolling=False
         while end<len(results):
             result = results[end]
             if result:
@@ -869,8 +854,8 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
                     end = start
         if start<end:
             fusable_intervals.append((start,end))
-        dprint(fusable_intervals)   
-        dprint("allFusableOps", results)
+        drint(fusable_intervals)   
+        drint("allFusableOps", results)
         fusability_exists = reduce(lambda x,y: x or y,[int(op._task_id) in self.validIDs for op in ops])
         return (fusability_exists, fusable_intervals)
 
@@ -880,24 +865,34 @@ class ValidScalarProducers(FusionConstraint):
 class IdenticalProjection(FusionConstraint):
     """Fusion rule that only ops with identical
        projection functors can be fused"""
-    def apply(self, contexts, runtime, ops, partitioners, strategies):
 
+    def apply(self, contexts, runtime, ops, partitioners, strategies):
+        linkset = {}
         store_to_ops = {}
+        base_window = [0, len(ops)]
+        
         for i, op in enumerate(ops):
             bufferSet = {}
- 
             # find the set union of input and output buffers for the op
             for input in op._inputs:
                 if input not in bufferSet:
+                    print(input)
                     proj = strategies[i].get_projection(input)
                     if hasattr(proj, 'part'):
                         bufferSet[input]=proj
-
+                        if input not in linkset:
+                            linkset[input] = [i]
+                        else:
+                            linkset[input].append(i)
             for output in op._outputs:
                 if output not in bufferSet:
                     proj = strategies[i].get_projection(output)
                     if hasattr(proj, 'part'):
                         bufferSet[output]=proj
+                        if output not in linkset:
+                            linkset[output] = [i]
+                        else:
+                            linkset[output].append(i)
 
             # for each op in the union, record its associated transform
             for buffer in bufferSet.keys():
@@ -907,18 +902,28 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
                     store_to_ops[buffer] = [matrix]
                 else:
                     store_to_ops[buffer].append(matrix)
-
+        
+        for keys, vals in linkset.items():
+            print("key", keys)
+            print("\t val, ", vals)
+            
+ 
+        print()
         # for each buffer, check all it's associated transforms/partitions
         # across ops are equivalent 
         for store, matrices in store_to_ops.items():
             if len(matrices)>1: 
                 first = matrices[0]
+                print(store, matrices)
                 for matrix in matrices:
                     if not (matrix==first).all():
-                        return False, [(0,0)]
+                        indices = linkset[store]
+                        print("must split", indices)
+                        return True, [(0,indices[1]), (indices[1],len(ops))]
         return True, [(0,len(ops))]
 
 
+
 class IdenticalLaunchShapes(FusionConstraint):
     """Fusion rule that only ops with identical
        launch shapes can be fused"""
@@ -926,12 +931,11 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         launch_shapes = []
         for i in range(len(ops)):
             launch_shapes.append(strategies[i]._launch_shape)
-        dprint(strategies[3].__dict__)
-        dprint('launch shapes', launch_shapes)
+        drint('launch shapes', launch_shapes)
         first_shape = launch_shapes[0]
         for launch_shape in launch_shapes:
             if launch_shape!=first_shape:
-                return False, [(0,0)]
+                return True, [(0,1),(1,len(ops))]
         return True, [(0,len(ops))]
 
 
@@ -979,7 +983,7 @@ def __init__(self, core_library):
         # Legate operations.
         self._outstanding_ops = []
         self._window_size =10
-        self._fusion_threshold =4
+        self._fusion_threshold =2
         self._clearing_pipe = False
 
         # Now we initialize managers
@@ -1134,18 +1138,13 @@ def serialize_multiop_metadata(self, numpy_context, ops):
         meta_arrs =  (input_starts, output_starts, offset_starts, offsets, reduction_starts,  scalar_starts, 
                       future_starts, op_ids)
         fusion_metadata = FusionMetadata(*meta_arrs)
-
-        #TODO: remove me
-        #inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets))
-        #meta_arrs_np =  map(npo.array, meta_arrs)
-        #def make_deferred(inst):
-        #    return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) 
-        #meta_maps = map(make_deferred, meta_arrs_np)
         meta_maps=None
         return meta_maps, fusion_metadata
    
 
     def build_fused_op(self,ops):
+        for i in range(len(ops)):
+            self.propogateFuture(ops[i])
         fusion_checker = FusionChecker(ops, self._contexts, self)
         fusion_checker.register_constraint(NumpyContextExists())
         fusion_checker.register_constraint(AllValidOps())
@@ -1155,8 +1154,8 @@ def build_fused_op(self,ops):
 
         #short circuit         
         if not can_fuse:
-            dprint("CANNOT FUSE!")
-            return None
+            drint("CANNOT FUSE!")
+            return False, partitions
 
         super_strats = []
         super_fspaces = []
@@ -1168,7 +1167,7 @@ def build_fused_op(self,ops):
             super_fspace = {}
             super_keystore = set()
             start,end = fusable_set
-            dprint("creating fusable set for", start, end)
+            drint("creating fusable set for", start, end)
             for j in range(start,end):
                 super_strat = {**(super_strat.copy()), **partitions[j]._strategy}
                 super_fspace = {**(super_fspace.copy()), **partitions[j]._fspaces}
@@ -1177,7 +1176,7 @@ def build_fused_op(self,ops):
             super_fspaces.append(super_fspace)
             super_keystores.append(super_keystore)
             super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore))
-        dprint("lens", len(super_strats), len(super_fspaces), len(super_strategies), len(super_keystore))
+        drint("lens", len(super_strats), len(super_fspaces), len(super_strategies), len(super_keystore))
         """
         super_strat = {}
         super_fspace = {}
@@ -1211,61 +1210,117 @@ def build_fused_op(self,ops):
                 fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true
                 
                 #add typical inputs and outputs of all subtasks to fused task
-                for op in op_subset:
+                for j,op in enumerate(op_subset):
                     for scalar in op._scalar_args:
                         fused_task.add_scalar_arg(scalar[0], ty.int32)
                     for reduction in op._reductions:
                         fused_task.add_reduction(reduction)
+
                     for input in op._inputs:
+                        #if input._storage is None:
+                        if i==1:
+                            frint("building fused in", i,j,op._task_id, input)
                         fused_task.add_input(input)   
                     for output in op._outputs:
+                        if i==1:
+                            frint("building fused out", i,j,op._task_id, output)
                         fused_task.add_output(output)   
+                    self.propogateFuture(op)
                     for future in op._futures:
                         fused_task.add_future(future)
                 new_op_list.append(fused_task)
-        dprint("new op list", new_op_list)
-        return new_op_list        
+        return new_op_list, True       
 
-    def _launch_outstanding(self):
-        print("launching final outstanding ops")
+    def _launch_outstanding(self, force_eval=True):
+        print("launching final outstanding ops", [op._task_id for op in self._outstanding_ops])
         if len(self._outstanding_ops):
             ops = self._outstanding_ops
             self._outstanding_ops = []
-            #self._schedule(ops, force_eval=True)
-               
+            self._schedule(ops, force_eval)
+ 
+    def _launch_one(self):
+        if len(self._outstanding_ops):
+            op = self._outstanding_ops[0]
+            self._outstanding_ops = self._outstanding_ops[1:]
+            self._schedule([op], force_eval=True)
+
+    def propogateFuture(self,op):
+        return
+        for input in op._inputs:
+            start = input
+            if input._storage is None: 
+                print("needs healing", input, op._task_id)
+                while start._storage is None and start._parent:
+                    start=start._parent
+                input._storage = start._storage
+                
+     
    
     def _schedule(self, ops, force_eval=False):
         ids = [op._task_id for op in ops]
-        dprint("ids", ids)
-        #try fusing tasks
+        print(force_eval, "ids", ids)
+        #case 1: try fusing current window of tasks
+        #if partially or fully fusable, 
+        #schedule the new set of tasks
+        strats = False
         if len(ops)>=2 and (not force_eval):
-            fused_task_list = self.build_fused_op(ops)
+            fused_task_list,strats = self.build_fused_op(ops)
             if fused_task_list:
-                dprint("start clearing pipe")
+                frint("created fused list", [op._task_id for op in fused_task_list])
+                drint("start clearing pipe")
                 self._clearing_pipe = True
                 for task in fused_task_list:
                     task.execute() 
                 self._clearing_pipe = False
-                dprint("stop clearing pipe")
+                drint("stop clearing pipe")
                 return
 
-        #if we cann't fuse op launch them individually
-
-        # tasks processed for fusion already have  
-        # their strategy "baked in"
+        # case 2: tasks  processed for fusion already have  
+        # their strategy "baked in", as we already partitioned
+        # them when testing fusion legality (in case 1)
         if len(ops)==1 and self._clearing_pipe:
             strategy = ops[0].strategy
-        else: #else do to the partition
-            for op in ops:
-                must_be_single = any(len(op.scalar_outputs) > 0 for op in [op])
-                partitioner = Partitioner(self, [op], must_be_single=must_be_single)
-                strategy = partitioner.partition_stores()
-                op.launch(strategy)
+            for input in ops[0]._inputs:
+                #if input._storage is None:
+                frint("launch fused input", ops[0]._task_id, input)
+                proj = ops[0].strategy.get_projection(input)
+                if hasattr(proj, 'part'):
+                   frint("strat", proj.part.index_partition.functor.transform.trans)
+                   frint("strat1", proj.part.index_partition.functor.__dict__)
+            self.propogateFuture(ops[0])
+
+            for output in ops[0]._outputs:
+                #if output._storage is None:
+                frint("launch used output", ops[0]._task_id, output)
+            ops[0].launch(strategy)
+
+        # case 3: execute the ops normally 
+        # if we already checked the ops for fusability,
+        # then the ops' buffers have already been partitioned
+        else:
+            if not strats: #ops were not check for fusability, so partition them
+                for op in ops:
+                    must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op])
+                    partitioner = Partitioner(self, [op], must_be_single=must_be_single)
+                    strategy = partitioner.partition_stores()
+                    op.strategy = strategy
+            else: #strategies already calculated during failed attempt to fuse 
+                for i,op in enumerate(ops):
+                    op.strategy = strats[i]
+            for i,op in enumerate(ops):
+                for input in op._inputs:
+                    print("in", input)
+                    if input._storage is None:
+                        frint("launch ufused input", op._task_id, input)
+                self.propogateFuture(op)
+                op.launch(op.strategy)
+
 
     def submit(self, op):
         #always launch ops that've been processed for fusion
         #do not re-add to the window
         #as the these ops already waited in the window
+        #print(op.__dict__)
         if self._clearing_pipe:
             self._schedule([op])
         else:
diff --git a/legate/core/store.py b/legate/core/store.py
index 7140a0f70..c5dceca29 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -423,6 +423,13 @@ def __init__(
 
     @property
     def shape(self):
+        if self._shape is None:
+            # If someone wants to access the shape of an unbound
+            # store before it is set, that means the producer task is
+            # sitting in the queue, so we should flush the queue.
+            self._runtime._launch_outstanding(False)
+            # At this point, we should have the shape set.
+            assert self._shape is not None
         return self._shape
 
     @property
@@ -462,6 +469,28 @@ def storage(self):
         Store. These will have exactly the type specified by `.kind`.
         """
         if self._storage is None:
+            print("store none, launching", [op._task_id for op in self._runtime._outstanding_ops])
+            self._runtime._launch_outstanding(False)
+            """
+            if self._kind ==Future:
+                print("future")
+            while(self._storage is None and len(self._runtime._outstanding_ops)):
+                print("launch_one")
+                print([op._task_id for op in self._runtime._outstanding_ops])
+                #self._runtime._launch_outstanding()
+                self._runtime._launch_one()
+            """ 
+            """
+            if True:
+                import pdb; pdb.set_trace()
+                start = self
+                while start._storage is None and start._parent:
+                    start=start._parent
+                if start._storage:
+                    self._storage = start._storage
+                else:
+                    self._runtime._launch_outstanding()
+            """
             if self.unbound:
                 raise RuntimeError(
                     "Storage of a variable size store cannot be retrieved "
@@ -471,6 +500,7 @@ def storage(self):
             #       if necessary
             if self._parent is None:
                 if self._kind is Future:
+                    print("supressing in store.py")
                     raise ValueError(
                         "Illegal to access the storage of an uninitialized "
                         "Legate store of volume 1 with scalar optimization"
diff --git a/src/core/utilities/makeshift_serializer.cc b/src/core/utilities/makeshift_serializer.cc
index 8b0e47dc0..7dacd29ce 100644
--- a/src/core/utilities/makeshift_serializer.cc
+++ b/src/core/utilities/makeshift_serializer.cc
@@ -119,6 +119,7 @@ namespace legate{
             pack((uint32_t) dom.dim);
             for (int32_t i =0; i<dom.dim; i++)
             {
+                std::cout<<"packing "<<i<<" "<<dom.rect_data[i + dom.dim]+1<<std::endl;
                 pack((int64_t) dom.rect_data[i + dom.dim] + 1);
             }
         }   

From db65e43fe0a22d9a2f82edc509b94100909a11eb Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@c0002.stanford.edu>
Date: Mon, 25 Oct 2021 14:34:46 -0700
Subject: [PATCH 14/44] op registry working

---
 legate/core/constraints.py                 |   4 +-
 legate/core/partition.py                   |   3 -
 legate/core/runtime.py                     | 151 +++++++++++++--------
 legate/core/solver.py                      |   9 +-
 legate/core/store.py                       |   5 +-
 src/core/runtime/context.cc                |  11 ++
 src/core/runtime/context.h                 |  14 +-
 src/core/runtime/runtime.cc                |   5 +
 src/core/runtime/runtime.h                 |   7 +-
 src/core/task/task.cc                      |   6 +
 src/core/task/task.h                       |   3 +
 src/core/utilities/makeshift_serializer.cc |  10 +-
 12 files changed, 156 insertions(+), 72 deletions(-)

diff --git a/legate/core/constraints.py b/legate/core/constraints.py
index cd81306e5..93f89bd36 100644
--- a/legate/core/constraints.py
+++ b/legate/core/constraints.py
@@ -56,7 +56,7 @@ def reduce(self):
 
 class PartSym(Expr):
     def __init__(self, op, store, id, disjoint, complete):
-        self._op = op
+        #self._op = op
         self._store = store
         self._id = id
         self._disjoint = disjoint
@@ -73,9 +73,11 @@ def closed(self):
     def __repr__(self):
         disj = "D" if self._disjoint else "A"
         comp = "C" if self._complete else "I"
+        return f"X{self._id}({disj},{comp})"
         return f"X{self._id}({disj},{comp})@{self._op.get_name()}"
 
     def __hash__(self):
+        return hash(self._id)
         return hash((self._op, self._id))
 
     def subst(self, mapping):
diff --git a/legate/core/partition.py b/legate/core/partition.py
index 51d9be6d6..25463054c 100644
--- a/legate/core/partition.py
+++ b/legate/core/partition.py
@@ -180,12 +180,9 @@ def construct(self, region, complete=False):
             transform = Transform(tile_shape.ndim, tile_shape.ndim)
             for idx, size in enumerate(tile_shape):
                 transform.trans[idx, idx] = size
-            print(self)
-            print("ndim" , tile_shape.ndim, "offset", self._offset, "tile_shape", self._tile_shape)
             lo = Shape((0,) * tile_shape.ndim) + self._offset
             hi = self._tile_shape - 1 + self._offset
             extent = Rect(hi, lo, exclusive=False)
-            print("extent", extent)
 
             color_space = self._runtime.find_or_create_index_space(
                 self.color_shape
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 0328da914..639bb9f98 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -42,9 +42,9 @@
 from .shape import Shape
 from .solver import Partitioner, Strategy
 from .store import RegionField, Store, FusionMetadata
-
-debugPrint = True
-futureBugPrint = True
+import numpy as np
+debugPrint = False
+futureBugPrint = False
 
 def zprint(*args):
     return
@@ -708,17 +708,20 @@ def supress_small_fusions(self, intervals, threshold):
         return fusable, final_set
 
     def can_fuse(self):
-        must_be_single = any(len(op.scalar_outputs) > 0 for op in self.ops)
         for op in self.ops:
-            # TODO: cache as much as of the partitioner results as possible
-            # so the calls to Partitioner() and partition_stores done kill perf
+            must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op])
             partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single)
             self.partitioners.append( partitioner )
-            print(op._inputs)
-            import pdb; pdb.set_trace()
             strategy = partitioner.partition_stores()
+            for output, part, in zip(op._outputs, op._output_parts):
+                partition = strategy.get_partition(part)
+                output.set_key_partition(partition)
+                key_part = partition
+                for input in op._inputs:
+                    if input.shape==output.shape:
+                        input.set_key_partition(key_part)
             if len(op.inputs)>1:
-                proj = strategy.get_projection(op._inputs[1])
+                proj = strategy.get_projection(op._input_parts[1])
             self.strategies.append(strategy)
 
         results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints]
@@ -803,8 +806,9 @@ def __init__(self):
         #these ops are always fusable
         self.validIDs.add(2) #Binary op
         #self.validIDs.add(5) #convert op
-        #self.validIDs.add(18) #Unary op
-        #self.validIDs.add(9) #Fill op
+        self.validIDs.add(18) #Unary op
+        self.validIDs.add(9) #Fill op
+        self.validIDs.add(14) #Fill op
 
         # the following are conditionally fusable
         # they will be processed in the a subsequent level of filtering
@@ -871,54 +875,65 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         store_to_ops = {}
         base_window = [0, len(ops)]
         
-        for i, op in enumerate(ops):
+        intervals = []
+        start=0
+        end = len(ops)
+        i=0
+        #for i, op in enumerate(ops):
+        while i<end:
+            op=ops[i]
             bufferSet = {}
             # find the set union of input and output buffers for the op
-            for input in op._inputs:
+            for input, part in zip(op._inputs, op._input_parts):
                 if input not in bufferSet:
-                    print(input)
-                    proj = strategies[i].get_projection(input)
+                    proj = strategies[i].get_projection(part)
                     if hasattr(proj, 'part'):
+                        #bufferSet[input]=proj
                         bufferSet[input]=proj
                         if input not in linkset:
                             linkset[input] = [i]
                         else:
                             linkset[input].append(i)
-            for output in op._outputs:
+            for output, part in zip(op._outputs, op._output_parts):
                 if output not in bufferSet:
-                    proj = strategies[i].get_projection(output)
+                    proj = strategies[i].get_projection(part)
                     if hasattr(proj, 'part'):
                         bufferSet[output]=proj
                         if output not in linkset:
                             linkset[output] = [i]
                         else:
                             linkset[output].append(i)
-
+            if i==0: #we only iterate from i==1 onwards
+                i+=1
+                continue
             # for each op in the union, record its associated transform
             for buffer in bufferSet.keys():
                 proj = bufferSet[buffer]
                 matrix = proj.part.index_partition.functor.transform.trans
                 if buffer not in store_to_ops:
-                    store_to_ops[buffer] = [matrix]
-                else:
-                    store_to_ops[buffer].append(matrix)
-        
-        for keys, vals in linkset.items():
-            print("key", keys)
-            print("\t val, ", vals)
-            
- 
-        print()
+                    store_to_ops[buffer] = matrix
+                else: #we see a new projection for the same buffer
+                    if not np.array_equal(matrix, store_to_ops[buffer]):
+                        intervals.append((start, i))
+                        start=i
+                        i=start+1
+                        store_to_ops={}
+                        continue
+            i+=1
+        if start<end:
+            intervals.append((start,end))
+        return True, intervals
+         
+        #TODO: remove me    
         # for each buffer, check all it's associated transforms/partitions
         # across ops are equivalent 
+        seperators = []
         for store, matrices in store_to_ops.items():
             if len(matrices)>1: 
                 first = matrices[0]
-                print(store, matrices)
                 for matrix in matrices:
                     if not (matrix==first).all():
                         indices = linkset[store]
-                        print("must split", indices)
                         return True, [(0,indices[1]), (indices[1],len(ops))]
         return True, [(0,len(ops))]
 
@@ -931,11 +946,29 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         launch_shapes = []
         for i in range(len(ops)):
             launch_shapes.append(strategies[i]._launch_shape)
-        drint('launch shapes', launch_shapes)
+        #print(launch_shapes)
+        """
         first_shape = launch_shapes[0]
         for launch_shape in launch_shapes:
             if launch_shape!=first_shape:
                 return True, [(0,1),(1,len(ops))]
+        """
+        intervals =[]
+        i=1
+        start=0
+        end = len(launch_shapes)
+        while i<end:
+            leftNone = launch_shapes[i] is None and (launch_shapes[i-1] is not None)
+            rightNone = launch_shapes[i-1] is None and (launch_shapes[i] is not None)
+            if leftNone or rightNone or launch_shapes[i]!=launch_shapes[i-1]:
+                intervals.append((start, i))
+                start=i
+                i=start+1
+            else:
+                i+=1
+        if start<end:
+            intervals.append((start, end))
+        return True, intervals
         return True, [(0,len(ops))]
 
 
@@ -985,9 +1018,9 @@ def __init__(self, core_library):
         self._window_size =10
         self._fusion_threshold =2
         self._clearing_pipe = False
-        self._window_size = self._core_context.get_tunable(
-            legion.LEGATE_CORE_TUNABLE_WINDOW_SIZE,
-            ty.uint32,
+        #self._window_size = self._core_context.get_tunable(
+        #    legion.LEGATE_CORE_TUNABLE_WINDOW_SIZE,
+        #    ty.uint32,
 
         # Now we initialize managers
         self._attachment_manager = AttachmentManager(self)
@@ -1178,7 +1211,7 @@ def build_fused_op(self,ops):
             for j in range(start,end):
                 super_strat = {**(super_strat.copy()), **partitions[j]._strategy}
                 super_fspace = {**(super_fspace.copy()), **partitions[j]._fspaces}
-                super_keystore = super_keystore.union(partitions[j]._key_stores)
+                super_keystore = super_keystore.union(partitions[j]._key_parts)
             super_strats.append(super_strat)
             super_fspaces.append(super_fspace)
             super_keystores.append(super_keystore)
@@ -1196,7 +1229,7 @@ def build_fused_op(self,ops):
         fused_id = self._contexts["legate.numpy"].fused_id
         numpy_context = self._contexts["legate.numpy"]
         numpy_runtime = numpy_context._library.runtime
-
+        z=0
         new_op_list = []
         for i,fusable_set in enumerate(fusable_sets):
             start, end = fusable_set
@@ -1209,14 +1242,13 @@ def build_fused_op(self,ops):
             elif end-start > 1:
                 #initialize fused task
                 fused_task = numpy_context.create_task(fused_id)
-                fused_task.strategy = super_strategies[i]
-       
-                #serialize necessary metadata on all encapsulated ops 
+
+               #serialize necessary metadata on all encapsulated ops 
                 #this metadata will be fed into the fused op as inputs
                 meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset)
                 fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true
-                
                 #add typical inputs and outputs of all subtasks to fused task
+                key_part = None
                 for j,op in enumerate(op_subset):
                     for scalar in op._scalar_args:
                         fused_task.add_scalar_arg(scalar[0], ty.int32)
@@ -1225,21 +1257,27 @@ def build_fused_op(self,ops):
 
                     for input in op._inputs:
                         #if input._storage is None:
-                        if i==1:
-                            frint("building fused in", i,j,op._task_id, input)
                         fused_task.add_input(input)   
-                    for output in op._outputs:
-                        if i==1:
-                            frint("building fused out", i,j,op._task_id, output)
+                    for output,part in zip(op._outputs, op._output_parts):
                         fused_task.add_output(output)   
-                    self.propogateFuture(op)
+                        if key_part==None:
+                            key_part = partitions[z].get_partition(part)
+                          
+                    self.propogateFuture(fused_task)
                     for future in op._futures:
                         fused_task.add_future(future)
+                    z+=1
                 new_op_list.append(fused_task)
+        for i,fused_task in enumerate(new_op_list):
+            must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task])
+            partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single)
+            strategy = partitioner.partition_stores()
+            #fused_task.strategy = super_strategies[i]
+            fused_task.strategy = strategy
+
         return new_op_list, True       
 
     def _launch_outstanding(self, force_eval=True):
-        print("launching final outstanding ops", [op._task_id for op in self._outstanding_ops])
         if len(self._outstanding_ops):
             ops = self._outstanding_ops
             self._outstanding_ops = []
@@ -1252,11 +1290,10 @@ def _launch_one(self):
             self._schedule([op], force_eval=True)
 
     def propogateFuture(self,op):
-        return
+        return 
         for input in op._inputs:
             start = input
-            if input._storage is None: 
-                print("needs healing", input, op._task_id)
+            if input._kind is Future and input._storage is None: 
                 while start._storage is None and start._parent:
                     start=start._parent
                 input._storage = start._storage
@@ -1270,6 +1307,11 @@ def _schedule(self, ops, force_eval=False):
         #if partially or fully fusable, 
         #schedule the new set of tasks
         strats = False
+        #for op in ops:
+        #     must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op])
+        #     partitioner = Partitioner(self, [op], must_be_single=must_be_single)
+        #     strategy = partitioner.partition_stores()
+
         if len(ops)>=2 and (not force_eval):
             fused_task_list,strats = self.build_fused_op(ops)
             if fused_task_list:
@@ -1287,13 +1329,9 @@ def _schedule(self, ops, force_eval=False):
         # them when testing fusion legality (in case 1)
         if len(ops)==1 and self._clearing_pipe:
             strategy = ops[0].strategy
-            for input in ops[0]._inputs:
-                #if input._storage is None:
+            for input, part in zip(ops[0]._inputs, ops[0]._input_parts):
                 frint("launch fused input", ops[0]._task_id, input)
-                proj = ops[0].strategy.get_projection(input)
-                if hasattr(proj, 'part'):
-                   frint("strat", proj.part.index_partition.functor.transform.trans)
-                   frint("strat1", proj.part.index_partition.functor.__dict__)
+                proj = ops[0].strategy.get_projection(part)
             self.propogateFuture(ops[0])
 
             for output in ops[0]._outputs:
@@ -1316,7 +1354,6 @@ def _schedule(self, ops, force_eval=False):
                     op.strategy = strats[i]
             for i,op in enumerate(ops):
                 for input in op._inputs:
-                    print("in", input)
                     if input._storage is None:
                         frint("launch ufused input", op._task_id, input)
                 self.propogateFuture(op)
diff --git a/legate/core/solver.py b/legate/core/solver.py
index d539999d0..77628df1a 100644
--- a/legate/core/solver.py
+++ b/legate/core/solver.py
@@ -271,6 +271,7 @@ def cost(unknown):
             store = unknown._store
             return (
                 store.comm_volume(),
+                store._key_partition is None,
                 not store.has_key_partition(all_restrictions[unknown]),
             )
 
@@ -278,6 +279,7 @@ def cost(unknown):
 
         key_parts = set()
         prev_part = None
+        #import pdb; pdb.set_trace()
         for unknown in unknowns:
             if unknown in partitions:
                 continue
@@ -290,7 +292,10 @@ def cost(unknown):
             if isinstance(prev_part, NoPartition):
                 partition = prev_part
             else:
-                partition = store.compute_key_partition(restrictions)
+                if store._key_partition is not None:
+                    partition=store._key_partition
+                else:
+                    partition = store.compute_key_partition(restrictions)
                 key_parts.add(unknown)
 
             cls = constraints.find(unknown)
@@ -298,7 +303,7 @@ def cost(unknown):
                 if to_align in partitions:
                     continue
                 partitions[to_align] = partition
-
+                #print("ptype", to_align, (partition))
             prev_part = partition
 
         for lhs, rhs in dependent.items():
diff --git a/legate/core/store.py b/legate/core/store.py
index b8ed049c8..ce097db41 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -472,10 +472,9 @@ def storage(self):
         # If someone is trying to retreive the storage of a store,
         # we need to execute outstanding operations so that we know
         # it has been initialized correctly.
-        self._runtime.flush_scheduling_window()
+        self._runtime._launch_outstanding(False)
         if self._storage is None:
-            print("store none, launching", [op._task_id for op in self._runtime._outstanding_ops])
-            self._runtime._launch_outstanding(False)
+            #print("store none, launching", [op._task_id for op in self._runtime._outstanding_ops])
             """
             if self._kind ==Future:
                 print("future")
diff --git a/src/core/runtime/context.cc b/src/core/runtime/context.cc
index 79bfbf6a4..4884c0110 100644
--- a/src/core/runtime/context.cc
+++ b/src/core/runtime/context.cc
@@ -154,6 +154,17 @@ TaskContext::TaskContext(const Legion::Task* task,
   scalars_    = dez.unpack<std::vector<Scalar>>();
 
 }
+/*
+  TaskContext::TaskContext(std::vector<Store>& inputs, std::vector<Store>& outputs,
+              std::vector<Store>& reductions, std::vector<Scalar>& scalars) 
+  : inputs_(inputs), outputs_(outputs), reductions_(reductions), scalars_(scalars) 
+{
+    regions_ = NULL;
+    context_ = NULL;
+    runtime_ = NULL;
+    task_ = NULL;
+}
+*/
 
 ReturnValues TaskContext::pack_return_values() const
 {
diff --git a/src/core/runtime/context.h b/src/core/runtime/context.h
index 891452cc9..c9cb29462 100644
--- a/src/core/runtime/context.h
+++ b/src/core/runtime/context.h
@@ -110,11 +110,23 @@ class LibraryContext {
 // of the Legion API.
 class TaskContext {
  public:
+  TaskContext() = default;
+
   TaskContext(const Legion::Task* task,
               const std::vector<Legion::PhysicalRegion>& regions,
               Legion::Context context,
               Legion::Runtime* runtime);
 
+  TaskContext(const Legion::Task* task, const std::vector<Legion::PhysicalRegion> regions)
+//             std::vector<Store>& inputs, std::vector<Store>& outputs, std::vector<Scalar>& scalars)
+  : task_(task), regions_(regions) 
+ {
+    //inputs_=inputs;  
+    //outputs_=outputs;
+    //scalars_=scalars;
+}
+
+
  public:
   std::vector<Store>& inputs() { return inputs_; }
   std::vector<Store>& outputs() { return outputs_; }
@@ -131,7 +143,7 @@ class TaskContext {
   Legion::Runtime* runtime_;
   FusionMetadata fusionMetadata;
 
- private:
+ public:
   std::vector<Store> inputs_, outputs_, reductions_;
   std::vector<Scalar> scalars_;
 };
diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc
index aab251955..4f4c679ed 100644
--- a/src/core/runtime/runtime.cc
+++ b/src/core/runtime/runtime.cc
@@ -32,6 +32,11 @@ Logger log_legate("legate");
 
 // This is the unique string name for our library which can be used
 // from both C++ and Python to generate IDs
+
+using LegateVariantImpl = void (*)(TaskContext&);
+/*static */ std::vector<std::pair<int64_t, LegateVariantImpl> > Core::opIDs = *(new std::vector<std::pair<int64_t, LegateVariantImpl> >());
+/*static */ std::unordered_map<long, LegateVariantImpl> Core::cpuDescriptors = *(new std::unordered_map<long, LegateVariantImpl>());
+
 static const char* const core_library_name = "legate.core";
 
 /*static*/ bool Core::show_progress = false;
diff --git a/src/core/runtime/runtime.h b/src/core/runtime/runtime.h
index 03b62e0c8..e5ba54b35 100644
--- a/src/core/runtime/runtime.h
+++ b/src/core/runtime/runtime.h
@@ -19,9 +19,12 @@
 #include "legion.h"
 
 #include "core/utilities/typedefs.h"
-
+#include "core/runtime/context.h"
+#include <unordered_map>
 namespace legate {
 
+using LegateVariantImpl = void (*)(TaskContext&);
+
 extern uint32_t extract_env(const char* env_name,
                             const uint32_t default_value,
                             const uint32_t test_value);
@@ -30,6 +33,8 @@ class Core {
  public:
   static void parse_config(void);
   static void shutdown(void);
+  static std::unordered_map<int64_t, LegateVariantImpl> cpuDescriptors; 
+  static std::vector<std::pair<int64_t, LegateVariantImpl> > opIDs;
 
  public:
   // Configuration settings
diff --git a/src/core/task/task.cc b/src/core/task/task.cc
index 1cc9f1e43..301bfa2a7 100644
--- a/src/core/task/task.cc
+++ b/src/core/task/task.cc
@@ -35,6 +35,7 @@ void LegateTaskRegistrar::record_variant(TaskID tid,
   assert((kind == Processor::LOC_PROC) || (kind == Processor::TOC_PROC) ||
          (kind == Processor::OMP_PROC));
 
+
   // Buffer these up until we can do our actual registration with the runtime
   pending_task_variants_.push_back(PendingTaskVariant(
     tid,
@@ -56,6 +57,11 @@ void LegateTaskRegistrar::record_variant(TaskID tid,
 
 void LegateTaskRegistrar::register_all_tasks(Runtime* runtime, LibraryContext& context)
 {
+  for (auto& taskIdx : Core::opIDs){
+    auto newID = context.get_task_id(taskIdx.first);
+    Core::cpuDescriptors.insert(std::pair<int64_t, LegateVariantImpl>((int64_t) newID, taskIdx.second));
+  }
+
   // Do all our registrations
   for (auto& task : pending_task_variants_) {
     task.task_id =
diff --git a/src/core/task/task.h b/src/core/task/task.h
index 06befc089..3d464bbab 100644
--- a/src/core/task/task.h
+++ b/src/core/task/task.h
@@ -127,6 +127,8 @@ class LegateTask {
                                bool inner      = false,
                                bool idempotent = false)
   {
+    
+    
     // Construct the code descriptor for this task so that the library
     // can register it later when it is ready
     Legion::CodeDescriptor desc(
@@ -134,6 +136,7 @@ class LegateTask {
         legion_task_wrapper<ReturnValues, LegateTask<T>::template legate_task_wrapper<TASK_PTR>>);
     auto task_id = T::TASK_ID;
 
+    Core::opIDs.push_back(std::pair<int64_t, LegateVariantImpl>((int64_t)task_id, TASK_PTR));
     T::Registrar::record_variant(task_id,
                                  T::task_name(),
                                  desc,
diff --git a/src/core/utilities/makeshift_serializer.cc b/src/core/utilities/makeshift_serializer.cc
index 7dacd29ce..06f2d3997 100644
--- a/src/core/utilities/makeshift_serializer.cc
+++ b/src/core/utilities/makeshift_serializer.cc
@@ -108,18 +108,20 @@ namespace legate{
         //if _isfuture
         if(buffer.is_future_)
         {   
-
+            //std::cout<<"packing future"<<std::endl;
             //pack future_wrapper
             pack((bool) buffer.future_.read_only_);
-
-            pack((bool) !buffer.future_.uninitialized_);
+            bool good = true;
+            //std::cout<<"uninit "<<buffer.future_.uninitialized_<<std::endl;
+            //pack((bool) !buffer.future_.uninitialized_);
+            pack((bool) good);
 
             pack((int32_t) buffer.future_.field_size_);
             auto dom = buffer.future_.domain();
             pack((uint32_t) dom.dim);
             for (int32_t i =0; i<dom.dim; i++)
             {
-                std::cout<<"packing "<<i<<" "<<dom.rect_data[i + dom.dim]+1<<std::endl;
+                //std::cout<<"packing "<<i<<" "<<dom.rect_data[i + dom.dim]+1<<std::endl;
                 pack((int64_t) dom.rect_data[i + dom.dim] + 1);
             }
         }   

From ddd8dc77455f8f1e16f79014b39331df66e368a3 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <mzalewski@nvidia.com>
Date: Wed, 27 Oct 2021 14:18:39 -0700
Subject: [PATCH 15/44] Change the pip package name to match the conda package
 and update version.

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 749f42373..ac6f7dc02 100755
--- a/setup.py
+++ b/setup.py
@@ -69,8 +69,8 @@ def run(self):
     # Remove the recurse argument from the list
     sys.argv.remove("--recurse")
     setup(
-        name="legate.core",
-        version="0.1",
+        name="legate-core",
+        version="21.10.00",
         packages=["legate", "legate.core", "legate.timing"],
         cmdclass={"build_py": my_build_py},
     )

From ef677e6448b11b76d8b91e8ebd17652dc8d992a6 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <mzalewski@nvidia.com>
Date: Wed, 27 Oct 2021 23:46:45 -0700
Subject: [PATCH 16/44] Fix the version of Legion to a particular commit

---
 install.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/install.py b/install.py
index 8f9a32db8..0b3ca61b5 100755
--- a/install.py
+++ b/install.py
@@ -142,6 +142,7 @@ def git_clone(repo_dir, url, branch=None, tag=None, commit=None):
         verbose_check_call(
             ["git", "submodule", "update", "--init"], cwd=repo_dir
         )
+        git_reset(repo_dir, commit)
     else:
         verbose_check_call(
             [
@@ -162,10 +163,13 @@ def git_reset(repo_dir, refspec):
     verbose_check_call(["git", "reset", "--hard", refspec], cwd=repo_dir)
 
 
-def git_update(repo_dir, branch=None):
-    verbose_check_call(["git", "pull", "--ff-only"], cwd=repo_dir)
+def git_update(repo_dir, branch=None, tag=None, commit=None):
     if branch is not None:
         verbose_check_call(["git", "checkout", branch], cwd=repo_dir)
+        verbose_check_call(["git", "pull", "--ff-only"], cwd=repo_dir)
+    else:
+        verbose_check_call(["git", "fetch"], cwd=repo_dir)
+        verbose_check_call(["git", "checkout", commit or tag], cwd=repo_dir)
 
 
 def load_json_config(filename):
@@ -209,13 +213,14 @@ def install_gasnet(gasnet_dir, conduit, thread_count):
     shutil.rmtree(temp_dir)
 
 
-def install_legion(legion_src_dir, branch="legate_stable"):
+def install_legion(legion_src_dir, branch, commit="3141d7c0"):
     print("Legate is installing Legion into a local directory...")
     # For now all we have to do is clone legion since we build it with Legate
     git_clone(
         legion_src_dir,
         url="https://gitlab.com/StanfordLegion/legion.git",
         branch=branch,
+        commit=commit
     )
 
 
@@ -228,9 +233,9 @@ def install_thrust(thrust_dir):
     )
 
 
-def update_legion(legion_src_dir, branch="legate_stable"):
+def update_legion(legion_src_dir, branch, commit="3141d7c0"):
     # Make sure we are on the right branch for single/multi-node
-    git_update(legion_src_dir, branch=branch)
+    git_update(legion_src_dir, branch=branch, commit=commit)
 
 
 def build_legion(

From ba955e280ef575bcb93181bc5ce22787f68625f3 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <mzalewski@nvidia.com>
Date: Thu, 28 Oct 2021 00:24:15 -0700
Subject: [PATCH 17/44] Do not find a default branch for the release

---
 install.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/install.py b/install.py
index 0b3ca61b5..f6ddbb75c 100755
--- a/install.py
+++ b/install.py
@@ -220,7 +220,7 @@ def install_legion(legion_src_dir, branch, commit="3141d7c0"):
         legion_src_dir,
         url="https://gitlab.com/StanfordLegion/legion.git",
         branch=branch,
-        commit=commit
+        commit=commit,
     )
 
 
@@ -562,8 +562,10 @@ def install(
 
     legate_core_dir = os.path.dirname(os.path.realpath(__file__))
 
-    if legion_branch is None:
-        legion_branch = find_default_legion_branch(legate_core_dir)
+    # For the release, we will use a hardcoded commit unless user asks for
+    # a branch
+    #    if legion_branch is None:
+    #        legion_branch = find_default_legion_branch(legate_core_dir)
 
     cmake_config = os.path.join(legate_core_dir, ".cmake.json")
     dump_json_config(cmake_config, cmake)

From de337cfe033d5c332dda6bf8ac69b63956fcb2b4 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <mzalewski@nvidia.com>
Date: Thu, 28 Oct 2021 01:03:19 -0700
Subject: [PATCH 18/44] Change the Legion checkout target

---
 install.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/install.py b/install.py
index f6ddbb75c..d18a44a42 100755
--- a/install.py
+++ b/install.py
@@ -213,7 +213,7 @@ def install_gasnet(gasnet_dir, conduit, thread_count):
     shutil.rmtree(temp_dir)
 
 
-def install_legion(legion_src_dir, branch, commit="3141d7c0"):
+def install_legion(legion_src_dir, branch, commit="d0907f4c"):
     print("Legate is installing Legion into a local directory...")
     # For now all we have to do is clone legion since we build it with Legate
     git_clone(
@@ -233,7 +233,7 @@ def install_thrust(thrust_dir):
     )
 
 
-def update_legion(legion_src_dir, branch, commit="3141d7c0"):
+def update_legion(legion_src_dir, branch, commit="d0907f4c"):
     # Make sure we are on the right branch for single/multi-node
     git_update(legion_src_dir, branch=branch, commit=commit)
 

From 78335cc7682b961910119d2b899d6e49328db3e3 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Thu, 28 Oct 2021 09:13:48 -0700
Subject: [PATCH 19/44] Bumped up the version of pyarrow

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 80f3fbd55..d72135b2e 100644
--- a/README.md
+++ b/README.md
@@ -221,7 +221,7 @@ no support for Windows.
 
 The Legate Core currently requires Python >= 3.7 and the following packages:
 
-  - `pyarrow=1.0.1`
+  - `pyarrow=5.0.0`
   - `numpy`
   - `cffi`
   - [CUDA](https://developer.nvidia.com/cuda-downloads) >= 8.0

From 369909dbfa11ce0cfb3cff3f3e049443a5306a9b Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@g0001.stanford.edu>
Date: Sun, 31 Oct 2021 23:50:16 -0700
Subject: [PATCH 20/44] gpu descriptors

---
 install.py                  | 4 ++--
 legate/core/runtime.py      | 7 +++----
 src/core/runtime/runtime.cc | 2 ++
 src/core/runtime/runtime.h  | 2 ++
 src/core/task/task.cc       | 7 +++++++
 src/core/task/task.h        | 6 +++++-
 6 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/install.py b/install.py
index 7c45e3ece..b6f9132a5 100755
--- a/install.py
+++ b/install.py
@@ -806,8 +806,8 @@ def driver():
     )
     parser.add_argument(
         "--cuda",
-        action=BooleanFlag,
-        default=os.environ.get("USE_CUDA", "0") == "1",
+        action= BooleanFlag,
+        default=True,#os.environ.get("USE_CUDA", "0") == "1",
         help="Build Legate with CUDA support.",
     )
     parser.add_argument(
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 639bb9f98..ce34986e7 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -807,8 +807,8 @@ def __init__(self):
         self.validIDs.add(2) #Binary op
         #self.validIDs.add(5) #convert op
         self.validIDs.add(18) #Unary op
-        self.validIDs.add(9) #Fill op
-        self.validIDs.add(14) #Fill op
+        #self.validIDs.add(9) #Fill op
+        #self.validIDs.add(14) #Fill op
 
         # the following are conditionally fusable
         # they will be processed in the a subsequent level of filtering
@@ -1302,7 +1302,7 @@ def propogateFuture(self,op):
    
     def _schedule(self, ops, force_eval=False):
         ids = [op._task_id for op in ops]
-        print(force_eval, "ids", ids)
+        #print(force_eval, "ids", ids)
         #case 1: try fusing current window of tasks
         #if partially or fully fusable, 
         #schedule the new set of tasks
@@ -1311,7 +1311,6 @@ def _schedule(self, ops, force_eval=False):
         #     must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op])
         #     partitioner = Partitioner(self, [op], must_be_single=must_be_single)
         #     strategy = partitioner.partition_stores()
-
         if len(ops)>=2 and (not force_eval):
             fused_task_list,strats = self.build_fused_op(ops)
             if fused_task_list:
diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc
index 4f4c679ed..eb57f46d8 100644
--- a/src/core/runtime/runtime.cc
+++ b/src/core/runtime/runtime.cc
@@ -35,7 +35,9 @@ Logger log_legate("legate");
 
 using LegateVariantImpl = void (*)(TaskContext&);
 /*static */ std::vector<std::pair<int64_t, LegateVariantImpl> > Core::opIDs = *(new std::vector<std::pair<int64_t, LegateVariantImpl> >());
+/*static */ std::vector<std::pair<int64_t, LegateVariantImpl> > Core::gpuOpIDs = *(new std::vector<std::pair<int64_t, LegateVariantImpl> >());
 /*static */ std::unordered_map<long, LegateVariantImpl> Core::cpuDescriptors = *(new std::unordered_map<long, LegateVariantImpl>());
+/*static */ std::unordered_map<long, LegateVariantImpl> Core::gpuDescriptors = *(new std::unordered_map<long, LegateVariantImpl>());
 
 static const char* const core_library_name = "legate.core";
 
diff --git a/src/core/runtime/runtime.h b/src/core/runtime/runtime.h
index e5ba54b35..6d1dc8341 100644
--- a/src/core/runtime/runtime.h
+++ b/src/core/runtime/runtime.h
@@ -34,7 +34,9 @@ class Core {
   static void parse_config(void);
   static void shutdown(void);
   static std::unordered_map<int64_t, LegateVariantImpl> cpuDescriptors; 
+  static std::unordered_map<int64_t, LegateVariantImpl> gpuDescriptors; 
   static std::vector<std::pair<int64_t, LegateVariantImpl> > opIDs;
+  static std::vector<std::pair<int64_t, LegateVariantImpl> > gpuOpIDs;
 
  public:
   // Configuration settings
diff --git a/src/core/task/task.cc b/src/core/task/task.cc
index 301bfa2a7..6ee5fb69b 100644
--- a/src/core/task/task.cc
+++ b/src/core/task/task.cc
@@ -62,6 +62,13 @@ void LegateTaskRegistrar::register_all_tasks(Runtime* runtime, LibraryContext& c
     Core::cpuDescriptors.insert(std::pair<int64_t, LegateVariantImpl>((int64_t) newID, taskIdx.second));
   }
 
+  for (auto& taskIdx : Core::gpuOpIDs){
+    auto newID = context.get_task_id(taskIdx.first);
+    Core::gpuDescriptors.insert(std::pair<int64_t, LegateVariantImpl>((int64_t) newID, taskIdx.second));
+  }
+
+
+
   // Do all our registrations
   for (auto& task : pending_task_variants_) {
     task.task_id =
diff --git a/src/core/task/task.h b/src/core/task/task.h
index 3d464bbab..dc18b9f0f 100644
--- a/src/core/task/task.h
+++ b/src/core/task/task.h
@@ -136,7 +136,11 @@ class LegateTask {
         legion_task_wrapper<ReturnValues, LegateTask<T>::template legate_task_wrapper<TASK_PTR>>);
     auto task_id = T::TASK_ID;
 
-    Core::opIDs.push_back(std::pair<int64_t, LegateVariantImpl>((int64_t)task_id, TASK_PTR));
+    if (kind ==Legion::Processor::LOC_PROC){
+        Core::opIDs.push_back(std::pair<int64_t, LegateVariantImpl>((int64_t)task_id, TASK_PTR));
+    }else if (kind ==Legion::Processor::TOC_PROC){
+        Core::gpuOpIDs.push_back(std::pair<int64_t, LegateVariantImpl>((int64_t)task_id, TASK_PTR));
+    }
     T::Registrar::record_variant(task_id,
                                  T::task_name(),
                                  desc,

From ab0c0448474058bdd71e525a261affa049a2b3d8 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 1 Nov 2021 11:35:03 -0700
Subject: [PATCH 21/44] Remove back edges from partition symbols back to
 operations to avoid object cycles

---
 legate/core/constraints.py | 13 +++++++++----
 legate/core/operation.py   |  3 ++-
 legate/core/solver.py      | 18 +++++++++---------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/legate/core/constraints.py b/legate/core/constraints.py
index cd81306e5..39e85bdcc 100644
--- a/legate/core/constraints.py
+++ b/legate/core/constraints.py
@@ -55,8 +55,9 @@ def reduce(self):
 
 
 class PartSym(Expr):
-    def __init__(self, op, store, id, disjoint, complete):
-        self._op = op
+    def __init__(self, op_hash, op_name, store, id, disjoint, complete):
+        self._op_hash = op_hash
+        self._op_name = op_name
         self._store = store
         self._id = id
         self._disjoint = disjoint
@@ -66,6 +67,10 @@ def __init__(self, op, store, id, disjoint, complete):
     def ndim(self):
         return self._store.ndim
 
+    @property
+    def store(self):
+        return self._store
+
     @property
     def closed(self):
         return False
@@ -73,10 +78,10 @@ def closed(self):
     def __repr__(self):
         disj = "D" if self._disjoint else "A"
         comp = "C" if self._complete else "I"
-        return f"X{self._id}({disj},{comp})@{self._op.get_name()}"
+        return f"X{self._id}({disj},{comp})@{self._op_name}"
 
     def __hash__(self):
-        return hash((self._op, self._id))
+        return hash((self._op_hash, self._id))
 
     def subst(self, mapping):
         return Lit(mapping[self])
diff --git a/legate/core/operation.py b/legate/core/operation.py
index c2f13c524..5e8283dd5 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -158,7 +158,8 @@ def _get_symbol_id(self):
 
     def declare_partition(self, store, disjoint=True, complete=True):
         sym = PartSym(
-            self,
+            self._op_id,
+            self.get_name(),
             store,
             self._get_symbol_id(),
             disjoint=disjoint,
diff --git a/legate/core/solver.py b/legate/core/solver.py
index 58321c00f..b4d5c5e80 100644
--- a/legate/core/solver.py
+++ b/legate/core/solver.py
@@ -114,16 +114,16 @@ def launch_domain(self):
 
     def get_projection(self, part):
         partition = self.get_partition(part)
-        return partition.get_requirement(self._launch_shape, part._store)
+        return partition.get_requirement(self._launch_shape, part.store)
 
     def get_partition(self, part):
-        assert not part._store.unbound
+        assert not part.store.unbound
         if part not in self._strategy:
             raise ValueError(f"No strategy is found for {part}")
         return self._strategy[part]
 
     def get_field_space(self, part):
-        assert part._store.unbound
+        assert part.store.unbound
         if part not in self._fspaces:
             raise ValueError(f"No strategy is found for {part}")
         return self._fspaces[part]
@@ -160,7 +160,7 @@ def _solve_broadcast_constraints(
     ):
         to_remove = OrderedSet()
         for unknown in unknowns:
-            store = unknown._store
+            store = unknown.store
             if not (store.kind is Future or unknown in broadcasts):
                 continue
 
@@ -183,7 +183,7 @@ def _solve_unbound_constraints(
     ):
         to_remove = OrderedSet()
         for unknown in unknowns:
-            store = unknown._store
+            store = unknown.store
             if not store.unbound:
                 continue
 
@@ -193,7 +193,7 @@ def _solve_unbound_constraints(
                 continue
 
             cls = constraints.find(unknown)
-            assert all(to_align._store.unbound for to_align in cls)
+            assert all(to_align.store.unbound for to_align in cls)
 
             fspace = self._runtime.create_field_space()
             for to_align in cls:
@@ -206,7 +206,7 @@ def _solve_unbound_constraints(
     def _find_restrictions(cls):
         merged = None
         for unknown in cls:
-            store = unknown._store
+            store = unknown.store
             restrictions = store.find_restrictions()
             if merged is None:
                 merged = restrictions
@@ -268,7 +268,7 @@ def partition_stores(self):
         all_restrictions = self._find_all_restrictions(unknowns, constraints)
 
         def cost(unknown):
-            store = unknown._store
+            store = unknown.store
             return (
                 -store.comm_volume(),
                 not store.has_key_partition(all_restrictions[unknown]),
@@ -284,7 +284,7 @@ def cost(unknown):
             elif unknown in dependent:
                 continue
 
-            store = unknown._store
+            store = unknown.store
             restrictions = all_restrictions[unknown]
 
             if isinstance(prev_part, NoPartition):

From 54d3bb8fc20baf186246bd7b97271f1a01142459 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 1 Nov 2021 15:39:06 -0700
Subject: [PATCH 22/44] Make sure we don't create cycles between region fields
 and attachments

---
 legate/core/runtime.py | 60 +++++++++++++++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 15 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 6bd9a5839..c06de76e1 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -16,6 +16,7 @@
 import gc
 import math
 import struct
+import weakref
 from collections import deque
 from functools import reduce
 
@@ -319,11 +320,19 @@ def __init__(self, ptr, extent, region_field):
         self.ptr = ptr
         self.extent = extent
         self.end = ptr + extent - 1
-        self.region_field = region_field
+        self._region_field = weakref.ref(region_field)
 
     def overlaps(self, other):
         return not (self.end < other.ptr or other.end < self.ptr)
 
+    @property
+    def region_field(self):
+        return self._region_field()
+
+    @region_field.setter
+    def region_field(self, region_field):
+        self._region_field = weakref.ref(region_field)
+
 
 class AttachmentManager(object):
     def __init__(self, runtime):
@@ -359,22 +368,35 @@ def attachment_key(alloc):
 
     def has_attachment(self, alloc):
         key = self.attachment_key(alloc)
-        return key in self._attachments
+        attachment = self._attachments.get(key, None)
+        return attachment is not None and attachment.region_field
 
     def reuse_existing_attachment(self, alloc):
         key = self.attachment_key(alloc)
-        if key not in self._attachments:
+        attachment = self._attachments.get(key, None)
+        if attachment is None:
             return None
-        attachment = self._attachments[key]
-        return attachment.region_field
+        rf = attachment.region_field
+        # If the region field is already collected, we don't need to keep
+        # track of it for de-duplication.
+        if rf is None:
+            del self._attachments[key]
+        return rf
 
     def attach_external_allocation(self, alloc, region_field):
         key = self.attachment_key(alloc)
-        if key in self._attachments:
+        attachment = self._attachments.get(key, None)
+        if not (attachment is None or attachment.region_field is None):
             raise RuntimeError(
                 "Cannot attach two different RegionFields to the same buffer"
             )
-        attachment = Attachment(*key, region_field)
+        if attachment is None:
+            attachment = Attachment(*key, region_field)
+        else:
+            attachment.region_field = region_field
+            # We temporary remove the attachment from the map for
+            # the following alias checking
+            del self._attachments[key]
         for other in self._attachments.values():
             if other.overlaps(attachment):
                 raise RuntimeError(
@@ -382,7 +404,19 @@ def attach_external_allocation(self, alloc, region_field):
                 )
         self._attachments[key] = attachment
 
-    def detach_external_allocation(self, alloc, detach, defer):
+    def _remove_allocation(self, alloc):
+        key = self.attachment_key(alloc)
+        if key not in self._attachments:
+            raise RuntimeError("Unable to find attachment to remove")
+        del self._attachments[key]
+
+    def detach_external_allocation(
+        self, alloc, detach, defer=False, previously_deferred=False
+    ):
+        # If the detachment was previously deferred, then we don't
+        # need to remove the allocation from the map again.
+        if not previously_deferred:
+            self._remove_allocation(alloc)
         if defer:
             # If we need to defer this until later do that now
             self._deferred_detachments.append((alloc, detach))
@@ -391,12 +425,6 @@ def detach_external_allocation(self, alloc, detach, defer):
         # Dangle a reference to the field off the future to prevent the
         # field from being recycled until the detach is done
         future.field_reference = detach.field
-        # We also need to tell the core legate library that this buffer
-        # is no longer attached
-        key = self.attachment_key(alloc)
-        if key not in self._attachments:
-            raise RuntimeError("Unable to find attachment to remove")
-        del self._attachments[key]
         # If the future is already ready, then no need to track it
         if future.is_ready():
             return
@@ -417,7 +445,9 @@ def perform_detachments(self):
         detachments = self._deferred_detachments
         self._deferred_detachments = list()
         for alloc, detach in detachments:
-            self.detach_external_allocation(alloc, detach, defer=False)
+            self.detach_external_allocation(
+                alloc, detach, defer=False, previously_deferred=True
+            )
 
     def prune_detachments(self):
         to_remove = []

From d6ccdd2747d9f2bb72de21658f0a7906810991aa Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 3 Nov 2021 15:25:17 -0700
Subject: [PATCH 23/44] Handle cases where one instance is used by multiple
 mappings

---
 src/core/mapping/base_mapper.cc | 46 ++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index fd4cfa04c..9f679de9e 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -429,7 +429,7 @@ void BaseMapper::map_task(const MapperContext ctx,
 
   // Map each field separately for each of the logical regions
   std::vector<PhysicalInstance> needed_acquires;
-  std::map<PhysicalInstance, uint32_t> instances_to_mappings;
+  std::map<PhysicalInstance, std::set<uint32_t>> instances_to_mappings;
   for (uint32_t mapping_idx = 0; mapping_idx < mappings.size(); ++mapping_idx) {
     auto& mapping    = mappings[mapping_idx];
     auto req_indices = mapping.requirement_indices();
@@ -457,7 +457,7 @@ void BaseMapper::map_task(const MapperContext ctx,
       needed_acquires.push_back(result);
 
     for (auto req_idx : req_indices) output.chosen_instances[req_idx].push_back(result);
-    instances_to_mappings[result] = mapping_idx;
+    instances_to_mappings[result].insert(mapping_idx);
   }
 
   // Do an acquire on all the instances so we have our result
@@ -471,27 +471,31 @@ void BaseMapper::map_task(const MapperContext ctx,
     filter_failed_acquires(needed_acquires, failed_acquires);
 
     for (auto failed_acquire : failed_acquires) {
-      auto mapping_idx = instances_to_mappings[failed_acquire];
-      auto& mapping    = mappings[mapping_idx];
-      auto req_indices = mapping.requirement_indices();
-
-      std::vector<std::reference_wrapper<const RegionRequirement>> reqs;
-      for (auto req_idx : req_indices) reqs.push_back(std::cref(task.regions[req_idx]));
-
-      for (auto req_idx : req_indices) {
-        auto& instances   = output.chosen_instances[req_idx];
-        uint32_t inst_idx = 0;
-        for (; inst_idx < instances.size(); ++inst_idx)
-          if (instances[inst_idx] == failed_acquire) break;
-        instances.erase(instances.begin() + inst_idx);
-      }
+      auto affected_mappings = instances_to_mappings[failed_acquire];
+      instances_to_mappings.erase(failed_acquire);
+
+      for (auto& mapping_idx : affected_mappings) {
+        auto& mapping    = mappings[mapping_idx];
+        auto req_indices = mapping.requirement_indices();
+
+        std::vector<std::reference_wrapper<const RegionRequirement>> reqs;
+        for (auto req_idx : req_indices) reqs.push_back(std::cref(task.regions[req_idx]));
+
+        for (auto req_idx : req_indices) {
+          auto& instances   = output.chosen_instances[req_idx];
+          uint32_t inst_idx = 0;
+          for (; inst_idx < instances.size(); ++inst_idx)
+            if (instances[inst_idx] == failed_acquire) break;
+          instances.erase(instances.begin() + inst_idx);
+        }
 
-      PhysicalInstance result;
-      if (map_legate_store(ctx, task, mapping, reqs, task.target_proc, result))
-        needed_acquires.push_back(result);
+        PhysicalInstance result;
+        if (map_legate_store(ctx, task, mapping, reqs, task.target_proc, result))
+          needed_acquires.push_back(result);
 
-      for (auto req_idx : req_indices) output.chosen_instances[req_idx].push_back(result);
-      instances_to_mappings[result] = mapping_idx;
+        for (auto req_idx : req_indices) output.chosen_instances[req_idx].push_back(result);
+        instances_to_mappings[result].insert(mapping_idx);
+      }
     }
   }
 }

From 51dd00f6acf0ea32569498a131ceebd44f76ccaf Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <mpapadakis@nvidia.com>
Date: Fri, 5 Nov 2021 13:29:35 -0700
Subject: [PATCH 24/44] Fix import of legion CFFI

---
 legate/core/__init__.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/legate/core/__init__.py b/legate/core/__init__.py
index 8c607466f..b3fa32b9c 100644
--- a/legate/core/__init__.py
+++ b/legate/core/__init__.py
@@ -19,10 +19,10 @@
 # Perform a check to see if we're running inside of Legion Python
 # If we're not then we should raise an error message
 try:
-    from legion_cffi import ffi, lib as legion
+    from legion_cffi import lib as _legion
 
     # Now confirm that we are actually inside of a task
-    if legion.legion_runtime_has_context():
+    if _legion.legion_runtime_has_context():
         using_legion_python = True
     else:
         using_legion_python = False
@@ -115,6 +115,10 @@
     ReductionOp,
 )
 
+# NOTE: This needs to come after the imports from legate.core.legion, as we
+# are overriding that module's name.
+from legion_cffi import ffi, lib as legion
+
 # Import the PyArrow type system
 from pyarrow import (
     DataType,

From f388f07bb36594d9359676184a9e64c911b2309c Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Fri, 5 Nov 2021 16:25:15 -0700
Subject: [PATCH 25/44] Make sure we flush deferred detachments

---
 legate/core/runtime.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index c06de76e1..957019d4e 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -851,6 +851,8 @@ def destroy(self):
         self.destroyed = True
 
     def dispatch(self, op, redop=None):
+        self._attachment_manager.perform_detachments()
+        self._attachment_manager.prune_detachments()
         if redop:
             return op.launch(self.legion_runtime, self.legion_context, redop)
         else:

From 345f275c48330cf612353c18e24f4439d0d126ac Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@g0001.stanford.edu>
Date: Fri, 5 Nov 2021 18:13:36 -0700
Subject: [PATCH 26/44] reduction fix

---
 legate/core/runtime.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index ce34986e7..f151ff426 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -807,8 +807,8 @@ def __init__(self):
         self.validIDs.add(2) #Binary op
         #self.validIDs.add(5) #convert op
         self.validIDs.add(18) #Unary op
-        #self.validIDs.add(9) #Fill op
-        #self.validIDs.add(14) #Fill op
+        self.validIDs.add(9) #Fill op
+        #self.validIDs.add(14) #read op
 
         # the following are conditionally fusable
         # they will be processed in the a subsequent level of filtering
@@ -969,7 +969,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         if start<end:
             intervals.append((start, end))
         return True, intervals
-        return True, [(0,len(ops))]
 
 
    
@@ -1017,6 +1016,7 @@ def __init__(self, core_library):
         self._outstanding_ops = []
         self._window_size =10
         self._fusion_threshold =2
+        self._opLens = []
         self._clearing_pipe = False
         #self._window_size = self._core_context.get_tunable(
         #    legion.LEGATE_CORE_TUNABLE_WINDOW_SIZE,
@@ -1104,7 +1104,7 @@ def destroy(self):
         # Before we clean up the runtime, we should execute all outstanding
         # operations.
         self.flush_scheduling_window()
-
+        #print(self._opLens)
         # Destroy all libraries. Note that we should do this
         # from the lastly added one to the first one
         for context in reversed(self._context_list):
@@ -1157,6 +1157,9 @@ def serialize_multiop_metadata(self, numpy_context, ops):
                     future_start+=1
             for o,output in enumerate(op._outputs):
                 offsets.append(-(o+1)) 
+            for r,reduction in enumerate(op._reductions):
+                offsets.append(-(r+1)) 
+ 
             op_ids.append(numpy_context.get_task_id(op._task_id._value_))
 
             offset_start+=(len(op._inputs)+len(op._outputs))
@@ -1277,8 +1280,10 @@ def build_fused_op(self,ops):
 
         return new_op_list, True       
 
-    def _launch_outstanding(self, force_eval=True):
+    def _launch_outstanding(self, force_eval=True):        
         if len(self._outstanding_ops):
+            #print("launching outstanding", ops)
+            self._opLens.append(len(self._outstanding_ops))
             ops = self._outstanding_ops
             self._outstanding_ops = []
             self._schedule(ops, force_eval)
@@ -1369,6 +1374,7 @@ def submit(self, op):
         else:
             self._outstanding_ops.append(op)
             if len(self._outstanding_ops) >= self._window_size:
+                self._opLens.append(len(self._outstanding_ops))
                 ops = self._outstanding_ops
                 self._outstanding_ops = []
                 self._schedule(ops)
@@ -1391,6 +1397,7 @@ def _scheduleNew(self, ops):
     def flush_scheduling_window(self):
         if len(self._outstanding_ops) == 0:
             return
+        self._opLens.append(len(self._outstanding_ops))
         ops = self._outstanding_ops
         self._outstanding_ops = []
         self._schedule(ops)

From 1f9a655b3efc473217fa3f243af031a9bdc12d31 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@g0001.stanford.edu>
Date: Sat, 13 Nov 2021 15:14:36 -0800
Subject: [PATCH 27/44] put new constraint stuff back in

---
 legate/core/constraints.py |  11 ++--
 legate/core/operation.py   |  17 ++++-
 legate/core/runtime.py     | 124 ++++++++++++++++++++++++++-----------
 legate/core/solver.py      |   4 ++
 4 files changed, 115 insertions(+), 41 deletions(-)

diff --git a/legate/core/constraints.py b/legate/core/constraints.py
index 93f89bd36..0609b8980 100644
--- a/legate/core/constraints.py
+++ b/legate/core/constraints.py
@@ -55,13 +55,15 @@ def reduce(self):
 
 
 class PartSym(Expr):
-    def __init__(self, op, store, id, disjoint, complete):
-        #self._op = op
+    def __init__(self, op_hash, op_name, store, id, disjoint, complete):
+        self._op_hash = op_hash
+        self._op_name = op_name
         self._store = store
         self._id = id
         self._disjoint = disjoint
         self._complete = complete
 
+
     @property
     def ndim(self):
         return self._store.ndim
@@ -73,11 +75,12 @@ def closed(self):
     def __repr__(self):
         disj = "D" if self._disjoint else "A"
         comp = "C" if self._complete else "I"
-        return f"X{self._id}({disj},{comp})"
+        #return f"X{self._id}({disj},{comp})"
         return f"X{self._id}({disj},{comp})@{self._op.get_name()}"
 
     def __hash__(self):
-        return hash(self._id)
+        #return hash(self._id)
+        return hash((self._op_hash, self._id))
         return hash((self._op, self._id))
 
     def subst(self, mapping):
diff --git a/legate/core/operation.py b/legate/core/operation.py
index fc1b9ae18..d3424e4ff 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -19,7 +19,11 @@
 from .launcher import CopyLauncher, TaskLauncher
 from .store import Store
 from .utils import OrderedSet
-
+from .legion import (
+    FieldSpace,
+    Future
+)
+ 
 
 class Operation(object):
     def __init__(self, context, mapper_id=0, op_id=0):
@@ -165,7 +169,8 @@ def _get_symbol_id(self):
 
     def declare_partition(self, store, disjoint=True, complete=True):
         sym = PartSym(
-            self,
+            self._op_id,
+            self.get_name(),
             store,
             self._get_symbol_id(),
             disjoint=disjoint,
@@ -179,6 +184,7 @@ def declare_partition(self, store, disjoint=True, complete=True):
         return sym
 
 
+
 class Task(Operation):
     def __init__(self, context, task_id, mapper_id=0, op_id=0):
         Operation.__init__(self, context, mapper_id=mapper_id, op_id=op_id)
@@ -222,19 +228,26 @@ def launch(self, strategy):
             # We update the key partition of a store only when it gets updated
             temp.set_key_partition(partition)
         """
+        #print("inputs")
         for input, input_part in zip(self._inputs, self._input_parts):
             proj = strategy.get_projection(input_part)
+            #if (input._kind==Future):
+            #    print(input, proj)
             tag = self.get_tag(strategy, input_part)
             launcher.add_input(input, proj, tag=tag)
+        #print("outputs", len(self._outputs))
         for output, output_part in zip(self._outputs, self._output_parts):
             if output.unbound:
                 continue
             proj = strategy.get_projection(output_part)
+            #if (output._kind==Future):
+            #    print(output, proj)
             tag = self.get_tag(strategy, output_part)
             launcher.add_output(output, proj, tag=tag)
             partition = strategy.get_partition(output_part)
             # We update the key partition of a store only when it gets updated
             output.set_key_partition(partition)
+        #print()
         for ((reduction, redop), reduction_part) in zip(
             self._reductions, self._reduction_parts
         ):
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index f151ff426..d3c68b31e 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -708,7 +708,7 @@ def supress_small_fusions(self, intervals, threshold):
         return fusable, final_set
 
     def can_fuse(self):
-        for op in self.ops:
+        for op in reversed(self.ops):
             must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op])
             partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single)
             self.partitioners.append( partitioner )
@@ -723,6 +723,7 @@ def can_fuse(self):
             if len(op.inputs)>1:
                 proj = strategy.get_projection(op._input_parts[1])
             self.strategies.append(strategy)
+        self.strategies.reverse()
 
         results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints]
         drint("fuse results", results)
@@ -763,7 +764,7 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
 
 class NumpyContextExists(FusionConstraint):
     def apply(self, contexts, runtime, ops, partitioners, strategies):
-        if "legate.numpy" in contexts:
+        if "cunumeric" in contexts:
             return True, [(0, len(ops))]
         else:
            return False, [(0,0)]
@@ -803,12 +804,13 @@ class AllValidOps(FusionConstraint):
     def __init__(self):
         self.validIDs = set()
 
-        #these ops are always fusable
+        #these ops are almost always fusable
         self.validIDs.add(2) #Binary op
-        #self.validIDs.add(5) #convert op
         self.validIDs.add(18) #Unary op
         self.validIDs.add(9) #Fill op
+        self.validIDs.add(20) #Where op
         #self.validIDs.add(14) #read op
+        #self.validIDs.add(5) #convert op
 
         # the following are conditionally fusable
         # they will be processed in the a subsequent level of filtering
@@ -924,20 +926,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
             intervals.append((start,end))
         return True, intervals
          
-        #TODO: remove me    
-        # for each buffer, check all it's associated transforms/partitions
-        # across ops are equivalent 
-        seperators = []
-        for store, matrices in store_to_ops.items():
-            if len(matrices)>1: 
-                first = matrices[0]
-                for matrix in matrices:
-                    if not (matrix==first).all():
-                        indices = linkset[store]
-                        return True, [(0,indices[1]), (indices[1],len(ops))]
-        return True, [(0,len(ops))]
-
-
 
 class IdenticalLaunchShapes(FusionConstraint):
     """Fusion rule that only ops with identical
@@ -971,6 +959,52 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         return True, intervals
 
 
+
+class ValidProducerConsumer(FusionConstraint):
+    """In a fused op, there cannot be a producer consumer
+        relationship between different views of the same buffers"""
+
+    def apply(self, contexts, runtime, ops, partitioners, strategies):
+        childMap = {}
+        intervals = []
+        i, start=0, 0
+        end = len(ops)
+        def getRoot(store):
+            while store._parent:
+                store = store._parent
+            return store
+        
+        while i<end:
+            op = ops[i] 
+
+            #check consumers view of root array
+            #is the same as the producers view
+            isSame = True
+            for input in op._inputs:
+                inputRoot = getRoot(input) 
+                if inputRoot in childMap:
+                    isSame = isSame and childMap[inputRoot] == input
+
+            if not isSame:                     
+                intervals.append((start, i))
+                start=i
+                i=start+1
+                childMap = {}
+            else:
+                i=i+1 
+
+            #register producers view of buffer
+            for output in op._outputs:
+                outputRoot = getRoot(output)
+                if outputRoot not in childMap:  
+                    childMap[outputRoot] = output
+            #end while
+        if start<end:
+           intervals.append((start,end))
+        return True, intervals
+ 
+
+
    
 class Runtime(object):
     def __init__(self, core_library):
@@ -1014,13 +1048,14 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size =10
+        self._window_size=10
         self._fusion_threshold =2
         self._opLens = []
+        self._fusedOpLens = []
         self._clearing_pipe = False
         #self._window_size = self._core_context.get_tunable(
         #    legion.LEGATE_CORE_TUNABLE_WINDOW_SIZE,
-        #    ty.uint32,
+        #    ty.uint32,)
 
         # Now we initialize managers
         self._attachment_manager = AttachmentManager(self)
@@ -1104,7 +1139,16 @@ def destroy(self):
         # Before we clean up the runtime, we should execute all outstanding
         # operations.
         self.flush_scheduling_window()
+        #print("opLens")
         #print(self._opLens)
+        #print("fusedOpLens")
+        #print(self._fusedOpLens)
+        #import pickle
+        #with open('fusedOpLens.pkl', 'wb') as f:
+        #    pickle.dump(self._fusedOpLens, f)
+        #with open('flushOpLens.pkl', 'wb') as f:
+        #    pickle.dump(self._fusedOpLens, f)
+ 
         # Destroy all libraries. Note that we should do this
         # from the lastly added one to the first one
         for context in reversed(self._context_list):
@@ -1193,12 +1237,13 @@ def build_fused_op(self,ops):
         fusion_checker.register_constraint(AllValidOps())
         fusion_checker.register_constraint(IdenticalLaunchShapes())
         fusion_checker.register_constraint(IdenticalProjection())
+        fusion_checker.register_constraint(ValidProducerConsumer())
         can_fuse,fusable_sets, partitions = fusion_checker.can_fuse()
-
+        #print("flag 1")
         #short circuit         
-        if not can_fuse:
-            drint("CANNOT FUSE!")
-            return False, partitions
+        #if not can_fuse:
+        ##    drint("CANNOT FUSE!")
+        #    return False, partitions
 
         super_strats = []
         super_fspaces = []
@@ -1219,7 +1264,8 @@ def build_fused_op(self,ops):
             super_fspaces.append(super_fspace)
             super_keystores.append(super_keystore)
             super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore))
-        drint("lens", len(super_strats), len(super_fspaces), len(super_strategies), len(super_keystore))
+        #print("created supers")
+        #drint("lens", len(super_strats), len(super_fspaces), len(super_strategies), len(super_keystore))
         """
         super_strat = {}
         super_fspace = {}
@@ -1229,15 +1275,17 @@ def build_fused_op(self,ops):
         """
         #super_strategy = Strategy(partitions[0]._launch_shape, super_strat, super_fspace)
         #hacky way to get numpy context and designated fused task id
-        fused_id = self._contexts["legate.numpy"].fused_id
-        numpy_context = self._contexts["legate.numpy"]
+        fused_id = self._contexts["cunumeric"].fused_id
+        numpy_context = self._contexts["cunumeric"]
         numpy_runtime = numpy_context._library.runtime
         z=0
+        #print("fsets", fusable_sets)
         new_op_list = []
         for i,fusable_set in enumerate(fusable_sets):
             start, end = fusable_set
             op_subset = ops[start:end]
             #if nothing to fuse, just use the original op
+            #self._fusedOpLens.append(len(op_subset))
             if end-start==1:
                 normal_op = ops[start]
                 normal_op.strategy =  super_strategies[i]
@@ -1263,27 +1311,31 @@ def build_fused_op(self,ops):
                         fused_task.add_input(input)   
                     for output,part in zip(op._outputs, op._output_parts):
                         fused_task.add_output(output)   
-                        if key_part==None:
-                            key_part = partitions[z].get_partition(part)
+                        #if key_part==None:
+                        #    key_part = partitions[z].get_partition(part)
                           
-                    self.propogateFuture(fused_task)
+                    #self.propogateFuture(fused_task)
                     for future in op._futures:
                         fused_task.add_future(future)
                     z+=1
                 new_op_list.append(fused_task)
+        #print("flag 2")
+        strats=[]
         for i,fused_task in enumerate(new_op_list):
             must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task])
             partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single)
             strategy = partitioner.partition_stores()
             #fused_task.strategy = super_strategies[i]
             fused_task.strategy = strategy
-
-        return new_op_list, True       
+            strats.append(strategy)
+            #print("star",strategy)
+        #print("flag 3")
+        return new_op_list, strats       
 
     def _launch_outstanding(self, force_eval=True):        
         if len(self._outstanding_ops):
             #print("launching outstanding", ops)
-            self._opLens.append(len(self._outstanding_ops))
+            #self._opLens.append(len(self._outstanding_ops))
             ops = self._outstanding_ops
             self._outstanding_ops = []
             self._schedule(ops, force_eval)
@@ -1318,6 +1370,7 @@ def _schedule(self, ops, force_eval=False):
         #     strategy = partitioner.partition_stores()
         if len(ops)>=2 and (not force_eval):
             fused_task_list,strats = self.build_fused_op(ops)
+            #print("flist", fused_task_list)
             if fused_task_list:
                 frint("created fused list", [op._task_id for op in fused_task_list])
                 drint("start clearing pipe")
@@ -1347,6 +1400,7 @@ def _schedule(self, ops, force_eval=False):
         # if we already checked the ops for fusability,
         # then the ops' buffers have already been partitioned
         else:
+            #print("normal execution", ids)
             if not strats: #ops were not check for fusability, so partition them
                 for op in ops:
                     must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op])
@@ -1374,7 +1428,7 @@ def submit(self, op):
         else:
             self._outstanding_ops.append(op)
             if len(self._outstanding_ops) >= self._window_size:
-                self._opLens.append(len(self._outstanding_ops))
+                #self._opLens.append(len(self._outstanding_ops))
                 ops = self._outstanding_ops
                 self._outstanding_ops = []
                 self._schedule(ops)
@@ -1397,7 +1451,7 @@ def _scheduleNew(self, ops):
     def flush_scheduling_window(self):
         if len(self._outstanding_ops) == 0:
             return
-        self._opLens.append(len(self._outstanding_ops))
+        #self._opLens.append(len(self._outstanding_ops))
         ops = self._outstanding_ops
         self._outstanding_ops = []
         self._schedule(ops)
diff --git a/legate/core/solver.py b/legate/core/solver.py
index 77628df1a..fb6ba0ed6 100644
--- a/legate/core/solver.py
+++ b/legate/core/solver.py
@@ -269,6 +269,10 @@ def partition_stores(self):
 
         def cost(unknown):
             store = unknown._store
+            return (
+                -store.comm_volume(),
+                not store.has_key_partition(all_restrictions[unknown]),
+            )
             return (
                 store.comm_volume(),
                 store._key_partition is None,

From e2afa73edf31198846f2664c3086d56507309c33 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@g0001.stanford.edu>
Date: Mon, 15 Nov 2021 15:05:56 -0800
Subject: [PATCH 28/44] constant optimization

---
 legate/core/constraints.py      |  2 +-
 legate/core/launcher.py         |  2 ++
 legate/core/runtime.py          | 61 ++++++++++++---------------------
 legate/core/solver.py           | 10 +++---
 legate/core/store.py            |  1 -
 src/core/data/store.cc          |  4 +--
 src/core/mapping/core_mapper.cc |  2 +-
 7 files changed, 33 insertions(+), 49 deletions(-)

diff --git a/legate/core/constraints.py b/legate/core/constraints.py
index 0609b8980..26db3e8cc 100644
--- a/legate/core/constraints.py
+++ b/legate/core/constraints.py
@@ -76,7 +76,7 @@ def __repr__(self):
         disj = "D" if self._disjoint else "A"
         comp = "C" if self._complete else "I"
         #return f"X{self._id}({disj},{comp})"
-        return f"X{self._id}({disj},{comp})@{self._op.get_name()}"
+        return f"X{self._id}({disj},{comp})@{self._op_name}"
 
     def __hash__(self):
         #return hash(self._id)
diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index 78924acfe..c97ff6967 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -263,6 +263,8 @@ def __init__(self, region, permission, proj, tag, flags):
         self.region = region
         self.permission = permission
         self.proj = proj
+        #print(proj.__dict__)
+        #print(region)
         self.tag = tag
         self.flags = flags
 
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index d3c68b31e..90d4d1af3 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -807,9 +807,10 @@ def __init__(self):
         #these ops are almost always fusable
         self.validIDs.add(2) #Binary op
         self.validIDs.add(18) #Unary op
+        #self.validIDs.add(5) #Convert op
         self.validIDs.add(9) #Fill op
         self.validIDs.add(20) #Where op
-        #self.validIDs.add(14) #read op
+        self.validIDs.add(14) #read op
         #self.validIDs.add(5) #convert op
 
         # the following are conditionally fusable
@@ -934,13 +935,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         launch_shapes = []
         for i in range(len(ops)):
             launch_shapes.append(strategies[i]._launch_shape)
-        #print(launch_shapes)
-        """
-        first_shape = launch_shapes[0]
-        for launch_shape in launch_shapes:
-            if launch_shape!=first_shape:
-                return True, [(0,1),(1,len(ops))]
-        """
         intervals =[]
         i=1
         start=0
@@ -948,6 +942,11 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         while i<end:
             leftNone = launch_shapes[i] is None and (launch_shapes[i-1] is not None)
             rightNone = launch_shapes[i-1] is None and (launch_shapes[i] is not None)
+            #leftRead = leftNone and int(ops[i]._task_id)==14
+            #rightRead = rightNone and int(ops[i-1]._task_id)==14
+            #if leftRead or rightRead:
+            #    i=i+1
+            #elif leftNone or rightNone or launch_shapes[i]!=launch_shapes[i-1]:
             if leftNone or rightNone or launch_shapes[i]!=launch_shapes[i-1]:
                 intervals.append((start, i))
                 start=i
@@ -956,6 +955,7 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
                 i+=1
         if start<end:
             intervals.append((start, end))
+        #print("intervals", intervals, launch_shapes)
         return True, intervals
 
 
@@ -976,7 +976,6 @@ def getRoot(store):
         
         while i<end:
             op = ops[i] 
-
             #check consumers view of root array
             #is the same as the producers view
             isSame = True
@@ -984,6 +983,8 @@ def getRoot(store):
                 inputRoot = getRoot(input) 
                 if inputRoot in childMap:
                     isSame = isSame and childMap[inputRoot] == input
+                    #if input!= childMap[inputRoot]:
+                    #    print(input, childMap[inputRoot])
 
             if not isSame:                     
                 intervals.append((start, i))
@@ -992,7 +993,6 @@ def getRoot(store):
                 childMap = {}
             else:
                 i=i+1 
-
             #register producers view of buffer
             for output in op._outputs:
                 outputRoot = getRoot(output)
@@ -1001,6 +1001,7 @@ def getRoot(store):
             #end while
         if start<end:
            intervals.append((start,end))
+        #print("v", intervals)
         return True, intervals
  
 
@@ -1048,7 +1049,7 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size=10
+        self._window_size=1
         self._fusion_threshold =2
         self._opLens = []
         self._fusedOpLens = []
@@ -1205,7 +1206,6 @@ def serialize_multiop_metadata(self, numpy_context, ops):
                 offsets.append(-(r+1)) 
  
             op_ids.append(numpy_context.get_task_id(op._task_id._value_))
-
             offset_start+=(len(op._inputs)+len(op._outputs))
             input_start+=len(op._inputs)
             output_start+=len(op._outputs)
@@ -1235,20 +1235,19 @@ def build_fused_op(self,ops):
         fusion_checker = FusionChecker(ops, self._contexts, self)
         fusion_checker.register_constraint(NumpyContextExists())
         fusion_checker.register_constraint(AllValidOps())
-        fusion_checker.register_constraint(IdenticalLaunchShapes())
+        #fusion_checker.register_constraint(IdenticalLaunchShapes())
         fusion_checker.register_constraint(IdenticalProjection())
         fusion_checker.register_constraint(ValidProducerConsumer())
         can_fuse,fusable_sets, partitions = fusion_checker.can_fuse()
-        #print("flag 1")
         #short circuit         
         #if not can_fuse:
         ##    drint("CANNOT FUSE!")
         #    return False, partitions
-
+        
         super_strats = []
         super_fspaces = []
         super_strategies = []
-        super_keystores = []
+        super_keystores = [] 
         for fusable_set in fusable_sets:   
             #create super strategy for this fusable set
             super_strat = {}
@@ -1264,8 +1263,8 @@ def build_fused_op(self,ops):
             super_fspaces.append(super_fspace)
             super_keystores.append(super_keystore)
             super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore))
-        #print("created supers")
         #drint("lens", len(super_strats), len(super_fspaces), len(super_strategies), len(super_keystore))
+       
         """
         super_strat = {}
         super_fspace = {}
@@ -1279,7 +1278,6 @@ def build_fused_op(self,ops):
         numpy_context = self._contexts["cunumeric"]
         numpy_runtime = numpy_context._library.runtime
         z=0
-        #print("fsets", fusable_sets)
         new_op_list = []
         for i,fusable_set in enumerate(fusable_sets):
             start, end = fusable_set
@@ -1288,7 +1286,7 @@ def build_fused_op(self,ops):
             #self._fusedOpLens.append(len(op_subset))
             if end-start==1:
                 normal_op = ops[start]
-                normal_op.strategy =  super_strategies[i]
+                normal_op.strategy =  partitions[z]._strategy#uper_strategies[i]
                 new_op_list.append(normal_op)
             elif end-start > 1:
                 #initialize fused task
@@ -1305,21 +1303,18 @@ def build_fused_op(self,ops):
                         fused_task.add_scalar_arg(scalar[0], ty.int32)
                     for reduction in op._reductions:
                         fused_task.add_reduction(reduction)
-
+                    isScalarConversion = len(op._outputs) ==1 and len(op._inputs)==1
+                    if int(op._task_id)==14 and isScalarConversion: #for handling scalars
+                        op._outputs[0]._storage = op._inputs[0]._storage    
                     for input in op._inputs:
-                        #if input._storage is None:
                         fused_task.add_input(input)   
                     for output,part in zip(op._outputs, op._output_parts):
                         fused_task.add_output(output)   
-                        #if key_part==None:
-                        #    key_part = partitions[z].get_partition(part)
-                          
                     #self.propogateFuture(fused_task)
                     for future in op._futures:
                         fused_task.add_future(future)
                     z+=1
                 new_op_list.append(fused_task)
-        #print("flag 2")
         strats=[]
         for i,fused_task in enumerate(new_op_list):
             must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task])
@@ -1328,8 +1323,7 @@ def build_fused_op(self,ops):
             #fused_task.strategy = super_strategies[i]
             fused_task.strategy = strategy
             strats.append(strategy)
-            #print("star",strategy)
-        #print("flag 3")
+            #strats.append( super_strategies[i])
         return new_op_list, strats       
 
     def _launch_outstanding(self, force_eval=True):        
@@ -1339,12 +1333,7 @@ def _launch_outstanding(self, force_eval=True):
             ops = self._outstanding_ops
             self._outstanding_ops = []
             self._schedule(ops, force_eval)
- 
-    def _launch_one(self):
-        if len(self._outstanding_ops):
-            op = self._outstanding_ops[0]
-            self._outstanding_ops = self._outstanding_ops[1:]
-            self._schedule([op], force_eval=True)
+
 
     def propogateFuture(self,op):
         return 
@@ -1354,8 +1343,6 @@ def propogateFuture(self,op):
                 while start._storage is None and start._parent:
                     start=start._parent
                 input._storage = start._storage
-                
-     
    
     def _schedule(self, ops, force_eval=False):
         ids = [op._task_id for op in ops]
@@ -1364,10 +1351,6 @@ def _schedule(self, ops, force_eval=False):
         #if partially or fully fusable, 
         #schedule the new set of tasks
         strats = False
-        #for op in ops:
-        #     must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op])
-        #     partitioner = Partitioner(self, [op], must_be_single=must_be_single)
-        #     strategy = partitioner.partition_stores()
         if len(ops)>=2 and (not force_eval):
             fused_task_list,strats = self.build_fused_op(ops)
             #print("flist", fused_task_list)
diff --git a/legate/core/solver.py b/legate/core/solver.py
index fb6ba0ed6..a600dd1dc 100644
--- a/legate/core/solver.py
+++ b/legate/core/solver.py
@@ -273,11 +273,11 @@ def cost(unknown):
                 -store.comm_volume(),
                 not store.has_key_partition(all_restrictions[unknown]),
             )
-            return (
-                store.comm_volume(),
-                store._key_partition is None,
-                not store.has_key_partition(all_restrictions[unknown]),
-            )
+            #return (
+            #    store.comm_volume(),
+            #    store._key_partition is None,
+            #    not store.has_key_partition(all_restrictions[unknown]),
+            #)
 
         unknowns = sorted(unknowns, key=cost)
 
diff --git a/legate/core/store.py b/legate/core/store.py
index ce097db41..600aae9c5 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -474,7 +474,6 @@ def storage(self):
         # it has been initialized correctly.
         self._runtime._launch_outstanding(False)
         if self._storage is None:
-            #print("store none, launching", [op._task_id for op in self._runtime._outstanding_ops])
             """
             if self._kind ==Future:
                 print("future")
diff --git a/src/core/data/store.cc b/src/core/data/store.cc
index f114ed4de..fd40a4dd9 100644
--- a/src/core/data/store.cc
+++ b/src/core/data/store.cc
@@ -127,8 +127,8 @@ Domain FutureWrapper::domain() const { return domain_; }
 ReturnValue FutureWrapper::pack() const
 {
   if (nullptr == rawptr_) {
-    fprintf(stderr, "Found an uninitialized Legate store\n");
-    assert(false);
+    //fprintf(stderr, "Found an uninitialized Legate store\n");
+    //assert(false);
   }
   return ReturnValue(rawptr_, field_size_);
 }
diff --git a/src/core/mapping/core_mapper.cc b/src/core/mapping/core_mapper.cc
index 6d8d57ad7..4138d670a 100644
--- a/src/core/mapping/core_mapper.cc
+++ b/src/core/mapping/core_mapper.cc
@@ -202,7 +202,7 @@ Mapper::MapperSyncModel CoreMapper::get_mapper_sync_model(void) const
 
 void CoreMapper::select_task_options(const MapperContext ctx, const Task& task, TaskOptions& output)
 {
-  std::cout<<"task_id "<<task.task_id<<std::endl;
+  //std::cout<<"task_id "<<task.task_id<<std::endl;
   //assert(context.valid_task_id(task.task_id));
   if (task.tag == LEGATE_CPU_VARIANT) {
     assert(!local_cpus.empty());

From 80aa90fd129aeec09496fd95dc2e2ccc8ee526a3 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@g0001.stanford.edu>
Date: Sun, 21 Nov 2021 18:27:30 -0800
Subject: [PATCH 29/44] better constant opt

---
 legate/core/runtime.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 90d4d1af3..109fb4266 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -807,7 +807,7 @@ def __init__(self):
         #these ops are almost always fusable
         self.validIDs.add(2) #Binary op
         self.validIDs.add(18) #Unary op
-        #self.validIDs.add(5) #Convert op
+        self.validIDs.add(5) #Convert op
         self.validIDs.add(9) #Fill op
         self.validIDs.add(20) #Where op
         self.validIDs.add(14) #read op
@@ -1049,7 +1049,7 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size=1
+        self._window_size=50
         self._fusion_threshold =2
         self._opLens = []
         self._fusedOpLens = []
@@ -1248,6 +1248,7 @@ def build_fused_op(self,ops):
         super_fspaces = []
         super_strategies = []
         super_keystores = [] 
+        z=0
         for fusable_set in fusable_sets:   
             #create super strategy for this fusable set
             super_strat = {}
@@ -1299,6 +1300,12 @@ def build_fused_op(self,ops):
                 #add typical inputs and outputs of all subtasks to fused task
                 key_part = None
                 for j,op in enumerate(op_subset):
+                    #if int(op._task_id) == 5:
+                        #fused_task.add_output(op._outputs[0])
+                        #fused_task.add_input(op._inputs[0])
+                        #fused_task.add_broadcast(op._inputs[0])
+                        #fused_task.add_broadcast(op._outputs[0])
+                        #continue
                     for scalar in op._scalar_args:
                         fused_task.add_scalar_arg(scalar[0], ty.int32)
                     for reduction in op._reductions:
@@ -1322,6 +1329,7 @@ def build_fused_op(self,ops):
             strategy = partitioner.partition_stores()
             #fused_task.strategy = super_strategies[i]
             fused_task.strategy = strategy
+            #print("\t q ",i, strategy)
             strats.append(strategy)
             #strats.append( super_strategies[i])
         return new_op_list, strats       
@@ -1371,7 +1379,7 @@ def _schedule(self, ops, force_eval=False):
             strategy = ops[0].strategy
             for input, part in zip(ops[0]._inputs, ops[0]._input_parts):
                 frint("launch fused input", ops[0]._task_id, input)
-                proj = ops[0].strategy.get_projection(part)
+                #proj = ops[0].strategy.get_projection(part)
             self.propogateFuture(ops[0])
 
             for output in ops[0]._outputs:

From f9eb11963e3281e17b6b3624504b20e30abae80e Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@g0001.stanford.edu>
Date: Mon, 22 Nov 2021 13:57:58 -0800
Subject: [PATCH 30/44] terminal dots

---
 legate/core/runtime.py | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 109fb4266..75be54545 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -803,16 +803,19 @@ class AllValidOps(FusionConstraint):
     """
     def __init__(self):
         self.validIDs = set()
-
+        self.terminals = set()
         #these ops are almost always fusable
         self.validIDs.add(2) #Binary op
         self.validIDs.add(18) #Unary op
         self.validIDs.add(5) #Convert op
         self.validIDs.add(9) #Fill op
         self.validIDs.add(20) #Where op
-        self.validIDs.add(14) #read op
+        self.validIDs.add(7)
+        #self.validIDs.add(14) #read op
         #self.validIDs.add(5) #convert op
 
+        self.terminals.add(7)
+
         # the following are conditionally fusable
         # they will be processed in the a subsequent level of filtering
  
@@ -843,18 +846,24 @@ def __init__(self):
 
     def apply(self, contexts, runtime, ops, partitioners, strategies):
         results = [int(op._task_id) in self.validIDs for op in ops]
+
         drint("valids", results)
         fusable_intervals = []
         start, end =0,0
         while end<len(results):
             result = results[end]
-            if result:
+            if result and (ops[end]._task_id not in self.terminals):
                 end=end+1
             else:
                 if start<end:
-                    fusable_intervals.append((start,end))
-                    start=end 
-                    end=start
+                    if ops[end]._task_id in self.terminals:
+                        fusable_intervals.append((start,end+1))
+                        start=end+1
+                        end=start
+                    else:
+                        fusable_intervals.append((start,end))
+                        start=end 
+                        end=start
                 else:
                     fusable_intervals.append((start, start+1))
                     start=start+1
@@ -1235,7 +1244,7 @@ def build_fused_op(self,ops):
         fusion_checker = FusionChecker(ops, self._contexts, self)
         fusion_checker.register_constraint(NumpyContextExists())
         fusion_checker.register_constraint(AllValidOps())
-        #fusion_checker.register_constraint(IdenticalLaunchShapes())
+        fusion_checker.register_constraint(IdenticalLaunchShapes())
         fusion_checker.register_constraint(IdenticalProjection())
         fusion_checker.register_constraint(ValidProducerConsumer())
         can_fuse,fusable_sets, partitions = fusion_checker.can_fuse()
@@ -1308,11 +1317,11 @@ def build_fused_op(self,ops):
                         #continue
                     for scalar in op._scalar_args:
                         fused_task.add_scalar_arg(scalar[0], ty.int32)
-                    for reduction in op._reductions:
-                        fused_task.add_reduction(reduction)
+                    for (reduction, redop) in op._reductions:
+                        fused_task.add_reduction(reduction, redop)
                     isScalarConversion = len(op._outputs) ==1 and len(op._inputs)==1
-                    if int(op._task_id)==14 and isScalarConversion: #for handling scalars
-                        op._outputs[0]._storage = op._inputs[0]._storage    
+                    #if int(op._task_id)==14 and isScalarConversion: #for handling scalars
+                    #    op._outputs[0]._storage = op._inputs[0]._storage    
                     for input in op._inputs:
                         fused_task.add_input(input)   
                     for output,part in zip(op._outputs, op._output_parts):

From f07381ea1e44bde3471db0e62680ce6194f3e925 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@g0002.stanford.edu>
Date: Wed, 1 Dec 2021 09:39:54 -0800
Subject: [PATCH 31/44] reuse partitions

---
 legate/core/operation.py |  35 +++----
 legate/core/runtime.py   | 198 ++++++++++++++++++---------------------
 2 files changed, 105 insertions(+), 128 deletions(-)

diff --git a/legate/core/operation.py b/legate/core/operation.py
index 1dba97e90..8b13c92e8 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -161,6 +161,7 @@ def execute(self):
 
     def get_tag(self, strategy, part):
         if strategy.is_key_part(part):
+            return 0
             return 1  # LEGATE_CORE_KEY_STORE_TAG
         else:
             return 0
@@ -217,40 +218,30 @@ def launch(self, strategy):
 
         if self._is_fused:
             launcher.add_fusion_metadata(self._is_fused, self._fusion_metadata)
-        """
-        for input in self._inputs:
-            proj = strategy.get_projection(input)
-            tag = self.get_tag(strategy, input)
-            launcher.add_input(input, proj, tag=tag)
-        for temp in self._temps:
-            proj = strategy.get_projection(temp)
-            launcher.add_temp(temp, proj)
-            partition = strategy.get_partition(temp)
-            # We update the key partition of a store only when it gets updated
-            temp.set_key_partition(partition)
-        """
-        #print("inputs")
-        for input, input_part in zip(self._inputs, self._input_parts):
+        if  self._is_fused: #fused ops re-use encapsulated unfused partitions
+            input_parts = self._unfused_input_parts
+            output_parts = self._unfused_output_parts
+            reduction_parts = self._unfused_reduction_parts
+        else:
+            input_parts = self._input_parts
+            output_parts = self._output_parts
+            reduction_parts = self._reduction_parts
+
+        for input, input_part in zip(self._inputs, input_parts):
             proj = strategy.get_projection(input_part)
-            #if (input._kind==Future):
-            #    print(input, proj)
             tag = self.get_tag(strategy, input_part)
             launcher.add_input(input, proj, tag=tag)
-        #print("outputs", len(self._outputs))
-        for output, output_part in zip(self._outputs, self._output_parts):
+        for output, output_part in zip(self._outputs, output_parts):
             if output.unbound:
                 continue
             proj = strategy.get_projection(output_part)
-            #if (output._kind==Future):
-            #    print(output, proj)
             tag = self.get_tag(strategy, output_part)
             launcher.add_output(output, proj, tag=tag)
             partition = strategy.get_partition(output_part)
             # We update the key partition of a store only when it gets updated
             output.set_key_partition(partition)
-        #print()
         for ((reduction, redop), reduction_part) in zip(
-            self._reductions, self._reduction_parts
+            self._reductions, reduction_parts
         ):
             partition = strategy.get_partition(reduction_part)
             can_read_write = partition.is_disjoint_for(strategy, reduction)
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 8f6e3ea16..6f3029ab5 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -25,6 +25,10 @@
 
 from legate.core import types as ty
 
+import datetime
+import cProfile
+pr = cProfile.Profile()
+
 from .context import Context
 from .corelib import CoreLib
 from .launcher import TaskLauncher
@@ -756,6 +760,7 @@ def supress_small_fusions(self, intervals, threshold):
         return fusable, final_set
 
     def can_fuse(self):
+        #starttime = datetime.datetime.now()
         for op in reversed(self.ops):
             must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op])
             partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single)
@@ -768,20 +773,32 @@ def can_fuse(self):
                 for input in op._inputs:
                     if input.shape==output.shape:
                         input.set_key_partition(key_part)
-            if len(op.inputs)>1:
-                proj = strategy.get_projection(op._input_parts[1])
+            #if len(op.inputs)>1:
+            #    proj = strategy.get_projection(op._input_parts[1])
             self.strategies.append(strategy)
         self.strategies.reverse()
-
+        """
+        stoptime = datetime.datetime.now()
+        delta=stoptime-starttime
+        total = delta.total_seconds() * 1000.0
+        print("partime", total, len(self.ops))
+        """     
+        #starttime = datetime.datetime.now()
         results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints]
+        """
+        stoptime = datetime.datetime.now()
+        delta=stoptime-starttime
+        total = delta.total_seconds() * 1000.0
+        print("applytime", total)
+        """
+        #starttime = datetime.datetime.now()
         drint("fuse results", results)
         all_fusable = [result[0] for result in results]
         interval_sets = [result[1] for result in results]
   
         #intersect intervals
-        #this is a very, very bad way of doing this,
-        # in the future I'll just "intersect" in place
-        # as we apply constraints
+        #this is an inefficent way of doing this,
+        #but it takes little time in practice
         curr_set = interval_sets[0]
         for interval_set in interval_sets[1:]:
             newset = []
@@ -793,6 +810,14 @@ def can_fuse(self):
                         newset.append((news, newe))
             curr_set=newset
         fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold)
+         
+        """
+        stoptime = datetime.datetime.now()
+        delta=stoptime-starttime
+        total = delta.total_seconds() * 1000.0
+        print("filtertime", total)
+        """
+
         drint("curset", curr_set)
 
         drint("final_set", final_set)
@@ -816,34 +841,8 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
             return True, [(0, len(ops))]
         else:
            return False, [(0,0)]
-"""
-enum NumPyOpCode {
-  NUMPY_ARANGE           = 1,
-  NUMPY_BINARY_OP        = 2,
-  NUMPY_BINARY_RED       = 3,
-  NUMPY_BINCOUNT         = 4,
-  NUMPY_CONVERT          = 5,
-  NUMPY_DIAG             = 6,
-  NUMPY_DOT              = 7,
-  NUMPY_EYE              = 8,
-  NUMPY_FILL             = 9,
-  NUMPY_MATMUL           = 10,
-  NUMPY_MATVECMUL        = 11,
-  NUMPY_NONZERO          = 12,
-  NUMPY_RAND             = 13,
-  NUMPY_READ             = 14,
-  NUMPY_SCALAR_UNARY_RED = 15,
-  NUMPY_TILE             = 16,
-  NUMPY_TRANSPOSE        = 17,
-  NUMPY_UNARY_OP         = 18,
-  NUMPY_UNARY_RED        = 19,
-  NUMPY_WHERE            = 20,
-  NUMPY_WRITE            = 21,
-  NUMPY_DOUBLE_BINARY_OP  = 23,
-  NUMPY_FUSED_OP            = 24,
-}
-
-"""
+
+
 class AllValidOps(FusionConstraint):
     """
     Class for only fusing only potentially fusable ops.
@@ -854,43 +853,12 @@ def __init__(self):
         self.terminals = set()
         #these ops are almost always fusable
         self.validIDs.add(2) #Binary op
-        self.validIDs.add(18) #Unary op
-        self.validIDs.add(5) #Convert op
-        self.validIDs.add(9) #Fill op
+        self.validIDs.add(10) #Fill op
         self.validIDs.add(20) #Where op
-        self.validIDs.add(7)
-        #self.validIDs.add(14) #read op
-        #self.validIDs.add(5) #convert op
-
-        self.terminals.add(7)
-
-        # the following are conditionally fusable
-        # they will be processed in the a subsequent level of filtering
- 
-        # scalar producing ops are valid if the scalars they produce
-        # are NOT consumed by a subsequent op in the window
-        # however they can be printed, which we cannot detect in the runtime
-        # without static analysis, so consider these terminal fusable
-        #self.validIDs.add(400004) #Scalar unary red      
-        #self.validIDs.add(400005) #Unary red      
-
-        # as all scalars are futures,
-        # so we can just check if both Futures are "ready"
-        # more powerfully, we can also create a dependency tree
-        # of ops, and assuming they're all scalar ops, 
-        # and the "roots" are ready, we can fuse
-        #self.validIDs.add(400002) #Scalar Binary op
-        #self.validIDs.add(400007) #Scalar Unary op
-        #self.validIDs.add(400008) #Scalar binary red     
-
-        #a matmul is valid if it is the last op in the sequence
-        #unless if it followed by a matmul of the exact same size 
-        #so it is terminal fusable
-        #self.validIDs.add(400017) #Matmul
-
-        #vector dot is binary op + scalar producing reduction
-        #it is thus terminal fusable
-        #self.validIDs.add(400019) #dot
+        #self.validIDs.add(6) #Where op
+        #self.validIDs.add(5) #Convert op
+        self.validIDs.add(7) #dot op
+        self.terminals.add(7) #dot op is only fusable as a terminal op in a window
 
     def apply(self, contexts, runtime, ops, partitioners, strategies):
         results = [int(op._task_id) in self.validIDs for op in ops]
@@ -948,7 +916,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
                 if input not in bufferSet:
                     proj = strategies[i].get_projection(part)
                     if hasattr(proj, 'part'):
-                        #bufferSet[input]=proj
                         bufferSet[input]=proj
                         if input not in linkset:
                             linkset[input] = [i]
@@ -982,6 +949,7 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
             i+=1
         if start<end:
             intervals.append((start,end))
+        #print("proj", intervals)
         return True, intervals
          
 
@@ -992,6 +960,7 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         launch_shapes = []
         for i in range(len(ops)):
             launch_shapes.append(strategies[i]._launch_shape)
+        #print("ls", launch_shapes)
         intervals =[]
         i=1
         start=0
@@ -1012,7 +981,7 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
                 i+=1
         if start<end:
             intervals.append((start, end))
-        #print("intervals", intervals, launch_shapes)
+        #print(intervals)
         return True, intervals
 
 
@@ -1040,8 +1009,6 @@ def getRoot(store):
                 inputRoot = getRoot(input) 
                 if inputRoot in childMap:
                     isSame = isSame and childMap[inputRoot] == input
-                    #if input!= childMap[inputRoot]:
-                    #    print(input, childMap[inputRoot])
 
             if not isSame:                     
                 intervals.append((start, i))
@@ -1058,7 +1025,6 @@ def getRoot(store):
             #end while
         if start<end:
            intervals.append((start,end))
-        #print("v", intervals)
         return True, intervals
  
 
@@ -1106,7 +1072,7 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size=50
+        self._window_size=4
         self._fusion_threshold =2
         self._opLens = []
         self._fusedOpLens = []
@@ -1216,7 +1182,7 @@ def destroy(self):
         #    pickle.dump(self._fusedOpLens, f)
         #with open('flushOpLens.pkl', 'wb') as f:
         #    pickle.dump(self._fusedOpLens, f)
- 
+        #pr.print_stats(sort='time')
         # Destroy all libraries. Note that we should do this
         # from the lastly added one to the first one
         for context in reversed(self._context_list):
@@ -1289,8 +1255,6 @@ def serialize_multiop_metadata(self, numpy_context, ops):
         scalar_starts.append(scalar_start)
         future_starts.append(future_start)
 
-        #turn metadata maps into deferred arrays
-        #then load them into the task as the initial inputs
         meta_arrs =  (input_starts, output_starts, offset_starts, offsets, reduction_starts,  scalar_starts, 
                       future_starts, op_ids)
         fusion_metadata = FusionMetadata(*meta_arrs)
@@ -1299,6 +1263,7 @@ def serialize_multiop_metadata(self, numpy_context, ops):
    
 
     def build_fused_op(self,ops):
+
         for i in range(len(ops)):
             self.propogateFuture(ops[i])
         fusion_checker = FusionChecker(ops, self._contexts, self)
@@ -1308,11 +1273,12 @@ def build_fused_op(self,ops):
         fusion_checker.register_constraint(IdenticalProjection())
         fusion_checker.register_constraint(ValidProducerConsumer())
         can_fuse,fusable_sets, partitions = fusion_checker.can_fuse()
+
         #short circuit         
         #if not can_fuse:
         ##    drint("CANNOT FUSE!")
         #    return False, partitions
-        
+                
         super_strats = []
         super_fspaces = []
         super_strategies = []
@@ -1335,13 +1301,14 @@ def build_fused_op(self,ops):
             super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore))
         #drint("lens", len(super_strats), len(super_fspaces), len(super_strategies), len(super_keystore))
        
-        """
         super_strat = {}
         super_fspace = {}
         for partition in partitions:
             super_strat = {**(super_strat.copy()), **partition._strategy}  
             super_fspace = {**(super_fspace.copy()), **partition._fspaces}
-        """
+        
+
+        #starttime = datetime.datetime.now()
         #super_strategy = Strategy(partitions[0]._launch_shape, super_strat, super_fspace)
         #hacky way to get numpy context and designated fused task id
         fused_id = self._contexts["cunumeric"].fused_id
@@ -1358,54 +1325,67 @@ def build_fused_op(self,ops):
                 normal_op = ops[start]
                 normal_op.strategy =  partitions[z]._strategy#uper_strategies[i]
                 new_op_list.append(normal_op)
+                z+=1
             elif end-start > 1:
                 #initialize fused task
                 fused_task = numpy_context.create_task(fused_id)
-
-               #serialize necessary metadata on all encapsulated ops 
+                #serialize necessary metadata on all encapsulated ops 
                 #this metadata will be fed into the fused op as inputs
                 meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset)
                 fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true
                 #add typical inputs and outputs of all subtasks to fused task
                 key_part = None
+                fused_task._unfused_input_parts = []
+                fused_task._unfused_output_parts = []
+                fused_task._unfused_reduction_parts = []
+                #isScalarConversion = len(op._outputs) ==1 and len(op._inputs)==1
                 for j,op in enumerate(op_subset):
-                    #if int(op._task_id) == 5:
-                        #fused_task.add_output(op._outputs[0])
-                        #fused_task.add_input(op._inputs[0])
-                        #fused_task.add_broadcast(op._inputs[0])
-                        #fused_task.add_broadcast(op._outputs[0])
-                        #continue
                     for scalar in op._scalar_args:
                         fused_task.add_scalar_arg(scalar[0], ty.int32)
-                    for (reduction, redop) in op._reductions:
+                    for (reduction, redop), part in zip(op._reductions, op._reduction_parts):
                         fused_task.add_reduction(reduction, redop)
-                    isScalarConversion = len(op._outputs) ==1 and len(op._inputs)==1
-                    #if int(op._task_id)==14 and isScalarConversion: #for handling scalars
-                    #    op._outputs[0]._storage = op._inputs[0]._storage    
-                    for input in op._inputs:
+                        fused_task._unfused_reduction_parts.append(part)
+                    for input,part in zip(op._inputs, op._input_parts):
+                        #print(j, part)
                         fused_task.add_input(input)   
+                        fused_task._unfused_input_parts.append(part)
                     for output,part in zip(op._outputs, op._output_parts):
                         fused_task.add_output(output)   
-                    #self.propogateFuture(fused_task)
+                        fused_task._unfused_output_parts.append(part)
                     for future in op._futures:
                         fused_task.add_future(future)
                     z+=1
                 new_op_list.append(fused_task)
         strats=[]
+        """
+        stoptime = datetime.datetime.now()
+        delta=stoptime-starttime
+        total = delta.total_seconds() * 1000.0
+        print("buildtime", total)
+        """
+   
+        #pr.enable()
+        redoPar=False
+        #starttime = datetime.datetime.now()
         for i,fused_task in enumerate(new_op_list):
-            must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task])
-            partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single)
-            strategy = partitioner.partition_stores()
-            #fused_task.strategy = super_strategies[i]
-            fused_task.strategy = strategy
-            #print("\t q ",i, strategy)
-            strats.append(strategy)
-            #strats.append( super_strategies[i])
+            if redoPar:
+                must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task])
+                partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single)
+                strategy = partitioner.partition_stores()
+                fused_task.strategy = strategy
+                strats.append(strategy)
+            else:
+                fused_task.strategy = super_strategies[i]
+                strats.append( super_strategies[i])
+        #stoptime = datetime.datetime.now()
+        #delta=stoptime-starttime
+        #total = delta.total_seconds() * 1000.0
+        #print("repartime", total)
+        #pr.disable()
         return new_op_list, strats       
 
     def _launch_outstanding(self, force_eval=True):        
         if len(self._outstanding_ops):
-            #print("launching outstanding", ops)
             #self._opLens.append(len(self._outstanding_ops))
             ops = self._outstanding_ops
             self._outstanding_ops = []
@@ -1429,8 +1409,14 @@ def _schedule(self, ops, force_eval=False):
         #schedule the new set of tasks
         strats = False
         if len(ops)>=2 and (not force_eval):
+            #start = datetime.datetime.now()
+            #pr.enable()
             fused_task_list,strats = self.build_fused_op(ops)
-            #print("flist", fused_task_list)
+            #pr.disable()
+            #stop = datetime.datetime.now()
+            #delta=stop-start
+            #total = delta.total_seconds() * 1000.0
+            #print("time", total)
             if fused_task_list:
                 frint("created fused list", [op._task_id for op in fused_task_list])
                 drint("start clearing pipe")

From ea1237758b1e2b7888872d335ceaa5393dc3c6f7 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@sapling.stanford.edu>
Date: Thu, 2 Dec 2021 17:37:21 -0800
Subject: [PATCH 32/44] install.py

---
 install.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/install.py b/install.py
index 559ea7eaf..6a3313e62 100755
--- a/install.py
+++ b/install.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+#/home/shiv1/pypy/pypy3.8-v7.3.7-linux64/bin/python3.8
 
 # Copyright 2021 NVIDIA Corporation
 #
@@ -377,6 +378,7 @@ def build_legion(
 
         legion_python_dir = os.path.join(legion_src_dir, "bindings", "python")
         if clean_first:
+            print("cleaning!\n")
             verbose_check_call(
                 ["make"] + flags + ["clean"], cwd=legion_python_dir
             )
@@ -897,7 +899,6 @@ def driver():
         "--clean",
         dest="clean_first",
         action=BooleanFlag,
-        #default=False,
         default=False,
         help="Clean before build, and pull latest Legion.",
     )

From d7a8dabddd6f7817e417630da3a7f632dd50f94b Mon Sep 17 00:00:00 2001
From: Shiv Sundram <sundram1@lassen69.coral.llnl.gov>
Date: Sun, 5 Dec 2021 22:07:47 -0800
Subject: [PATCH 33/44] new way of applying constraints

---
 legate/core/runtime.py | 139 +++++++++++++++++++++++++++++++----------
 1 file changed, 106 insertions(+), 33 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 6f3029ab5..862f9e263 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -783,33 +783,44 @@ def can_fuse(self):
         total = delta.total_seconds() * 1000.0
         print("partime", total, len(self.ops))
         """     
+        streamApply = False
         #starttime = datetime.datetime.now()
-        results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints]
+        if not streamApply:
+            results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints]
+            all_fusable = [result[0] for result in results]
+            interval_sets = [result[1] for result in results]
+        else:
+            alpha = self.constraints[0].apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies)
+            beta = self.constraints[1].apply2(self.contexts, self.runtime, self.ops, alpha[1],self.partitioners, self.strategies)
+            beta = self.constraints[2].apply2(self.contexts, self.runtime, self.ops, beta[1], self.partitioners, self.strategies)
+        #print("beta", beta)
         """
         stoptime = datetime.datetime.now()
         delta=stoptime-starttime
         total = delta.total_seconds() * 1000.0
         print("applytime", total)
         """
-        #starttime = datetime.datetime.now()
-        drint("fuse results", results)
-        all_fusable = [result[0] for result in results]
-        interval_sets = [result[1] for result in results]
+        starttime = datetime.datetime.now()
+        #drint("fuse results", results)
   
         #intersect intervals
         #this is an inefficent way of doing this,
-        #but it takes little time in practice
-        curr_set = interval_sets[0]
-        for interval_set in interval_sets[1:]:
-            newset = []
-            for aset in curr_set:
-                for bset in interval_set:
-                    if not (aset[0] > bset[1] or bset[0] > aset[1]): 
-                        news = max(aset[0], bset[0])
-                        newe = min(aset[1], bset[1])
-                        newset.append((news, newe))
-            curr_set=newset
-        fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold)
+        #"""
+        if not streamApply:
+            curr_set = interval_sets[0]
+            for interval_set in interval_sets[1:]:
+                newset = []
+                for aset in curr_set:
+                    for bset in interval_set:
+                        if not (aset[0] > bset[1] or bset[0] > aset[1]): 
+                            news = max(aset[0], bset[0])
+                            newe = min(aset[1], bset[1])
+                            newset.append((news, newe))
+                curr_set=newset
+        #"""
+            fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold)
+        else:
+            fusable,final_set = self.supress_small_fusions(beta[1], self.runtime._fusion_threshold)
          
         """
         stoptime = datetime.datetime.now()
@@ -818,11 +829,11 @@ def can_fuse(self):
         print("filtertime", total)
         """
 
-        drint("curset", curr_set)
+        #drint("curset", curr_set)
 
-        drint("final_set", final_set)
-        drint("all fusable", fusable)
-        drint("intervals", interval_sets)
+        #drint("final_set", final_set)
+        #drint("all fusable", fusable)
+        #drint("intervals", interval_sets)
         #return reduce(lambda x,y: x and y, all_fusable), final_set, self.strategies
         return fusable, final_set, self.strategies
 
@@ -857,8 +868,8 @@ def __init__(self):
         self.validIDs.add(20) #Where op
         #self.validIDs.add(6) #Where op
         #self.validIDs.add(5) #Convert op
-        self.validIDs.add(7) #dot op
-        self.terminals.add(7) #dot op is only fusable as a terminal op in a window
+        #self.validIDs.add(7) #dot op
+        #self.terminals.add(7) #dot op is only fusable as a terminal op in a window
 
     def apply(self, contexts, runtime, ops, partitioners, strategies):
         results = [int(op._task_id) in self.validIDs for op in ops]
@@ -917,19 +928,19 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
                     proj = strategies[i].get_projection(part)
                     if hasattr(proj, 'part'):
                         bufferSet[input]=proj
-                        if input not in linkset:
-                            linkset[input] = [i]
-                        else:
-                            linkset[input].append(i)
+                        #if input not in linkset:
+                        #    linkset[input] = [i]
+                        #else:
+                        #    linkset[input].append(i)
             for output, part in zip(op._outputs, op._output_parts):
                 if output not in bufferSet:
                     proj = strategies[i].get_projection(part)
                     if hasattr(proj, 'part'):
                         bufferSet[output]=proj
-                        if output not in linkset:
-                            linkset[output] = [i]
-                        else:
-                            linkset[output].append(i)
+                        #if output not in linkset:
+                        #    linkset[output] = [i]
+                        #else:
+                        #    linkset[output].append(i)
             if i==0: #we only iterate from i==1 onwards
                 i+=1
                 continue
@@ -984,6 +995,30 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         #print(intervals)
         return True, intervals
 
+    def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies):
+        launch_shapes = []
+        for i in range(len(ops)):
+            launch_shapes.append(strategies[i]._launch_shape)
+        #print("ls", launch_shapes)
+        intervals =[]
+        for baseInterval in baseIntervals:
+            start=baseInterval[0]
+            i=start+1
+            end = baseInterval[1]
+            while i<end:
+                leftNone = launch_shapes[i] is None and (launch_shapes[i-1] is not None)
+                rightNone = launch_shapes[i-1] is None and (launch_shapes[i] is not None)
+                if leftNone or rightNone or launch_shapes[i]!=launch_shapes[i-1]:
+                    intervals.append((start, i))
+                    start=i
+                    i=start+1
+                else:
+                    i+=1
+            if start<end:
+                intervals.append((start, end))
+        return True, intervals
+
+
 
 
 class ValidProducerConsumer(FusionConstraint):
@@ -1028,6 +1063,45 @@ def getRoot(store):
         return True, intervals
  
 
+    def apply2(self, contexts, runtime, ops, baseIntervals,  partitioners, strategies):
+        childMap = {}
+        intervals = []
+        i, start=0, 0
+        end = len(ops)
+        def getRoot(store):
+            while store._parent:
+                store = store._parent
+            return store
+        for baseInterval in baseIntervals:
+            start, end = baseInterval 
+            while i<end:
+                op = ops[i] 
+                #check consumers view of root array
+                #is the same as the producers view
+                isSame = True
+                for input in op._inputs:
+                    inputRoot = getRoot(input) 
+                    if inputRoot in childMap:
+                        isSame = isSame and childMap[inputRoot] == input
+
+                if not isSame:                     
+                    intervals.append((start, i))
+                    start=i
+                    i=start+1
+                    childMap = {}
+                else:
+                    i=i+1 
+                #register producers view of buffer
+                for output in op._outputs:
+                    outputRoot = getRoot(output)
+                    if outputRoot not in childMap:  
+                        childMap[outputRoot] = output
+            #end while
+            if start<end:
+                intervals.append((start,end))
+        return True, intervals
+ 
+
 
    
 class Runtime(object):
@@ -1072,7 +1146,7 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size=4
+        self._window_size=1
         self._fusion_threshold =2
         self._opLens = []
         self._fusedOpLens = []
@@ -1267,7 +1341,6 @@ def build_fused_op(self,ops):
         for i in range(len(ops)):
             self.propogateFuture(ops[i])
         fusion_checker = FusionChecker(ops, self._contexts, self)
-        fusion_checker.register_constraint(NumpyContextExists())
         fusion_checker.register_constraint(AllValidOps())
         fusion_checker.register_constraint(IdenticalLaunchShapes())
         fusion_checker.register_constraint(IdenticalProjection())

From fe66dad4418a7cc237146f09ee5c9ab4a793c461 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@sapling.stanford.edu>
Date: Tue, 7 Dec 2021 14:53:19 -0800
Subject: [PATCH 34/44] minor cleanup

---
 legate/core/__init__.py    |   4 --
 legate/core/constraints.py |   1 -
 legate/core/launcher.py    |  16 +----
 legate/core/legion.py      |   1 -
 legate/core/operation.py   |   6 --
 legate/core/runtime.py     | 119 ++++---------------------------------
 legate/core/solver.py      |   2 -
 legate/core/store.py       |  33 ----------
 8 files changed, 14 insertions(+), 168 deletions(-)

diff --git a/legate/core/__init__.py b/legate/core/__init__.py
index 526355308..19933da35 100644
--- a/legate/core/__init__.py
+++ b/legate/core/__init__.py
@@ -120,10 +120,6 @@
 # are overriding that module's name.
 from legion_cffi import ffi, lib as legion
 
-# NOTE: This needs to come after the imports from legate.core.legion, as we
-# are overriding that module's name.
-from legion_cffi import ffi, lib as legion
-
 # Import the PyArrow type system
 from pyarrow import (
     DataType,
diff --git a/legate/core/constraints.py b/legate/core/constraints.py
index 276631d2a..39e85bdcc 100644
--- a/legate/core/constraints.py
+++ b/legate/core/constraints.py
@@ -63,7 +63,6 @@ def __init__(self, op_hash, op_name, store, id, disjoint, complete):
         self._disjoint = disjoint
         self._complete = complete
 
-
     @property
     def ndim(self):
         return self._store.ndim
diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index ffde8e126..e396b0894 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -263,8 +263,6 @@ def __init__(self, region, permission, proj, tag, flags):
         self.region = region
         self.permission = permission
         self.proj = proj
-        #print(proj.__dict__)
-        #print(region)
         self.tag = tag
         self.flags = flags
 
@@ -572,13 +570,12 @@ def add_store(self, args, store, proj, perm, tag, flags):
         if store.kind is Future:
             if store.has_storage:
                 self.add_future(store.storage)
-            elif (perm == Permission.READ or perm == Permission.REDUCTION):
+            elif perm == Permission.READ or perm == Permission.REDUCTION:
                 raise RuntimeError(
                     "Read access to an uninitialized store is disallowed"
                 )
             read_only = perm == Permission.READ
             args.append(FutureStoreArg(store, read_only, store.has_storage))
-            #args.append(FutureStoreArg(store, perm, store.has_storage))
 
         else:
             region = store.storage.region
@@ -605,12 +602,6 @@ def add_output(self, store, proj, tag=0, flags=0):
             self._outputs, store, proj, Permission.WRITE, tag, flags
         )
 
-    # currently this is adding to outputs but we can have a seperate "temps" array in the core
-    def add_temp(self, store, proj, tag=0, flags=0):
-        self.add_store(
-            self._outputs, store, proj, Permission.WRITE, tag, flags
-        )
-
     def add_reduction(self, store, proj, tag=0, flags=0, read_write=False):
         if read_write and store.kind is not Future:
             self.add_store(
@@ -674,11 +665,6 @@ def pack_fusion_metadata(argbuf, is_fused, fusion_metadata):
 
     def build_task(self, launch_domain, argbuf):
         self._req_analyzer.analyze_requirements()
-        #print("building task id", self._task_id)
-        #for req in self._req_analyzer._requirements:
-        #    print(req)
-        #    print(req[0].__dict__)
-        #    print()
         self._out_analyzer.analyze_requirements()
 
         #pack fusion metadata
diff --git a/legate/core/legion.py b/legate/core/legion.py
index 7127bfe4d..a3cd653c9 100644
--- a/legate/core/legion.py
+++ b/legate/core/legion.py
@@ -157,7 +157,6 @@ def legate_task_postamble(runtime, context):
 # This is a decorator for wrapping the launch method on launchers
 # to dispatch any unordered deletions while the task is live
 def dispatch(func):
-    #print("dispatching")
     def launch(launcher, runtime, context, *args):
         # This context should always be in the dictionary
         legate_task_progress(runtime, context)
diff --git a/legate/core/operation.py b/legate/core/operation.py
index 8b13c92e8..39806ad81 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -36,7 +36,6 @@ def __init__(self, context, mapper_id=0, op_id=0):
         self._reductions = []
         self._is_fused = False
         self._temps = []
-
         self._input_parts = []
         self._output_parts = []
         self._reduction_parts = []
@@ -123,10 +122,6 @@ def add_output(self, store, partition=None):
         self._outputs.append(store)
         self._output_parts.append(partition)
 
-    def add_temp(self, store):
-        self._check_store(store)
-        self._temps.append(store) #this may not be necessary
-
     def add_reduction(self, store, redop, partition=None):
         self._check_store(store)
         if store.kind is Future:
@@ -186,7 +181,6 @@ def declare_partition(self, store, disjoint=True, complete=True):
         return sym
 
 
-
 class Task(Operation):
     def __init__(self, context, task_id, mapper_id=0, op_id=0):
         Operation.__init__(self, context, mapper_id=mapper_id, op_id=op_id)
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 862f9e263..195f16072 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -48,20 +48,8 @@
 from .solver import Partitioner, Strategy
 from .store import RegionField, Store, FusionMetadata
 import numpy as np
-debugPrint = False
-futureBugPrint = False
 
-def zprint(*args):
-    return
-if debugPrint:
-    drint = print
-else:
-    drint = zprint
 
-if futureBugPrint:
-    frint = print
-else:
-    frint = zprint
 
 
 # A Field holds a reference to a field in a region tree
@@ -782,7 +770,9 @@ def can_fuse(self):
         delta=stoptime-starttime
         total = delta.total_seconds() * 1000.0
         print("partime", total, len(self.ops))
-        """     
+        """  
+        #TODO: have all constraints use "streamApply"
+        # this is a more efficient way/interface for generating fusable intervals   
         streamApply = False
         #starttime = datetime.datetime.now()
         if not streamApply:
@@ -793,7 +783,6 @@ def can_fuse(self):
             alpha = self.constraints[0].apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies)
             beta = self.constraints[1].apply2(self.contexts, self.runtime, self.ops, alpha[1],self.partitioners, self.strategies)
             beta = self.constraints[2].apply2(self.contexts, self.runtime, self.ops, beta[1], self.partitioners, self.strategies)
-        #print("beta", beta)
         """
         stoptime = datetime.datetime.now()
         delta=stoptime-starttime
@@ -801,11 +790,9 @@ def can_fuse(self):
         print("applytime", total)
         """
         starttime = datetime.datetime.now()
-        #drint("fuse results", results)
   
         #intersect intervals
         #this is an inefficent way of doing this,
-        #"""
         if not streamApply:
             curr_set = interval_sets[0]
             for interval_set in interval_sets[1:]:
@@ -817,7 +804,6 @@ def can_fuse(self):
                             newe = min(aset[1], bset[1])
                             newset.append((news, newe))
                 curr_set=newset
-        #"""
             fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold)
         else:
             fusable,final_set = self.supress_small_fusions(beta[1], self.runtime._fusion_threshold)
@@ -828,13 +814,6 @@ def can_fuse(self):
         total = delta.total_seconds() * 1000.0
         print("filtertime", total)
         """
-
-        #drint("curset", curr_set)
-
-        #drint("final_set", final_set)
-        #drint("all fusable", fusable)
-        #drint("intervals", interval_sets)
-        #return reduce(lambda x,y: x and y, all_fusable), final_set, self.strategies
         return fusable, final_set, self.strategies
 
 class FusionConstraint(object):
@@ -862,19 +841,13 @@ class AllValidOps(FusionConstraint):
     def __init__(self):
         self.validIDs = set()
         self.terminals = set()
-        #these ops are almost always fusable
         self.validIDs.add(2) #Binary op
         self.validIDs.add(10) #Fill op
-        self.validIDs.add(20) #Where op
-        #self.validIDs.add(6) #Where op
-        #self.validIDs.add(5) #Convert op
-        #self.validIDs.add(7) #dot op
-        #self.terminals.add(7) #dot op is only fusable as a terminal op in a window
+        self.validIDs.add(20) #Unary op
 
     def apply(self, contexts, runtime, ops, partitioners, strategies):
         results = [int(op._task_id) in self.validIDs for op in ops]
 
-        drint("valids", results)
         fusable_intervals = []
         start, end =0,0
         while end<len(results):
@@ -897,8 +870,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
                     end = start
         if start<end:
             fusable_intervals.append((start,end))
-        drint(fusable_intervals)   
-        drint("allFusableOps", results)
         fusability_exists = reduce(lambda x,y: x or y,[int(op._task_id) in self.validIDs for op in ops])
         return (fusability_exists, fusable_intervals)
 
@@ -960,7 +931,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
             i+=1
         if start<end:
             intervals.append((start,end))
-        #print("proj", intervals)
         return True, intervals
          
 
@@ -971,7 +941,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         launch_shapes = []
         for i in range(len(ops)):
             launch_shapes.append(strategies[i]._launch_shape)
-        #print("ls", launch_shapes)
         intervals =[]
         i=1
         start=0
@@ -979,11 +948,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         while i<end:
             leftNone = launch_shapes[i] is None and (launch_shapes[i-1] is not None)
             rightNone = launch_shapes[i-1] is None and (launch_shapes[i] is not None)
-            #leftRead = leftNone and int(ops[i]._task_id)==14
-            #rightRead = rightNone and int(ops[i-1]._task_id)==14
-            #if leftRead or rightRead:
-            #    i=i+1
-            #elif leftNone or rightNone or launch_shapes[i]!=launch_shapes[i-1]:
             if leftNone or rightNone or launch_shapes[i]!=launch_shapes[i-1]:
                 intervals.append((start, i))
                 start=i
@@ -992,14 +956,12 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
                 i+=1
         if start<end:
             intervals.append((start, end))
-        #print(intervals)
         return True, intervals
 
     def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies):
         launch_shapes = []
         for i in range(len(ops)):
             launch_shapes.append(strategies[i]._launch_shape)
-        #print("ls", launch_shapes)
         intervals =[]
         for baseInterval in baseIntervals:
             start=baseInterval[0]
@@ -1146,14 +1108,16 @@ def __init__(self, core_library):
         # to be dispatched. This list allows cross library introspection for
         # Legate operations.
         self._outstanding_ops = []
-        self._window_size=1
+
+        #self._window_size = self._core_context.get_tunable(
+        #    legion.LEGATE_CORE_TUNABLE_WINDOW_SIZE,
+        #    ty.uint32,)
+        self._window_size=50
         self._fusion_threshold =2
+        #used for logging window and fusion lengths
         self._opLens = []
         self._fusedOpLens = []
         self._clearing_pipe = False
-        #self._window_size = self._core_context.get_tunable(
-        #    legion.LEGATE_CORE_TUNABLE_WINDOW_SIZE,
-        #    ty.uint32,)
 
         # Now we initialize managers
         self._attachment_manager = AttachmentManager(self)
@@ -1247,16 +1211,6 @@ def destroy(self):
         # Before we clean up the runtime, we should execute all outstanding
         # operations.
         self.flush_scheduling_window()
-        #print("opLens")
-        #print(self._opLens)
-        #print("fusedOpLens")
-        #print(self._fusedOpLens)
-        #import pickle
-        #with open('fusedOpLens.pkl', 'wb') as f:
-        #    pickle.dump(self._fusedOpLens, f)
-        #with open('flushOpLens.pkl', 'wb') as f:
-        #    pickle.dump(self._fusedOpLens, f)
-        #pr.print_stats(sort='time')
         # Destroy all libraries. Note that we should do this
         # from the lastly added one to the first one
         for context in reversed(self._context_list):
@@ -1338,19 +1292,13 @@ def serialize_multiop_metadata(self, numpy_context, ops):
 
     def build_fused_op(self,ops):
 
-        for i in range(len(ops)):
-            self.propogateFuture(ops[i])
         fusion_checker = FusionChecker(ops, self._contexts, self)
         fusion_checker.register_constraint(AllValidOps())
         fusion_checker.register_constraint(IdenticalLaunchShapes())
-        fusion_checker.register_constraint(IdenticalProjection())
+        #this constraints doesn't seem to be necessary?
+        #fusion_checker.register_constraint(IdenticalProjection())
         fusion_checker.register_constraint(ValidProducerConsumer())
         can_fuse,fusable_sets, partitions = fusion_checker.can_fuse()
-
-        #short circuit         
-        #if not can_fuse:
-        ##    drint("CANNOT FUSE!")
-        #    return False, partitions
                 
         super_strats = []
         super_fspaces = []
@@ -1363,7 +1311,6 @@ def build_fused_op(self,ops):
             super_fspace = {}
             super_keystore = set()
             start,end = fusable_set
-            drint("creating fusable set for", start, end)
             for j in range(start,end):
                 super_strat = {**(super_strat.copy()), **partitions[j]._strategy}
                 super_fspace = {**(super_fspace.copy()), **partitions[j]._fspaces}
@@ -1372,7 +1319,6 @@ def build_fused_op(self,ops):
             super_fspaces.append(super_fspace)
             super_keystores.append(super_keystore)
             super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore))
-        #drint("lens", len(super_strats), len(super_fspaces), len(super_strategies), len(super_keystore))
        
         super_strat = {}
         super_fspace = {}
@@ -1381,8 +1327,6 @@ def build_fused_op(self,ops):
             super_fspace = {**(super_fspace.copy()), **partition._fspaces}
         
 
-        #starttime = datetime.datetime.now()
-        #super_strategy = Strategy(partitions[0]._launch_shape, super_strat, super_fspace)
         #hacky way to get numpy context and designated fused task id
         fused_id = self._contexts["cunumeric"].fused_id
         numpy_context = self._contexts["cunumeric"]
@@ -1406,12 +1350,12 @@ def build_fused_op(self,ops):
                 #this metadata will be fed into the fused op as inputs
                 meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset)
                 fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true
+
                 #add typical inputs and outputs of all subtasks to fused task
                 key_part = None
                 fused_task._unfused_input_parts = []
                 fused_task._unfused_output_parts = []
                 fused_task._unfused_reduction_parts = []
-                #isScalarConversion = len(op._outputs) ==1 and len(op._inputs)==1
                 for j,op in enumerate(op_subset):
                     for scalar in op._scalar_args:
                         fused_task.add_scalar_arg(scalar[0], ty.int32)
@@ -1419,7 +1363,6 @@ def build_fused_op(self,ops):
                         fused_task.add_reduction(reduction, redop)
                         fused_task._unfused_reduction_parts.append(part)
                     for input,part in zip(op._inputs, op._input_parts):
-                        #print(j, part)
                         fused_task.add_input(input)   
                         fused_task._unfused_input_parts.append(part)
                     for output,part in zip(op._outputs, op._output_parts):
@@ -1437,7 +1380,6 @@ def build_fused_op(self,ops):
         print("buildtime", total)
         """
    
-        #pr.enable()
         redoPar=False
         #starttime = datetime.datetime.now()
         for i,fused_task in enumerate(new_op_list):
@@ -1464,40 +1406,18 @@ def _launch_outstanding(self, force_eval=True):
             self._outstanding_ops = []
             self._schedule(ops, force_eval)
 
-
-    def propogateFuture(self,op):
-        return 
-        for input in op._inputs:
-            start = input
-            if input._kind is Future and input._storage is None: 
-                while start._storage is None and start._parent:
-                    start=start._parent
-                input._storage = start._storage
    
     def _schedule(self, ops, force_eval=False):
         ids = [op._task_id for op in ops]
-        #print(force_eval, "ids", ids)
         #case 1: try fusing current window of tasks
-        #if partially or fully fusable, 
-        #schedule the new set of tasks
         strats = False
         if len(ops)>=2 and (not force_eval):
-            #start = datetime.datetime.now()
-            #pr.enable()
             fused_task_list,strats = self.build_fused_op(ops)
-            #pr.disable()
-            #stop = datetime.datetime.now()
-            #delta=stop-start
-            #total = delta.total_seconds() * 1000.0
-            #print("time", total)
             if fused_task_list:
-                frint("created fused list", [op._task_id for op in fused_task_list])
-                drint("start clearing pipe")
                 self._clearing_pipe = True
                 for task in fused_task_list:
                     task.execute() 
                 self._clearing_pipe = False
-                drint("stop clearing pipe")
                 return
 
         # case 2: tasks  processed for fusion already have  
@@ -1505,21 +1425,12 @@ def _schedule(self, ops, force_eval=False):
         # them when testing fusion legality (in case 1)
         if len(ops)==1 and self._clearing_pipe:
             strategy = ops[0].strategy
-            for input, part in zip(ops[0]._inputs, ops[0]._input_parts):
-                frint("launch fused input", ops[0]._task_id, input)
-                #proj = ops[0].strategy.get_projection(part)
-            self.propogateFuture(ops[0])
-
-            for output in ops[0]._outputs:
-                #if output._storage is None:
-                frint("launch used output", ops[0]._task_id, output)
             ops[0].launch(strategy)
 
         # case 3: execute the ops normally 
         # if we already checked the ops for fusability,
         # then the ops' buffers have already been partitioned
         else:
-            #print("normal execution", ids)
             if not strats: #ops were not check for fusability, so partition them
                 for op in ops:
                     must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op])
@@ -1530,10 +1441,6 @@ def _schedule(self, ops, force_eval=False):
                 for i,op in enumerate(ops):
                     op.strategy = strats[i]
             for i,op in enumerate(ops):
-                for input in op._inputs:
-                    if input._storage is None:
-                        frint("launch ufused input", op._task_id, input)
-                self.propogateFuture(op)
                 op.launch(op.strategy)
 
 
diff --git a/legate/core/solver.py b/legate/core/solver.py
index 262497bf1..d5b9bdf27 100644
--- a/legate/core/solver.py
+++ b/legate/core/solver.py
@@ -283,7 +283,6 @@ def cost(unknown):
 
         key_parts = set()
         prev_part = None
-        #import pdb; pdb.set_trace()
         for unknown in unknowns:
             if unknown in partitions:
                 continue
@@ -307,7 +306,6 @@ def cost(unknown):
                 if to_align in partitions:
                     continue
                 partitions[to_align] = partition
-                #print("ptype", to_align, (partition))
             prev_part = partition
 
         for lhs, rhs in dependent.items():
diff --git a/legate/core/store.py b/legate/core/store.py
index ef2e58084..c8063f72f 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -536,26 +536,6 @@ def storage(self):
         # it has been initialized correctly.
         self._runtime._launch_outstanding(False)
         if self._storage is None:
-            """
-            if self._kind ==Future:
-                print("future")
-            while(self._storage is None and len(self._runtime._outstanding_ops)):
-                print("launch_one")
-                print([op._task_id for op in self._runtime._outstanding_ops])
-                #self._runtime._launch_outstanding()
-                self._runtime._launch_one()
-            """ 
-            """
-            if True:
-                import pdb; pdb.set_trace()
-                start = self
-                while start._storage is None and start._parent:
-                    start=start._parent
-                if start._storage:
-                    self._storage = start._storage
-                else:
-                    self._runtime._launch_outstanding()
-            """
             if self.unbound:
                 raise RuntimeError(
                     "Storage of a variable size store cannot be retrieved "
@@ -565,7 +545,6 @@ def storage(self):
             #       if necessary
             if self._parent is None:
                 if self._kind is Future:
-                    print("supressing in store.py")
                     raise ValueError(
                         "Illegal to access the storage of an uninitialized "
                         "Legate store of volume 1 with scalar optimization"
@@ -957,13 +936,8 @@ def packList(self, meta_list, buf):
         # aggregate the ints when packing
         # much faster than individually packing each int
         buf.pack_32bit_int_arr(meta_list)
-        #for elem in meta_list: 
-        #    buf.pack_32bit_int(elem)
 
     def pack(self, buf):
-       
-        #buf.pack_32bit_int(len(self._opIDs)) #nOps
-        #buf.pack_32bit_int(len(self._buffer_offsets)) #nIOBuffers+1
         superbuff = [len(self._opIDs)]+[len(self._buffer_offsets)]
         superbuff += self._input_starts
         superbuff += self._output_starts
@@ -974,11 +948,4 @@ def pack(self, buf):
         superbuff += self._future_starts
         superbuff += self._opIDs
         self.packList(superbuff, buf)
-        #self.packList(self._input_starts, buf)
-        #self.packList(self._output_starts, buf)
-        #self.packList(self._offset_starts, buf)
-        #self.packList(self._buffer_offsets, buf)
-        #self.packList(self._reduction_starts, buf)
-        #self.packList(self._scalar_starts, buf)
-        #self.packList(self._opIDs, buf)
 

From a73dec165788442c53e5e0a33f93d73a58f82f58 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@sapling.stanford.edu>
Date: Tue, 7 Dec 2021 15:12:23 -0800
Subject: [PATCH 35/44] more cleanup

---
 install.py              |  6 +----
 legate/core/launcher.py |  1 -
 legate/core/runtime.py  | 58 +++++------------------------------------
 3 files changed, 7 insertions(+), 58 deletions(-)

diff --git a/install.py b/install.py
index 6a3313e62..aabd00824 100755
--- a/install.py
+++ b/install.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-#/home/shiv1/pypy/pypy3.8-v7.3.7-linux64/bin/python3.8
 
 # Copyright 2021 NVIDIA Corporation
 #
@@ -126,7 +125,6 @@ def git_clone(repo_dir, url, branch=None, tag=None, commit=None):
         verbose_check_call(
             ["git", "submodule", "update", "--init"], cwd=repo_dir
         )
-        git_reset(repo_dir, commit)
     else:
         verbose_check_call(
             [
@@ -201,7 +199,6 @@ def install_legion(legion_src_dir, branch):
         legion_src_dir,
         url="https://gitlab.com/StanfordLegion/legion.git",
         branch=branch,
-        commit=commit,
     )
 
 
@@ -216,7 +213,7 @@ def install_thrust(thrust_dir):
 
 def update_legion(legion_src_dir, branch):
     # Make sure we are on the right branch for single/multi-node
-    git_update(legion_src_dir, branch=branch, commit=commit)
+    git_update(legion_src_dir, branch=branch)
 
 
 def build_legion(
@@ -378,7 +375,6 @@ def build_legion(
 
         legion_python_dir = os.path.join(legion_src_dir, "bindings", "python")
         if clean_first:
-            print("cleaning!\n")
             verbose_check_call(
                 ["make"] + flags + ["clean"], cwd=legion_python_dir
             )
diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index e396b0894..837e53068 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -369,7 +369,6 @@ def coalesce(self):
         # promote them to read write permission.
         if len(all_perms - set([Permission.NO_ACCESS])) > 1:
             perm = Permission.READ_WRITE
-            #perm = Permission.WRITE
 
             # When the field requires read write permission,
             # all projections must be the same
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 195f16072..bf625b901 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -26,8 +26,6 @@
 from legate.core import types as ty
 
 import datetime
-import cProfile
-pr = cProfile.Profile()
 
 from .context import Context
 from .corelib import CoreLib
@@ -824,15 +822,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
          """
         raise NotImplementedError("Implement in derived classes")
 
-
-class NumpyContextExists(FusionConstraint):
-    def apply(self, contexts, runtime, ops, partitioners, strategies):
-        if "cunumeric" in contexts:
-            return True, [(0, len(ops))]
-        else:
-           return False, [(0,0)]
-
-
 class AllValidOps(FusionConstraint):
     """
     Class for only fusing only potentially fusable ops.
@@ -873,9 +862,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         fusability_exists = reduce(lambda x,y: x or y,[int(op._task_id) in self.validIDs for op in ops])
         return (fusability_exists, fusable_intervals)
 
-class ValidScalarProducers(FusionConstraint):
-   """Checks all scalar producing are terminal ops"""
-
 class IdenticalProjection(FusionConstraint):
     """Fusion rule that only ops with identical
        projection functors can be fused"""
@@ -899,19 +885,11 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
                     proj = strategies[i].get_projection(part)
                     if hasattr(proj, 'part'):
                         bufferSet[input]=proj
-                        #if input not in linkset:
-                        #    linkset[input] = [i]
-                        #else:
-                        #    linkset[input].append(i)
             for output, part in zip(op._outputs, op._output_parts):
                 if output not in bufferSet:
                     proj = strategies[i].get_projection(part)
                     if hasattr(proj, 'part'):
                         bufferSet[output]=proj
-                        #if output not in linkset:
-                        #    linkset[output] = [i]
-                        #else:
-                        #    linkset[output].append(i)
             if i==0: #we only iterate from i==1 onwards
                 i+=1
                 continue
@@ -980,9 +958,6 @@ def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies
                 intervals.append((start, end))
         return True, intervals
 
-
-
-
 class ValidProducerConsumer(FusionConstraint):
     """In a fused op, there cannot be a producer consumer
         relationship between different views of the same buffers"""
@@ -1115,8 +1090,6 @@ def __init__(self, core_library):
         self._window_size=50
         self._fusion_threshold =2
         #used for logging window and fusion lengths
-        self._opLens = []
-        self._fusedOpLens = []
         self._clearing_pipe = False
 
         # Now we initialize managers
@@ -1287,7 +1260,7 @@ def serialize_multiop_metadata(self, numpy_context, ops):
                       future_starts, op_ids)
         fusion_metadata = FusionMetadata(*meta_arrs)
         meta_maps=None
-        return meta_maps, fusion_metadata
+        return fusion_metadata
    
 
     def build_fused_op(self,ops):
@@ -1300,10 +1273,7 @@ def build_fused_op(self,ops):
         fusion_checker.register_constraint(ValidProducerConsumer())
         can_fuse,fusable_sets, partitions = fusion_checker.can_fuse()
                 
-        super_strats = []
-        super_fspaces = []
         super_strategies = []
-        super_keystores = [] 
         z=0
         for fusable_set in fusable_sets:   
             #create super strategy for this fusable set
@@ -1315,40 +1285,28 @@ def build_fused_op(self,ops):
                 super_strat = {**(super_strat.copy()), **partitions[j]._strategy}
                 super_fspace = {**(super_fspace.copy()), **partitions[j]._fspaces}
                 super_keystore = super_keystore.union(partitions[j]._key_parts)
-            super_strats.append(super_strat)
-            super_fspaces.append(super_fspace)
-            super_keystores.append(super_keystore)
             super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore))
        
-        super_strat = {}
-        super_fspace = {}
-        for partition in partitions:
-            super_strat = {**(super_strat.copy()), **partition._strategy}  
-            super_fspace = {**(super_fspace.copy()), **partition._fspaces}
-        
-
         #hacky way to get numpy context and designated fused task id
         fused_id = self._contexts["cunumeric"].fused_id
         numpy_context = self._contexts["cunumeric"]
         numpy_runtime = numpy_context._library.runtime
-        z=0
+        opID=0
         new_op_list = []
         for i,fusable_set in enumerate(fusable_sets):
             start, end = fusable_set
             op_subset = ops[start:end]
             #if nothing to fuse, just use the original op
-            #self._fusedOpLens.append(len(op_subset))
             if end-start==1:
                 normal_op = ops[start]
-                normal_op.strategy =  partitions[z]._strategy#uper_strategies[i]
+                normal_op.strategy =  partitions[opID]._strategy#uper_strategies[i]
                 new_op_list.append(normal_op)
-                z+=1
+                opID+=1
             elif end-start > 1:
                 #initialize fused task
                 fused_task = numpy_context.create_task(fused_id)
                 #serialize necessary metadata on all encapsulated ops 
-                #this metadata will be fed into the fused op as inputs
-                meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset)
+                fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset)
                 fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true
 
                 #add typical inputs and outputs of all subtasks to fused task
@@ -1370,7 +1328,7 @@ def build_fused_op(self,ops):
                         fused_task._unfused_output_parts.append(part)
                     for future in op._futures:
                         fused_task.add_future(future)
-                    z+=1
+                    opID+=1
                 new_op_list.append(fused_task)
         strats=[]
         """
@@ -1401,7 +1359,6 @@ def build_fused_op(self,ops):
 
     def _launch_outstanding(self, force_eval=True):        
         if len(self._outstanding_ops):
-            #self._opLens.append(len(self._outstanding_ops))
             ops = self._outstanding_ops
             self._outstanding_ops = []
             self._schedule(ops, force_eval)
@@ -1448,13 +1405,11 @@ def submit(self, op):
         #always launch ops that've been processed for fusion
         #do not re-add to the window
         #as the these ops already waited in the window
-        #print(op.__dict__)
         if self._clearing_pipe:
             self._schedule([op])
         else:
             self._outstanding_ops.append(op)
             if len(self._outstanding_ops) >= self._window_size:
-                #self._opLens.append(len(self._outstanding_ops))
                 ops = self._outstanding_ops
                 self._outstanding_ops = []
                 self._schedule(ops)
@@ -1477,7 +1432,6 @@ def _scheduleNew(self, ops):
     def flush_scheduling_window(self):
         if len(self._outstanding_ops) == 0:
             return
-        #self._opLens.append(len(self._outstanding_ops))
         ops = self._outstanding_ops
         self._outstanding_ops = []
         self._schedule(ops)

From 437e67eef6d239786d2c03dd10c0c70681daa893 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@sapling.stanford.edu>
Date: Sun, 12 Dec 2021 11:37:24 -0800
Subject: [PATCH 36/44] use alignment info when fusing

---
 legate/core/operation.py  |   6 +++
 legate/core/runtime.py    | 110 ++++++++++++++++++++++++++++++++------
 legate/core/solver.py     |   5 --
 src/core/utilities/span.h |   2 -
 4 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/legate/core/operation.py b/legate/core/operation.py
index 39806ad81..861b042b5 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -151,6 +151,12 @@ def add_broadcast(self, store):
     def add_constraint(self, constraint):
         self._constraints.append(constraint)
 
+    def has_constraint(self, store1, store2):
+        part1 = self._get_unique_partition(store1)
+        part2 = self._get_unique_partition(store2)
+        cons = [str(con) for con in self._constraints]
+        return (str(part1 == part2) in cons) or (str(part2==part1) in cons)
+
     def execute(self):
         self._context.runtime.submit(self)
 
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index bf625b901..6bea2b5c3 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -757,10 +757,9 @@ def can_fuse(self):
                 output.set_key_partition(partition)
                 key_part = partition
                 for input in op._inputs:
-                    if input.shape==output.shape:
+                    #check if input and output should be aligned
+                    if op.has_constraint(input, output):
                         input.set_key_partition(key_part)
-            #if len(op.inputs)>1:
-            #    proj = strategy.get_projection(op._input_parts[1])
             self.strategies.append(strategy)
         self.strategies.reverse()
         """
@@ -771,16 +770,16 @@ def can_fuse(self):
         """  
         #TODO: have all constraints use "streamApply"
         # this is a more efficient way/interface for generating fusable intervals   
-        streamApply = False
+        streamApply = True
         #starttime = datetime.datetime.now()
         if not streamApply:
             results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints]
             all_fusable = [result[0] for result in results]
             interval_sets = [result[1] for result in results]
         else:
-            alpha = self.constraints[0].apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies)
-            beta = self.constraints[1].apply2(self.contexts, self.runtime, self.ops, alpha[1],self.partitioners, self.strategies)
-            beta = self.constraints[2].apply2(self.contexts, self.runtime, self.ops, beta[1], self.partitioners, self.strategies)
+            windows = [(0, len(self.ops))]
+            for constraint in self.constraints:
+                windows = constraint.apply2(self.contexts, self.runtime, self.ops, windows, self.partitioners, self.strategies)
         """
         stoptime = datetime.datetime.now()
         delta=stoptime-starttime
@@ -804,7 +803,7 @@ def can_fuse(self):
                 curr_set=newset
             fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold)
         else:
-            fusable,final_set = self.supress_small_fusions(beta[1], self.runtime._fusion_threshold)
+            fusable,final_set = self.supress_small_fusions(windows, self.runtime._fusion_threshold)
          
         """
         stoptime = datetime.datetime.now()
@@ -862,14 +861,42 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         fusability_exists = reduce(lambda x,y: x or y,[int(op._task_id) in self.validIDs for op in ops])
         return (fusability_exists, fusable_intervals)
 
+    def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies):
+        fusable_intervals = []
+        results = [int(op._task_id) in self.validIDs for op in ops]
+        for baseInterval in baseIntervals:
+            start, end = baseInterval[0], baseInterval[0]
+            while end<baseInterval[1]:
+                result = results[end]
+                if result and (ops[end]._task_id not in self.terminals):
+                    end=end+1
+                else:
+                    if start<end:
+                        if ops[end]._task_id in self.terminals:
+                            fusable_intervals.append((start,end+1))
+                            start=end+1
+                            end=start
+                        else:
+                            fusable_intervals.append((start,end))
+                            start=end 
+                            end=start
+                    else:
+                        fusable_intervals.append((start, start+1))
+                        start=start+1
+                        end = start
+            if start<end:
+                fusable_intervals.append((start,end))
+        return fusable_intervals
+
+
+
+
 class IdenticalProjection(FusionConstraint):
     """Fusion rule that only ops with identical
        projection functors can be fused"""
 
     def apply(self, contexts, runtime, ops, partitioners, strategies):
-        linkset = {}
         store_to_ops = {}
-        base_window = [0, len(ops)]
         
         intervals = []
         start=0
@@ -910,7 +937,53 @@ def apply(self, contexts, runtime, ops, partitioners, strategies):
         if start<end:
             intervals.append((start,end))
         return True, intervals
-         
+     
+    
+    def apply2(self, contexts, runtime, ops, baseIntervals,  partitioners, strategies):
+        store_to_ops = {}
+        
+        intervals = []
+        i=0
+        #for i, op in enumerate(ops):
+        for baseInterval in baseIntervals:
+            start, end = baseInterval
+            i=start
+            while i<end:
+                op=ops[i]
+                bufferSet = {}
+                # find the set union of input and output buffers for the op
+                for input, part in zip(op._inputs, op._input_parts):
+                    if input not in bufferSet:
+                        proj = strategies[i].get_projection(part)
+                        if hasattr(proj, 'part'):
+                            bufferSet[input]=proj
+                for output, part in zip(op._outputs, op._output_parts):
+                    if output not in bufferSet:
+                        proj = strategies[i].get_projection(part)
+                        if hasattr(proj, 'part'):
+                            bufferSet[output]=proj
+                if i==0: #we only iterate from i==1 onwards
+                    i+=1
+                    continue
+                # for each op in the union, record its associated transform
+                for buffer in bufferSet.keys():
+                    proj = bufferSet[buffer]
+                    matrix = proj.part.index_partition.functor.transform.trans
+                    if buffer not in store_to_ops:
+                        store_to_ops[buffer] = matrix
+                    else: #we see a new projection for the same buffer
+                        if not np.array_equal(matrix, store_to_ops[buffer]):
+                            intervals.append((start, i))
+                            start=i
+                            i=start+1
+                            store_to_ops={}
+                            continue
+                i+=1
+            if start<end:
+                intervals.append((start,end))
+        return intervals
+
+
 
 class IdenticalLaunchShapes(FusionConstraint):
     """Fusion rule that only ops with identical
@@ -956,7 +1029,7 @@ def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies
                     i+=1
             if start<end:
                 intervals.append((start, end))
-        return True, intervals
+        return intervals
 
 class ValidProducerConsumer(FusionConstraint):
     """In a fused op, there cannot be a producer consumer
@@ -1036,7 +1109,7 @@ def getRoot(store):
             #end while
             if start<end:
                 intervals.append((start,end))
-        return True, intervals
+        return intervals
  
 
 
@@ -1241,6 +1314,7 @@ def serialize_multiop_metadata(self, numpy_context, ops):
             for r,reduction in enumerate(op._reductions):
                 offsets.append(-(r+1)) 
  
+            #todo: this will have to be generalized to work with any library
             op_ids.append(numpy_context.get_task_id(op._task_id._value_))
             offset_start+=(len(op._inputs)+len(op._outputs))
             input_start+=len(op._inputs)
@@ -1287,10 +1361,15 @@ def build_fused_op(self,ops):
                 super_keystore = super_keystore.union(partitions[j]._key_parts)
             super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore))
        
-        #hacky way to get numpy context and designated fused task id
+       
+        #once fusion in the core is playing nicely with the mapepr
+        #the following two lines will be removed, and be replaced 
+        #with the 2 subsequent (commented out) lines
         fused_id = self._contexts["cunumeric"].fused_id
         numpy_context = self._contexts["cunumeric"]
-        numpy_runtime = numpy_context._library.runtime
+        #fused_id = self._contexts["legate.core"]._library.fused_id
+        #numpy_context = self._contexts["legate.core"]
+
         opID=0
         new_op_list = []
         for i,fusable_set in enumerate(fusable_sets):
@@ -1366,6 +1445,7 @@ def _launch_outstanding(self, force_eval=True):
    
     def _schedule(self, ops, force_eval=False):
         ids = [op._task_id for op in ops]
+        #print(ids)
         #case 1: try fusing current window of tasks
         strats = False
         if len(ops)>=2 and (not force_eval):
diff --git a/legate/core/solver.py b/legate/core/solver.py
index d5b9bdf27..51639f17d 100644
--- a/legate/core/solver.py
+++ b/legate/core/solver.py
@@ -273,11 +273,6 @@ def cost(unknown):
                 -store.comm_volume(),
                 not store.has_key_partition(all_restrictions[unknown]),
             )
-            #return (
-            #    store.comm_volume(),
-            #    store._key_partition is None,
-            #    not store.has_key_partition(all_restrictions[unknown]),
-            #)
 
         unknowns = sorted(unknowns, key=cost)
 
diff --git a/src/core/utilities/span.h b/src/core/utilities/span.h
index c839bb365..c0a20c5a8 100644
--- a/src/core/utilities/span.h
+++ b/src/core/utilities/span.h
@@ -35,7 +35,6 @@ struct Span {
  public:
   decltype(auto) operator[](size_t pos)
   {
-    //std::cout<<"pos "<<pos<<" "<<size_<<std::endl;
     assert(pos < size_);
     return data_[pos];
   }
@@ -43,7 +42,6 @@ struct Span {
  public:
   decltype(auto) subspan(size_t off)
   {
-    //std::cout<<"size "<<size_<<" off "<<off<<std::endl;
     assert(off <= size_);
     return Span(data_ + off, size_ - off);
   }

From 9594cb567aff46e6e2034e27312acac8e929077a Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@sapling.stanford.edu>
Date: Sun, 12 Dec 2021 11:50:05 -0800
Subject: [PATCH 37/44] new apply methods

---
 legate/core/runtime.py | 224 ++++-------------------------------------
 1 file changed, 18 insertions(+), 206 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 6bea2b5c3..21793b891 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -746,7 +746,6 @@ def supress_small_fusions(self, intervals, threshold):
         return fusable, final_set
 
     def can_fuse(self):
-        #starttime = datetime.datetime.now()
         for op in reversed(self.ops):
             must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op])
             partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single)
@@ -756,71 +755,30 @@ def can_fuse(self):
                 partition = strategy.get_partition(part)
                 output.set_key_partition(partition)
                 key_part = partition
+                #check if input and output should be aligned
                 for input in op._inputs:
-                    #check if input and output should be aligned
                     if op.has_constraint(input, output):
                         input.set_key_partition(key_part)
             self.strategies.append(strategy)
         self.strategies.reverse()
-        """
-        stoptime = datetime.datetime.now()
-        delta=stoptime-starttime
-        total = delta.total_seconds() * 1000.0
-        print("partime", total, len(self.ops))
-        """  
-        #TODO: have all constraints use "streamApply"
-        # this is a more efficient way/interface for generating fusable intervals   
-        streamApply = True
-        #starttime = datetime.datetime.now()
-        if not streamApply:
-            results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints]
-            all_fusable = [result[0] for result in results]
-            interval_sets = [result[1] for result in results]
-        else:
-            windows = [(0, len(self.ops))]
-            for constraint in self.constraints:
-                windows = constraint.apply2(self.contexts, self.runtime, self.ops, windows, self.partitioners, self.strategies)
-        """
-        stoptime = datetime.datetime.now()
-        delta=stoptime-starttime
-        total = delta.total_seconds() * 1000.0
-        print("applytime", total)
-        """
-        starttime = datetime.datetime.now()
+
+        windows = [(0, len(self.ops))]
+        for constraint in self.constraints:
+            windows = constraint.apply(self.contexts, self.runtime, self.ops, windows, self.partitioners, self.strategies)
   
-        #intersect intervals
-        #this is an inefficent way of doing this,
-        if not streamApply:
-            curr_set = interval_sets[0]
-            for interval_set in interval_sets[1:]:
-                newset = []
-                for aset in curr_set:
-                    for bset in interval_set:
-                        if not (aset[0] > bset[1] or bset[0] > aset[1]): 
-                            news = max(aset[0], bset[0])
-                            newe = min(aset[1], bset[1])
-                            newset.append((news, newe))
-                curr_set=newset
-            fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold)
-        else:
-            fusable,final_set = self.supress_small_fusions(windows, self.runtime._fusion_threshold)
-         
-        """
-        stoptime = datetime.datetime.now()
-        delta=stoptime-starttime
-        total = delta.total_seconds() * 1000.0
-        print("filtertime", total)
-        """
+        fusable,final_set = self.supress_small_fusions(windows, self.runtime._fusion_threshold)
         return fusable, final_set, self.strategies
 
+
 class FusionConstraint(object):
-    def apply(self, contexts, runtime, ops, partitioners, strategies):
+    def apply(self, contexts, runtime, ops, baseIntervals, partitioners, strategies):
         """"
          Abstract class for determining a rule that constrains
          which legate operations can be fused
          """
         raise NotImplementedError("Implement in derived classes")
 
+
 class AllValidOps(FusionConstraint):
     """
     Class for only fusing only potentially fusable ops.
@@ -833,35 +791,7 @@ def __init__(self):
         self.validIDs.add(10) #Fill op
         self.validIDs.add(20) #Unary op
 
-    def apply(self, contexts, runtime, ops, partitioners, strategies):
-        results = [int(op._task_id) in self.validIDs for op in ops]
-
-        fusable_intervals = []
-        start, end =0,0
-        while end<len(results):
-            result = results[end]
-            if result and (ops[end]._task_id not in self.terminals):
-                end=end+1
-            else:
-                if start<end:
-                    if ops[end]._task_id in self.terminals:
-                        fusable_intervals.append((start,end+1))
-                        start=end+1
-                        end=start
-                    else:
-                        fusable_intervals.append((start,end))
-                        start=end 
-                        end=start
-                else:
-                    fusable_intervals.append((start, start+1))
-                    start=start+1
-                    end = start
-        if start<end:
-            fusable_intervals.append((start,end))
-        fusability_exists = reduce(lambda x,y: x or y,[int(op._task_id) in self.validIDs for op in ops])
-        return (fusability_exists, fusable_intervals)
-
-    def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies):
+    def apply(self, contexts, runtime, ops, baseIntervals, partitioners, strategies):
         fusable_intervals = []
         results = [int(op._task_id) in self.validIDs for op in ops]
         for baseInterval in baseIntervals:
@@ -889,62 +819,15 @@ def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies
         return fusable_intervals
 
 
-
-
 class IdenticalProjection(FusionConstraint):
     """Fusion rule that only ops with identical
        projection functors can be fused"""
 
-    def apply(self, contexts, runtime, ops, partitioners, strategies):
+    def apply(self, contexts, runtime, ops, baseIntervals,  partitioners, strategies):
         store_to_ops = {}
         
         intervals = []
-        start=0
-        end = len(ops)
         i=0
-        #for i, op in enumerate(ops):
-        while i<end:
-            op=ops[i]
-            bufferSet = {}
-            # find the set union of input and output buffers for the op
-            for input, part in zip(op._inputs, op._input_parts):
-                if input not in bufferSet:
-                    proj = strategies[i].get_projection(part)
-                    if hasattr(proj, 'part'):
-                        bufferSet[input]=proj
-            for output, part in zip(op._outputs, op._output_parts):
-                if output not in bufferSet:
-                    proj = strategies[i].get_projection(part)
-                    if hasattr(proj, 'part'):
-                        bufferSet[output]=proj
-            if i==0: #we only iterate from i==1 onwards
-                i+=1
-                continue
-            # for each op in the union, record its associated transform
-            for buffer in bufferSet.keys():
-                proj = bufferSet[buffer]
-                matrix = proj.part.index_partition.functor.transform.trans
-                if buffer not in store_to_ops:
-                    store_to_ops[buffer] = matrix
-                else: #we see a new projection for the same buffer
-                    if not np.array_equal(matrix, store_to_ops[buffer]):
-                        intervals.append((start, i))
-                        start=i
-                        i=start+1
-                        store_to_ops={}
-                        continue
-            i+=1
-        if start<end:
-            intervals.append((start,end))
-        return True, intervals
-     
-    
-    def apply2(self, contexts, runtime, ops, baseIntervals,  partitioners, strategies):
-        store_to_ops = {}
-        
-        intervals = []
-        i=0
-        #for i, op in enumerate(ops):
         for baseInterval in baseIntervals:
             start, end = baseInterval
             i=start
@@ -984,32 +867,11 @@ def apply2(self, contexts, runtime, ops, baseIntervals,  partitioners, strategie
         return intervals
 
 
-
 class IdenticalLaunchShapes(FusionConstraint):
     """Fusion rule that only ops with identical
        launch shapes can be fused"""
-    def apply(self, contexts, runtime, ops, partitioners, strategies):
-        launch_shapes = []
-        for i in range(len(ops)):
-            launch_shapes.append(strategies[i]._launch_shape)
-        intervals =[]
-        i=1
-        start=0
-        end = len(launch_shapes)
-        while i<end:
-            leftNone = launch_shapes[i] is None and (launch_shapes[i-1] is not None)
-            rightNone = launch_shapes[i-1] is None and (launch_shapes[i] is not None)
-            if leftNone or rightNone or launch_shapes[i]!=launch_shapes[i-1]:
-                intervals.append((start, i))
-                start=i
-                i=start+1
-            else:
-                i+=1
-        if start<end:
-            intervals.append((start, end))
-        return True, intervals
 
-    def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies):
+    def apply(self, contexts, runtime, ops, baseIntervals, partitioners, strategies):
         launch_shapes = []
         for i in range(len(ops)):
             launch_shapes.append(strategies[i]._launch_shape)
@@ -1031,49 +893,12 @@ def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies
                 intervals.append((start, end))
         return intervals
 
+
 class ValidProducerConsumer(FusionConstraint):
     """In a fused op, there cannot be a producer consumer
         relationship between different views of the same buffers"""
 
-    def apply(self, contexts, runtime, ops, partitioners, strategies):
-        childMap = {}
-        intervals = []
-        i, start=0, 0
-        end = len(ops)
-        def getRoot(store):
-            while store._parent:
-                store = store._parent
-            return store
-        
-        while i<end:
-            op = ops[i] 
-            #check consumers view of root array
-            #is the same as the producers view
-            isSame = True
-            for input in op._inputs:
-                inputRoot = getRoot(input) 
-                if inputRoot in childMap:
-                    isSame = isSame and childMap[inputRoot] == input
-
-            if not isSame:                     
-                intervals.append((start, i))
-                start=i
-                i=start+1
-                childMap = {}
-            else:
-                i=i+1 
-            #register producers view of buffer
-            for output in op._outputs:
-                outputRoot = getRoot(output)
-                if outputRoot not in childMap:  
-                    childMap[outputRoot] = output
-            #end while
-        if start<end:
-           intervals.append((start,end))
-        return True, intervals
- 
-
-    def apply2(self, contexts, runtime, ops, baseIntervals,  partitioners, strategies):
+    def apply(self, contexts, runtime, ops, baseIntervals,  partitioners, strategies):
         childMap = {}
         intervals = []
         i, start=0, 0
@@ -1157,9 +982,9 @@ def __init__(self, core_library):
         # Legate operations.
         self._outstanding_ops = []
 
-        #self._window_size = self._core_context.get_tunable(
-        #    legion.LEGATE_CORE_TUNABLE_WINDOW_SIZE,
-        #    ty.uint32,)
+        self._window_size = self._core_context.get_tunable(
+            legion.LEGATE_CORE_TUNABLE_WINDOW_SIZE,
+            ty.uint32,)
         self._window_size=50
         self._fusion_threshold =2
         #used for logging window and fusion lengths
@@ -1342,8 +1167,7 @@ def build_fused_op(self,ops):
         fusion_checker = FusionChecker(ops, self._contexts, self)
         fusion_checker.register_constraint(AllValidOps())
         fusion_checker.register_constraint(IdenticalLaunchShapes())
-        #this constraints doesn't seem to be necessary?
-        #fusion_checker.register_constraint(IdenticalProjection())
+        fusion_checker.register_constraint(IdenticalProjection())
         fusion_checker.register_constraint(ValidProducerConsumer())
         can_fuse,fusable_sets, partitions = fusion_checker.can_fuse()
                 
@@ -1410,15 +1234,8 @@ def build_fused_op(self,ops):
                     opID+=1
                 new_op_list.append(fused_task)
         strats=[]
-        """
-        stoptime = datetime.datetime.now()
-        delta=stoptime-starttime
-        total = delta.total_seconds() * 1000.0
-        print("buildtime", total)
-        """
    
         redoPar=False
-        #starttime = datetime.datetime.now()
         for i,fused_task in enumerate(new_op_list):
             if redoPar:
                 must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task])
@@ -1429,11 +1246,6 @@ def build_fused_op(self,ops):
             else:
                 fused_task.strategy = super_strategies[i]
                 strats.append( super_strategies[i])
-        #stoptime = datetime.datetime.now()
-        #delta=stoptime-starttime
-        #total = delta.total_seconds() * 1000.0
-        #print("repartime", total)
-        #pr.disable()
         return new_op_list, strats       
 
     def _launch_outstanding(self, force_eval=True):        

From 239ae351fcc38683490be05efe0ea8d9cece8fcf Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@sapling.stanford.edu>
Date: Sun, 12 Dec 2021 17:01:17 -0800
Subject: [PATCH 38/44] removing serializer code

---
 src/core/data/scalar.h             |  2 --
 src/core/data/scalar.inl           |  1 -
 src/core/data/store.cc             | 13 ++++---------
 src/core/data/store.h              | 14 ++------------
 src/core/data/transform.cc         | 25 -------------------------
 src/core/data/transform.h          | 14 --------------
 src/core/runtime/context.cc        | 11 -----------
 src/core/runtime/context.h         | 10 +++-------
 src/core/utilities/deserializer.cc |  4 ++--
 9 files changed, 11 insertions(+), 83 deletions(-)

diff --git a/src/core/data/scalar.h b/src/core/data/scalar.h
index 205e51a51..852121a19 100644
--- a/src/core/data/scalar.h
+++ b/src/core/data/scalar.h
@@ -19,7 +19,6 @@
 #include "core/utilities/span.h"
 #include "core/utilities/type_traits.h"
 #include "core/utilities/typedefs.h"
-#include "core/utilities/makeshift_serializer.h"
 
 namespace legate {
 
@@ -59,7 +58,6 @@ class Scalar {
   LegateTypeCode code_{MAX_TYPE_NUMBER};
   const void* data_;
 
-  friend class MakeshiftSerializer;
 };
 
 }  // namespace legate
diff --git a/src/core/data/scalar.inl b/src/core/data/scalar.inl
index ec629708c..892ce414e 100644
--- a/src/core/data/scalar.inl
+++ b/src/core/data/scalar.inl
@@ -44,7 +44,6 @@ VAL Scalar::value() const
 template <typename VAL>
 Span<const VAL> Scalar::values() const
 {
-
   if (tuple_) {
     auto size = *static_cast<const uint32_t*>(data_);
     auto data = static_cast<const uint8_t*>(data_) + sizeof(uint32_t);
diff --git a/src/core/data/store.cc b/src/core/data/store.cc
index fd40a4dd9..ed21a6957 100644
--- a/src/core/data/store.cc
+++ b/src/core/data/store.cc
@@ -21,8 +21,8 @@ namespace legate {
 
 using namespace Legion;
 
-RegionField::RegionField(int32_t dim, const PhysicalRegion& pr, FieldID fid, unsigned reqIdx)
-  : dim_(dim), pr_(pr), fid_(fid), reqIdx_(reqIdx)
+RegionField::RegionField(int32_t dim, const PhysicalRegion& pr, FieldID fid)
+  : dim_(dim), pr_(pr), fid_(fid)
 {
   auto priv  = pr.get_privilege();
   readable_  = static_cast<bool>(priv & LEGION_READ_PRIV);
@@ -34,7 +34,6 @@ RegionField::RegionField(RegionField&& other) noexcept
   : dim_(other.dim_),
     pr_(other.pr_),
     fid_(other.fid_),
-    reqIdx_(other.reqIdx_),
     readable_(other.readable_),
     writable_(other.writable_),
     reducible_(other.reducible_)
@@ -46,7 +45,6 @@ RegionField& RegionField::operator=(RegionField&& other) noexcept
   dim_       = other.dim_;
   pr_        = other.pr_;
   fid_       = other.fid_;
-  reqIdx_    = other.reqIdx_; 
 
   readable_  = other.readable_;
   writable_  = other.writable_;
@@ -56,15 +54,14 @@ RegionField& RegionField::operator=(RegionField&& other) noexcept
 
 Domain RegionField::domain() const { return dim_dispatch(dim_, get_domain_fn{}, pr_); }
 
-OutputRegionField::OutputRegionField(const OutputRegion& out, FieldID fid, unsigned reqIdx) : out_(out), fid_(fid), reqIdx_(reqIdx) {}
+OutputRegionField::OutputRegionField(const OutputRegion& out, FieldID fid) : out_(out), fid_(fid) {}
 
 OutputRegionField::OutputRegionField(OutputRegionField&& other) noexcept
-  : bound_(other.bound_), out_(other.out_), fid_(other.fid_), reqIdx_(other.reqIdx_)
+  : bound_(other.bound_), out_(other.out_), fid_(other.fid_)
 {
   other.bound_ = false;
   other.out_   = OutputRegion();
   other.fid_   = -1;
-  //TODO, how should we invalidate reqIdx
 }
 
 OutputRegionField& OutputRegionField::operator=(OutputRegionField&& other) noexcept
@@ -72,12 +69,10 @@ OutputRegionField& OutputRegionField::operator=(OutputRegionField&& other) noexc
   bound_ = other.bound_;
   out_   = other.out_;
   fid_   = other.fid_;
-  reqIdx_= other.reqIdx_;
 
   other.bound_ = false;
   other.out_   = OutputRegion();
   other.fid_   = -1;
-  //TODO, how should we invalidate reqIdx
 
   return *this;
 }
diff --git a/src/core/data/store.h b/src/core/data/store.h
index eef2896d4..308c50cd1 100644
--- a/src/core/data/store.h
+++ b/src/core/data/store.h
@@ -22,14 +22,13 @@
 #include "core/task/return.h"
 #include "core/utilities/machine.h"
 #include "core/utilities/typedefs.h"
-#include "core/utilities/makeshift_serializer.h"
 
 namespace legate {
 
 class RegionField {
  public:
   RegionField() {}
-  RegionField(int32_t dim, const Legion::PhysicalRegion& pr, Legion::FieldID fid, unsigned reqIdx);
+  RegionField(int32_t dim, const Legion::PhysicalRegion& pr, Legion::FieldID fid);
 
  public:
   RegionField(RegionField&& other) noexcept;
@@ -139,7 +138,6 @@ class RegionField {
   template <int32_t DIM>
   Legion::Rect<DIM> shape() const;
   Legion::Domain domain() const;
-  unsigned getReqIdx() const {return reqIdx_; }
 
  public:
   bool is_readable() const { return readable_; }
@@ -150,20 +148,18 @@ class RegionField {
   int32_t dim_{-1};
   Legion::PhysicalRegion pr_{};
   Legion::FieldID fid_{-1U};
-  unsigned reqIdx_; //this gets packed as an unsigned
 
  private:
   bool readable_{false};
   bool writable_{false};
   bool reducible_{false};
 
-  friend class MakeshiftSerializer;
 };
 
 class OutputRegionField {
  public:
   OutputRegionField() {}
-  OutputRegionField(const Legion::OutputRegion& out, Legion::FieldID fid, unsigned reqIdx);
+  OutputRegionField(const Legion::OutputRegion& out, Legion::FieldID fid);
 
  public:
   OutputRegionField(OutputRegionField&& other) noexcept;
@@ -181,9 +177,7 @@ class OutputRegionField {
   bool bound_{false};
   Legion::OutputRegion out_{};
   Legion::FieldID fid_{-1U};
-  unsigned reqIdx_;  //this gets packed as an unsigned
 
-  friend class MakeshiftSerializer;
 };
 
 class FutureWrapper {
@@ -246,7 +240,6 @@ class FutureWrapper {
   mutable bool uninitialized_{true};
   mutable void* rawptr_{nullptr};
 
- friend class MakeshiftSerializer;
 };
 
 class Store {
@@ -275,7 +268,6 @@ class Store {
 
  public:
   int32_t dim() const { return dim_; }
-  bool is_future2() const { return is_future_; }
   LegateTypeCode code() const { return code_; }
 
  public:
@@ -302,7 +294,6 @@ class Store {
   template <int32_t DIM>
   Legion::Rect<DIM> shape() const;
   Legion::Domain domain() const;
-  unsigned getReqIdx() const {return region_field_.getReqIdx(); }
 
  public:
   bool is_readable() const { return readable_; }
@@ -341,7 +332,6 @@ class Store {
   bool writable_{false};
   bool reducible_{false};
 
- friend class MakeshiftSerializer;
 };
 
 //containts prefix sums for a sub-op
diff --git a/src/core/data/transform.cc b/src/core/data/transform.cc
index 4ca2b071e..0f2fddc66 100644
--- a/src/core/data/transform.cc
+++ b/src/core/data/transform.cc
@@ -45,11 +45,6 @@ Shift::Shift(int32_t dim, int64_t offset, StoreTransformP parent)
 {
 }
 
-int32_t Shift::getTransformCode() const
-{
-    return LEGATE_CORE_TRANSFORM_SHIFT;
-}
-
 Domain Shift::transform(const Domain& input) const
 {
   auto result = nullptr != parent_ ? parent_->transform(input) : input;
@@ -103,11 +98,6 @@ Promote::Promote(int32_t extra_dim, int64_t dim_size, StoreTransformP parent)
 {
 }
 
-int32_t Promote::getTransformCode() const
-{
-    return LEGATE_CORE_TRANSFORM_PROMOTE;
-}
-
 Domain Promote::transform(const Domain& input) const
 {
   auto promote = [](int32_t extra_dim, int64_t dim_size, const Domain& input) {
@@ -174,11 +164,6 @@ Project::Project(int32_t dim, int64_t coord, StoreTransformP parent)
 {
 }
 
-int32_t Project::getTransformCode() const
-{
-    return LEGATE_CORE_TRANSFORM_PROJECT;
-}
-
 Domain Project::transform(const Domain& input) const
 {
   auto project = [](int32_t collapsed_dim, const Domain& input) {
@@ -247,11 +232,6 @@ Transpose::Transpose(std::vector<int32_t>&& axes, StoreTransformP parent)
 {
 }
 
-int32_t Transpose::getTransformCode() const
-{
-    return LEGATE_CORE_TRANSFORM_TRANSPOSE;
-}
-
 Domain Transpose::transform(const Domain& input) const
 {
   auto transpose = [](const auto& axes, const Domain& input) {
@@ -335,11 +315,6 @@ Delinearize::Delinearize(int32_t dim, std::vector<int64_t>&& sizes, StoreTransfo
   for (auto size : sizes_) volume_ *= size;
 }
 
-int32_t Delinearize::getTransformCode() const
-{
-    return LEGATE_CORE_TRANSFORM_DELINEARIZE;
-}
-
 Domain Delinearize::transform(const Domain& input) const
 {
   auto delinearize = [](const auto dim, const auto ndim, const auto& strides, const Domain& input) {
diff --git a/src/core/data/transform.h b/src/core/data/transform.h
index 6c272b735..e39b75962 100644
--- a/src/core/data/transform.h
+++ b/src/core/data/transform.h
@@ -19,9 +19,7 @@
 #include <memory>
 
 #include "legion.h"
-#include "core/legate_c.h"
 
-class MakeshiftSerializer;
 namespace legate {
 
 class StoreTransform {
@@ -33,11 +31,9 @@ class StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& input) const           = 0;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const = 0;
-  virtual int32_t getTransformCode() const =0;
   virtual void print(std::ostream& out) const                                   = 0;
 
  protected:
-  friend class MakeshiftSerializer;
   std::shared_ptr<StoreTransform> parent_{nullptr};
 };
 
@@ -51,13 +47,11 @@ class Shift : public StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& input) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
-  virtual int32_t getTransformCode() const override;
   virtual void print(std::ostream& out) const override;
 
  private:
   int32_t dim_;
   int64_t offset_; 
- friend class MakeshiftSerializer;
 };
 
 class Promote : public StoreTransform {
@@ -68,13 +62,11 @@ class Promote : public StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& input) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
-  virtual int32_t getTransformCode() const override;
   virtual void print(std::ostream& out) const override;
 
  private:
   int32_t extra_dim_;
   int64_t dim_size_;
- friend class MakeshiftSerializer;
 };
 
 class Project : public StoreTransform {
@@ -85,13 +77,11 @@ class Project : public StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& domain) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
-  virtual int32_t getTransformCode() const override;
   virtual void print(std::ostream& out) const override;
 
  private:
   int32_t dim_;
   int64_t coord_;
- friend class MakeshiftSerializer;
 };
 
 class Transpose : public StoreTransform {
@@ -102,12 +92,10 @@ class Transpose : public StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& domain) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
-  virtual int32_t getTransformCode() const override;
   virtual void print(std::ostream& out) const override;
 
  private:
   std::vector<int32_t> axes_;
- friend class MakeshiftSerializer;
 };
 
 class Delinearize : public StoreTransform {
@@ -120,7 +108,6 @@ class Delinearize : public StoreTransform {
  public:
   virtual Legion::Domain transform(const Legion::Domain& domain) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
-  virtual int32_t getTransformCode() const override;
   virtual void print(std::ostream& out) const override;
 
  private:
@@ -128,7 +115,6 @@ class Delinearize : public StoreTransform {
   std::vector<int64_t> sizes_;
   std::vector<int64_t> strides_;
   int64_t volume_;
- friend class MakeshiftSerializer;
 };
 
 }  // namespace legate
diff --git a/src/core/runtime/context.cc b/src/core/runtime/context.cc
index 4884c0110..79bfbf6a4 100644
--- a/src/core/runtime/context.cc
+++ b/src/core/runtime/context.cc
@@ -154,17 +154,6 @@ TaskContext::TaskContext(const Legion::Task* task,
   scalars_    = dez.unpack<std::vector<Scalar>>();
 
 }
-/*
-  TaskContext::TaskContext(std::vector<Store>& inputs, std::vector<Store>& outputs,
-              std::vector<Store>& reductions, std::vector<Scalar>& scalars) 
-  : inputs_(inputs), outputs_(outputs), reductions_(reductions), scalars_(scalars) 
-{
-    regions_ = NULL;
-    context_ = NULL;
-    runtime_ = NULL;
-    task_ = NULL;
-}
-*/
 
 ReturnValues TaskContext::pack_return_values() const
 {
diff --git a/src/core/runtime/context.h b/src/core/runtime/context.h
index c9cb29462..245024e10 100644
--- a/src/core/runtime/context.h
+++ b/src/core/runtime/context.h
@@ -18,6 +18,7 @@
 
 #include "legion.h"
 #include "core/data/scalar.h"
+#include "core/data/store.h"
 
 #include "core/task/return.h"
 
@@ -25,7 +26,7 @@ namespace legate {
 
 class Store;
 class Scalar;
-struct FusionMetadata;
+//struct FusionMetadata;
 
 struct ResourceConfig {
   int64_t max_tasks{1000000};
@@ -118,13 +119,8 @@ class TaskContext {
               Legion::Runtime* runtime);
 
   TaskContext(const Legion::Task* task, const std::vector<Legion::PhysicalRegion> regions)
-//             std::vector<Store>& inputs, std::vector<Store>& outputs, std::vector<Scalar>& scalars)
   : task_(task), regions_(regions) 
- {
-    //inputs_=inputs;  
-    //outputs_=outputs;
-    //scalars_=scalars;
-}
+ {}
 
 
  public:
diff --git a/src/core/utilities/deserializer.cc b/src/core/utilities/deserializer.cc
index 3aa297811..9a0813013 100644
--- a/src/core/utilities/deserializer.cc
+++ b/src/core/utilities/deserializer.cc
@@ -145,7 +145,7 @@ void TaskDeserializer::_unpack(RegionField& value)
   auto dim = unpack<int32_t>();
   auto idx = unpack<uint32_t>();
   auto fid = unpack<int32_t>();
-  value = RegionField(dim, regions_[idx], fid, idx);
+  value = RegionField(dim, regions_[idx], fid);
 }
 
 void TaskDeserializer::_unpack(OutputRegionField& value)
@@ -155,7 +155,7 @@ void TaskDeserializer::_unpack(OutputRegionField& value)
   auto idx = unpack<uint32_t>();
   auto fid = unpack<int32_t>();
 
-  value = OutputRegionField(outputs_[idx], fid, idx);
+  value = OutputRegionField(outputs_[idx], fid);
 }
 
 namespace mapping {

From b49557bfa69008cd9254bfe55c75dc978f8d3026 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@sapling.stanford.edu>
Date: Sun, 12 Dec 2021 17:08:42 -0800
Subject: [PATCH 39/44] more cleanup

---
 legate/core/operation.py | 1 -
 src/core/data/store.cc   | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/legate/core/operation.py b/legate/core/operation.py
index 861b042b5..5f148a5d0 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -35,7 +35,6 @@ def __init__(self, context, mapper_id=0, op_id=0):
         self._outputs = []
         self._reductions = []
         self._is_fused = False
-        self._temps = []
         self._input_parts = []
         self._output_parts = []
         self._reduction_parts = []
diff --git a/src/core/data/store.cc b/src/core/data/store.cc
index ed21a6957..4fd19b34c 100644
--- a/src/core/data/store.cc
+++ b/src/core/data/store.cc
@@ -122,8 +122,8 @@ Domain FutureWrapper::domain() const { return domain_; }
 ReturnValue FutureWrapper::pack() const
 {
   if (nullptr == rawptr_) {
-    //fprintf(stderr, "Found an uninitialized Legate store\n");
-    //assert(false);
+    fprintf(stderr, "Found an uninitialized Legate store\n");
+    assert(false);
   }
   return ReturnValue(rawptr_, field_size_);
 }

From e05e3ffd5171425cf2593101877a772de3dbf7a8 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@sapling.stanford.edu>
Date: Sun, 12 Dec 2021 18:01:04 -0800
Subject: [PATCH 40/44] remove fusion reference from core

---
 legate/core/corelib.py          |  3 ++-
 legate/core/runtime.py          | 30 ++++++++++++++++++------------
 src/core.mk                     | 12 +++++++-----
 src/core/legate_c.h             |  1 +
 src/core/mapping/core_mapper.cc |  3 +--
 src/core/runtime/runtime.cc     | 25 +++++++++++++++++++++++++
 src/core/runtime/runtime.h      |  2 ++
 src/core/task/task.cc           |  6 +++++-
 src/core/task/task.h            |  3 +++
 src/legate.h                    |  2 ++
 10 files changed, 66 insertions(+), 21 deletions(-)

diff --git a/legate/core/corelib.py b/legate/core/corelib.py
index ff35071a6..77800d20d 100644
--- a/legate/core/corelib.py
+++ b/legate/core/corelib.py
@@ -24,7 +24,7 @@
 class CoreLib(Library):
     def __init__(self):
         self._lib = None
-
+        
     def get_name(self):
         return "legate.core"
 
@@ -38,6 +38,7 @@ def get_c_header(self):
     def initialize(self, shared_lib):
         self._lib = shared_lib
         shared_lib.legate_parse_config()
+        #self.fused_id = self._lib.LEGATE_CORE_FUSED_TASK_ID
 
     def get_registration_callback(self):
         return "legate_core_perform_registration"
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 21793b891..b0e2c53bf 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -778,6 +778,17 @@ def apply(self, contexts, runtime, ops, baseIntervals, partitioners, strategies)
          """
         raise NotImplementedError("Implement in derived classes")
 
+class cuNumericContextExists(FusionConstraint):
+    """
+    Fusion currently exists as a cuNumeric operation
+    This can be removed once fusion becomes a core task
+    """
+    def apply(self, contexts, runtime, ops, baseIntervals, partitioners, strategies):
+        if "cunumeric" in contexts:
+            return baseIntervals
+        else:
+           return [(i, i+1) for i in range(len(ops))]
+
 
 class AllValidOps(FusionConstraint):
     """
@@ -789,7 +800,7 @@ def __init__(self):
         self.terminals = set()
         self.validIDs.add(2) #Binary op
         self.validIDs.add(10) #Fill op
-        self.validIDs.add(20) #Unary op
+        self.validIDs.add(21) #Unary op
 
     def apply(self, contexts, runtime, ops, baseIntervals, partitioners, strategies):
         fusable_intervals = []
@@ -1165,6 +1176,7 @@ def serialize_multiop_metadata(self, numpy_context, ops):
     def build_fused_op(self,ops):
 
         fusion_checker = FusionChecker(ops, self._contexts, self)
+        fusion_checker.register_constraint(cuNumericContextExists())
         fusion_checker.register_constraint(AllValidOps())
         fusion_checker.register_constraint(IdenticalLaunchShapes())
         fusion_checker.register_constraint(IdenticalProjection())
@@ -1254,10 +1266,8 @@ def _launch_outstanding(self, force_eval=True):
             self._outstanding_ops = []
             self._schedule(ops, force_eval)
 
-   
     def _schedule(self, ops, force_eval=False):
         ids = [op._task_id for op in ops]
-        #print(ids)
         #case 1: try fusing current window of tasks
         strats = False
         if len(ops)>=2 and (not force_eval):
@@ -1267,30 +1277,26 @@ def _schedule(self, ops, force_eval=False):
                 for task in fused_task_list:
                     task.execute() 
                 self._clearing_pipe = False
-                return
 
         # case 2: tasks  processed for fusion already have  
         # their strategy "baked in", as we already partitioned
         # them when testing fusion legality (in case 1)
-        if len(ops)==1 and self._clearing_pipe:
+        elif len(ops)==1 and self._clearing_pipe:
             strategy = ops[0].strategy
             ops[0].launch(strategy)
 
         # case 3: execute the ops normally 
-        # if we already checked the ops for fusability,
-        # then the ops' buffers have already been partitioned
+        # partition if op wasn't checked for fusability
         else:
             if not strats: #ops were not check for fusability, so partition them
+                strats = []
                 for op in ops:
                     must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op])
                     partitioner = Partitioner(self, [op], must_be_single=must_be_single)
                     strategy = partitioner.partition_stores()
-                    op.strategy = strategy
-            else: #strategies already calculated during failed attempt to fuse 
-                for i,op in enumerate(ops):
-                    op.strategy = strats[i]
+                    strats.append(strategy)
             for i,op in enumerate(ops):
-                op.launch(op.strategy)
+                op.launch(strats[i])
 
 
     def submit(self, op):
diff --git a/src/core.mk b/src/core.mk
index 988d6741d..3aa885b2c 100644
--- a/src/core.mk
+++ b/src/core.mk
@@ -31,12 +31,13 @@ GEN_CPU_SRC	= core/legate_c.cc                 \
 							core/task/return.cc              \
 							core/task/task.cc                \
 							core/utilities/deserializer.cc   \
-							core/utilities/makeshift_serializer.cc \
 							core/utilities/machine.cc        \
-							core/utilities/linearize.cc
+							core/utilities/linearize.cc    # \
+							#core/fused/fused_op_gpu.cc     \
+							#core/fused/fused_op.cc
 
 ifeq ($(strip $(USE_CUDA)),1)
-GEN_CPU_SRC	+= core/gpu/cudalibs.cc
+GEN_CPU_SRC	+= core/gpu/cudalibs.cc 
 endif
 
 # Header files that we need to have installed for client legate libraries
@@ -63,6 +64,7 @@ INSTALL_HEADERS = legate.h                        \
 									core/utilities/dispatch.h       \
 									core/utilities/machine.h        \
 									core/utilities/span.h           \
-									core/utilities/makeshift_serializer.h  \
 									core/utilities/type_traits.h    \
-									core/utilities/typedefs.h
+									core/utilities/typedefs.h      #\
+							                #core/fused/fused_op.h           \
+							                #core/fused/fused_op_wrapper.h
diff --git a/src/core/legate_c.h b/src/core/legate_c.h
index d3e12a502..132328cf7 100644
--- a/src/core/legate_c.h
+++ b/src/core/legate_c.h
@@ -21,6 +21,7 @@ typedef enum legate_core_task_id_t {
   LEGATE_CORE_INITIALIZE_TASK_ID,
   LEGATE_CORE_FINALIZE_TASK_ID,
   LEGATE_CORE_EXTRACT_SCALAR_TASK_ID,
+  //LEGATE_CORE_FUSED_TASK_ID,
   LEGATE_CORE_NUM_TASK_IDS,  // must be last
 } legate_core_task_id_t;
 
diff --git a/src/core/mapping/core_mapper.cc b/src/core/mapping/core_mapper.cc
index 7f0c8ae0a..97a059166 100644
--- a/src/core/mapping/core_mapper.cc
+++ b/src/core/mapping/core_mapper.cc
@@ -215,8 +215,7 @@ Mapper::MapperSyncModel CoreMapper::get_mapper_sync_model(void) const
 
 void CoreMapper::select_task_options(const MapperContext ctx, const Task& task, TaskOptions& output)
 {
-  //std::cout<<"task_id "<<task.task_id<<std::endl;
-  //assert(context.valid_task_id(task.task_id));
+  assert(context.valid_task_id(task.task_id));
   if (task.tag == LEGATE_CPU_VARIANT) {
     assert(!local_cpus.empty());
     output.initial_proc = local_cpus.front();
diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc
index eb57f46d8..85c21307c 100644
--- a/src/core/runtime/runtime.cc
+++ b/src/core/runtime/runtime.cc
@@ -18,6 +18,7 @@
 #include "core/runtime/context.h"
 #include "core/runtime/projection.h"
 #include "core/runtime/shard.h"
+#include "core/fused/fused_op.h"
 #include "core/utilities/deserializer.h"
 #include "legate.h"
 #ifdef LEGATE_USE_CUDA
@@ -36,8 +37,10 @@ Logger log_legate("legate");
 using LegateVariantImpl = void (*)(TaskContext&);
 /*static */ std::vector<std::pair<int64_t, LegateVariantImpl> > Core::opIDs = *(new std::vector<std::pair<int64_t, LegateVariantImpl> >());
 /*static */ std::vector<std::pair<int64_t, LegateVariantImpl> > Core::gpuOpIDs = *(new std::vector<std::pair<int64_t, LegateVariantImpl> >());
+/*static */ std::vector<std::pair<int64_t, LegateVariantImpl> > Core::ompOpIDs = *(new std::vector<std::pair<int64_t, LegateVariantImpl> >());
 /*static */ std::unordered_map<long, LegateVariantImpl> Core::cpuDescriptors = *(new std::unordered_map<long, LegateVariantImpl>());
 /*static */ std::unordered_map<long, LegateVariantImpl> Core::gpuDescriptors = *(new std::unordered_map<long, LegateVariantImpl>());
+/*static */ std::unordered_map<long, LegateVariantImpl> Core::ompDescriptors = *(new std::unordered_map<long, LegateVariantImpl>());
 
 static const char* const core_library_name = "legate.core";
 
@@ -180,6 +183,12 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library
   runtime->attach_name(
     extract_scalar_task_id, extract_scalar_task_name, false /*mutable*/, true /*local only*/);
 
+  //const TaskID fused_task_id  = context.get_task_id(LEGATE_CORE_FUSED_TASK2_ID);
+  //const char* fused_task_name = "Legate Core Task Fusion";
+  //runtime->attach_name(
+  //  fused_task_id, fused_task_name, false /*mutable*/, true /*local only*/);
+
+
   auto make_registrar = [&](auto task_id, auto* task_name, auto proc_kind) {
     TaskVariantRegistrar registrar(task_id, task_name);
     registrar.add_constraint(ProcessorConstraint(proc_kind));
@@ -203,6 +212,19 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library
     runtime->register_task_variant<ReturnValues, extract_scalar_task>(registrar,
                                                                       LEGATE_CPU_VARIANT);
   }
+  /*
+  {
+    auto registrar = make_registrar(fused_task_id, fused_task_name, Processor::LOC_PROC);
+    runtime->register_task_variant<legate::FusedOpTask2>(registrar, LEGATE_CPU_VARIANT);
+  }
+  {
+    auto registrar = make_registrar(fused_task_id, fused_task_name, Processor::TOC_PROC);
+    runtime->register_task_variant<legate::FusedOpTask2>(registrar, LEGATE_GPU_VARIANT);
+  }
+  */
+
+
+
 #ifdef LEGATE_USE_CUDA
   {
     auto registrar = make_registrar(initialize_task_id, initialize_task_name, Processor::TOC_PROC);
@@ -244,6 +266,9 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library
   register_legate_core_projection_functors(runtime, context);
 
   register_legate_core_sharding_functors(runtime, context);
+
+  std::cout<<"performing legate core registration callback"<<std::endl;
+  CoreFused::get_registrar().register_all_tasks(runtime, context);
 }
 
 }  // namespace legate
diff --git a/src/core/runtime/runtime.h b/src/core/runtime/runtime.h
index 6d1dc8341..566ad4adb 100644
--- a/src/core/runtime/runtime.h
+++ b/src/core/runtime/runtime.h
@@ -35,8 +35,10 @@ class Core {
   static void shutdown(void);
   static std::unordered_map<int64_t, LegateVariantImpl> cpuDescriptors; 
   static std::unordered_map<int64_t, LegateVariantImpl> gpuDescriptors; 
+  static std::unordered_map<int64_t, LegateVariantImpl> ompDescriptors; 
   static std::vector<std::pair<int64_t, LegateVariantImpl> > opIDs;
   static std::vector<std::pair<int64_t, LegateVariantImpl> > gpuOpIDs;
+  static std::vector<std::pair<int64_t, LegateVariantImpl> > ompOpIDs;
 
  public:
   // Configuration settings
diff --git a/src/core/task/task.cc b/src/core/task/task.cc
index 4749aba8f..64adf10b9 100644
--- a/src/core/task/task.cc
+++ b/src/core/task/task.cc
@@ -35,7 +35,6 @@ void LegateTaskRegistrar::record_variant(TaskID tid,
   assert((kind == Processor::LOC_PROC) || (kind == Processor::TOC_PROC) ||
          (kind == Processor::OMP_PROC));
 
-
   // Buffer these up until we can do our actual registration with the runtime
   pending_task_variants_.push_back(PendingTaskVariant(tid,
                                                       false /*global*/,
@@ -68,6 +67,11 @@ void LegateTaskRegistrar::register_all_tasks(Runtime* runtime, LibraryContext& c
     Core::gpuDescriptors.insert(std::pair<int64_t, LegateVariantImpl>((int64_t) newID, taskIdx.second));
   }
 
+  for (auto& taskIdx : Core::ompOpIDs){
+    auto newID = context.get_task_id(taskIdx.first);
+    Core::ompDescriptors.insert(std::pair<int64_t, LegateVariantImpl>((int64_t) newID, taskIdx.second));
+  }
+
 
 
   // Do all our registrations
diff --git a/src/core/task/task.h b/src/core/task/task.h
index cc5d08628..a5ccb25df 100644
--- a/src/core/task/task.h
+++ b/src/core/task/task.h
@@ -140,6 +140,9 @@ class LegateTask {
     }else if (kind ==Legion::Processor::TOC_PROC){
         Core::gpuOpIDs.push_back(std::pair<int64_t, LegateVariantImpl>((int64_t)task_id, TASK_PTR));
     }
+    else if (kind ==Legion::Processor::OMP_PROC){
+        Core::ompOpIDs.push_back(std::pair<int64_t, LegateVariantImpl>((int64_t)task_id, TASK_PTR));
+    }
     T::Registrar::record_variant(task_id,
                                  T::task_name(),
                                  desc,
diff --git a/src/legate.h b/src/legate.h
index e786b5f87..8001a95ac 100644
--- a/src/legate.h
+++ b/src/legate.h
@@ -27,4 +27,6 @@
 #include "core/utilities/dispatch.h"
 #include "core/utilities/type_traits.h"
 #include "core/utilities/typedefs.h"
+//#include "core/fused/fused_op.h"
+//#include "core/fused/fused_op_wrapper.h"
 #include "legate_defines.h"

From a59e142cba0b8b27bdf06e35139370253ecf7999 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@sapling.stanford.edu>
Date: Sun, 12 Dec 2021 18:06:57 -0800
Subject: [PATCH 41/44] remove comments

---
 src/core.mk                 |  8 ++------
 src/core/legate_c.h         |  1 -
 src/core/runtime/runtime.cc | 19 -------------------
 3 files changed, 2 insertions(+), 26 deletions(-)

diff --git a/src/core.mk b/src/core.mk
index 3aa885b2c..337925308 100644
--- a/src/core.mk
+++ b/src/core.mk
@@ -32,9 +32,7 @@ GEN_CPU_SRC	= core/legate_c.cc                 \
 							core/task/task.cc                \
 							core/utilities/deserializer.cc   \
 							core/utilities/machine.cc        \
-							core/utilities/linearize.cc    # \
-							#core/fused/fused_op_gpu.cc     \
-							#core/fused/fused_op.cc
+							core/utilities/linearize.cc    
 
 ifeq ($(strip $(USE_CUDA)),1)
 GEN_CPU_SRC	+= core/gpu/cudalibs.cc 
@@ -65,6 +63,4 @@ INSTALL_HEADERS = legate.h                        \
 									core/utilities/machine.h        \
 									core/utilities/span.h           \
 									core/utilities/type_traits.h    \
-									core/utilities/typedefs.h      #\
-							                #core/fused/fused_op.h           \
-							                #core/fused/fused_op_wrapper.h
+									core/utilities/typedefs.h      
diff --git a/src/core/legate_c.h b/src/core/legate_c.h
index 132328cf7..d3e12a502 100644
--- a/src/core/legate_c.h
+++ b/src/core/legate_c.h
@@ -21,7 +21,6 @@ typedef enum legate_core_task_id_t {
   LEGATE_CORE_INITIALIZE_TASK_ID,
   LEGATE_CORE_FINALIZE_TASK_ID,
   LEGATE_CORE_EXTRACT_SCALAR_TASK_ID,
-  //LEGATE_CORE_FUSED_TASK_ID,
   LEGATE_CORE_NUM_TASK_IDS,  // must be last
 } legate_core_task_id_t;
 
diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc
index 85c21307c..d869ec794 100644
--- a/src/core/runtime/runtime.cc
+++ b/src/core/runtime/runtime.cc
@@ -18,7 +18,6 @@
 #include "core/runtime/context.h"
 #include "core/runtime/projection.h"
 #include "core/runtime/shard.h"
-#include "core/fused/fused_op.h"
 #include "core/utilities/deserializer.h"
 #include "legate.h"
 #ifdef LEGATE_USE_CUDA
@@ -183,11 +182,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library
   runtime->attach_name(
     extract_scalar_task_id, extract_scalar_task_name, false /*mutable*/, true /*local only*/);
 
-  //const TaskID fused_task_id  = context.get_task_id(LEGATE_CORE_FUSED_TASK2_ID);
-  //const char* fused_task_name = "Legate Core Task Fusion";
-  //runtime->attach_name(
-  //  fused_task_id, fused_task_name, false /*mutable*/, true /*local only*/);
-
 
   auto make_registrar = [&](auto task_id, auto* task_name, auto proc_kind) {
     TaskVariantRegistrar registrar(task_id, task_name);
@@ -212,17 +206,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library
     runtime->register_task_variant<ReturnValues, extract_scalar_task>(registrar,
                                                                       LEGATE_CPU_VARIANT);
   }
-  /*
-  {
-    auto registrar = make_registrar(fused_task_id, fused_task_name, Processor::LOC_PROC);
-    runtime->register_task_variant<legate::FusedOpTask2>(registrar, LEGATE_CPU_VARIANT);
-  }
-  {
-    auto registrar = make_registrar(fused_task_id, fused_task_name, Processor::TOC_PROC);
-    runtime->register_task_variant<legate::FusedOpTask2>(registrar, LEGATE_GPU_VARIANT);
-  }
-  */
-
 
 
 #ifdef LEGATE_USE_CUDA
@@ -267,8 +250,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library
 
   register_legate_core_sharding_functors(runtime, context);
 
-  std::cout<<"performing legate core registration callback"<<std::endl;
-  CoreFused::get_registrar().register_all_tasks(runtime, context);
 }
 
 }  // namespace legate

From 399d07005e7123f3f6e46234a1b028d5beed81e6 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@sapling.stanford.edu>
Date: Sun, 12 Dec 2021 18:12:09 -0800
Subject: [PATCH 42/44] more cleanup

---
 src/core/data/scalar.h      | 1 -
 src/core/data/store.h       | 5 +----
 src/core/runtime/context.h  | 2 --
 src/core/runtime/runtime.cc | 4 ----
 src/core/task/task.h        | 2 --
 src/legate.h                | 2 --
 6 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/src/core/data/scalar.h b/src/core/data/scalar.h
index 852121a19..428c75386 100644
--- a/src/core/data/scalar.h
+++ b/src/core/data/scalar.h
@@ -57,7 +57,6 @@ class Scalar {
   bool tuple_{false};
   LegateTypeCode code_{MAX_TYPE_NUMBER};
   const void* data_;
-
 };
 
 }  // namespace legate
diff --git a/src/core/data/store.h b/src/core/data/store.h
index 308c50cd1..b14a5d850 100644
--- a/src/core/data/store.h
+++ b/src/core/data/store.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "legion.h"
+
 #include "core/data/buffer.h"
 #include "core/data/transform.h"
 #include "core/task/return.h"
@@ -153,7 +154,6 @@ class RegionField {
   bool readable_{false};
   bool writable_{false};
   bool reducible_{false};
-
 };
 
 class OutputRegionField {
@@ -177,7 +177,6 @@ class OutputRegionField {
   bool bound_{false};
   Legion::OutputRegion out_{};
   Legion::FieldID fid_{-1U};
-
 };
 
 class FutureWrapper {
@@ -239,7 +238,6 @@ class FutureWrapper {
  private:
   mutable bool uninitialized_{true};
   mutable void* rawptr_{nullptr};
-
 };
 
 class Store {
@@ -331,7 +329,6 @@ class Store {
   bool readable_{false};
   bool writable_{false};
   bool reducible_{false};
-
 };
 
 //containts prefix sums for a sub-op
diff --git a/src/core/runtime/context.h b/src/core/runtime/context.h
index 245024e10..d93d1662f 100644
--- a/src/core/runtime/context.h
+++ b/src/core/runtime/context.h
@@ -26,7 +26,6 @@ namespace legate {
 
 class Store;
 class Scalar;
-//struct FusionMetadata;
 
 struct ResourceConfig {
   int64_t max_tasks{1000000};
@@ -36,7 +35,6 @@ struct ResourceConfig {
   int64_t max_shardings{0};
 };
 
-
 class ResourceScope {
  public:
   ResourceScope() = default;
diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc
index d869ec794..a39159c66 100644
--- a/src/core/runtime/runtime.cc
+++ b/src/core/runtime/runtime.cc
@@ -182,7 +182,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library
   runtime->attach_name(
     extract_scalar_task_id, extract_scalar_task_name, false /*mutable*/, true /*local only*/);
 
-
   auto make_registrar = [&](auto task_id, auto* task_name, auto proc_kind) {
     TaskVariantRegistrar registrar(task_id, task_name);
     registrar.add_constraint(ProcessorConstraint(proc_kind));
@@ -206,8 +205,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library
     runtime->register_task_variant<ReturnValues, extract_scalar_task>(registrar,
                                                                       LEGATE_CPU_VARIANT);
   }
-
-
 #ifdef LEGATE_USE_CUDA
   {
     auto registrar = make_registrar(initialize_task_id, initialize_task_name, Processor::TOC_PROC);
@@ -249,7 +246,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library
   register_legate_core_projection_functors(runtime, context);
 
   register_legate_core_sharding_functors(runtime, context);
-
 }
 
 }  // namespace legate
diff --git a/src/core/task/task.h b/src/core/task/task.h
index a5ccb25df..e83019a08 100644
--- a/src/core/task/task.h
+++ b/src/core/task/task.h
@@ -126,8 +126,6 @@ class LegateTask {
                                bool inner      = false,
                                bool idempotent = false)
   {
-    
-    
     // Construct the code descriptor for this task so that the library
     // can register it later when it is ready
     Legion::CodeDescriptor desc(
diff --git a/src/legate.h b/src/legate.h
index 8001a95ac..e786b5f87 100644
--- a/src/legate.h
+++ b/src/legate.h
@@ -27,6 +27,4 @@
 #include "core/utilities/dispatch.h"
 #include "core/utilities/type_traits.h"
 #include "core/utilities/typedefs.h"
-//#include "core/fused/fused_op.h"
-//#include "core/fused/fused_op_wrapper.h"
 #include "legate_defines.h"

From 5bb4df59cc9f7793719970d5dda60a0f7bf638b1 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@g0002.stanford.edu>
Date: Wed, 20 Apr 2022 15:35:57 -0700
Subject: [PATCH 43/44] partitioning fix

---
 legate/core/operation.py | 11 +++------
 legate/core/runtime.py   | 51 +++++++---------------------------------
 legate/core/solver.py    |  2 +-
 3 files changed, 13 insertions(+), 51 deletions(-)

diff --git a/legate/core/operation.py b/legate/core/operation.py
index 5f148a5d0..a5a5e53e4 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -217,14 +217,9 @@ def launch(self, strategy):
 
         if self._is_fused:
             launcher.add_fusion_metadata(self._is_fused, self._fusion_metadata)
-        if  self._is_fused: #fused ops re-use encapsulated unfused partitions
-            input_parts = self._unfused_input_parts
-            output_parts = self._unfused_output_parts
-            reduction_parts = self._unfused_reduction_parts
-        else:
-            input_parts = self._input_parts
-            output_parts = self._output_parts
-            reduction_parts = self._reduction_parts
+        input_parts = self._input_parts
+        output_parts = self._output_parts
+        reduction_parts = self._reduction_parts
 
         for input, input_part in zip(self._inputs, input_parts):
             proj = strategy.get_projection(input_part)
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index b0e2c53bf..d7762b7ce 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -46,6 +46,7 @@
 from .solver import Partitioner, Strategy
 from .store import RegionField, Store, FusionMetadata
 import numpy as np
+from .constraints import Alignment
 
 
 
@@ -751,14 +752,6 @@ def can_fuse(self):
             partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single)
             self.partitioners.append( partitioner )
             strategy = partitioner.partition_stores()
-            for output, part, in zip(op._outputs, op._output_parts):
-                partition = strategy.get_partition(part)
-                output.set_key_partition(partition)
-                key_part = partition
-                #check if input and output should be aligned
-                for input in op._inputs:
-                    if op.has_constraint(input, output):
-                        input.set_key_partition(key_part)
             self.strategies.append(strategy)
         self.strategies.reverse()
 
@@ -1183,28 +1176,11 @@ def build_fused_op(self,ops):
         fusion_checker.register_constraint(ValidProducerConsumer())
         can_fuse,fusable_sets, partitions = fusion_checker.can_fuse()
                 
-        super_strategies = []
-        z=0
-        for fusable_set in fusable_sets:   
-            #create super strategy for this fusable set
-            super_strat = {}
-            super_fspace = {}
-            super_keystore = set()
-            start,end = fusable_set
-            for j in range(start,end):
-                super_strat = {**(super_strat.copy()), **partitions[j]._strategy}
-                super_fspace = {**(super_fspace.copy()), **partitions[j]._fspaces}
-                super_keystore = super_keystore.union(partitions[j]._key_parts)
-            super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore))
-       
-       
         #once fusion in the core is playing nicely with the mapepr
         #the following two lines will be removed, and be replaced 
         #with the 2 subsequent (commented out) lines
         fused_id = self._contexts["cunumeric"].fused_id
         numpy_context = self._contexts["cunumeric"]
-        #fused_id = self._contexts["legate.core"]._library.fused_id
-        #numpy_context = self._contexts["legate.core"]
 
         opID=0
         new_op_list = []
@@ -1225,39 +1201,30 @@ def build_fused_op(self,ops):
                 fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true
 
                 #add typical inputs and outputs of all subtasks to fused task
-                key_part = None
-                fused_task._unfused_input_parts = []
-                fused_task._unfused_output_parts = []
-                fused_task._unfused_reduction_parts = []
                 for j,op in enumerate(op_subset):
                     for scalar in op._scalar_args:
                         fused_task.add_scalar_arg(scalar[0], ty.int32)
                     for (reduction, redop), part in zip(op._reductions, op._reduction_parts):
                         fused_task.add_reduction(reduction, redop)
-                        fused_task._unfused_reduction_parts.append(part)
                     for input,part in zip(op._inputs, op._input_parts):
                         fused_task.add_input(input)   
-                        fused_task._unfused_input_parts.append(part)
                     for output,part in zip(op._outputs, op._output_parts):
                         fused_task.add_output(output)   
-                        fused_task._unfused_output_parts.append(part)
                     for future in op._futures:
                         fused_task.add_future(future)
+                    for constraint in op._constraints:
+                        if (isinstance(constraint, Alignment)):
+                            fused_task.add_alignment(constraint._lhs.store, constraint._rhs.store)
                     opID+=1
                 new_op_list.append(fused_task)
         strats=[]
    
-        redoPar=False
         for i,fused_task in enumerate(new_op_list):
-            if redoPar:
-                must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task])
-                partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single)
-                strategy = partitioner.partition_stores()
-                fused_task.strategy = strategy
-                strats.append(strategy)
-            else:
-                fused_task.strategy = super_strategies[i]
-                strats.append( super_strategies[i])
+            must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task])
+            partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single)
+            strategy = partitioner.partition_stores()
+            fused_task.strategy = strategy
+            strats.append(strategy)
         return new_op_list, strats       
 
     def _launch_outstanding(self, force_eval=True):        
diff --git a/legate/core/solver.py b/legate/core/solver.py
index 51639f17d..84aa8180c 100644
--- a/legate/core/solver.py
+++ b/legate/core/solver.py
@@ -35,7 +35,7 @@ def empty(self):
     def _add(self, var1, var2):
         cls = set([var1, var2])
         cls_id = self._next_class_id
-        self._next_class_id + 1
+        self._next_class_id += 1
         self._classes[cls_id] = cls
         self._class_ids[var1] = cls_id
         self._class_ids[var2] = cls_id

From 36c40cb1ebf7a809fe1196569c81be44a41d4586 Mon Sep 17 00:00:00 2001
From: Shiv Sundram <shiv1@sapling.stanford.edu>
Date: Mon, 13 Jun 2022 19:13:18 -0700
Subject: [PATCH 44/44] choose midpoint partition

---
 legate/core/runtime.py | 57 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index d7762b7ce..3d433637b 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -758,9 +758,55 @@ def can_fuse(self):
         windows = [(0, len(self.ops))]
         for constraint in self.constraints:
             windows = constraint.apply(self.contexts, self.runtime, self.ops, windows, self.partitioners, self.strategies)
-  
+
+        #for i,strategy in enumerate(self.strategies):
+        #    print(i,"i", strategy)
+        old_strategies = self.strategies[:]
+        old_strategies.reverse()
+        self.strategies = []
+        #for i,strategy in enumerate(old_strategies):
+        #    print(i,strategy)
+        ist=0
+        keyps = []
+        for window in reversed(windows):
+            fusable,final_set = self.supress_small_fusions(windows, self.runtime._fusion_threshold)
+            local_partitions =  []
+            if window[0] == window[1]:
+                continue
+            for op in reversed(self.ops[ window[0]:window[1] ]):
+                strategy = old_strategies[ist]
+                #print("looking in", strategy)
+           
+                for output, part, in zip(op._outputs, op._output_parts):
+                    #print("need", part)
+                    partition = strategy.get_partition(part)
+                    local_partitions.append(partition)
+                ist+=1
+            midpoint = int(len(local_partitions)/2)
+            partition = local_partitions[midpoint]
+            keyps.append(partition)
+            #print("selected", midpoint, len(local_partitions), partition)
+            for op in reversed(self.ops[ window[0]:window[1] ]):
+                #print("selected", partition)
+                for output, part, in zip(op._outputs, op._output_parts):
+                    output.reset_key_partition()
+                    output.set_key_partition(partition)
+                    strategy._strategy[part] = partition
+                    key_part = partition
+                    #check if input and output should be aligned
+                    for input, ipart in zip(op._inputs, op._input_parts):
+                        if input.shape== output.shape:
+                            input.reset_key_partition()
+                            input.set_key_partition(partition)
+                            strategy._strategy[ipart] = partition
+                self.strategies.append(strategy)
+            #return fusable, final_set, self.strategies
+        self.strategies.reverse()
+        keyps.reverse()
+
+      
         fusable,final_set = self.supress_small_fusions(windows, self.runtime._fusion_threshold)
-        return fusable, final_set, self.strategies
+        return fusable, final_set, self.strategies, keyps
 
 
 class FusionConstraint(object):
@@ -1174,7 +1220,7 @@ def build_fused_op(self,ops):
         fusion_checker.register_constraint(IdenticalLaunchShapes())
         fusion_checker.register_constraint(IdenticalProjection())
         fusion_checker.register_constraint(ValidProducerConsumer())
-        can_fuse,fusable_sets, partitions = fusion_checker.can_fuse()
+        can_fuse,fusable_sets, partitions, keyps = fusion_checker.can_fuse()
                 
         #once fusion in the core is playing nicely with the mapepr
         #the following two lines will be removed, and be replaced 
@@ -1221,10 +1267,15 @@ def build_fused_op(self,ops):
    
         for i,fused_task in enumerate(new_op_list):
             must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task])
+            for output, part, in zip(fused_task._outputs, fused_task._output_parts):
+                output.set_key_partition(keyps[i])
+            for input, part, in zip(fused_task._inputs, fused_task._input_parts):
+                input.set_key_partition(keyps[i])
             partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single)
             strategy = partitioner.partition_stores()
             fused_task.strategy = strategy
             strats.append(strategy)
+            #print(i, strategy)
         return new_op_list, strats       
 
     def _launch_outstanding(self, force_eval=True):