From 8ff02615570ee31a38122b6876212cb4a80872b7 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Mon, 27 Sep 2021 15:06:00 -0700 Subject: [PATCH 01/44] task fusion and legality constraints --- install.py | 2 +- legate/core/launcher.py | 15 ++- legate/core/legion.py | 2 + legate/core/operation.py | 16 ++- legate/core/runtime.py | 214 ++++++++++++++++++++++++++++++++-- src/core.mk | 4 +- src/data/scalar.h | 3 + src/data/scalar.inl | 1 + src/data/store.cc | 14 ++- src/data/store.h | 15 ++- src/mapping/mapper.cc | 3 +- src/runtime/context.h | 9 +- src/utilities/deserializer.cc | 6 +- src/utilities/span.h | 2 + 14 files changed, 279 insertions(+), 27 deletions(-) diff --git a/install.py b/install.py index bfc6211fa..ac8ce559e 100755 --- a/install.py +++ b/install.py @@ -887,7 +887,7 @@ def driver(): "--clean", dest="clean_first", action=BooleanFlag, - default=True, + default=False, help="Clean before build, and pull latest Legion.", ) parser.add_argument( diff --git a/legate/core/launcher.py b/legate/core/launcher.py index 10486d629..bd09acf62 100644 --- a/legate/core/launcher.py +++ b/legate/core/launcher.py @@ -364,6 +364,7 @@ def coalesce(self): # promote them to read write permission. if len(all_perms - set([Permission.NO_ACCESS])) > 1: perm = Permission.READ_WRITE + #perm = Permission.WRITE # When the field requires read write permission, # all projections must be the same @@ -440,6 +441,7 @@ def insert(self, req, field_id): field_set.insert(field_id, req.permission, proj_info) def analyze_requirements(self): + #import pdb; pdb.set_trace() for region, field_set in self._field_sets.items(): perm_map = field_set.coalesce() for key, fields in perm_map.items(): @@ -568,7 +570,6 @@ def add_store(self, args, store, proj, perm, tag, flags): else: region = store.storage.region field_id = store.storage.field.field_id - req = RegionReq(region, perm, proj, tag, flags) self._req_analyzer.insert(req, field_id) @@ -591,6 +592,12 @@ def add_output(self, store, proj, tag=0, flags=0): self._outputs, store, proj, Permission.WRITE, tag, flags ) + # currently this is adding to outputs but we can have a seperate "temps" array in the core + def add_temp(self, store, proj, tag=0, flags=0): + self.add_store( + self._outputs, store, proj, Permission.WRITE, tag, flags + ) + def add_reduction(self, store, proj, tag=0, flags=0, read_write=False): if read_write: self.add_store( @@ -642,13 +649,17 @@ def pack_args(argbuf, args): def build_task(self, launch_domain, argbuf): self._req_analyzer.analyze_requirements() + #print("building task id", self._task_id) + for req in self._req_analyzer._requirements: + print(req) + print(req[0].__dict__) + print() self._out_analyzer.analyze_requirements() self.pack_args(argbuf, self._inputs) self.pack_args(argbuf, self._outputs) self.pack_args(argbuf, self._reductions) self.pack_args(argbuf, self._scalars) - task = IndexTask( self.legion_task_id, launch_domain, diff --git a/legate/core/legion.py b/legate/core/legion.py index 5d1211099..ebadb5e65 100644 --- a/legate/core/legion.py +++ b/legate/core/legion.py @@ -155,6 +155,7 @@ def legate_task_postamble(runtime, context): # This is a decorator for wrapping the launch method on launchers # to dispatch any unordered deletions while the task is live def dispatch(func): + #print("dispatching") def launch(launcher, runtime, context, *args): # This context should always be in the dictionary legate_task_progress(runtime, context) @@ -4744,6 +4745,7 @@ def get_string(self): if self.string is None or self.arglen != len(self.args): fmtstr = "".join(self.fmt) assert len(fmtstr) == len(self.args) + 1 + print(self.args) self.string = struct.pack(fmtstr, *self.args) self.arglen = len(self.args) return self.string diff --git a/legate/core/operation.py b/legate/core/operation.py index e8f093eb5..52bd14a12 100644 --- a/legate/core/operation.py +++ b/legate/core/operation.py @@ -28,6 +28,7 @@ def __init__(self, context, mapper_id=0): self._inputs = [] self._outputs = [] self._reductions = [] + self._temps = [] self._future_output = None self._future_reduction = None self._constraints = EqClass() @@ -99,6 +100,14 @@ def add_output(self, store): else: self._outputs.append(store) + def add_temp(self, store): + self._check_store(store) + self._temps.append(store) #this may not be necessary + + def add_output(self, store): + self._check_store(store) + self._outputs.append(store) + def add_reduction(self, store, redop): self._check_store(store) if store.kind is Future: @@ -143,10 +152,15 @@ def add_future(self, future): def launch(self, strategy): launcher = TaskLauncher(self.context, self._task_id, self.mapper_id) - for input in self._inputs: proj = strategy.get_projection(input) launcher.add_input(input, proj) + for temp in self._temps: + proj = strategy.get_projection(temp) + launcher.add_temp(temp, proj) + partition = strategy.get_partition(temp) + # We update the key partition of a store only when it gets updated + temp.set_key_partition(partition) for output in self._outputs: if output.unbound: continue diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 05a96d446..a85913f2b 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -41,7 +41,7 @@ from .shape import Shape from .solver import Partitioner from .store import RegionField, Store - +import numpy as npo # A Field holds a reference to a field in a region tree # that can be used by many different RegionField objects @@ -711,6 +711,110 @@ def record_partition(self, index_space, functor, index_partition): self._index_partitions[key] = index_partition +class FusionChecker(object): + def __init__(self, ops, contexts, runtime): + """ + This is a class containing a list of constraints for fusing ops + It emits whether or not a given list of ops can be fused + """ + self.constraints = [] + self.ops = ops + self.contexts = contexts + self.runtime=runtime + + def register_constraint(self, fusion_constraint_rule): + self.constraints.append(fusion_constraint_rule) + + def can_fuse(self): + results = [constraint.apply(self.contexts, self.runtime, self.ops) for constraint in self.constraints] + print(results) + return reduce(lambda x,y: x and y, results) + +class FusionConstraint(object): + def apply(self, contexts, runtime, ops): + """" + Abstract class for determining a rule that constrains + which legate operations can be fused + """ + raise NotImplementedError("Implement in derived classes") + + +class NumpyContextExists(FusionConstraint): + def apply(self, contexts, runtime, ops): + return "legate.numpy" in contexts + + +class AllBinaryOps(FusionConstraint): + """Temporary class for only fusing Binary Ops. + This constrains will be removed""" + def apply(self, contexts, runtime, ops): + allBinary = reduce(lambda x,y: x and y,[int(op._task_id)==400000 for op in ops]) + return allBinary + + +class IdenticalProjection(FusionConstraint): + """Fusion rule that only ops with identical + projection functors can be fused""" + def apply(self, contexts, runtime, ops): + partitioners = [] + strategies = [] + must_be_single = any(op._future_output is not None for op in ops) + for op in ops: + partitioner = Partitioner(runtime, ops, must_be_single=must_be_single) + strategy = partitioner.partition_stores() + + store_to_ops = {} + for op in ops: + bufferSet = {} + for input in op._inputs: + if input not in bufferSet: + proj = strategy.get_projection(input) + if hasattr(proj, 'part'): + bufferSet[input]=proj + + for output in op._outputs: + if output not in bufferSet: + proj = strategy.get_projection(output) + if hasattr(proj, 'part'): + bufferSet[output]=proj + + for buffer in bufferSet.keys(): + proj = bufferSet[buffer] + matrix = proj.part.index_partition.functor.transform.trans + if buffer not in store_to_ops: + store_to_ops[buffer] = [matrix] + else: + store_to_ops[buffer].append(matrix) + for store, matrices in store_to_ops.items(): + if len(matrices)>1: + allEqual = reduce(lambda x,y: x==y, matrices) + if not allEqual: + return False + print(store_to_ops) + return True + + +class IdenticalLaunchShapes(FusionConstraint): + """Fusion rule that only ops with identical + launch shapes can be fused""" + def apply(self, contexts, runtime, ops): + partitioners = [] + strategies = [] + launch_shapes = [] + must_be_single = any(op._future_output is not None for op in ops) + for op in ops: + partitioner = Partitioner(runtime, ops, must_be_single=must_be_single) + strategy = partitioner.partition_stores() + launch_shapes.append(strategy._launch_shape) + first_shape = launch_shapes[0] + print(launch_shapes) + for launch_shape in launch_shapes: + if launch_shape!=first_shape: + return False + return True + + + class Runtime(object): def __init__(self, core_library): """ @@ -753,7 +857,7 @@ def __init__(self, core_library): # to be dispatched. This list allows cross library introspection for # Legate operations. self._outstanding_ops = [] - self._window_size = 1 + self._window_size = 2 # Now we initialize managers self._attachment_manager = AttachmentManager(self) @@ -857,20 +961,111 @@ def dispatch(self, op, redop=None): else: return op.launch(self.legion_runtime, self.legion_context) - def _schedule(self, ops): + + def build_fused_binary_numpy_op(self,ops): + fusion_checker = FusionChecker(ops, self._contexts, self) + fusion_checker.register_constraint(NumpyContextExists()) + fusion_checker.register_constraint(AllBinaryOps()) + fusion_checker.register_constraint(IdenticalLaunchShapes()) + fusion_checker.register_constraint(IdenticalProjection()) + can_fuse = fusion_checker.can_fuse() + + if not can_fuse: + return None + + #hacky way to get numpy context and designated fused task id + fused_id = self._contexts["legate.numpy"].fused_id + numpy_context = self._contexts["legate.numpy"] + numpy_runtime = numpy_context._library.runtime + #initialize fused task + fused_task = numpy_context.create_task(fused_id) + + #generate offset maps for all inputs + input_starts, output_starts, offset_starts, offsets= [],[],[],[] + input_start, output_start, offset_start = 0,0,0 + + for op in ops: + input_starts.append(input_start) + output_starts.append(output_start) + offset_starts.append(offset_start) + + for i,input in enumerate(op._inputs): + offsets.append(i+1) + for o,output in enumerate(op._outputs): + offsets.append(-(o+1)) + + offset_start+=(len(op._inputs)+len(op._outputs)) + input_start+=len(op._inputs) + output_start+=len(op._outputs) + + #terminators + input_starts.append(input_start) + output_starts.append(output_start) + offset_starts.append(offset_start) + + #turn offset maps into deferred arrays + #then load them into the task as the initial inputs + #print(input_starts, output_starts, offset_starts, offsets) + inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets)) + def make_deferred(inst): + return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) + offset_maps = map(make_deferred, (inst, oust, offst, offs)) + + #add offset maps to task + for offset_map in offset_maps: + fused_task.add_input(offset_map.base) + fused_task.add_broadcast(offset_map.base) + + #add typical inputs and outputs to task + for op in ops: + for scalar in op._scalar_args: + fused_task.add_scalar_arg(scalar[0], ty.int32) + for input in op._inputs: + fused_task.add_input(input) + for output in op._outputs: + fused_task.add_output(output) + + return fused_task + + def _launch_outstanding(self): + print("launching final outstanding ops") + if len(self._outstanding_ops): + ops = self._outstanding_ops + self._outstanding_ops = [] + self._schedule(ops, force_eval=True) + + + def _schedule(self, ops, force_eval=False): + ids = [op._task_id for op in ops] + #print("current ops", ids) + #try fusing tasks + if len(ops)>=2 and (not force_eval): + fused_task = self.build_fused_binary_numpy_op(ops) + if fused_task: + fused_task.execute() + return + + #if we cann't fuse op launch them individually must_be_single = any(op._future_output is not None for op in ops) partitioner = Partitioner(self, ops, must_be_single=must_be_single) strategy = partitioner.partition_stores() - for op in ops: + #print("task_id", op._task_id, int(op._task_id)) + #print("inputs", op._inputs) + #print("outputs", op._outputs) op.launch(strategy) def submit(self, op): - self._outstanding_ops.append(op) - if len(self._outstanding_ops) >= self._window_size: - ops = self._outstanding_ops - self._outstanding_ops = [] - self._schedule(ops) + #always launch a fused op, dont add it to the window + #as the encapsulated ops already waited in the window + if int(op._task_id)==400028: + self._schedule([op]) + else: + self._outstanding_ops.append(op) + if len(self._outstanding_ops) >= self._window_size: + ops = self._outstanding_ops + self._outstanding_ops = [] + self._schedule(ops) def _progress_unordered_operations(self): legion.legion_context_progress_unordered_operations( @@ -1065,6 +1260,7 @@ def record_partition(self, index_space, functor, index_partition): def _cleanup_legate_runtime(): global _runtime + _runtime._launch_outstanding() _runtime.destroy() del _runtime gc.collect() diff --git a/src/core.mk b/src/core.mk index deedb74eb..8765eca36 100644 --- a/src/core.mk +++ b/src/core.mk @@ -25,7 +25,8 @@ GEN_CPU_SRC = legate_c.cc \ runtime/runtime.cc \ runtime/shard.cc \ task/task.cc \ - utilities/deserializer.cc + utilities/deserializer.cc \ + utilities/makeshift_serializer.cc ifeq ($(strip $(USE_CUDA)),1) GEN_CPU_SRC += gpu/cudalibs.cc @@ -55,4 +56,5 @@ INSTALL_HEADERS = legate.h \ utilities/dispatch.h \ utilities/span.h \ utilities/type_traits.h \ + utilities/makeshift_serializer.h \ utilities/typedefs.h diff --git a/src/data/scalar.h b/src/data/scalar.h index 18d58c45f..fc1c1ede4 100644 --- a/src/data/scalar.h +++ b/src/data/scalar.h @@ -18,6 +18,7 @@ #include "utilities/span.h" #include "utilities/typedefs.h" +#include "utilities/makeshift_serializer.h" namespace legate { @@ -44,6 +45,8 @@ class Scalar { bool tuple_{false}; LegateTypeCode code_{MAX_TYPE_NUMBER}; const void* data_; + + friend class MakeshiftSerializer; }; } // namespace legate diff --git a/src/data/scalar.inl b/src/data/scalar.inl index bb9cfa5cb..3e49fe08b 100644 --- a/src/data/scalar.inl +++ b/src/data/scalar.inl @@ -25,6 +25,7 @@ VAL Scalar::value() const template Span Scalar::values() const { + if (tuple_) { auto size = *static_cast(data_); auto data = static_cast(data_) + sizeof(uint32_t); diff --git a/src/data/store.cc b/src/data/store.cc index e8a70ca67..d9accf3a4 100644 --- a/src/data/store.cc +++ b/src/data/store.cc @@ -21,8 +21,8 @@ namespace legate { using namespace Legion; -RegionField::RegionField(int32_t dim, const PhysicalRegion& pr, FieldID fid) - : dim_(dim), pr_(pr), fid_(fid) +RegionField::RegionField(int32_t dim, const PhysicalRegion& pr, FieldID fid, unsigned reqIdx) + : dim_(dim), pr_(pr), fid_(fid), reqIdx_(reqIdx) { auto priv = pr.get_privilege(); readable_ = static_cast(priv & LEGION_READ_PRIV); @@ -34,6 +34,7 @@ RegionField::RegionField(RegionField&& other) noexcept : dim_(other.dim_), pr_(other.pr_), fid_(other.fid_), + reqIdx_(other.reqIdx_), readable_(other.readable_), writable_(other.writable_), reducible_(other.reducible_) @@ -45,6 +46,8 @@ RegionField& RegionField::operator=(RegionField&& other) noexcept dim_ = other.dim_; pr_ = other.pr_; fid_ = other.fid_; + reqIdx_ = other.reqIdx_; + readable_ = other.readable_; writable_ = other.writable_; reducible_ = other.reducible_; @@ -53,14 +56,15 @@ RegionField& RegionField::operator=(RegionField&& other) noexcept Domain RegionField::domain() const { return dim_dispatch(dim_, get_domain_fn{}, pr_); } -OutputRegionField::OutputRegionField(const OutputRegion& out, FieldID fid) : out_(out), fid_(fid) {} +OutputRegionField::OutputRegionField(const OutputRegion& out, FieldID fid, unsigned reqIdx) : out_(out), fid_(fid), reqIdx_(reqIdx) {} OutputRegionField::OutputRegionField(OutputRegionField&& other) noexcept - : bound_(other.bound_), out_(other.out_), fid_(other.fid_) + : bound_(other.bound_), out_(other.out_), fid_(other.fid_), reqIdx_(other.reqIdx_) { other.bound_ = false; other.out_ = OutputRegion(); other.fid_ = -1; + //TODO, how should we invalidate reqIdx } OutputRegionField& OutputRegionField::operator=(OutputRegionField&& other) noexcept @@ -68,10 +72,12 @@ OutputRegionField& OutputRegionField::operator=(OutputRegionField&& other) noexc bound_ = other.bound_; out_ = other.out_; fid_ = other.fid_; + reqIdx_= other.reqIdx_; other.bound_ = false; other.out_ = OutputRegion(); other.fid_ = -1; + //TODO, how should we invalidate reqIdx return *this; } diff --git a/src/data/store.h b/src/data/store.h index 10c4428ff..4b31fd049 100644 --- a/src/data/store.h +++ b/src/data/store.h @@ -21,13 +21,14 @@ #include "data/buffer.h" #include "data/transform.h" #include "utilities/typedefs.h" +#include "utilities/makeshift_serializer.h" namespace legate { class RegionField { public: RegionField() {} - RegionField(int32_t dim, const Legion::PhysicalRegion& pr, Legion::FieldID fid); + RegionField(int32_t dim, const Legion::PhysicalRegion& pr, Legion::FieldID fid, unsigned reqIdx); public: RegionField(RegionField&& other) noexcept; @@ -147,17 +148,20 @@ class RegionField { int32_t dim_{-1}; Legion::PhysicalRegion pr_{}; Legion::FieldID fid_{-1U}; + unsigned reqIdx_; //this gets packed as an unsigned private: bool readable_{false}; bool writable_{false}; bool reducible_{false}; + + friend class MakeshiftSerializer; }; class OutputRegionField { public: OutputRegionField() {} - OutputRegionField(const Legion::OutputRegion& out, Legion::FieldID fid); + OutputRegionField(const Legion::OutputRegion& out, Legion::FieldID fid, unsigned reqIdx); public: OutputRegionField(OutputRegionField&& other) noexcept; @@ -175,6 +179,9 @@ class OutputRegionField { bool bound_{false}; Legion::OutputRegion out_{}; Legion::FieldID fid_{-1U}; + unsigned reqIdx_; //this gets packed as an unsigned + + friend class MakeshiftSerializer; }; class FutureWrapper { @@ -237,6 +244,7 @@ class Store { public: int32_t dim() const { return dim_; } + bool is_future() const { return is_future_; } LegateTypeCode code() const { return code_; } public: @@ -296,8 +304,9 @@ class Store { bool readable_{false}; bool writable_{false}; bool reducible_{false}; -}; + friend class MakeshiftSerializer; +}; } // namespace legate #include "data/store.inl" diff --git a/src/mapping/mapper.cc b/src/mapping/mapper.cc index 1e43b11ed..546097a09 100644 --- a/src/mapping/mapper.cc +++ b/src/mapping/mapper.cc @@ -195,7 +195,8 @@ Mapper::MapperSyncModel CoreMapper::get_mapper_sync_model(void) const void CoreMapper::select_task_options(const MapperContext ctx, const Task& task, TaskOptions& output) { - assert(context.valid_task_id(task.task_id)); + std::cout<<"task_id "<& outputs() { return outputs_; } std::vector& reductions() { return reductions_; } std::vector& scalars() { return scalars_; } + //Deserializer dez; + //Serializer dez; - private: + public: const Legion::Task* task_; const std::vector& regions_; Legion::Context context_; diff --git a/src/utilities/deserializer.cc b/src/utilities/deserializer.cc index 466ef03f8..c4d63733a 100644 --- a/src/utilities/deserializer.cc +++ b/src/utilities/deserializer.cc @@ -91,7 +91,7 @@ void Deserializer::_unpack(RegionField& value) auto idx = unpack(); auto fid = unpack(); - value = RegionField(dim, regions_[idx], fid); + value = RegionField(dim, regions_[idx], fid, idx); } void Deserializer::_unpack(OutputRegionField& value) @@ -101,12 +101,12 @@ void Deserializer::_unpack(OutputRegionField& value) auto idx = unpack(); auto fid = unpack(); - value = OutputRegionField(outputs_[idx], fid); + value = OutputRegionField(outputs_[idx], fid, idx); } std::unique_ptr Deserializer::unpack_transform() { - auto code = unpack(); + int32_t code = unpack(); switch (code) { case -1: { return nullptr; diff --git a/src/utilities/span.h b/src/utilities/span.h index c0a20c5a8..c839bb365 100644 --- a/src/utilities/span.h +++ b/src/utilities/span.h @@ -35,6 +35,7 @@ struct Span { public: decltype(auto) operator[](size_t pos) { + //std::cout<<"pos "< Date: Mon, 27 Sep 2021 15:08:51 -0700 Subject: [PATCH 02/44] makeshift serializer for inline ops --- src/utilities/makeshift_serializer.cc | 58 ++++++++++++ src/utilities/makeshift_serializer.h | 121 ++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 src/utilities/makeshift_serializer.cc create mode 100644 src/utilities/makeshift_serializer.h diff --git a/src/utilities/makeshift_serializer.cc b/src/utilities/makeshift_serializer.cc new file mode 100644 index 000000000..fe96a2820 --- /dev/null +++ b/src/utilities/makeshift_serializer.cc @@ -0,0 +1,58 @@ +#include "utilities/makeshift_serializer.h" + +namespace legate{ + + void MakeshiftSerializer::packScalar(const Scalar& scalar){ + pack((bool) scalar.is_tuple()); + pack((LegateTypeCode) scalar.code_); + int32_t size = scalar.size(); + packWithoutType(scalar.data_, size); + } + + void MakeshiftSerializer::packBuffer(const Store& buffer) + { + pack((bool) buffer.is_future()); //is_future + pack((int32_t) buffer.dim()); + //int32_t code = buffer.code(); + pack((int32_t) buffer.code()); + //pack transform: + //pack trasnform code + int32_t neg= -1; + pack((int32_t) neg); + //skip the rest for now, assume no transform, for now pack -1 + // no need to implement this for benchmarking purposes + // TODO: implement transform packing + // TODO: add "code" to transform object + //if _isfuture + if(buffer.is_future_) + { + //pack future_wrapper + } + //elif dim>=0 + else if (buffer.dim()>=0){ + pack((int32_t) buffer.redop_id_); + //pack reigon field + //pack dim + pack((int32_t) buffer.region_field_.dim()); + //pack idx (req idx) //need to map regions to idx + pack((uint32_t) buffer.region_field_.reqIdx_); + //pack fid (field id) + pack((int32_t) buffer.region_field_.fid_); + } + else + { + //pack redop_id + pack((int32_t) buffer.redop_id_); + //pack reigon field + //pack dim; always 1 in an buffer + pack((int32_t) 1); + //pack idx (req idx) //need to map regions to idx + pack((uint32_t) buffer.region_field_.reqIdx_); + //pack fid (field id) + pack((int32_t) buffer.region_field_.fid_); + } + } + + + +} diff --git a/src/utilities/makeshift_serializer.h b/src/utilities/makeshift_serializer.h new file mode 100644 index 000000000..a4eec808f --- /dev/null +++ b/src/utilities/makeshift_serializer.h @@ -0,0 +1,121 @@ + +#pragma once +#include +#include +#include "data/store.h" +#include "data/scalar.h" +namespace legate { + +class Scalar; +class Store; +class MakeshiftSerializer{ + + public: + MakeshiftSerializer(){ + size=128; + raw.resize(size); + write_offset=0; + read_offset=0; + } +/* + template void pack(T&& arg) + { + T copy = arg; + pack(copy); //call l-value version + } +*/ + template void pack(T arg) + { + int8_t * argAddr = (int8_t*) &arg; + //std::cout<((argAddr)+i); + } + //std::cout<<"reint "<<*reinterpret_cast(raw.data()+write_offset)<(argByte+i); + } + write_offset+=argSize; + //std::cout<<" "< T read() + { + if (read_offset(raw.data()+read_offset); + read_offset+=sizeof(T); + return datum; + } + else{ + std::cout<<"finished reading buffer"< raw; +}; +/* +int main(){ + MakeshiftSerializer ms; + int a=3; + char g='a'; + ms.pack(a); + ms.pack(g); + ms.pack(a); + ms.pack(g); + std::cout<()<()<()<()<()<()< Date: Wed, 29 Sep 2021 07:35:17 -0700 Subject: [PATCH 03/44] reductions scalars, opids, need to remove dynamic allocations --- legate/core/launcher.py | 8 ++-- legate/core/legion.py | 2 +- legate/core/runtime.py | 93 +++++++++++++++++++++++++---------------- 3 files changed, 61 insertions(+), 42 deletions(-) diff --git a/legate/core/launcher.py b/legate/core/launcher.py index bd09acf62..8894bc0ee 100644 --- a/legate/core/launcher.py +++ b/legate/core/launcher.py @@ -650,10 +650,10 @@ def pack_args(argbuf, args): def build_task(self, launch_domain, argbuf): self._req_analyzer.analyze_requirements() #print("building task id", self._task_id) - for req in self._req_analyzer._requirements: - print(req) - print(req[0].__dict__) - print() + #for req in self._req_analyzer._requirements: + #print(req) + #print(req[0].__dict__) + #print() self._out_analyzer.analyze_requirements() self.pack_args(argbuf, self._inputs) diff --git a/legate/core/legion.py b/legate/core/legion.py index ebadb5e65..ff5c97dd1 100644 --- a/legate/core/legion.py +++ b/legate/core/legion.py @@ -4745,7 +4745,7 @@ def get_string(self): if self.string is None or self.arglen != len(self.args): fmtstr = "".join(self.fmt) assert len(fmtstr) == len(self.args) + 1 - print(self.args) + #print(self.args) self.string = struct.pack(fmtstr, *self.args) self.arglen = len(self.args) return self.string diff --git a/legate/core/runtime.py b/legate/core/runtime.py index a85913f2b..0e6bda47f 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -727,7 +727,7 @@ def register_constraint(self, fusion_constraint_rule): def can_fuse(self): results = [constraint.apply(self.contexts, self.runtime, self.ops) for constraint in self.constraints] - print(results) + #print(results) return reduce(lambda x,y: x and y, results) class FusionConstraint(object): @@ -790,7 +790,7 @@ def apply(self, contexts, runtime, ops): allEqual = reduce(lambda x,y: x==y, matrices) if not allEqual: return False - print(store_to_ops) + #print(store_to_ops) return True @@ -807,7 +807,7 @@ def apply(self, contexts, runtime, ops): strategy = partitioner.partition_stores() launch_shapes.append(strategy._launch_shape) first_shape = launch_shapes[0] - print(launch_shapes) + #print(launch_shapes) for launch_shape in launch_shapes: if launch_shape!=first_shape: return False @@ -857,7 +857,7 @@ def __init__(self, core_library): # to be dispatched. This list allows cross library introspection for # Legate operations. self._outstanding_ops = [] - self._window_size = 2 + self._window_size = 1 # Now we initialize managers self._attachment_manager = AttachmentManager(self) @@ -962,64 +962,86 @@ def dispatch(self, op, redop=None): return op.launch(self.legion_runtime, self.legion_context) - def build_fused_binary_numpy_op(self,ops): - fusion_checker = FusionChecker(ops, self._contexts, self) - fusion_checker.register_constraint(NumpyContextExists()) - fusion_checker.register_constraint(AllBinaryOps()) - fusion_checker.register_constraint(IdenticalLaunchShapes()) - fusion_checker.register_constraint(IdenticalProjection()) - can_fuse = fusion_checker.can_fuse() - - if not can_fuse: - return None - - #hacky way to get numpy context and designated fused task id - fused_id = self._contexts["legate.numpy"].fused_id - numpy_context = self._contexts["legate.numpy"] - numpy_runtime = numpy_context._library.runtime - #initialize fused task - fused_task = numpy_context.create_task(fused_id) - - #generate offset maps for all inputs + def serialize_multiop_metadata(self, numpy_runtime, ops): + """creates a 'header' for a fused op that denotes metadata + on each ops inputs, outputs, reductions and scalars + """ + #generate offset maps for all inputs to serialize metadata input_starts, output_starts, offset_starts, offsets= [],[],[],[] + reduction_starts, scalar_starts,op_ids = [], [], [] input_start, output_start, offset_start = 0,0,0 + reduction_start, scalar_start = 0,0 for op in ops: input_starts.append(input_start) output_starts.append(output_start) offset_starts.append(offset_start) + reduction_starts.append(reduction_start) + scalar_starts.append(scalar_start) for i,input in enumerate(op._inputs): offsets.append(i+1) for o,output in enumerate(op._outputs): - offsets.append(-(o+1)) + offsets.append(-(o+1)) + op_ids.append(op._task_id._value_) offset_start+=(len(op._inputs)+len(op._outputs)) input_start+=len(op._inputs) output_start+=len(op._outputs) + reduction_start+=len(op._reductions) + scalar_start+=len(op._scalar_args) #terminators input_starts.append(input_start) output_starts.append(output_start) offset_starts.append(offset_start) + reduction_starts.append(reduction_start) + scalar_starts.append(scalar_start) - #turn offset maps into deferred arrays + #turn metadata maps into deferred arrays #then load them into the task as the initial inputs - #print(input_starts, output_starts, offset_starts, offsets) - inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets)) + meta_arrs = (input_starts, output_starts, offset_starts, offsets, reduction_starts, scalar_starts, op_ids) + #inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets)) + meta_arrs_np = map(npo.array, meta_arrs) def make_deferred(inst): return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) - offset_maps = map(make_deferred, (inst, oust, offst, offs)) + meta_maps = map(make_deferred, meta_arrs_np) + return meta_maps + + + def build_fused_op(self,ops): + fusion_checker = FusionChecker(ops, self._contexts, self) + fusion_checker.register_constraint(NumpyContextExists()) + fusion_checker.register_constraint(AllBinaryOps()) + fusion_checker.register_constraint(IdenticalLaunchShapes()) + fusion_checker.register_constraint(IdenticalProjection()) + can_fuse = fusion_checker.can_fuse() - #add offset maps to task - for offset_map in offset_maps: - fused_task.add_input(offset_map.base) - fused_task.add_broadcast(offset_map.base) + if not can_fuse: + return None + + #hacky way to get numpy context and designated fused task id + fused_id = self._contexts["legate.numpy"].fused_id + numpy_context = self._contexts["legate.numpy"] + numpy_runtime = numpy_context._library.runtime + #initialize fused task + fused_task = numpy_context.create_task(fused_id) + + #serialize necessary metadata on all encapsulated ops + #this metadata will be fed into the fused op as inputs + meta_maps = self.serialize_multiop_metadata(numpy_runtime, ops) + + #add metadata maps to task as inputs + for meta_map in meta_maps: + fused_task.add_input(meta_map.base) + fused_task.add_broadcast(meta_map.base) - #add typical inputs and outputs to task + #add typical inputs and outputs of all subtasks to fused task for op in ops: for scalar in op._scalar_args: fused_task.add_scalar_arg(scalar[0], ty.int32) + for reduction in op._reductions: + fused_task.add_reduction(reduction) for input in op._inputs: fused_task.add_input(input) for output in op._outputs: @@ -1040,7 +1062,7 @@ def _schedule(self, ops, force_eval=False): #print("current ops", ids) #try fusing tasks if len(ops)>=2 and (not force_eval): - fused_task = self.build_fused_binary_numpy_op(ops) + fused_task = self.build_fused_op(ops) if fused_task: fused_task.execute() return @@ -1050,9 +1072,6 @@ def _schedule(self, ops, force_eval=False): partitioner = Partitioner(self, ops, must_be_single=must_be_single) strategy = partitioner.partition_stores() for op in ops: - #print("task_id", op._task_id, int(op._task_id)) - #print("inputs", op._inputs) - #print("outputs", op._outputs) op.launch(strategy) def submit(self, op): From a4e21d8af18d2c094d67e7a01aa43ddc6d1f772a Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Wed, 29 Sep 2021 15:59:45 -0700 Subject: [PATCH 04/44] fusion metadata passed via serialization now --- legate/core/launcher.py | 21 ++++++++++++++- legate/core/legion.py | 1 - legate/core/operation.py | 10 +++++++ legate/core/runtime.py | 41 ++++++++++++++++++---------- legate/core/store.py | 38 ++++++++++++++++++++++++++ src/data/store.h | 19 +++++++++++++ src/runtime/context.cc | 1 + src/runtime/context.h | 6 +++-- src/utilities/deserializer.cc | 50 +++++++++++++++++++++++++++++++++++ src/utilities/deserializer.h | 4 +++ 10 files changed, 173 insertions(+), 18 deletions(-) diff --git a/legate/core/launcher.py b/legate/core/launcher.py index 8894bc0ee..06d275a47 100644 --- a/legate/core/launcher.py +++ b/legate/core/launcher.py @@ -441,7 +441,6 @@ def insert(self, req, field_id): field_set.insert(field_id, req.permission, proj_info) def analyze_requirements(self): - #import pdb; pdb.set_trace() for region, field_set in self._field_sets.items(): perm_map = field_set.coalesce() for key, fields in perm_map.items(): @@ -531,6 +530,8 @@ def __init__(self, context, task_id, mapper_id=0, tag=0): self._sharding_space = None self._point = None self._output_regions = list() + self._is_fused = False + self._fusion_metadata = None @property def library_task_id(self): @@ -641,12 +642,24 @@ def set_sharding_space(self, space): def set_point(self, point): self._point = point + def add_fusion_metadata(self, is_fused, fusion_metadata): + self._is_fused = is_fused + self._fusion_metadata = fusion_metadata + @staticmethod def pack_args(argbuf, args): argbuf.pack_32bit_uint(len(args)) for arg in args: arg.pack(argbuf) + + @staticmethod + def pack_fusion_metadata(argbuf, is_fused, fusion_metadata): + argbuf.pack_bool(is_fused) + if is_fused: + fusion_metadata.pack(argbuf) + + def build_task(self, launch_domain, argbuf): self._req_analyzer.analyze_requirements() #print("building task id", self._task_id) @@ -656,6 +669,9 @@ def build_task(self, launch_domain, argbuf): #print() self._out_analyzer.analyze_requirements() + #pack fusion metadata + self.pack_fusion_metadata(argbuf, self._is_fused, self._fusion_metadata) + self.pack_args(argbuf, self._inputs) self.pack_args(argbuf, self._outputs) self.pack_args(argbuf, self._reductions) @@ -685,6 +701,9 @@ def build_task(self, launch_domain, argbuf): def build_single_task(self, argbuf): self._req_analyzer.analyze_requirements() self._out_analyzer.analyze_requirements() + + #pack fusion metadata + self.pack_fusion_metadata(argbuf, self._is_fused, self._fusion_metadata) self.pack_args(argbuf, self._inputs) self.pack_args(argbuf, self._outputs) diff --git a/legate/core/legion.py b/legate/core/legion.py index ff5c97dd1..3d6fa5299 100644 --- a/legate/core/legion.py +++ b/legate/core/legion.py @@ -4745,7 +4745,6 @@ def get_string(self): if self.string is None or self.arglen != len(self.args): fmtstr = "".join(self.fmt) assert len(fmtstr) == len(self.args) + 1 - #print(self.args) self.string = struct.pack(fmtstr, *self.args) self.arglen = len(self.args) return self.string diff --git a/legate/core/operation.py b/legate/core/operation.py index 52bd14a12..e8a6ad952 100644 --- a/legate/core/operation.py +++ b/legate/core/operation.py @@ -33,6 +33,7 @@ def __init__(self, context, mapper_id=0): self._future_reduction = None self._constraints = EqClass() self._broadcasts = set() + self._is_fused = False @property def context(self): @@ -139,6 +140,7 @@ def __init__(self, context, task_id, mapper_id=0): self._task_id = task_id self._scalar_args = [] self._futures = [] + self._fusion_metadata = None def add_scalar_arg(self, value, dtype): self._scalar_args.append((value, dtype)) @@ -150,8 +152,16 @@ def add_dtype_arg(self, dtype): def add_future(self, future): self._futures.append(future) + def add_fusion_metadata(self, fusion_metadata): + self._is_fused = True + self._fusion_metadata = fusion_metadata + def launch(self, strategy): launcher = TaskLauncher(self.context, self._task_id, self.mapper_id) + + if self._is_fused: + launcher.add_fusion_metadata(self._is_fused, self._fusion_metadata) + for input in self._inputs: proj = strategy.get_projection(input) launcher.add_input(input, proj) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 0e6bda47f..c35e19388 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -40,7 +40,7 @@ from .partition import Restriction from .shape import Shape from .solver import Partitioner -from .store import RegionField, Store +from .store import RegionField, Store, FusionMetadata import numpy as npo # A Field holds a reference to a field in a region tree @@ -759,6 +759,9 @@ def apply(self, contexts, runtime, ops): partitioners = [] strategies = [] must_be_single = any(op._future_output is not None for op in ops) + + # TODO: cache as much as of the partitioner results as possible + # so the calls to Partitioner() and partition_stores done kill perf for op in ops: partitioner = Partitioner(runtime, ops, must_be_single=must_be_single) strategy = partitioner.partition_stores() @@ -766,6 +769,8 @@ def apply(self, contexts, runtime, ops): store_to_ops = {} for op in ops: bufferSet = {} + + # find the set union of input and output buffers for the op for input in op._inputs: if input not in bufferSet: proj = strategy.get_projection(input) @@ -778,6 +783,7 @@ def apply(self, contexts, runtime, ops): if hasattr(proj, 'part'): bufferSet[output]=proj + # for each op in the union, record its associated transform for buffer in bufferSet.keys(): proj = bufferSet[buffer] matrix = proj.part.index_partition.functor.transform.trans @@ -785,12 +791,14 @@ def apply(self, contexts, runtime, ops): store_to_ops[buffer] = [matrix] else: store_to_ops[buffer].append(matrix) + + # for each buffer, check all it's associated transforms/partitions + # across ops are equivalent for store, matrices in store_to_ops.items(): if len(matrices)>1: allEqual = reduce(lambda x,y: x==y, matrices) if not allEqual: return False - #print(store_to_ops) return True @@ -803,6 +811,8 @@ def apply(self, contexts, runtime, ops): launch_shapes = [] must_be_single = any(op._future_output is not None for op in ops) for op in ops: + # TODO: cache as much as of the partitioner results as possible + # so the calls to Partitioner() and partition_stores done kill perf partitioner = Partitioner(runtime, ops, must_be_single=must_be_single) strategy = partitioner.partition_stores() launch_shapes.append(strategy._launch_shape) @@ -821,8 +831,8 @@ def __init__(self, core_library): This is a class that implements the Legate runtime. The Runtime object provides high-level APIs for Legate libraries to use services in the Legion runtime. The Runtime centralizes - resource management for all the libraries so that they can - focus on implementing their domain logic. + resource management for all the libraries so that they can + focus on implementing their domain logic. """ try: @@ -857,7 +867,7 @@ def __init__(self, core_library): # to be dispatched. This list allows cross library introspection for # Legate operations. self._outstanding_ops = [] - self._window_size = 1 + self._window_size = 5 # Now we initialize managers self._attachment_manager = AttachmentManager(self) @@ -1001,20 +1011,22 @@ def serialize_multiop_metadata(self, numpy_runtime, ops): #turn metadata maps into deferred arrays #then load them into the task as the initial inputs meta_arrs = (input_starts, output_starts, offset_starts, offsets, reduction_starts, scalar_starts, op_ids) + fusion_metadata = FusionMetadata(*meta_arrs) + #inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets)) meta_arrs_np = map(npo.array, meta_arrs) def make_deferred(inst): return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) meta_maps = map(make_deferred, meta_arrs_np) - return meta_maps + return meta_maps, fusion_metadata def build_fused_op(self,ops): fusion_checker = FusionChecker(ops, self._contexts, self) fusion_checker.register_constraint(NumpyContextExists()) fusion_checker.register_constraint(AllBinaryOps()) - fusion_checker.register_constraint(IdenticalLaunchShapes()) - fusion_checker.register_constraint(IdenticalProjection()) + #fusion_checker.register_constraint(IdenticalLaunchShapes()) + #fusion_checker.register_constraint(IdenticalProjection()) can_fuse = fusion_checker.can_fuse() if not can_fuse: @@ -1026,15 +1038,16 @@ def build_fused_op(self,ops): numpy_runtime = numpy_context._library.runtime #initialize fused task fused_task = numpy_context.create_task(fused_id) - + #serialize necessary metadata on all encapsulated ops #this metadata will be fed into the fused op as inputs - meta_maps = self.serialize_multiop_metadata(numpy_runtime, ops) + meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_runtime, ops) + fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true #add metadata maps to task as inputs - for meta_map in meta_maps: - fused_task.add_input(meta_map.base) - fused_task.add_broadcast(meta_map.base) + #for meta_map in meta_maps: + # fused_task.add_input(meta_map.base) + # fused_task.add_broadcast(meta_map.base) #add typical inputs and outputs of all subtasks to fused task for op in ops: @@ -1059,7 +1072,7 @@ def _launch_outstanding(self): def _schedule(self, ops, force_eval=False): ids = [op._task_id for op in ops] - #print("current ops", ids) + #try fusing tasks if len(ops)>=2 and (not force_eval): fused_task = self.build_fused_op(ops) diff --git a/legate/core/store.py b/legate/core/store.py index 3b9297a74..517c8def2 100644 --- a/legate/core/store.py +++ b/legate/core/store.py @@ -754,3 +754,41 @@ def find_or_create_partition(self, functor): part = converted.construct(self.storage.region, complete=complete) self._partitions[functor] = (part, proj) return part, proj + + +class FusionMetadata(object): + def __init__( + self, + input_starts, + output_starts, + offset_starts, + buffer_offsets, + reduction_starts, + scalar_starts, + opIDs + ): + self._input_starts = input_starts + self._output_starts = output_starts + self._offset_starts = offset_starts + self._buffer_offsets = buffer_offsets + self._reduction_starts = reduction_starts + self._scalar_starts = scalar_starts + self._opIDs = opIDs + + def packList(self, meta_list, buf): + for elem in meta_list: + buf.pack_32bit_int(elem) + + def pack(self, buf): + + buf.pack_32bit_int(len(self._opIDs)) #nOps + buf.pack_32bit_int(len(self._buffer_offsets)) #nIOBuffers+1 + + self.packList(self._input_starts, buf) + self.packList(self._output_starts, buf) + self.packList(self._offset_starts, buf) + self.packList(self._buffer_offsets, buf) + self.packList(self._reduction_starts, buf) + self.packList(self._scalar_starts, buf) + self.packList(self._opIDs, buf) + diff --git a/src/data/store.h b/src/data/store.h index 4b31fd049..f862c8959 100644 --- a/src/data/store.h +++ b/src/data/store.h @@ -307,6 +307,25 @@ class Store { friend class MakeshiftSerializer; }; + +//containts prefix sums for a sub-op +//to index into its own data +struct FusionMetadata { + public: + bool isFused; + int32_t nOps; + int32_t nBuffers; + std::vector inputStarts; + std::vector outputStarts; + std::vector offsetStarts; + std::vector offsets; // can contain negative elements + std::vector reductionStarts; + std::vector scalarStarts; + std::vector opIDs; +}; + + + } // namespace legate #include "data/store.inl" diff --git a/src/runtime/context.cc b/src/runtime/context.cc index 1a2a4d39d..bd1e316c2 100644 --- a/src/runtime/context.cc +++ b/src/runtime/context.cc @@ -143,6 +143,7 @@ TaskContext::TaskContext(const Legion::Task* task, : task_(task), regions_(regions), context_(context), runtime_(runtime) { Deserializer dez(task, regions); + fusionMetadata = dez.unpack(); inputs_ = dez.unpack>(); outputs_ = dez.unpack>(); reductions_ = dez.unpack>(); diff --git a/src/runtime/context.h b/src/runtime/context.h index f2bd32553..c92ce8a14 100644 --- a/src/runtime/context.h +++ b/src/runtime/context.h @@ -17,11 +17,13 @@ #pragma once #include "legion.h" +#include "data/scalar.h" namespace legate { class Store; class Scalar; +struct FusionMetadata; struct ResourceConfig { int64_t max_tasks{1000000}; @@ -31,6 +33,7 @@ struct ResourceConfig { int64_t max_shardings{0}; }; + class ResourceScope { public: ResourceScope() = default; @@ -111,14 +114,13 @@ class TaskContext { std::vector& outputs() { return outputs_; } std::vector& reductions() { return reductions_; } std::vector& scalars() { return scalars_; } - //Deserializer dez; - //Serializer dez; public: const Legion::Task* task_; const std::vector& regions_; Legion::Context context_; Legion::Runtime* runtime_; + FusionMetadata fusionMetadata; private: std::vector inputs_, outputs_, reductions_; diff --git a/src/utilities/deserializer.cc b/src/utilities/deserializer.cc index c4d63733a..d4d4c92ca 100644 --- a/src/utilities/deserializer.cc +++ b/src/utilities/deserializer.cc @@ -38,6 +38,56 @@ void Deserializer::_unpack(LegateTypeCode& value) value = static_cast(unpack()); } + +void Deserializer::_unpack(FusionMetadata& metadata){ + metadata.isFused = unpack(); + if (!metadata.isFused){ + return; + } + //exit out if the this is not a fused op + metadata.nOps = unpack(); + metadata.nBuffers = unpack(); + int nOps = metadata.nOps; + int nBuffers = metadata.nBuffers; + + metadata.inputStarts.resize(nOps+1); + metadata.outputStarts.resize(nOps+1); + metadata.offsetStarts.resize(nOps+1); + metadata.offsets.resize(nBuffers+1); + metadata.reductionStarts.resize(nOps+1); + metadata.scalarStarts.resize(nOps+1); + metadata.opIDs.resize(nOps); + //TODO: wrap this up to reuse code` + for (int i=0; i(); + } + for (int i=0; i(); + } + for (int i=0; i(); + } + for (int i=0; i(); + } + for (int i=0; i(); + } + for (int i=0; i(); + } + for (int i=0; i(); + } +} + void Deserializer::_unpack(Store& value) { auto is_future = unpack(); diff --git a/src/utilities/deserializer.h b/src/utilities/deserializer.h index 7d739aa6e..d142ab1c9 100644 --- a/src/utilities/deserializer.h +++ b/src/utilities/deserializer.h @@ -33,6 +33,7 @@ class Scalar; class FutureWrapper; class RegionField; class OutputRegionField; +struct FusionMetadata; class Deserializer { public: @@ -47,6 +48,8 @@ class Deserializer { return std::move(value); } + //void unpackFusionMetadata(bool& isFused); + private: template != MAX_TYPE_NUMBER>* = nullptr> void _unpack(T& value) @@ -70,6 +73,7 @@ class Deserializer { void _unpack(FutureWrapper& value); void _unpack(RegionField& value); void _unpack(OutputRegionField& value); + void _unpack(FusionMetadata& value); private: std::unique_ptr unpack_transform(); From 9b57fc863951ea9ab2d6b4d476d7120ab99f3401 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Wed, 29 Sep 2021 21:31:19 -0700 Subject: [PATCH 05/44] remove redundant store partitioning --- legate/core/runtime.py | 82 ++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index c35e19388..67d89ba17 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -39,7 +39,7 @@ ) from .partition import Restriction from .shape import Shape -from .solver import Partitioner +from .solver import Partitioner, Strategy from .store import RegionField, Store, FusionMetadata import numpy as npo @@ -721,17 +721,26 @@ def __init__(self, ops, contexts, runtime): self.ops = ops self.contexts = contexts self.runtime=runtime + self.partitioners = [] + self.strategies = [] def register_constraint(self, fusion_constraint_rule): self.constraints.append(fusion_constraint_rule) def can_fuse(self): - results = [constraint.apply(self.contexts, self.runtime, self.ops) for constraint in self.constraints] - #print(results) - return reduce(lambda x,y: x and y, results) + must_be_single = any(op._future_output is not None for op in self.ops) + for op in self.ops: + # TODO: cache as much as of the partitioner results as possible + # so the calls to Partitioner() and partition_stores done kill perf + partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single) + self.partitioners.append( partitioner ) + strategy = partitioner.partition_stores() + self.strategies.append(strategy) + results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints] + return reduce(lambda x,y: x and y, results), self.strategies class FusionConstraint(object): - def apply(self, contexts, runtime, ops): + def apply(self, contexts, runtime, ops, partitioners, strategies): """" Abstract class for determining a rule that constrains which legate operations can be fused @@ -740,14 +749,14 @@ def apply(self, contexts, runtime, ops): class NumpyContextExists(FusionConstraint): - def apply(self, contexts, runtime, ops): + def apply(self, contexts, runtime, ops, partitioners, strategies): return "legate.numpy" in contexts class AllBinaryOps(FusionConstraint): """Temporary class for only fusing Binary Ops. This constrains will be removed""" - def apply(self, contexts, runtime, ops): + def apply(self, contexts, runtime, ops, partitioners, strategies): allBinary = reduce(lambda x,y: x and y,[int(op._task_id)==400000 for op in ops]) return allBinary @@ -755,31 +764,22 @@ def apply(self, contexts, runtime, ops): class IdenticalProjection(FusionConstraint): """Fusion rule that only ops with identical projection functors can be fused""" - def apply(self, contexts, runtime, ops): - partitioners = [] - strategies = [] - must_be_single = any(op._future_output is not None for op in ops) - - # TODO: cache as much as of the partitioner results as possible - # so the calls to Partitioner() and partition_stores done kill perf - for op in ops: - partitioner = Partitioner(runtime, ops, must_be_single=must_be_single) - strategy = partitioner.partition_stores() + def apply(self, contexts, runtime, ops, partitioners, strategies): store_to_ops = {} - for op in ops: + for i, op in enumerate(ops): bufferSet = {} # find the set union of input and output buffers for the op for input in op._inputs: if input not in bufferSet: - proj = strategy.get_projection(input) + proj = strategies[i].get_projection(input) if hasattr(proj, 'part'): bufferSet[input]=proj for output in op._outputs: if output not in bufferSet: - proj = strategy.get_projection(output) + proj = strategies[i].get_projection(output) if hasattr(proj, 'part'): bufferSet[output]=proj @@ -805,19 +805,11 @@ def apply(self, contexts, runtime, ops): class IdenticalLaunchShapes(FusionConstraint): """Fusion rule that only ops with identical launch shapes can be fused""" - def apply(self, contexts, runtime, ops): - partitioners = [] - strategies = [] + def apply(self, contexts, runtime, ops, partitioners, strategies): launch_shapes = [] - must_be_single = any(op._future_output is not None for op in ops) - for op in ops: - # TODO: cache as much as of the partitioner results as possible - # so the calls to Partitioner() and partition_stores done kill perf - partitioner = Partitioner(runtime, ops, must_be_single=must_be_single) - strategy = partitioner.partition_stores() - launch_shapes.append(strategy._launch_shape) + for i in range(len(ops)): + launch_shapes.append(strategies[i]._launch_shape) first_shape = launch_shapes[0] - #print(launch_shapes) for launch_shape in launch_shapes: if launch_shape!=first_shape: return False @@ -867,7 +859,7 @@ def __init__(self, core_library): # to be dispatched. This list allows cross library introspection for # Legate operations. self._outstanding_ops = [] - self._window_size = 5 + self._window_size =1 # Now we initialize managers self._attachment_manager = AttachmentManager(self) @@ -1025,20 +1017,27 @@ def build_fused_op(self,ops): fusion_checker = FusionChecker(ops, self._contexts, self) fusion_checker.register_constraint(NumpyContextExists()) fusion_checker.register_constraint(AllBinaryOps()) - #fusion_checker.register_constraint(IdenticalLaunchShapes()) - #fusion_checker.register_constraint(IdenticalProjection()) - can_fuse = fusion_checker.can_fuse() + fusion_checker.register_constraint(IdenticalLaunchShapes()) + fusion_checker.register_constraint(IdenticalProjection()) + can_fuse, partitions = fusion_checker.can_fuse() if not can_fuse: return None + super_strat = {} + super_fspace = {} + for partition in partitions: + super_strat = {**(super_strat.copy()), **partition._strategy} + super_fspace = {**(super_fspace.copy()), **partition._fspaces} + super_strategy = Strategy(partitions[0]._launch_shape, super_strat, super_fspace) #hacky way to get numpy context and designated fused task id fused_id = self._contexts["legate.numpy"].fused_id numpy_context = self._contexts["legate.numpy"] numpy_runtime = numpy_context._library.runtime #initialize fused task fused_task = numpy_context.create_task(fused_id) - + fused_task.strategy = super_strategy + #serialize necessary metadata on all encapsulated ops #this metadata will be fed into the fused op as inputs meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_runtime, ops) @@ -1081,9 +1080,14 @@ def _schedule(self, ops, force_eval=False): return #if we cann't fuse op launch them individually - must_be_single = any(op._future_output is not None for op in ops) - partitioner = Partitioner(self, ops, must_be_single=must_be_single) - strategy = partitioner.partition_stores() + #fused tasks already have their strategy + if len(ops)==1 and ops[0]._task_id==400028: + strategy = ops[0].strategy + + else: + must_be_single = any(op._future_output is not None for op in ops) + partitioner = Partitioner(self, ops, must_be_single=must_be_single) + strategy = partitioner.partition_stores() for op in ops: op.launch(strategy) From bf7973ab677e8711944d0f89d1949babd28bcb3d Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Thu, 30 Sep 2021 17:41:24 -0700 Subject: [PATCH 06/44] remove creation of deferred arrays --- legate/core/runtime.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 67d89ba17..705fde3b4 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -859,7 +859,7 @@ def __init__(self, core_library): # to be dispatched. This list allows cross library introspection for # Legate operations. self._outstanding_ops = [] - self._window_size =1 + self._window_size =10 # Now we initialize managers self._attachment_manager = AttachmentManager(self) @@ -1006,10 +1006,11 @@ def serialize_multiop_metadata(self, numpy_runtime, ops): fusion_metadata = FusionMetadata(*meta_arrs) #inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets)) - meta_arrs_np = map(npo.array, meta_arrs) + #meta_arrs_np = map(npo.array, meta_arrs) def make_deferred(inst): return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) - meta_maps = map(make_deferred, meta_arrs_np) + #meta_maps = map(make_deferred, meta_arrs_np) + meta_maps=None return meta_maps, fusion_metadata From b6121e7539e769a046668719f72aa9827a269d02 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Fri, 1 Oct 2021 14:34:37 -0700 Subject: [PATCH 07/44] optimized packing, some transform packing --- legate/core/legion.py | 8 ++- legate/core/runtime.py | 5 +- legate/core/store.py | 33 +++++++---- src/data/transform.cc | 36 ++++++++++++ src/data/transform.h | 18 +++++- src/utilities/makeshift_serializer.cc | 82 ++++++++++++++++++++++++++- src/utilities/makeshift_serializer.h | 19 +++++-- 7 files changed, 175 insertions(+), 26 deletions(-) diff --git a/legate/core/legion.py b/legate/core/legion.py index 3d6fa5299..b498bfb8e 100644 --- a/legate/core/legion.py +++ b/legate/core/legion.py @@ -4560,6 +4560,12 @@ def pack_32bit_int(self, arg): self.size += 4 self.add_arg(arg, legion.LEGION_TYPE_INT32) + def pack_32bit_int_arr(self, arg): + self.fmt.append(str(len(arg))+"i") + size = len(arg) + self.size += 4*size + self.args += arg + def pack_64bit_int(self, arg): self.fmt.append("q") self.size += 8 @@ -4744,7 +4750,7 @@ def pack_dtype(self, dtype): def get_string(self): if self.string is None or self.arglen != len(self.args): fmtstr = "".join(self.fmt) - assert len(fmtstr) == len(self.args) + 1 + #assert len(fmtstr) == len(self.args) + 1 self.string = struct.pack(fmtstr, *self.args) self.arglen = len(self.args) return self.string diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 705fde3b4..3a8fa1e9b 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -41,7 +41,6 @@ from .shape import Shape from .solver import Partitioner, Strategy from .store import RegionField, Store, FusionMetadata -import numpy as npo # A Field holds a reference to a field in a region tree # that can be used by many different RegionField objects @@ -1007,8 +1006,8 @@ def serialize_multiop_metadata(self, numpy_runtime, ops): #inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets)) #meta_arrs_np = map(npo.array, meta_arrs) - def make_deferred(inst): - return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) + #def make_deferred(inst): + # return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) #meta_maps = map(make_deferred, meta_arrs_np) meta_maps=None return meta_maps, fusion_metadata diff --git a/legate/core/store.py b/legate/core/store.py index 517c8def2..fafdc941c 100644 --- a/legate/core/store.py +++ b/legate/core/store.py @@ -776,19 +776,28 @@ def __init__( self._opIDs = opIDs def packList(self, meta_list, buf): - for elem in meta_list: - buf.pack_32bit_int(elem) + buf.pack_32bit_int_arr(meta_list) + #for elem in meta_list: + # buf.pack_32bit_int(elem) def pack(self, buf): - buf.pack_32bit_int(len(self._opIDs)) #nOps - buf.pack_32bit_int(len(self._buffer_offsets)) #nIOBuffers+1 - - self.packList(self._input_starts, buf) - self.packList(self._output_starts, buf) - self.packList(self._offset_starts, buf) - self.packList(self._buffer_offsets, buf) - self.packList(self._reduction_starts, buf) - self.packList(self._scalar_starts, buf) - self.packList(self._opIDs, buf) + #buf.pack_32bit_int(len(self._opIDs)) #nOps + #buf.pack_32bit_int(len(self._buffer_offsets)) #nIOBuffers+1 + superbuff = [len(self._opIDs)]+[len(self._buffer_offsets)] + superbuff += self._input_starts + superbuff += self._output_starts + superbuff += self._offset_starts + superbuff += self._buffer_offsets + superbuff += self._reduction_starts + superbuff += self._scalar_starts + superbuff += self._opIDs + self.packList(superbuff, buf) + #self.packList(self._input_starts, buf) + #self.packList(self._output_starts, buf) + #self.packList(self._offset_starts, buf) + #self.packList(self._buffer_offsets, buf) + #self.packList(self._reduction_starts, buf) + #self.packList(self._scalar_starts, buf) + #self.packList(self._opIDs, buf) diff --git a/src/data/transform.cc b/src/data/transform.cc index c9860fc65..0c117883b 100644 --- a/src/data/transform.cc +++ b/src/data/transform.cc @@ -22,6 +22,17 @@ using namespace Legion; using StoreTransformP = std::unique_ptr; +/* +typedef enum legate_core_transform_t { + LEGATE_CORE_TRANSFORM_SHIFT = 100, + LEGATE_CORE_TRANSFORM_PROMOTE, + LEGATE_CORE_TRANSFORM_PROJECT, + LEGATE_CORE_TRANSFORM_TRANSPOSE, + LEGATE_CORE_TRANSFORM_DELINEARIZE, +} legate_core_transform_t; +*/ + + DomainAffineTransform combine(const DomainAffineTransform& lhs, const DomainAffineTransform& rhs) { DomainAffineTransform result; @@ -39,6 +50,11 @@ Shift::Shift(int32_t dim, int64_t offset, StoreTransformP&& parent) { } +int32_t Shift::getTransformCode() const +{ + return LEGATE_CORE_TRANSFORM_SHIFT; +} + Domain Shift::transform(const Domain& input) const { auto result = nullptr != parent_ ? parent_->transform(input) : input; @@ -81,6 +97,11 @@ Promote::Promote(int32_t extra_dim, int64_t dim_size, StoreTransformP&& parent) { } +int32_t Promote::getTransformCode() const +{ + return LEGATE_CORE_TRANSFORM_PROMOTE; +} + Domain Promote::transform(const Domain& input) const { auto promote = [](int32_t extra_dim, int64_t dim_size, const Domain& input) { @@ -136,6 +157,11 @@ Project::Project(int32_t dim, int64_t coord, StoreTransformP&& parent) { } +int32_t Project::getTransformCode() const +{ + return LEGATE_CORE_TRANSFORM_PROJECT; +} + Domain Project::transform(const Domain& input) const { auto project = [](int32_t collapsed_dim, const Domain& input) { @@ -193,6 +219,11 @@ Transpose::Transpose(std::vector&& axes, StoreTransformP&& parent) { } +int32_t Transpose::getTransformCode() const +{ + return LEGATE_CORE_TRANSFORM_TRANSPOSE; +} + Domain Transpose::transform(const Domain& input) const { auto transpose = [](const auto& axes, const Domain& input) { @@ -246,6 +277,11 @@ Delinearize::Delinearize(int32_t dim, std::vector&& sizes, StoreTransfo for (auto size : sizes_) volume_ *= size; } +int32_t Delinearize::getTransformCode() const +{ + return LEGATE_CORE_TRANSFORM_DELINEARIZE; +} + Domain Delinearize::transform(const Domain& input) const { Domain output; diff --git a/src/data/transform.h b/src/data/transform.h index 1f2453df8..cc1433968 100644 --- a/src/data/transform.h +++ b/src/data/transform.h @@ -19,7 +19,9 @@ #include #include "legion.h" +#include "legate_c.h" +class MakeshiftSerializer; namespace legate { class StoreTransform { @@ -31,9 +33,10 @@ class StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& input) const = 0; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const = 0; - + virtual int32_t getTransformCode() const =0; protected: std::unique_ptr parent_{nullptr}; + friend class MakeshiftSerializer; }; class Shift : public StoreTransform { @@ -44,10 +47,11 @@ class Shift : public StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& input) const override; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override; - + virtual int32_t getTransformCode() const override; private: int32_t dim_; - int64_t offset_; + int64_t offset_; + friend class MakeshiftSerializer; }; class Promote : public StoreTransform { @@ -58,10 +62,12 @@ class Promote : public StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& input) const override; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override; + virtual int32_t getTransformCode() const override; private: int32_t extra_dim_; int64_t dim_size_; + friend class MakeshiftSerializer; }; class Project : public StoreTransform { @@ -72,10 +78,12 @@ class Project : public StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& domain) const override; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override; + virtual int32_t getTransformCode() const override; private: int32_t dim_; int64_t coord_; + friend class MakeshiftSerializer; }; class Transpose : public StoreTransform { @@ -86,9 +94,11 @@ class Transpose : public StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& domain) const override; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override; + virtual int32_t getTransformCode() const override; private: std::vector axes_; + friend class MakeshiftSerializer; }; class Delinearize : public StoreTransform { @@ -101,12 +111,14 @@ class Delinearize : public StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& domain) const override; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override; + virtual int32_t getTransformCode() const override; private: int32_t dim_; std::vector sizes_; std::vector strides_; int64_t volume_; + friend class MakeshiftSerializer; }; } // namespace legate diff --git a/src/utilities/makeshift_serializer.cc b/src/utilities/makeshift_serializer.cc index fe96a2820..b04ee4836 100644 --- a/src/utilities/makeshift_serializer.cc +++ b/src/utilities/makeshift_serializer.cc @@ -9,6 +9,84 @@ namespace legate{ packWithoutType(scalar.data_, size); } + void MakeshiftSerializer::packTransform(const StoreTransform* trans){ + + if (trans==nullptr){ + int32_t neg= -1; + pack((int32_t) neg); + } + else{ + int32_t code = trans->getTransformCode(); + switch (code) { + case -1: { + } + case LEGATE_CORE_TRANSFORM_SHIFT: { + Shift * shifter = (Shift*) trans; + pack((int32_t) shifter->dim_); + pack((int64_t) shifter->offset_); + packTransform(trans->parent_.get()); + } + case LEGATE_CORE_TRANSFORM_PROMOTE: { + Promote * promoter = (Promote*) trans; + pack((int32_t) promoter->extra_dim_); + pack((int64_t) promoter->dim_size_); + packTransform(trans->parent_.get()); + } + case LEGATE_CORE_TRANSFORM_PROJECT: { + Project * projector = (Project*) trans; + pack((int32_t) projector->dim_); + pack((int64_t) projector->coord_); + packTransform(trans->parent_.get()); + } + case LEGATE_CORE_TRANSFORM_TRANSPOSE: { + Transpose * projector = (Transpose*) trans; + packTransform(trans->parent_.get()); + } + case LEGATE_CORE_TRANSFORM_DELINEARIZE: { + Delinearize * projector = (Delinearize*) trans; + packTransform(trans->parent_.get()); + } + } + } + } +/* + case LEGATE_CORE_TRANSFORM_SHIFT: { + auto dim = unpack(); + auto offset = unpack(); + auto parent = unpack_transform(); + return std::make_unique(dim, offset, std::move(parent)); + } + case LEGATE_CORE_TRANSFORM_PROMOTE: { + auto extra_dim = unpack(); + auto dim_size = unpack(); + auto parent = unpack_transform(); + return std::make_unique(extra_dim, dim_size, std::move(parent)); + } + case LEGATE_CORE_TRANSFORM_PROJECT: { + auto dim = unpack(); + auto coord = unpack(); + auto parent = unpack_transform(); + return std::make_unique(dim, coord, std::move(parent)); + } + case LEGATE_CORE_TRANSFORM_TRANSPOSE: { + auto axes = unpack>(); + auto parent = unpack_transform(); + return std::make_unique(std::move(axes), std::move(parent)); + } + case LEGATE_CORE_TRANSFORM_DELINEARIZE: { + auto dim = unpack(); + auto sizes = unpack>(); + auto parent = unpack_transform(); + return std::make_unique(dim, std::move(sizes), std::move(parent)); + } + + def _serialize_transform(self, buf): + if self._parent is not None: + self._transform.serialize(buf) + self._parent._serialize_transform(buf) + else: + buf.pack_32bit_int(-1) +*/ void MakeshiftSerializer::packBuffer(const Store& buffer) { pack((bool) buffer.is_future()); //is_future @@ -17,8 +95,8 @@ namespace legate{ pack((int32_t) buffer.code()); //pack transform: //pack trasnform code - int32_t neg= -1; - pack((int32_t) neg); + packTransform(buffer.transform_.get()); + //skip the rest for now, assume no transform, for now pack -1 // no need to implement this for benchmarking purposes // TODO: implement transform packing diff --git a/src/utilities/makeshift_serializer.h b/src/utilities/makeshift_serializer.h index a4eec808f..ecbf98101 100644 --- a/src/utilities/makeshift_serializer.h +++ b/src/utilities/makeshift_serializer.h @@ -4,6 +4,8 @@ #include #include "data/store.h" #include "data/scalar.h" +#include "data/transform.h" + namespace legate { class Scalar; @@ -12,11 +14,15 @@ class MakeshiftSerializer{ public: MakeshiftSerializer(){ - size=128; + size=512; raw.resize(size); write_offset=0; read_offset=0; } + void zero(){ + //memset ((void*)raw.data(),0,raw.size()); + write_offset=0; + } /* template void pack(T&& arg) { @@ -32,10 +38,11 @@ class MakeshiftSerializer{ { resize(sizeof(T)); } - for (int i=0; i((argAddr)+i); - } + //for (int i=0; i((argAddr)+i); + //} + memcpy(raw.data()+write_offset, argAddr, sizeof(T)); //std::cout<<"reint "<<*reinterpret_cast(raw.data()+write_offset)< T read() { From e3708bfde139b4c735f2ee1b7cfbac9f3b3703ea Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Sun, 3 Oct 2021 20:30:20 -0700 Subject: [PATCH 08/44] more stuff --- legate/core/launcher.py | 8 +++--- legate/core/legion.py | 1 + legate/core/runtime.py | 9 ++++--- src/data/store.h | 2 ++ src/runtime/context.cc | 3 +++ src/utilities/deserializer.cc | 1 - src/utilities/makeshift_serializer.cc | 7 +++-- src/utilities/makeshift_serializer.h | 39 +++++++++++++++++++++++++++ 8 files changed, 59 insertions(+), 11 deletions(-) diff --git a/legate/core/launcher.py b/legate/core/launcher.py index 06d275a47..525163bf3 100644 --- a/legate/core/launcher.py +++ b/legate/core/launcher.py @@ -663,10 +663,10 @@ def pack_fusion_metadata(argbuf, is_fused, fusion_metadata): def build_task(self, launch_domain, argbuf): self._req_analyzer.analyze_requirements() #print("building task id", self._task_id) - #for req in self._req_analyzer._requirements: - #print(req) - #print(req[0].__dict__) - #print() + for req in self._req_analyzer._requirements: + print(req) + print(req[0].__dict__) + print() self._out_analyzer.analyze_requirements() #pack fusion metadata diff --git a/legate/core/legion.py b/legate/core/legion.py index b498bfb8e..26e7395f2 100644 --- a/legate/core/legion.py +++ b/legate/core/legion.py @@ -3624,6 +3624,7 @@ def launch(self, runtime, context): """ num_outputs = len(self.outputs) if num_outputs == 0: + return return Future( legion.legion_task_launcher_execute( runtime, context, self.launcher diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 3a8fa1e9b..35a31a197 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -794,10 +794,11 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): # for each buffer, check all it's associated transforms/partitions # across ops are equivalent for store, matrices in store_to_ops.items(): - if len(matrices)>1: - allEqual = reduce(lambda x,y: x==y, matrices) - if not allEqual: - return False + if len(matrices)>1: + first = matrices[0] + for matrix in matrices: + if not (matrix==first).all(): + return False return True diff --git a/src/data/store.h b/src/data/store.h index f862c8959..b2c6b7e91 100644 --- a/src/data/store.h +++ b/src/data/store.h @@ -138,6 +138,7 @@ class RegionField { template Legion::Rect shape() const; Legion::Domain domain() const; + unsigned getReqIdx() const {return reqIdx_; } public: bool is_readable() const { return readable_; } @@ -271,6 +272,7 @@ class Store { template Legion::Rect shape() const; Legion::Domain domain() const; + unsigned getReqIdx() const {return region_field_.getReqIdx(); } public: bool is_readable() const { return readable_; } diff --git a/src/runtime/context.cc b/src/runtime/context.cc index bd1e316c2..038d2f85f 100644 --- a/src/runtime/context.cc +++ b/src/runtime/context.cc @@ -20,6 +20,8 @@ #include "data/store.h" #include "runtime/context.h" #include "utilities/deserializer.h" +#include +#include namespace legate { @@ -148,6 +150,7 @@ TaskContext::TaskContext(const Legion::Task* task, outputs_ = dez.unpack>(); reductions_ = dez.unpack>(); scalars_ = dez.unpack>(); + } } // namespace legate diff --git a/src/utilities/deserializer.cc b/src/utilities/deserializer.cc index d4d4c92ca..77f28bf92 100644 --- a/src/utilities/deserializer.cc +++ b/src/utilities/deserializer.cc @@ -140,7 +140,6 @@ void Deserializer::_unpack(RegionField& value) auto dim = unpack(); auto idx = unpack(); auto fid = unpack(); - value = RegionField(dim, regions_[idx], fid, idx); } diff --git a/src/utilities/makeshift_serializer.cc b/src/utilities/makeshift_serializer.cc index b04ee4836..83ed87ce2 100644 --- a/src/utilities/makeshift_serializer.cc +++ b/src/utilities/makeshift_serializer.cc @@ -113,7 +113,9 @@ namespace legate{ //pack dim pack((int32_t) buffer.region_field_.dim()); //pack idx (req idx) //need to map regions to idx - pack((uint32_t) buffer.region_field_.reqIdx_); + unsigned newID = getNewReqID(buffer.region_field_.reqIdx_); + //pack((uint32_t) buffer.region_field_.reqIdx_); + pack((uint32_t) newID); //pack fid (field id) pack((int32_t) buffer.region_field_.fid_); } @@ -125,7 +127,8 @@ namespace legate{ //pack dim; always 1 in an buffer pack((int32_t) 1); //pack idx (req idx) //need to map regions to idx - pack((uint32_t) buffer.region_field_.reqIdx_); + unsigned newID = getNewReqID(buffer.region_field_.reqIdx_); + pack((uint32_t) newID); //pack fid (field id) pack((int32_t) buffer.region_field_.fid_); } diff --git a/src/utilities/makeshift_serializer.h b/src/utilities/makeshift_serializer.h index ecbf98101..94eefc684 100644 --- a/src/utilities/makeshift_serializer.h +++ b/src/utilities/makeshift_serializer.h @@ -5,6 +5,7 @@ #include "data/store.h" #include "data/scalar.h" #include "data/transform.h" +#include namespace legate { @@ -18,10 +19,14 @@ class MakeshiftSerializer{ raw.resize(size); write_offset=0; read_offset=0; + buffer_counter=0; } void zero(){ //memset ((void*)raw.data(),0,raw.size()); write_offset=0; + buffer_counter=0; + neededReqIds.clear(); + regionReqIdMap.clear(); } /* template void pack(T&& arg) @@ -103,11 +108,45 @@ class MakeshiftSerializer{ int buffSize(){ return write_offset; } + + int32_t returnAndIncrCounter(){ + int32_t old = buffer_counter; + buffer_counter++; + return old; + } + + //map old reqIdx to new reqIdx + void addReqID(int32_t id){ + //register the region reqID if it hasn't been seen yet for this op + if (regionReqIdMap.find(id)==regionReqIdMap.end()) + { + regionReqIdMap.insert(std::pair(id, returnAndIncrCounter())); + neededReqIds.push_back(id); + } + } + + int32_t getNewReqID(int32_t oldID) + { + return regionReqIdMap.find(oldID)->second; + } + + std::vector getReqIds (){ + //could use move semantics here + std::vector reqIdsCopy(neededReqIds); + return reqIdsCopy; + } + private: size_t size; int read_offset; int write_offset; + int buffer_counter; std::vector raw; + + private: + std::map regionReqIdMap; //maps old reqids to new ones + std::vector neededReqIds; //list of old reqIds needed in child op + }; /* int main(){ From 8eca06fb33d068be6be342237541b54550deff69 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Thu, 14 Oct 2021 15:04:26 -0700 Subject: [PATCH 09/44] partial fusion --- install.py | 1 + legate/core/launcher.py | 8 +- legate/core/legion.py | 1 - legate/core/runtime.py | 309 ++++++++++++++++++++------ legate/core/store.py | 5 + src/data/store.h | 1 + src/utilities/deserializer.cc | 6 + src/utilities/makeshift_serializer.cc | 18 +- src/utilities/makeshift_serializer.h | 1 - 9 files changed, 278 insertions(+), 72 deletions(-) diff --git a/install.py b/install.py index ac8ce559e..aa7ee4a0d 100755 --- a/install.py +++ b/install.py @@ -887,6 +887,7 @@ def driver(): "--clean", dest="clean_first", action=BooleanFlag, + #default=False, default=False, help="Clean before build, and pull latest Legion.", ) diff --git a/legate/core/launcher.py b/legate/core/launcher.py index 525163bf3..094a7b95c 100644 --- a/legate/core/launcher.py +++ b/legate/core/launcher.py @@ -663,10 +663,10 @@ def pack_fusion_metadata(argbuf, is_fused, fusion_metadata): def build_task(self, launch_domain, argbuf): self._req_analyzer.analyze_requirements() #print("building task id", self._task_id) - for req in self._req_analyzer._requirements: - print(req) - print(req[0].__dict__) - print() + #for req in self._req_analyzer._requirements: + # print(req) + # print(req[0].__dict__) + # print() self._out_analyzer.analyze_requirements() #pack fusion metadata diff --git a/legate/core/legion.py b/legate/core/legion.py index 26e7395f2..b498bfb8e 100644 --- a/legate/core/legion.py +++ b/legate/core/legion.py @@ -3624,7 +3624,6 @@ def launch(self, runtime, context): """ num_outputs = len(self.outputs) if num_outputs == 0: - return return Future( legion.legion_task_launcher_execute( runtime, context, self.launcher diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 35a31a197..d609b5af5 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -42,6 +42,17 @@ from .solver import Partitioner, Strategy from .store import RegionField, Store, FusionMetadata +debugPrint = False + +#debug printing +def zprint(*args): + return +if debugPrint: + dprint = print +else: + dprint = zprint + + # A Field holds a reference to a field in a region tree # that can be used by many different RegionField objects class Field(object): @@ -521,7 +532,6 @@ def compute_launch_shape(self, store, restrictions): for dim, restriction in enumerate(restrictions): if restriction != Restriction.RESTRICTED: to_partition += (shape[dim],) - launch_shape = self._compute_launch_shape(to_partition) if launch_shape is None: return None @@ -726,6 +736,20 @@ def __init__(self, ops, contexts, runtime): def register_constraint(self, fusion_constraint_rule): self.constraints.append(fusion_constraint_rule) + def supress_small_fusions(self, intervals, threshold): + #find if there's a fusable sub window of length + #greater than or equal to fusion_thresh + final_set = [] + fusable=False + for interval in intervals: + if interval[1] - interval[0] >=threshold: + final_set.append(interval) + fusable = True + else: + for i in range(interval[0], interval[1]): + final_set.append((i, i+1)) + return fusable, final_set + def can_fuse(self): must_be_single = any(op._future_output is not None for op in self.ops) for op in self.ops: @@ -735,8 +759,33 @@ def can_fuse(self): self.partitioners.append( partitioner ) strategy = partitioner.partition_stores() self.strategies.append(strategy) + results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints] - return reduce(lambda x,y: x and y, results), self.strategies + dprint("fuse results", results) + all_fusable = [result[0] for result in results] + interval_sets = [result[1] for result in results] + + #intersect intervals + #this is a very, very bad way of doing this, + # in the future I'll just "intersect" in place + # as we apply constraints + curr_set = interval_sets[0] + for interval_set in interval_sets[1:]: + newset = [] + for aset in curr_set: + for bset in interval_set: + if not (aset[0] > bset[1] or bset[0] > aset[1]): + news = max(aset[0], bset[0]) + newe = min(aset[1], bset[1]) + newset.append((news, newe)) + curr_set=newset + fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold) + dprint("curset", curr_set) + + dprint("final_set", final_set) + dprint("all fusable", fusable) + dprint("intervals", interval_sets) + return reduce(lambda x,y: x and y, all_fusable), final_set, self.strategies class FusionConstraint(object): def apply(self, contexts, runtime, ops, partitioners, strategies): @@ -749,16 +798,106 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): class NumpyContextExists(FusionConstraint): def apply(self, contexts, runtime, ops, partitioners, strategies): - return "legate.numpy" in contexts + if "legate.numpy" in contexts: + return True, [(0, len(ops))] + else: + return False, [(0,0)] +""" + NUMPY_BINARY_OP = 400000, + NUMPY_SCALAR_BINARY_OP = 400002, + NUMPY_FILL = 400003, + NUMPY_SCALAR_UNARY_RED = 400004, + NUMPY_UNARY_RED = 400005, + NUMPY_UNARY_OP = 400006, + NUMPY_SCALAR_UNARY_OP = 400007, + NUMPY_BINARY_RED = 400008, + NUMPY_CONVERT = 400010, + NUMPY_SCALAR_CONVERT = 400011, + NUMPY_WHERE = 400012, + NUMPY_SCALAR_WHERE = 400013, + NUMPY_READ = 400014, + NUMPY_WRITE = 400015, + NUMPY_DIAG = 400016, + NUMPY_MATMUL = 400017, + NUMPY_MATVECMUL = 400018, + NUMPY_DOT = 400019, + NUMPY_BINCOUNT = 400020, + NUMPY_EYE = 400021, + NUMPY_RAND = 400022, + NUMPY_ARANGE = 400023, + NUMPY_TRANSPOSE = 400024, + NUMPY_TILE = 400025, + NUMPY_NONZERO = 400026, + NUMPY_DOUBLE_BINARY_OP = 400027, + NUMPY_FUSED_OP = 400028, +""" +class AllValidOps(FusionConstraint): + """ + Class for only fusing only potentially fusable ops. + This class performs the first pass of legality filtering + """ + def __init__(self): + self.validIDs = set() + #these ops are always fusable + self.validIDs.add(400000) #Binary op + self.validIDs.add(400006) #Unary op -class AllBinaryOps(FusionConstraint): - """Temporary class for only fusing Binary Ops. - This constrains will be removed""" - def apply(self, contexts, runtime, ops, partitioners, strategies): - allBinary = reduce(lambda x,y: x and y,[int(op._task_id)==400000 for op in ops]) - return allBinary + # the following are conditionally fusable + # they will be processed in the a subsequent level of filtering + + # scalar producing ops are valid if the scalars they produce + # are NOT consumed by a subsequent op in the window + # however they can be printed, which we cannot detect in the runtime + # without static analysis, so consider these terminal fusable + self.validIDs.add(400004) #Scalar unary red + self.validIDs.add(400005) #Unary red + + # as all scalars are futures, + # so we can just check if both Futures are "ready" + # more powerfully, we can also create a dependency tree + # of ops, and assuming they're all scalar ops, + # and the "roots" are ready, we can fuse + self.validIDs.add(400002) #Scalar Binary op + self.validIDs.add(400007) #Scalar Unary op + self.validIDs.add(400008) #Scalar binary red + + #a matmul is valid if it is the last op in the sequence + #unless if it followed by a matmul of the exact same size + #so it is terminal fusable + #self.validIDs.add(400017) #Matmul + + #vector dot is binary op + scalar producing reduction + #it is thus terminal fusable + #self.validIDs.add(400019) #dot + def apply(self, contexts, runtime, ops, partitioners, strategies): + results = [int(op._task_id) in self.validIDs for op in ops] + fusable_intervals = [] + start, end =0,0 + rolling=False + while end 1: + #initialize fused task + fused_task = numpy_context.create_task(fused_id) + fused_task.strategy = super_strategies[i] + + #serialize necessary metadata on all encapsulated ops + #this metadata will be fed into the fused op as inputs + meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset) + fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true + + #add typical inputs and outputs of all subtasks to fused task + for op in op_subset: + for scalar in op._scalar_args: + fused_task.add_scalar_arg(scalar[0], ty.int32) + for reduction in op._reductions: + fused_task.add_reduction(reduction) + for input in op._inputs: + fused_task.add_input(input) + for output in op._outputs: + fused_task.add_output(output) + for future in op._futures: + fused_task.add_future(future) + new_op_list.append(fused_task) + dprint("new op list", new_op_list) + return new_op_list def _launch_outstanding(self): - print("launching final outstanding ops") + dprint("launching final outstanding ops") if len(self._outstanding_ops): ops = self._outstanding_ops self._outstanding_ops = [] @@ -1072,20 +1250,26 @@ def _launch_outstanding(self): def _schedule(self, ops, force_eval=False): ids = [op._task_id for op in ops] - + dprint("ids", ids) #try fusing tasks if len(ops)>=2 and (not force_eval): - fused_task = self.build_fused_op(ops) - if fused_task: - fused_task.execute() + fused_task_list = self.build_fused_op(ops) + if fused_task_list: + dprint("start clearing pipe") + self._clearing_pipe = True + for task in fused_task_list: + task.execute() + self._clearing_pipe = False + dprint("stop clearing pipe") return #if we cann't fuse op launch them individually - #fused tasks already have their strategy - if len(ops)==1 and ops[0]._task_id==400028: + + # tasks processed for fusion already have + # their strategy "baked in" + if len(ops)==1 and self._clearing_pipe: strategy = ops[0].strategy - - else: + else: #else do to the partition must_be_single = any(op._future_output is not None for op in ops) partitioner = Partitioner(self, ops, must_be_single=must_be_single) strategy = partitioner.partition_stores() @@ -1093,9 +1277,10 @@ def _schedule(self, ops, force_eval=False): op.launch(strategy) def submit(self, op): - #always launch a fused op, dont add it to the window - #as the encapsulated ops already waited in the window - if int(op._task_id)==400028: + #always launch ops that've been processed for fusion + #do not re-add to the window + #as the these ops already waited in the window + if self._clearing_pipe: self._schedule([op]) else: self._outstanding_ops.append(op) diff --git a/legate/core/store.py b/legate/core/store.py index fafdc941c..3c8220cfc 100644 --- a/legate/core/store.py +++ b/legate/core/store.py @@ -765,6 +765,7 @@ def __init__( buffer_offsets, reduction_starts, scalar_starts, + future_starts, opIDs ): self._input_starts = input_starts @@ -773,9 +774,12 @@ def __init__( self._buffer_offsets = buffer_offsets self._reduction_starts = reduction_starts self._scalar_starts = scalar_starts + self._future_starts = future_starts self._opIDs = opIDs def packList(self, meta_list, buf): + # aggregate the ints when packing + # much faster than individually packing each int buf.pack_32bit_int_arr(meta_list) #for elem in meta_list: # buf.pack_32bit_int(elem) @@ -791,6 +795,7 @@ def pack(self, buf): superbuff += self._buffer_offsets superbuff += self._reduction_starts superbuff += self._scalar_starts + superbuff += self._future_starts superbuff += self._opIDs self.packList(superbuff, buf) #self.packList(self._input_starts, buf) diff --git a/src/data/store.h b/src/data/store.h index b2c6b7e91..8b93f83cf 100644 --- a/src/data/store.h +++ b/src/data/store.h @@ -323,6 +323,7 @@ struct FusionMetadata { std::vector offsets; // can contain negative elements std::vector reductionStarts; std::vector scalarStarts; + std::vector futureStarts; std::vector opIDs; }; diff --git a/src/utilities/deserializer.cc b/src/utilities/deserializer.cc index 77f28bf92..061d4d3cf 100644 --- a/src/utilities/deserializer.cc +++ b/src/utilities/deserializer.cc @@ -56,6 +56,7 @@ void Deserializer::_unpack(FusionMetadata& metadata){ metadata.offsets.resize(nBuffers+1); metadata.reductionStarts.resize(nOps+1); metadata.scalarStarts.resize(nOps+1); + metadata.futureStarts.resize(nOps+1); metadata.opIDs.resize(nOps); //TODO: wrap this up to reuse code` for (int i=0; i(); } + for (int i=0; i(); + } for (int i=0; i(); @@ -125,6 +130,7 @@ void Deserializer::_unpack(FutureWrapper& value) futures_ = futures_.subspan(1); auto point = unpack>(); + Domain domain; domain.dim = static_cast(point.size()); for (int32_t idx = 0; idx < domain.dim; ++idx) { diff --git a/src/utilities/makeshift_serializer.cc b/src/utilities/makeshift_serializer.cc index 83ed87ce2..bc5c5a02f 100644 --- a/src/utilities/makeshift_serializer.cc +++ b/src/utilities/makeshift_serializer.cc @@ -17,34 +17,42 @@ namespace legate{ } else{ int32_t code = trans->getTransformCode(); + pack((int32_t) code); switch (code) { case -1: { + break; + } case LEGATE_CORE_TRANSFORM_SHIFT: { Shift * shifter = (Shift*) trans; pack((int32_t) shifter->dim_); pack((int64_t) shifter->offset_); packTransform(trans->parent_.get()); + break; } case LEGATE_CORE_TRANSFORM_PROMOTE: { Promote * promoter = (Promote*) trans; pack((int32_t) promoter->extra_dim_); pack((int64_t) promoter->dim_size_); packTransform(trans->parent_.get()); + break; } case LEGATE_CORE_TRANSFORM_PROJECT: { Project * projector = (Project*) trans; pack((int32_t) projector->dim_); pack((int64_t) projector->coord_); packTransform(trans->parent_.get()); + break; } case LEGATE_CORE_TRANSFORM_TRANSPOSE: { Transpose * projector = (Transpose*) trans; packTransform(trans->parent_.get()); + break; } case LEGATE_CORE_TRANSFORM_DELINEARIZE: { Delinearize * projector = (Delinearize*) trans; packTransform(trans->parent_.get()); + break; } } } @@ -97,14 +105,16 @@ namespace legate{ //pack trasnform code packTransform(buffer.transform_.get()); - //skip the rest for now, assume no transform, for now pack -1 - // no need to implement this for benchmarking purposes - // TODO: implement transform packing - // TODO: add "code" to transform object //if _isfuture if(buffer.is_future_) { //pack future_wrapper + auto dom = buffer.future_.domain(); + pack((uint32_t) dom.dim); + for (int32_t i =0; i=0 else if (buffer.dim()>=0){ diff --git a/src/utilities/makeshift_serializer.h b/src/utilities/makeshift_serializer.h index 94eefc684..56b409e69 100644 --- a/src/utilities/makeshift_serializer.h +++ b/src/utilities/makeshift_serializer.h @@ -38,7 +38,6 @@ class MakeshiftSerializer{ template void pack(T arg) { int8_t * argAddr = (int8_t*) &arg; - //std::cout< Date: Thu, 14 Oct 2021 18:47:02 -0700 Subject: [PATCH 10/44] finishing merge --- legate.py | 2 +- legate/core/runtime.py | 48 ++++++-- legate/core/solver.py | 1 + src/core/data/store.h | 2 +- src/core/data/transform.h | 5 +- src/core/mapping/task.cc | 1 + src/core/mapping/task.h | 4 +- src/core/runtime/context.h | 2 +- src/core/utilities/deserializer.cc | 62 +++++++++- src/core/utilities/deserializer.h | 2 +- src/utilities/makeshift_serializer.cc | 149 ----------------------- src/utilities/makeshift_serializer.h | 168 -------------------------- 12 files changed, 108 insertions(+), 338 deletions(-) delete mode 100644 src/utilities/makeshift_serializer.cc delete mode 100644 src/utilities/makeshift_serializer.h diff --git a/legate.py b/legate.py index 95c21ce72..71c8c6699 100755 --- a/legate.py +++ b/legate.py @@ -830,7 +830,7 @@ def driver(): args.cores_per_node, args.launcher, args.verbose, - args.interpreter, + #args.interpreter, args.gasnet_trace, args.eager_alloc, args.launcher_extra, diff --git a/legate/core/runtime.py b/legate/core/runtime.py index ac5f702c3..409dfadb6 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -43,7 +43,7 @@ from .solver import Partitioner, Strategy from .store import RegionField, Store, FusionMetadata -debugPrint = False +debugPrint = True #debug printing def zprint(*args): @@ -703,7 +703,7 @@ def supress_small_fusions(self, intervals, threshold): return fusable, final_set def can_fuse(self): - must_be_single = any(op._future_output is not None for op in self.ops) + must_be_single = any(len(op.scalar_outputs) > 0 for op in self.ops) for op in self.ops: # TODO: cache as much as of the partitioner results as possible # so the calls to Partitioner() and partition_stores done kill perf @@ -782,6 +782,32 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): NUMPY_NONZERO = 400026, NUMPY_DOUBLE_BINARY_OP = 400027, NUMPY_FUSED_OP = 400028, +enum NumPyOpCode { + NUMPY_ARANGE = 1, + NUMPY_BINARY_OP = 2, + NUMPY_BINARY_RED = 3, + NUMPY_BINCOUNT = 4, + NUMPY_CONVERT = 5, + NUMPY_DIAG = 6, + NUMPY_DOT = 7, + NUMPY_EYE = 8, + NUMPY_FILL = 9, + NUMPY_MATMUL = 10, + NUMPY_MATVECMUL = 11, + NUMPY_NONZERO = 12, + NUMPY_RAND = 13, + NUMPY_READ = 14, + NUMPY_SCALAR_UNARY_RED = 15, + NUMPY_TILE = 16, + NUMPY_TRANSPOSE = 17, + NUMPY_UNARY_OP = 18, + NUMPY_UNARY_RED = 19, + NUMPY_WHERE = 20, + NUMPY_WRITE = 21, + NUMPY_DOUBLE_BINARY_OP = 23, + NUMPY_FUSED_OP = 24, +} + """ class AllValidOps(FusionConstraint): """ @@ -792,8 +818,8 @@ def __init__(self): self.validIDs = set() #these ops are always fusable - self.validIDs.add(400000) #Binary op - self.validIDs.add(400006) #Unary op + self.validIDs.add(2) #Binary op + self.validIDs.add(18) #Unary op # the following are conditionally fusable # they will be processed in the a subsequent level of filtering @@ -952,8 +978,8 @@ def __init__(self, core_library): # to be dispatched. This list allows cross library introspection for # Legate operations. self._outstanding_ops = [] - self._window_size =1 - self._fusion_threshold =10 + self._window_size =10 + self._fusion_threshold =4 self._clearing_pipe = False # Now we initialize managers @@ -1135,19 +1161,23 @@ def build_fused_op(self,ops): super_strats = [] super_fspaces = [] super_strategies = [] + super_keystores = [] for fusable_set in fusable_sets: #create super strategy for this fusable set super_strat = {} super_fspace = {} + super_keystore = set() start,end = fusable_set dprint("creating fusable set for", start, end) for j in range(start,end): super_strat = {**(super_strat.copy()), **partitions[j]._strategy} super_fspace = {**(super_fspace.copy()), **partitions[j]._fspaces} + super_keystore = super_keystore.union(partitions[j]._key_stores) super_strats.append(super_strat) super_fspaces.append(super_fspace) - super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace)) - dprint("lens", len(super_strats), len(super_fspaces), len(super_strategies)) + super_keystores.append(super_keystore) + super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore)) + dprint("lens", len(super_strats), len(super_fspaces), len(super_strategies), len(super_keystore)) """ super_strat = {} super_fspace = {} @@ -1226,7 +1256,7 @@ def _schedule(self, ops, force_eval=False): if len(ops)==1 and self._clearing_pipe: strategy = ops[0].strategy else: #else do to the partition - must_be_single = any(op._future_output is not None for op in ops) + must_be_single = any(len(op.scalar_outputs) > 0 for op in ops) partitioner = Partitioner(self, ops, must_be_single=must_be_single) strategy = partitioner.partition_stores() for op in ops: diff --git a/legate/core/solver.py b/legate/core/solver.py index d0510e5ec..e9bb18a6b 100644 --- a/legate/core/solver.py +++ b/legate/core/solver.py @@ -288,4 +288,5 @@ def partition_stores(self): if must_be_1d_launch and color_shape is not None: color_shape = Shape((color_shape.volume(),)) + print("key_stores", key_stores) return Strategy(color_shape, partitions, fspaces, key_stores) diff --git a/src/core/data/store.h b/src/core/data/store.h index 76e11e599..effa17a8c 100644 --- a/src/core/data/store.h +++ b/src/core/data/store.h @@ -273,7 +273,7 @@ class Store { public: int32_t dim() const { return dim_; } - bool is_future() const { return is_future_; } + bool is_future2() const { return is_future_; } LegateTypeCode code() const { return code_; } public: diff --git a/src/core/data/transform.h b/src/core/data/transform.h index 680b03cc5..6c272b735 100644 --- a/src/core/data/transform.h +++ b/src/core/data/transform.h @@ -19,7 +19,7 @@ #include #include "legion.h" -#include "legate_c.h" +#include "core/legate_c.h" class MakeshiftSerializer; namespace legate { @@ -85,11 +85,8 @@ class Project : public StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& domain) const override; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override; -<<<<<<< HEAD:src/data/transform.h virtual int32_t getTransformCode() const override; -======= virtual void print(std::ostream& out) const override; ->>>>>>> source/branch-21.10:src/core/data/transform.h private: int32_t dim_; diff --git a/src/core/mapping/task.cc b/src/core/mapping/task.cc index 1f39a696a..404cc94f4 100644 --- a/src/core/mapping/task.cc +++ b/src/core/mapping/task.cc @@ -121,6 +121,7 @@ Task::Task(const LegionTask* task, : task_(task), library_(library) { MapperDeserializer dez(task, runtime, context); + fusionMetadata = dez.unpack(); inputs_ = dez.unpack>(); outputs_ = dez.unpack>(); reductions_ = dez.unpack>(); diff --git a/src/core/mapping/task.h b/src/core/mapping/task.h index 69efdc034..cf2533688 100644 --- a/src/core/mapping/task.h +++ b/src/core/mapping/task.h @@ -20,6 +20,7 @@ #include #include "core/data/scalar.h" +#include "core/data/store.h" #include "core/data/transform.h" #include "core/runtime/context.h" @@ -176,7 +177,8 @@ class Task { const LibraryContext& library_; const Legion::Task* task_; - private: + public: + FusionMetadata fusionMetadata; std::vector inputs_, outputs_, reductions_; std::vector scalars_; }; diff --git a/src/core/runtime/context.h b/src/core/runtime/context.h index 92152e769..891452cc9 100644 --- a/src/core/runtime/context.h +++ b/src/core/runtime/context.h @@ -17,7 +17,7 @@ #pragma once #include "legion.h" -#include "data/scalar.h" +#include "core/data/scalar.h" #include "core/task/return.h" diff --git a/src/core/utilities/deserializer.cc b/src/core/utilities/deserializer.cc index e784ad4d1..fcc22c57c 100644 --- a/src/core/utilities/deserializer.cc +++ b/src/core/utilities/deserializer.cc @@ -40,8 +40,8 @@ TaskDeserializer::TaskDeserializer(const LegionTask* task, first_task_ = !task->is_index_space || (task->index_point == task->index_domain.lo()); } -/* -void Deserializer::_unpack(FusionMetadata& metadata){ + +void TaskDeserializer::_unpack(FusionMetadata& metadata){ metadata.isFused = unpack(); if (!metadata.isFused){ return; @@ -94,7 +94,7 @@ void Deserializer::_unpack(FusionMetadata& metadata){ metadata.opIDs[i] = unpack(); } } -*/ + void TaskDeserializer::_unpack(Store& value) { auto is_future = unpack(); @@ -190,6 +190,62 @@ void MapperDeserializer::_unpack(Store& value) } } +void MapperDeserializer::_unpack(FusionMetadata& metadata){ + metadata.isFused = unpack(); + if (!metadata.isFused){ + return; + } + //exit out if the this is not a fused op + metadata.nOps = unpack(); + metadata.nBuffers = unpack(); + int nOps = metadata.nOps; + int nBuffers = metadata.nBuffers; + + metadata.inputStarts.resize(nOps+1); + metadata.outputStarts.resize(nOps+1); + metadata.offsetStarts.resize(nOps+1); + metadata.offsets.resize(nBuffers+1); + metadata.reductionStarts.resize(nOps+1); + metadata.scalarStarts.resize(nOps+1); + metadata.futureStarts.resize(nOps+1); + metadata.opIDs.resize(nOps); + //TODO: wrap this up to reuse code` + for (int i=0; i(); + } + for (int i=0; i(); + } + for (int i=0; i(); + } + for (int i=0; i(); + } + for (int i=0; i(); + } + for (int i=0; i(); + } + for (int i=0; i(); + } + for (int i=0; i(); + } +} + + + void MapperDeserializer::_unpack(FutureWrapper& value) { // We still need to deserialize these fields to get to the domain diff --git a/src/core/utilities/deserializer.h b/src/core/utilities/deserializer.h index 92f9d50dc..df1fe4f47 100644 --- a/src/core/utilities/deserializer.h +++ b/src/core/utilities/deserializer.h @@ -30,7 +30,6 @@ namespace legate { -struct FusionMetadata; template class BaseDeserializer { public: @@ -111,6 +110,7 @@ class MapperDeserializer : public BaseDeserializer { void _unpack(Store& value); void _unpack(FutureWrapper& value); void _unpack(RegionField& value, bool is_output_region); + void _unpack(FusionMetadata& value); private: Legion::Mapping::MapperRuntime* runtime_; diff --git a/src/utilities/makeshift_serializer.cc b/src/utilities/makeshift_serializer.cc deleted file mode 100644 index bc5c5a02f..000000000 --- a/src/utilities/makeshift_serializer.cc +++ /dev/null @@ -1,149 +0,0 @@ -#include "utilities/makeshift_serializer.h" - -namespace legate{ - - void MakeshiftSerializer::packScalar(const Scalar& scalar){ - pack((bool) scalar.is_tuple()); - pack((LegateTypeCode) scalar.code_); - int32_t size = scalar.size(); - packWithoutType(scalar.data_, size); - } - - void MakeshiftSerializer::packTransform(const StoreTransform* trans){ - - if (trans==nullptr){ - int32_t neg= -1; - pack((int32_t) neg); - } - else{ - int32_t code = trans->getTransformCode(); - pack((int32_t) code); - switch (code) { - case -1: { - break; - - } - case LEGATE_CORE_TRANSFORM_SHIFT: { - Shift * shifter = (Shift*) trans; - pack((int32_t) shifter->dim_); - pack((int64_t) shifter->offset_); - packTransform(trans->parent_.get()); - break; - } - case LEGATE_CORE_TRANSFORM_PROMOTE: { - Promote * promoter = (Promote*) trans; - pack((int32_t) promoter->extra_dim_); - pack((int64_t) promoter->dim_size_); - packTransform(trans->parent_.get()); - break; - } - case LEGATE_CORE_TRANSFORM_PROJECT: { - Project * projector = (Project*) trans; - pack((int32_t) projector->dim_); - pack((int64_t) projector->coord_); - packTransform(trans->parent_.get()); - break; - } - case LEGATE_CORE_TRANSFORM_TRANSPOSE: { - Transpose * projector = (Transpose*) trans; - packTransform(trans->parent_.get()); - break; - } - case LEGATE_CORE_TRANSFORM_DELINEARIZE: { - Delinearize * projector = (Delinearize*) trans; - packTransform(trans->parent_.get()); - break; - } - } - } - } -/* - case LEGATE_CORE_TRANSFORM_SHIFT: { - auto dim = unpack(); - auto offset = unpack(); - auto parent = unpack_transform(); - return std::make_unique(dim, offset, std::move(parent)); - } - case LEGATE_CORE_TRANSFORM_PROMOTE: { - auto extra_dim = unpack(); - auto dim_size = unpack(); - auto parent = unpack_transform(); - return std::make_unique(extra_dim, dim_size, std::move(parent)); - } - case LEGATE_CORE_TRANSFORM_PROJECT: { - auto dim = unpack(); - auto coord = unpack(); - auto parent = unpack_transform(); - return std::make_unique(dim, coord, std::move(parent)); - } - case LEGATE_CORE_TRANSFORM_TRANSPOSE: { - auto axes = unpack>(); - auto parent = unpack_transform(); - return std::make_unique(std::move(axes), std::move(parent)); - } - case LEGATE_CORE_TRANSFORM_DELINEARIZE: { - auto dim = unpack(); - auto sizes = unpack>(); - auto parent = unpack_transform(); - return std::make_unique(dim, std::move(sizes), std::move(parent)); - } - - def _serialize_transform(self, buf): - if self._parent is not None: - self._transform.serialize(buf) - self._parent._serialize_transform(buf) - else: - buf.pack_32bit_int(-1) -*/ - void MakeshiftSerializer::packBuffer(const Store& buffer) - { - pack((bool) buffer.is_future()); //is_future - pack((int32_t) buffer.dim()); - //int32_t code = buffer.code(); - pack((int32_t) buffer.code()); - //pack transform: - //pack trasnform code - packTransform(buffer.transform_.get()); - - //if _isfuture - if(buffer.is_future_) - { - //pack future_wrapper - auto dom = buffer.future_.domain(); - pack((uint32_t) dom.dim); - for (int32_t i =0; i=0 - else if (buffer.dim()>=0){ - pack((int32_t) buffer.redop_id_); - //pack reigon field - //pack dim - pack((int32_t) buffer.region_field_.dim()); - //pack idx (req idx) //need to map regions to idx - unsigned newID = getNewReqID(buffer.region_field_.reqIdx_); - //pack((uint32_t) buffer.region_field_.reqIdx_); - pack((uint32_t) newID); - //pack fid (field id) - pack((int32_t) buffer.region_field_.fid_); - } - else - { - //pack redop_id - pack((int32_t) buffer.redop_id_); - //pack reigon field - //pack dim; always 1 in an buffer - pack((int32_t) 1); - //pack idx (req idx) //need to map regions to idx - unsigned newID = getNewReqID(buffer.region_field_.reqIdx_); - pack((uint32_t) newID); - //pack fid (field id) - pack((int32_t) buffer.region_field_.fid_); - } - } - - - -} diff --git a/src/utilities/makeshift_serializer.h b/src/utilities/makeshift_serializer.h deleted file mode 100644 index 56b409e69..000000000 --- a/src/utilities/makeshift_serializer.h +++ /dev/null @@ -1,168 +0,0 @@ - -#pragma once -#include -#include -#include "data/store.h" -#include "data/scalar.h" -#include "data/transform.h" -#include - -namespace legate { - -class Scalar; -class Store; -class MakeshiftSerializer{ - - public: - MakeshiftSerializer(){ - size=512; - raw.resize(size); - write_offset=0; - read_offset=0; - buffer_counter=0; - } - void zero(){ - //memset ((void*)raw.data(),0,raw.size()); - write_offset=0; - buffer_counter=0; - neededReqIds.clear(); - regionReqIdMap.clear(); - } -/* - template void pack(T&& arg) - { - T copy = arg; - pack(copy); //call l-value version - } -*/ - template void pack(T arg) - { - int8_t * argAddr = (int8_t*) &arg; - if (size<=write_offset+sizeof(T)) - { - resize(sizeof(T)); - } - //for (int i=0; i((argAddr)+i); - //} - memcpy(raw.data()+write_offset, argAddr, sizeof(T)); - //std::cout<<"reint "<<*reinterpret_cast(raw.data()+write_offset)<(argByte+i); - } - write_offset+=argSize; - //std::cout<<" "< T read() - { - if (read_offset(raw.data()+read_offset); - read_offset+=sizeof(T); - return datum; - } - else{ - std::cout<<"finished reading buffer"<(id, returnAndIncrCounter())); - neededReqIds.push_back(id); - } - } - - int32_t getNewReqID(int32_t oldID) - { - return regionReqIdMap.find(oldID)->second; - } - - std::vector getReqIds (){ - //could use move semantics here - std::vector reqIdsCopy(neededReqIds); - return reqIdsCopy; - } - - private: - size_t size; - int read_offset; - int write_offset; - int buffer_counter; - std::vector raw; - - private: - std::map regionReqIdMap; //maps old reqids to new ones - std::vector neededReqIds; //list of old reqIds needed in child op - -}; -/* -int main(){ - MakeshiftSerializer ms; - int a=3; - char g='a'; - ms.pack(a); - ms.pack(g); - ms.pack(a); - ms.pack(g); - std::cout<()<()<()<()<()<()< Date: Thu, 14 Oct 2021 19:35:25 -0700 Subject: [PATCH 11/44] fix future stuff --- legate/core/runtime.py | 4 +++- src/core/data/store.h | 2 ++ src/core/utilities/deserializer.cc | 1 - 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 409dfadb6..9040db7aa 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -1209,7 +1209,7 @@ def build_fused_op(self,ops): #this metadata will be fed into the fused op as inputs meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset) fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true - + #add typical inputs and outputs of all subtasks to fused task for op in op_subset: for scalar in op._scalar_args: @@ -1222,6 +1222,8 @@ def build_fused_op(self,ops): fused_task.add_output(output) for future in op._futures: fused_task.add_future(future) + print(fused_task) + print(fused_task.__dict__) new_op_list.append(fused_task) dprint("new op list", new_op_list) return new_op_list diff --git a/src/core/data/store.h b/src/core/data/store.h index effa17a8c..eef2896d4 100644 --- a/src/core/data/store.h +++ b/src/core/data/store.h @@ -245,6 +245,8 @@ class FutureWrapper { private: mutable bool uninitialized_{true}; mutable void* rawptr_{nullptr}; + + friend class MakeshiftSerializer; }; class Store { diff --git a/src/core/utilities/deserializer.cc b/src/core/utilities/deserializer.cc index fcc22c57c..3aa297811 100644 --- a/src/core/utilities/deserializer.cc +++ b/src/core/utilities/deserializer.cc @@ -137,7 +137,6 @@ void TaskDeserializer::_unpack(FutureWrapper& value) future = futures_[0]; futures_ = futures_.subspan(1); } - value = FutureWrapper(read_only, field_size, domain, future, has_storage && first_task_); } From 823808320791e7733135ef019672b59db590f50d Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Thu, 14 Oct 2021 23:48:33 -0700 Subject: [PATCH 12/44] re add serializer, fix horrible merge bug --- legate.py | 1 + legate/core/operation.py | 4 - legate/core/runtime.py | 18 +-- legate/core/solver.py | 1 - src/core/utilities/makeshift_serializer.cc | 155 +++++++++++++++++++ src/core/utilities/makeshift_serializer.h | 168 +++++++++++++++++++++ 6 files changed, 332 insertions(+), 15 deletions(-) create mode 100644 src/core/utilities/makeshift_serializer.cc create mode 100644 src/core/utilities/makeshift_serializer.h diff --git a/legate.py b/legate.py index 5e053b637..e4197d3a3 100755 --- a/legate.py +++ b/legate.py @@ -829,6 +829,7 @@ def driver(): args.not_control_replicable, args.cores_per_node, args.launcher, + args.verbose, args.gasnet_trace, args.eager_alloc, args.launcher_extra, diff --git a/legate/core/operation.py b/legate/core/operation.py index 612519636..7b32c4f11 100644 --- a/legate/core/operation.py +++ b/legate/core/operation.py @@ -98,10 +98,6 @@ def add_temp(self, store): self._check_store(store) self._temps.append(store) #this may not be necessary - def add_output(self, store): - self._check_store(store) - self._outputs.append(store) - def add_reduction(self, store, redop): self._check_store(store) if store.scalar: diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 9040db7aa..a30893a3b 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -43,7 +43,7 @@ from .solver import Partitioner, Strategy from .store import RegionField, Store, FusionMetadata -debugPrint = True +debugPrint = False #debug printing def zprint(*args): @@ -1222,18 +1222,16 @@ def build_fused_op(self,ops): fused_task.add_output(output) for future in op._futures: fused_task.add_future(future) - print(fused_task) - print(fused_task.__dict__) new_op_list.append(fused_task) dprint("new op list", new_op_list) return new_op_list def _launch_outstanding(self): - dprint("launching final outstanding ops") + print("launching final outstanding ops") if len(self._outstanding_ops): ops = self._outstanding_ops self._outstanding_ops = [] - self._schedule(ops, force_eval=True) + #self._schedule(ops, force_eval=True) def _schedule(self, ops, force_eval=False): @@ -1258,11 +1256,11 @@ def _schedule(self, ops, force_eval=False): if len(ops)==1 and self._clearing_pipe: strategy = ops[0].strategy else: #else do to the partition - must_be_single = any(len(op.scalar_outputs) > 0 for op in ops) - partitioner = Partitioner(self, ops, must_be_single=must_be_single) - strategy = partitioner.partition_stores() - for op in ops: - op.launch(strategy) + for op in ops: + must_be_single = any(len(op.scalar_outputs) > 0 for op in [op]) + partitioner = Partitioner(self, [op], must_be_single=must_be_single) + strategy = partitioner.partition_stores() + op.launch(strategy) def submit(self, op): #always launch ops that've been processed for fusion diff --git a/legate/core/solver.py b/legate/core/solver.py index e9bb18a6b..d0510e5ec 100644 --- a/legate/core/solver.py +++ b/legate/core/solver.py @@ -288,5 +288,4 @@ def partition_stores(self): if must_be_1d_launch and color_shape is not None: color_shape = Shape((color_shape.volume(),)) - print("key_stores", key_stores) return Strategy(color_shape, partitions, fspaces, key_stores) diff --git a/src/core/utilities/makeshift_serializer.cc b/src/core/utilities/makeshift_serializer.cc new file mode 100644 index 000000000..8b0e47dc0 --- /dev/null +++ b/src/core/utilities/makeshift_serializer.cc @@ -0,0 +1,155 @@ +#include "core/utilities/makeshift_serializer.h" + +namespace legate{ + + void MakeshiftSerializer::packScalar(const Scalar& scalar){ + pack((bool) scalar.is_tuple()); + pack((LegateTypeCode) scalar.code_); + int32_t size = scalar.size(); + packWithoutType(scalar.data_, size); + } + + void MakeshiftSerializer::packTransform(const StoreTransform* trans){ + + if (trans==nullptr){ + int32_t neg= -1; + pack((int32_t) neg); + } + else{ + int32_t code = trans->getTransformCode(); + pack((int32_t) code); + switch (code) { + case -1: { + break; + + } + case LEGATE_CORE_TRANSFORM_SHIFT: { + Shift * shifter = (Shift*) trans; + pack((int32_t) shifter->dim_); + pack((int64_t) shifter->offset_); + packTransform(trans->parent_.get()); + break; + } + case LEGATE_CORE_TRANSFORM_PROMOTE: { + Promote * promoter = (Promote*) trans; + pack((int32_t) promoter->extra_dim_); + pack((int64_t) promoter->dim_size_); + packTransform(trans->parent_.get()); + break; + } + case LEGATE_CORE_TRANSFORM_PROJECT: { + Project * projector = (Project*) trans; + pack((int32_t) projector->dim_); + pack((int64_t) projector->coord_); + packTransform(trans->parent_.get()); + break; + } + case LEGATE_CORE_TRANSFORM_TRANSPOSE: { + Transpose * projector = (Transpose*) trans; + packTransform(trans->parent_.get()); + break; + } + case LEGATE_CORE_TRANSFORM_DELINEARIZE: { + Delinearize * projector = (Delinearize*) trans; + packTransform(trans->parent_.get()); + break; + } + } + } + } +/* + case LEGATE_CORE_TRANSFORM_SHIFT: { + auto dim = unpack(); + auto offset = unpack(); + auto parent = unpack_transform(); + return std::make_unique(dim, offset, std::move(parent)); + } + case LEGATE_CORE_TRANSFORM_PROMOTE: { + auto extra_dim = unpack(); + auto dim_size = unpack(); + auto parent = unpack_transform(); + return std::make_unique(extra_dim, dim_size, std::move(parent)); + } + case LEGATE_CORE_TRANSFORM_PROJECT: { + auto dim = unpack(); + auto coord = unpack(); + auto parent = unpack_transform(); + return std::make_unique(dim, coord, std::move(parent)); + } + case LEGATE_CORE_TRANSFORM_TRANSPOSE: { + auto axes = unpack>(); + auto parent = unpack_transform(); + return std::make_unique(std::move(axes), std::move(parent)); + } + case LEGATE_CORE_TRANSFORM_DELINEARIZE: { + auto dim = unpack(); + auto sizes = unpack>(); + auto parent = unpack_transform(); + return std::make_unique(dim, std::move(sizes), std::move(parent)); + } + + def _serialize_transform(self, buf): + if self._parent is not None: + self._transform.serialize(buf) + self._parent._serialize_transform(buf) + else: + buf.pack_32bit_int(-1) +*/ + void MakeshiftSerializer::packBuffer(const Store& buffer) + { + pack((bool) buffer.is_future2()); //is_future + pack((int32_t) buffer.dim()); + //int32_t code = buffer.code(); + pack((int32_t) buffer.code()); + //pack transform: + //pack trasnform code + packTransform(buffer.transform_.get()); + + //if _isfuture + if(buffer.is_future_) + { + + //pack future_wrapper + pack((bool) buffer.future_.read_only_); + + pack((bool) !buffer.future_.uninitialized_); + + pack((int32_t) buffer.future_.field_size_); + auto dom = buffer.future_.domain(); + pack((uint32_t) dom.dim); + for (int32_t i =0; i=0 + else if (buffer.dim()>=0){ + pack((int32_t) buffer.redop_id_); + //pack reigon field + //pack dim + pack((int32_t) buffer.region_field_.dim()); + //pack idx (req idx) //need to map regions to idx + unsigned newID = getNewReqID(buffer.region_field_.reqIdx_); + //pack((uint32_t) buffer.region_field_.reqIdx_); + pack((uint32_t) newID); + //pack fid (field id) + pack((int32_t) buffer.region_field_.fid_); + } + else + { + //pack redop_id + pack((int32_t) buffer.redop_id_); + //pack reigon field + //pack dim; always 1 in an buffer + pack((int32_t) 1); + //pack idx (req idx) //need to map regions to idx + unsigned newID = getNewReqID(buffer.region_field_.reqIdx_); + pack((uint32_t) newID); + //pack fid (field id) + pack((int32_t) buffer.region_field_.fid_); + } + } + + + +} diff --git a/src/core/utilities/makeshift_serializer.h b/src/core/utilities/makeshift_serializer.h new file mode 100644 index 000000000..81a85b2f2 --- /dev/null +++ b/src/core/utilities/makeshift_serializer.h @@ -0,0 +1,168 @@ + +#pragma once +#include +#include +#include "core/data/store.h" +#include "core/data/scalar.h" +#include "core/data/transform.h" +#include + +namespace legate { + +class Scalar; +class Store; +class MakeshiftSerializer{ + + public: + MakeshiftSerializer(){ + size=512; + raw.resize(size); + write_offset=0; + read_offset=0; + buffer_counter=0; + } + void zero(){ + //memset ((void*)raw.data(),0,raw.size()); + write_offset=0; + buffer_counter=0; + neededReqIds.clear(); + regionReqIdMap.clear(); + } +/* + template void pack(T&& arg) + { + T copy = arg; + pack(copy); //call l-value version + } +*/ + template void pack(T arg) + { + int8_t * argAddr = (int8_t*) &arg; + if (size<=write_offset+sizeof(T)) + { + resize(sizeof(T)); + } + //for (int i=0; i((argAddr)+i); + //} + memcpy(raw.data()+write_offset, argAddr, sizeof(T)); + //std::cout<<"reint "<<*reinterpret_cast(raw.data()+write_offset)<(argByte+i); + } + write_offset+=argSize; + //std::cout<<" "< T read() + { + if (read_offset(raw.data()+read_offset); + read_offset+=sizeof(T); + return datum; + } + else{ + std::cout<<"finished reading buffer"<(id, returnAndIncrCounter())); + neededReqIds.push_back(id); + } + } + + int32_t getNewReqID(int32_t oldID) + { + return regionReqIdMap.find(oldID)->second; + } + + std::vector getReqIds (){ + //could use move semantics here + std::vector reqIdsCopy(neededReqIds); + return reqIdsCopy; + } + + private: + size_t size; + int read_offset; + int write_offset; + int buffer_counter; + std::vector raw; + + private: + std::map regionReqIdMap; //maps old reqids to new ones + std::vector neededReqIds; //list of old reqIds needed in child op + +}; +/* +int main(){ + MakeshiftSerializer ms; + int a=3; + char g='a'; + ms.pack(a); + ms.pack(g); + ms.pack(a); + ms.pack(g); + std::cout<()<()<()<()<()<()< Date: Fri, 22 Oct 2021 15:06:40 -0700 Subject: [PATCH 13/44] debugging crap --- legate/core/launcher.py | 3 +- legate/core/partition.py | 5 +- legate/core/runtime.py | 231 +++++++++++++-------- legate/core/store.py | 30 +++ src/core/utilities/makeshift_serializer.cc | 1 + 5 files changed, 179 insertions(+), 91 deletions(-) diff --git a/legate/core/launcher.py b/legate/core/launcher.py index ef6f3d306..78924acfe 100644 --- a/legate/core/launcher.py +++ b/legate/core/launcher.py @@ -570,12 +570,13 @@ def add_store(self, args, store, proj, perm, tag, flags): if store.kind is Future: if store.has_storage: self.add_future(store.storage) - elif perm == Permission.READ or perm == Permission.REDUCTION: + elif (perm == Permission.READ or perm == Permission.REDUCTION): raise RuntimeError( "Read access to an uninitialized store is disallowed" ) read_only = perm == Permission.READ args.append(FutureStoreArg(store, read_only, store.has_storage)) + #args.append(FutureStoreArg(store, perm, store.has_storage)) else: region = store.storage.region diff --git a/legate/core/partition.py b/legate/core/partition.py index 073772166..b7a517713 100644 --- a/legate/core/partition.py +++ b/legate/core/partition.py @@ -165,11 +165,12 @@ def construct(self, region, complete=False): transform = Transform(tile_shape.ndim, tile_shape.ndim) for idx, size in enumerate(tile_shape): transform.trans[idx, idx] = size - + print(self) + print("ndim" , tile_shape.ndim, "offset", self._offset, "tile_shape", self._tile_shape) lo = Shape((0,) * tile_shape.ndim) + self._offset hi = self._tile_shape - 1 + self._offset - extent = Rect(hi, lo, exclusive=False) + print("extent", extent) color_space = self._runtime.find_or_create_index_space( self.color_shape diff --git a/legate/core/runtime.py b/legate/core/runtime.py index a30893a3b..d48ee6092 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -43,15 +43,20 @@ from .solver import Partitioner, Strategy from .store import RegionField, Store, FusionMetadata -debugPrint = False +debugPrint = True +futureBugPrint = True -#debug printing def zprint(*args): return if debugPrint: - dprint = print + drint = print else: - dprint = zprint + drint = zprint + +if futureBugPrint: + frint = print +else: + frint = zprint # A Field holds a reference to a field in a region tree @@ -709,11 +714,15 @@ def can_fuse(self): # so the calls to Partitioner() and partition_stores done kill perf partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single) self.partitioners.append( partitioner ) + print(op._inputs) + import pdb; pdb.set_trace() strategy = partitioner.partition_stores() + if len(op.inputs)>1: + proj = strategy.get_projection(op._inputs[1]) self.strategies.append(strategy) results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints] - dprint("fuse results", results) + drint("fuse results", results) all_fusable = [result[0] for result in results] interval_sets = [result[1] for result in results] @@ -732,12 +741,13 @@ def can_fuse(self): newset.append((news, newe)) curr_set=newset fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold) - dprint("curset", curr_set) + drint("curset", curr_set) - dprint("final_set", final_set) - dprint("all fusable", fusable) - dprint("intervals", interval_sets) - return reduce(lambda x,y: x and y, all_fusable), final_set, self.strategies + drint("final_set", final_set) + drint("all fusable", fusable) + drint("intervals", interval_sets) + #return reduce(lambda x,y: x and y, all_fusable), final_set, self.strategies + return fusable, final_set, self.strategies class FusionConstraint(object): def apply(self, contexts, runtime, ops, partitioners, strategies): @@ -755,33 +765,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): else: return False, [(0,0)] """ - NUMPY_BINARY_OP = 400000, - NUMPY_SCALAR_BINARY_OP = 400002, - NUMPY_FILL = 400003, - NUMPY_SCALAR_UNARY_RED = 400004, - NUMPY_UNARY_RED = 400005, - NUMPY_UNARY_OP = 400006, - NUMPY_SCALAR_UNARY_OP = 400007, - NUMPY_BINARY_RED = 400008, - NUMPY_CONVERT = 400010, - NUMPY_SCALAR_CONVERT = 400011, - NUMPY_WHERE = 400012, - NUMPY_SCALAR_WHERE = 400013, - NUMPY_READ = 400014, - NUMPY_WRITE = 400015, - NUMPY_DIAG = 400016, - NUMPY_MATMUL = 400017, - NUMPY_MATVECMUL = 400018, - NUMPY_DOT = 400019, - NUMPY_BINCOUNT = 400020, - NUMPY_EYE = 400021, - NUMPY_RAND = 400022, - NUMPY_ARANGE = 400023, - NUMPY_TRANSPOSE = 400024, - NUMPY_TILE = 400025, - NUMPY_NONZERO = 400026, - NUMPY_DOUBLE_BINARY_OP = 400027, - NUMPY_FUSED_OP = 400028, enum NumPyOpCode { NUMPY_ARANGE = 1, NUMPY_BINARY_OP = 2, @@ -819,7 +802,9 @@ def __init__(self): #these ops are always fusable self.validIDs.add(2) #Binary op - self.validIDs.add(18) #Unary op + #self.validIDs.add(5) #convert op + #self.validIDs.add(18) #Unary op + #self.validIDs.add(9) #Fill op # the following are conditionally fusable # they will be processed in the a subsequent level of filtering @@ -828,17 +813,17 @@ def __init__(self): # are NOT consumed by a subsequent op in the window # however they can be printed, which we cannot detect in the runtime # without static analysis, so consider these terminal fusable - self.validIDs.add(400004) #Scalar unary red - self.validIDs.add(400005) #Unary red + #self.validIDs.add(400004) #Scalar unary red + #self.validIDs.add(400005) #Unary red # as all scalars are futures, # so we can just check if both Futures are "ready" # more powerfully, we can also create a dependency tree # of ops, and assuming they're all scalar ops, # and the "roots" are ready, we can fuse - self.validIDs.add(400002) #Scalar Binary op - self.validIDs.add(400007) #Scalar Unary op - self.validIDs.add(400008) #Scalar binary red + #self.validIDs.add(400002) #Scalar Binary op + #self.validIDs.add(400007) #Scalar Unary op + #self.validIDs.add(400008) #Scalar binary red #a matmul is valid if it is the last op in the sequence #unless if it followed by a matmul of the exact same size @@ -851,9 +836,9 @@ def __init__(self): def apply(self, contexts, runtime, ops, partitioners, strategies): results = [int(op._task_id) in self.validIDs for op in ops] + drint("valids", results) fusable_intervals = [] start, end =0,0 - rolling=False while end1: first = matrices[0] + print(store, matrices) for matrix in matrices: if not (matrix==first).all(): - return False, [(0,0)] + indices = linkset[store] + print("must split", indices) + return True, [(0,indices[1]), (indices[1],len(ops))] return True, [(0,len(ops))] + class IdenticalLaunchShapes(FusionConstraint): """Fusion rule that only ops with identical launch shapes can be fused""" @@ -926,12 +931,11 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): launch_shapes = [] for i in range(len(ops)): launch_shapes.append(strategies[i]._launch_shape) - dprint(strategies[3].__dict__) - dprint('launch shapes', launch_shapes) + drint('launch shapes', launch_shapes) first_shape = launch_shapes[0] for launch_shape in launch_shapes: if launch_shape!=first_shape: - return False, [(0,0)] + return True, [(0,1),(1,len(ops))] return True, [(0,len(ops))] @@ -979,7 +983,7 @@ def __init__(self, core_library): # Legate operations. self._outstanding_ops = [] self._window_size =10 - self._fusion_threshold =4 + self._fusion_threshold =2 self._clearing_pipe = False # Now we initialize managers @@ -1134,18 +1138,13 @@ def serialize_multiop_metadata(self, numpy_context, ops): meta_arrs = (input_starts, output_starts, offset_starts, offsets, reduction_starts, scalar_starts, future_starts, op_ids) fusion_metadata = FusionMetadata(*meta_arrs) - - #TODO: remove me - #inst, oust, offst, offs = map(npo.array, (input_starts, output_starts, offset_starts, offsets)) - #meta_arrs_np = map(npo.array, meta_arrs) - #def make_deferred(inst): - # return numpy_runtime.find_or_create_array_thunk(inst, stacklevel=0, defer=True) - #meta_maps = map(make_deferred, meta_arrs_np) meta_maps=None return meta_maps, fusion_metadata def build_fused_op(self,ops): + for i in range(len(ops)): + self.propogateFuture(ops[i]) fusion_checker = FusionChecker(ops, self._contexts, self) fusion_checker.register_constraint(NumpyContextExists()) fusion_checker.register_constraint(AllValidOps()) @@ -1155,8 +1154,8 @@ def build_fused_op(self,ops): #short circuit if not can_fuse: - dprint("CANNOT FUSE!") - return None + drint("CANNOT FUSE!") + return False, partitions super_strats = [] super_fspaces = [] @@ -1168,7 +1167,7 @@ def build_fused_op(self,ops): super_fspace = {} super_keystore = set() start,end = fusable_set - dprint("creating fusable set for", start, end) + drint("creating fusable set for", start, end) for j in range(start,end): super_strat = {**(super_strat.copy()), **partitions[j]._strategy} super_fspace = {**(super_fspace.copy()), **partitions[j]._fspaces} @@ -1177,7 +1176,7 @@ def build_fused_op(self,ops): super_fspaces.append(super_fspace) super_keystores.append(super_keystore) super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore)) - dprint("lens", len(super_strats), len(super_fspaces), len(super_strategies), len(super_keystore)) + drint("lens", len(super_strats), len(super_fspaces), len(super_strategies), len(super_keystore)) """ super_strat = {} super_fspace = {} @@ -1211,61 +1210,117 @@ def build_fused_op(self,ops): fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true #add typical inputs and outputs of all subtasks to fused task - for op in op_subset: + for j,op in enumerate(op_subset): for scalar in op._scalar_args: fused_task.add_scalar_arg(scalar[0], ty.int32) for reduction in op._reductions: fused_task.add_reduction(reduction) + for input in op._inputs: + #if input._storage is None: + if i==1: + frint("building fused in", i,j,op._task_id, input) fused_task.add_input(input) for output in op._outputs: + if i==1: + frint("building fused out", i,j,op._task_id, output) fused_task.add_output(output) + self.propogateFuture(op) for future in op._futures: fused_task.add_future(future) new_op_list.append(fused_task) - dprint("new op list", new_op_list) - return new_op_list + return new_op_list, True - def _launch_outstanding(self): - print("launching final outstanding ops") + def _launch_outstanding(self, force_eval=True): + print("launching final outstanding ops", [op._task_id for op in self._outstanding_ops]) if len(self._outstanding_ops): ops = self._outstanding_ops self._outstanding_ops = [] - #self._schedule(ops, force_eval=True) - + self._schedule(ops, force_eval) + + def _launch_one(self): + if len(self._outstanding_ops): + op = self._outstanding_ops[0] + self._outstanding_ops = self._outstanding_ops[1:] + self._schedule([op], force_eval=True) + + def propogateFuture(self,op): + return + for input in op._inputs: + start = input + if input._storage is None: + print("needs healing", input, op._task_id) + while start._storage is None and start._parent: + start=start._parent + input._storage = start._storage + + def _schedule(self, ops, force_eval=False): ids = [op._task_id for op in ops] - dprint("ids", ids) - #try fusing tasks + print(force_eval, "ids", ids) + #case 1: try fusing current window of tasks + #if partially or fully fusable, + #schedule the new set of tasks + strats = False if len(ops)>=2 and (not force_eval): - fused_task_list = self.build_fused_op(ops) + fused_task_list,strats = self.build_fused_op(ops) if fused_task_list: - dprint("start clearing pipe") + frint("created fused list", [op._task_id for op in fused_task_list]) + drint("start clearing pipe") self._clearing_pipe = True for task in fused_task_list: task.execute() self._clearing_pipe = False - dprint("stop clearing pipe") + drint("stop clearing pipe") return - #if we cann't fuse op launch them individually - - # tasks processed for fusion already have - # their strategy "baked in" + # case 2: tasks processed for fusion already have + # their strategy "baked in", as we already partitioned + # them when testing fusion legality (in case 1) if len(ops)==1 and self._clearing_pipe: strategy = ops[0].strategy - else: #else do to the partition - for op in ops: - must_be_single = any(len(op.scalar_outputs) > 0 for op in [op]) - partitioner = Partitioner(self, [op], must_be_single=must_be_single) - strategy = partitioner.partition_stores() - op.launch(strategy) + for input in ops[0]._inputs: + #if input._storage is None: + frint("launch fused input", ops[0]._task_id, input) + proj = ops[0].strategy.get_projection(input) + if hasattr(proj, 'part'): + frint("strat", proj.part.index_partition.functor.transform.trans) + frint("strat1", proj.part.index_partition.functor.__dict__) + self.propogateFuture(ops[0]) + + for output in ops[0]._outputs: + #if output._storage is None: + frint("launch used output", ops[0]._task_id, output) + ops[0].launch(strategy) + + # case 3: execute the ops normally + # if we already checked the ops for fusability, + # then the ops' buffers have already been partitioned + else: + if not strats: #ops were not check for fusability, so partition them + for op in ops: + must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op]) + partitioner = Partitioner(self, [op], must_be_single=must_be_single) + strategy = partitioner.partition_stores() + op.strategy = strategy + else: #strategies already calculated during failed attempt to fuse + for i,op in enumerate(ops): + op.strategy = strats[i] + for i,op in enumerate(ops): + for input in op._inputs: + print("in", input) + if input._storage is None: + frint("launch ufused input", op._task_id, input) + self.propogateFuture(op) + op.launch(op.strategy) + def submit(self, op): #always launch ops that've been processed for fusion #do not re-add to the window #as the these ops already waited in the window + #print(op.__dict__) if self._clearing_pipe: self._schedule([op]) else: diff --git a/legate/core/store.py b/legate/core/store.py index 7140a0f70..c5dceca29 100644 --- a/legate/core/store.py +++ b/legate/core/store.py @@ -423,6 +423,13 @@ def __init__( @property def shape(self): + if self._shape is None: + # If someone wants to access the shape of an unbound + # store before it is set, that means the producer task is + # sitting in the queue, so we should flush the queue. + self._runtime._launch_outstanding(False) + # At this point, we should have the shape set. + assert self._shape is not None return self._shape @property @@ -462,6 +469,28 @@ def storage(self): Store. These will have exactly the type specified by `.kind`. """ if self._storage is None: + print("store none, launching", [op._task_id for op in self._runtime._outstanding_ops]) + self._runtime._launch_outstanding(False) + """ + if self._kind ==Future: + print("future") + while(self._storage is None and len(self._runtime._outstanding_ops)): + print("launch_one") + print([op._task_id for op in self._runtime._outstanding_ops]) + #self._runtime._launch_outstanding() + self._runtime._launch_one() + """ + """ + if True: + import pdb; pdb.set_trace() + start = self + while start._storage is None and start._parent: + start=start._parent + if start._storage: + self._storage = start._storage + else: + self._runtime._launch_outstanding() + """ if self.unbound: raise RuntimeError( "Storage of a variable size store cannot be retrieved " @@ -471,6 +500,7 @@ def storage(self): # if necessary if self._parent is None: if self._kind is Future: + print("supressing in store.py") raise ValueError( "Illegal to access the storage of an uninitialized " "Legate store of volume 1 with scalar optimization" diff --git a/src/core/utilities/makeshift_serializer.cc b/src/core/utilities/makeshift_serializer.cc index 8b0e47dc0..7dacd29ce 100644 --- a/src/core/utilities/makeshift_serializer.cc +++ b/src/core/utilities/makeshift_serializer.cc @@ -119,6 +119,7 @@ namespace legate{ pack((uint32_t) dom.dim); for (int32_t i =0; i Date: Mon, 25 Oct 2021 14:34:46 -0700 Subject: [PATCH 14/44] op registry working --- legate/core/constraints.py | 4 +- legate/core/partition.py | 3 - legate/core/runtime.py | 151 +++++++++++++-------- legate/core/solver.py | 9 +- legate/core/store.py | 5 +- src/core/runtime/context.cc | 11 ++ src/core/runtime/context.h | 14 +- src/core/runtime/runtime.cc | 5 + src/core/runtime/runtime.h | 7 +- src/core/task/task.cc | 6 + src/core/task/task.h | 3 + src/core/utilities/makeshift_serializer.cc | 10 +- 12 files changed, 156 insertions(+), 72 deletions(-) diff --git a/legate/core/constraints.py b/legate/core/constraints.py index cd81306e5..93f89bd36 100644 --- a/legate/core/constraints.py +++ b/legate/core/constraints.py @@ -56,7 +56,7 @@ def reduce(self): class PartSym(Expr): def __init__(self, op, store, id, disjoint, complete): - self._op = op + #self._op = op self._store = store self._id = id self._disjoint = disjoint @@ -73,9 +73,11 @@ def closed(self): def __repr__(self): disj = "D" if self._disjoint else "A" comp = "C" if self._complete else "I" + return f"X{self._id}({disj},{comp})" return f"X{self._id}({disj},{comp})@{self._op.get_name()}" def __hash__(self): + return hash(self._id) return hash((self._op, self._id)) def subst(self, mapping): diff --git a/legate/core/partition.py b/legate/core/partition.py index 51d9be6d6..25463054c 100644 --- a/legate/core/partition.py +++ b/legate/core/partition.py @@ -180,12 +180,9 @@ def construct(self, region, complete=False): transform = Transform(tile_shape.ndim, tile_shape.ndim) for idx, size in enumerate(tile_shape): transform.trans[idx, idx] = size - print(self) - print("ndim" , tile_shape.ndim, "offset", self._offset, "tile_shape", self._tile_shape) lo = Shape((0,) * tile_shape.ndim) + self._offset hi = self._tile_shape - 1 + self._offset extent = Rect(hi, lo, exclusive=False) - print("extent", extent) color_space = self._runtime.find_or_create_index_space( self.color_shape diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 0328da914..639bb9f98 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -42,9 +42,9 @@ from .shape import Shape from .solver import Partitioner, Strategy from .store import RegionField, Store, FusionMetadata - -debugPrint = True -futureBugPrint = True +import numpy as np +debugPrint = False +futureBugPrint = False def zprint(*args): return @@ -708,17 +708,20 @@ def supress_small_fusions(self, intervals, threshold): return fusable, final_set def can_fuse(self): - must_be_single = any(len(op.scalar_outputs) > 0 for op in self.ops) for op in self.ops: - # TODO: cache as much as of the partitioner results as possible - # so the calls to Partitioner() and partition_stores done kill perf + must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op]) partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single) self.partitioners.append( partitioner ) - print(op._inputs) - import pdb; pdb.set_trace() strategy = partitioner.partition_stores() + for output, part, in zip(op._outputs, op._output_parts): + partition = strategy.get_partition(part) + output.set_key_partition(partition) + key_part = partition + for input in op._inputs: + if input.shape==output.shape: + input.set_key_partition(key_part) if len(op.inputs)>1: - proj = strategy.get_projection(op._inputs[1]) + proj = strategy.get_projection(op._input_parts[1]) self.strategies.append(strategy) results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints] @@ -803,8 +806,9 @@ def __init__(self): #these ops are always fusable self.validIDs.add(2) #Binary op #self.validIDs.add(5) #convert op - #self.validIDs.add(18) #Unary op - #self.validIDs.add(9) #Fill op + self.validIDs.add(18) #Unary op + self.validIDs.add(9) #Fill op + self.validIDs.add(14) #Fill op # the following are conditionally fusable # they will be processed in the a subsequent level of filtering @@ -871,54 +875,65 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): store_to_ops = {} base_window = [0, len(ops)] - for i, op in enumerate(ops): + intervals = [] + start=0 + end = len(ops) + i=0 + #for i, op in enumerate(ops): + while i1: first = matrices[0] - print(store, matrices) for matrix in matrices: if not (matrix==first).all(): indices = linkset[store] - print("must split", indices) return True, [(0,indices[1]), (indices[1],len(ops))] return True, [(0,len(ops))] @@ -931,11 +946,29 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): launch_shapes = [] for i in range(len(ops)): launch_shapes.append(strategies[i]._launch_shape) - drint('launch shapes', launch_shapes) + #print(launch_shapes) + """ first_shape = launch_shapes[0] for launch_shape in launch_shapes: if launch_shape!=first_shape: return True, [(0,1),(1,len(ops))] + """ + intervals =[] + i=1 + start=0 + end = len(launch_shapes) + while i 1: #initialize fused task fused_task = numpy_context.create_task(fused_id) - fused_task.strategy = super_strategies[i] - - #serialize necessary metadata on all encapsulated ops + + #serialize necessary metadata on all encapsulated ops #this metadata will be fed into the fused op as inputs meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset) fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true - #add typical inputs and outputs of all subtasks to fused task + key_part = None for j,op in enumerate(op_subset): for scalar in op._scalar_args: fused_task.add_scalar_arg(scalar[0], ty.int32) @@ -1225,21 +1257,27 @@ def build_fused_op(self,ops): for input in op._inputs: #if input._storage is None: - if i==1: - frint("building fused in", i,j,op._task_id, input) fused_task.add_input(input) - for output in op._outputs: - if i==1: - frint("building fused out", i,j,op._task_id, output) + for output,part in zip(op._outputs, op._output_parts): fused_task.add_output(output) - self.propogateFuture(op) + if key_part==None: + key_part = partitions[z].get_partition(part) + + self.propogateFuture(fused_task) for future in op._futures: fused_task.add_future(future) + z+=1 new_op_list.append(fused_task) + for i,fused_task in enumerate(new_op_list): + must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task]) + partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single) + strategy = partitioner.partition_stores() + #fused_task.strategy = super_strategies[i] + fused_task.strategy = strategy + return new_op_list, True def _launch_outstanding(self, force_eval=True): - print("launching final outstanding ops", [op._task_id for op in self._outstanding_ops]) if len(self._outstanding_ops): ops = self._outstanding_ops self._outstanding_ops = [] @@ -1252,11 +1290,10 @@ def _launch_one(self): self._schedule([op], force_eval=True) def propogateFuture(self,op): - return + return for input in op._inputs: start = input - if input._storage is None: - print("needs healing", input, op._task_id) + if input._kind is Future and input._storage is None: while start._storage is None and start._parent: start=start._parent input._storage = start._storage @@ -1270,6 +1307,11 @@ def _schedule(self, ops, force_eval=False): #if partially or fully fusable, #schedule the new set of tasks strats = False + #for op in ops: + # must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op]) + # partitioner = Partitioner(self, [op], must_be_single=must_be_single) + # strategy = partitioner.partition_stores() + if len(ops)>=2 and (not force_eval): fused_task_list,strats = self.build_fused_op(ops) if fused_task_list: @@ -1287,13 +1329,9 @@ def _schedule(self, ops, force_eval=False): # them when testing fusion legality (in case 1) if len(ops)==1 and self._clearing_pipe: strategy = ops[0].strategy - for input in ops[0]._inputs: - #if input._storage is None: + for input, part in zip(ops[0]._inputs, ops[0]._input_parts): frint("launch fused input", ops[0]._task_id, input) - proj = ops[0].strategy.get_projection(input) - if hasattr(proj, 'part'): - frint("strat", proj.part.index_partition.functor.transform.trans) - frint("strat1", proj.part.index_partition.functor.__dict__) + proj = ops[0].strategy.get_projection(part) self.propogateFuture(ops[0]) for output in ops[0]._outputs: @@ -1316,7 +1354,6 @@ def _schedule(self, ops, force_eval=False): op.strategy = strats[i] for i,op in enumerate(ops): for input in op._inputs: - print("in", input) if input._storage is None: frint("launch ufused input", op._task_id, input) self.propogateFuture(op) diff --git a/legate/core/solver.py b/legate/core/solver.py index d539999d0..77628df1a 100644 --- a/legate/core/solver.py +++ b/legate/core/solver.py @@ -271,6 +271,7 @@ def cost(unknown): store = unknown._store return ( store.comm_volume(), + store._key_partition is None, not store.has_key_partition(all_restrictions[unknown]), ) @@ -278,6 +279,7 @@ def cost(unknown): key_parts = set() prev_part = None + #import pdb; pdb.set_trace() for unknown in unknowns: if unknown in partitions: continue @@ -290,7 +292,10 @@ def cost(unknown): if isinstance(prev_part, NoPartition): partition = prev_part else: - partition = store.compute_key_partition(restrictions) + if store._key_partition is not None: + partition=store._key_partition + else: + partition = store.compute_key_partition(restrictions) key_parts.add(unknown) cls = constraints.find(unknown) @@ -298,7 +303,7 @@ def cost(unknown): if to_align in partitions: continue partitions[to_align] = partition - + #print("ptype", to_align, (partition)) prev_part = partition for lhs, rhs in dependent.items(): diff --git a/legate/core/store.py b/legate/core/store.py index b8ed049c8..ce097db41 100644 --- a/legate/core/store.py +++ b/legate/core/store.py @@ -472,10 +472,9 @@ def storage(self): # If someone is trying to retreive the storage of a store, # we need to execute outstanding operations so that we know # it has been initialized correctly. - self._runtime.flush_scheduling_window() + self._runtime._launch_outstanding(False) if self._storage is None: - print("store none, launching", [op._task_id for op in self._runtime._outstanding_ops]) - self._runtime._launch_outstanding(False) + #print("store none, launching", [op._task_id for op in self._runtime._outstanding_ops]) """ if self._kind ==Future: print("future") diff --git a/src/core/runtime/context.cc b/src/core/runtime/context.cc index 79bfbf6a4..4884c0110 100644 --- a/src/core/runtime/context.cc +++ b/src/core/runtime/context.cc @@ -154,6 +154,17 @@ TaskContext::TaskContext(const Legion::Task* task, scalars_ = dez.unpack>(); } +/* + TaskContext::TaskContext(std::vector& inputs, std::vector& outputs, + std::vector& reductions, std::vector& scalars) + : inputs_(inputs), outputs_(outputs), reductions_(reductions), scalars_(scalars) +{ + regions_ = NULL; + context_ = NULL; + runtime_ = NULL; + task_ = NULL; +} +*/ ReturnValues TaskContext::pack_return_values() const { diff --git a/src/core/runtime/context.h b/src/core/runtime/context.h index 891452cc9..c9cb29462 100644 --- a/src/core/runtime/context.h +++ b/src/core/runtime/context.h @@ -110,11 +110,23 @@ class LibraryContext { // of the Legion API. class TaskContext { public: + TaskContext() = default; + TaskContext(const Legion::Task* task, const std::vector& regions, Legion::Context context, Legion::Runtime* runtime); + TaskContext(const Legion::Task* task, const std::vector regions) +// std::vector& inputs, std::vector& outputs, std::vector& scalars) + : task_(task), regions_(regions) + { + //inputs_=inputs; + //outputs_=outputs; + //scalars_=scalars; +} + + public: std::vector& inputs() { return inputs_; } std::vector& outputs() { return outputs_; } @@ -131,7 +143,7 @@ class TaskContext { Legion::Runtime* runtime_; FusionMetadata fusionMetadata; - private: + public: std::vector inputs_, outputs_, reductions_; std::vector scalars_; }; diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc index aab251955..4f4c679ed 100644 --- a/src/core/runtime/runtime.cc +++ b/src/core/runtime/runtime.cc @@ -32,6 +32,11 @@ Logger log_legate("legate"); // This is the unique string name for our library which can be used // from both C++ and Python to generate IDs + +using LegateVariantImpl = void (*)(TaskContext&); +/*static */ std::vector > Core::opIDs = *(new std::vector >()); +/*static */ std::unordered_map Core::cpuDescriptors = *(new std::unordered_map()); + static const char* const core_library_name = "legate.core"; /*static*/ bool Core::show_progress = false; diff --git a/src/core/runtime/runtime.h b/src/core/runtime/runtime.h index 03b62e0c8..e5ba54b35 100644 --- a/src/core/runtime/runtime.h +++ b/src/core/runtime/runtime.h @@ -19,9 +19,12 @@ #include "legion.h" #include "core/utilities/typedefs.h" - +#include "core/runtime/context.h" +#include namespace legate { +using LegateVariantImpl = void (*)(TaskContext&); + extern uint32_t extract_env(const char* env_name, const uint32_t default_value, const uint32_t test_value); @@ -30,6 +33,8 @@ class Core { public: static void parse_config(void); static void shutdown(void); + static std::unordered_map cpuDescriptors; + static std::vector > opIDs; public: // Configuration settings diff --git a/src/core/task/task.cc b/src/core/task/task.cc index 1cc9f1e43..301bfa2a7 100644 --- a/src/core/task/task.cc +++ b/src/core/task/task.cc @@ -35,6 +35,7 @@ void LegateTaskRegistrar::record_variant(TaskID tid, assert((kind == Processor::LOC_PROC) || (kind == Processor::TOC_PROC) || (kind == Processor::OMP_PROC)); + // Buffer these up until we can do our actual registration with the runtime pending_task_variants_.push_back(PendingTaskVariant( tid, @@ -56,6 +57,11 @@ void LegateTaskRegistrar::record_variant(TaskID tid, void LegateTaskRegistrar::register_all_tasks(Runtime* runtime, LibraryContext& context) { + for (auto& taskIdx : Core::opIDs){ + auto newID = context.get_task_id(taskIdx.first); + Core::cpuDescriptors.insert(std::pair((int64_t) newID, taskIdx.second)); + } + // Do all our registrations for (auto& task : pending_task_variants_) { task.task_id = diff --git a/src/core/task/task.h b/src/core/task/task.h index 06befc089..3d464bbab 100644 --- a/src/core/task/task.h +++ b/src/core/task/task.h @@ -127,6 +127,8 @@ class LegateTask { bool inner = false, bool idempotent = false) { + + // Construct the code descriptor for this task so that the library // can register it later when it is ready Legion::CodeDescriptor desc( @@ -134,6 +136,7 @@ class LegateTask { legion_task_wrapper::template legate_task_wrapper>); auto task_id = T::TASK_ID; + Core::opIDs.push_back(std::pair((int64_t)task_id, TASK_PTR)); T::Registrar::record_variant(task_id, T::task_name(), desc, diff --git a/src/core/utilities/makeshift_serializer.cc b/src/core/utilities/makeshift_serializer.cc index 7dacd29ce..06f2d3997 100644 --- a/src/core/utilities/makeshift_serializer.cc +++ b/src/core/utilities/makeshift_serializer.cc @@ -108,18 +108,20 @@ namespace legate{ //if _isfuture if(buffer.is_future_) { - + //std::cout<<"packing future"< Date: Wed, 27 Oct 2021 14:18:39 -0700 Subject: [PATCH 15/44] Change the pip package name to match the conda package and update version. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 749f42373..ac6f7dc02 100755 --- a/setup.py +++ b/setup.py @@ -69,8 +69,8 @@ def run(self): # Remove the recurse argument from the list sys.argv.remove("--recurse") setup( - name="legate.core", - version="0.1", + name="legate-core", + version="21.10.00", packages=["legate", "legate.core", "legate.timing"], cmdclass={"build_py": my_build_py}, ) From ef677e6448b11b76d8b91e8ebd17652dc8d992a6 Mon Sep 17 00:00:00 2001 From: Marcin Zalewski Date: Wed, 27 Oct 2021 23:46:45 -0700 Subject: [PATCH 16/44] Fix the version of Legion to a particular commit --- install.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/install.py b/install.py index 8f9a32db8..0b3ca61b5 100755 --- a/install.py +++ b/install.py @@ -142,6 +142,7 @@ def git_clone(repo_dir, url, branch=None, tag=None, commit=None): verbose_check_call( ["git", "submodule", "update", "--init"], cwd=repo_dir ) + git_reset(repo_dir, commit) else: verbose_check_call( [ @@ -162,10 +163,13 @@ def git_reset(repo_dir, refspec): verbose_check_call(["git", "reset", "--hard", refspec], cwd=repo_dir) -def git_update(repo_dir, branch=None): - verbose_check_call(["git", "pull", "--ff-only"], cwd=repo_dir) +def git_update(repo_dir, branch=None, tag=None, commit=None): if branch is not None: verbose_check_call(["git", "checkout", branch], cwd=repo_dir) + verbose_check_call(["git", "pull", "--ff-only"], cwd=repo_dir) + else: + verbose_check_call(["git", "fetch"], cwd=repo_dir) + verbose_check_call(["git", "checkout", commit or tag], cwd=repo_dir) def load_json_config(filename): @@ -209,13 +213,14 @@ def install_gasnet(gasnet_dir, conduit, thread_count): shutil.rmtree(temp_dir) -def install_legion(legion_src_dir, branch="legate_stable"): +def install_legion(legion_src_dir, branch, commit="3141d7c0"): print("Legate is installing Legion into a local directory...") # For now all we have to do is clone legion since we build it with Legate git_clone( legion_src_dir, url="https://gitlab.com/StanfordLegion/legion.git", branch=branch, + commit=commit ) @@ -228,9 +233,9 @@ def install_thrust(thrust_dir): ) -def update_legion(legion_src_dir, branch="legate_stable"): +def update_legion(legion_src_dir, branch, commit="3141d7c0"): # Make sure we are on the right branch for single/multi-node - git_update(legion_src_dir, branch=branch) + git_update(legion_src_dir, branch=branch, commit=commit) def build_legion( From ba955e280ef575bcb93181bc5ce22787f68625f3 Mon Sep 17 00:00:00 2001 From: Marcin Zalewski Date: Thu, 28 Oct 2021 00:24:15 -0700 Subject: [PATCH 17/44] Do not find a default branch for the release --- install.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/install.py b/install.py index 0b3ca61b5..f6ddbb75c 100755 --- a/install.py +++ b/install.py @@ -220,7 +220,7 @@ def install_legion(legion_src_dir, branch, commit="3141d7c0"): legion_src_dir, url="https://gitlab.com/StanfordLegion/legion.git", branch=branch, - commit=commit + commit=commit, ) @@ -562,8 +562,10 @@ def install( legate_core_dir = os.path.dirname(os.path.realpath(__file__)) - if legion_branch is None: - legion_branch = find_default_legion_branch(legate_core_dir) + # For the release, we will use a hardcoded commit unless user asks for + # a branch + # if legion_branch is None: + # legion_branch = find_default_legion_branch(legate_core_dir) cmake_config = os.path.join(legate_core_dir, ".cmake.json") dump_json_config(cmake_config, cmake) From de337cfe033d5c332dda6bf8ac69b63956fcb2b4 Mon Sep 17 00:00:00 2001 From: Marcin Zalewski Date: Thu, 28 Oct 2021 01:03:19 -0700 Subject: [PATCH 18/44] Change the Legion checkout target --- install.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install.py b/install.py index f6ddbb75c..d18a44a42 100755 --- a/install.py +++ b/install.py @@ -213,7 +213,7 @@ def install_gasnet(gasnet_dir, conduit, thread_count): shutil.rmtree(temp_dir) -def install_legion(legion_src_dir, branch, commit="3141d7c0"): +def install_legion(legion_src_dir, branch, commit="d0907f4c"): print("Legate is installing Legion into a local directory...") # For now all we have to do is clone legion since we build it with Legate git_clone( @@ -233,7 +233,7 @@ def install_thrust(thrust_dir): ) -def update_legion(legion_src_dir, branch, commit="3141d7c0"): +def update_legion(legion_src_dir, branch, commit="d0907f4c"): # Make sure we are on the right branch for single/multi-node git_update(legion_src_dir, branch=branch, commit=commit) From 78335cc7682b961910119d2b899d6e49328db3e3 Mon Sep 17 00:00:00 2001 From: Marcin Zalewski Date: Thu, 28 Oct 2021 09:13:48 -0700 Subject: [PATCH 19/44] Bumped up the version of pyarrow --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 80f3fbd55..d72135b2e 100644 --- a/README.md +++ b/README.md @@ -221,7 +221,7 @@ no support for Windows. The Legate Core currently requires Python >= 3.7 and the following packages: - - `pyarrow=1.0.1` + - `pyarrow=5.0.0` - `numpy` - `cffi` - [CUDA](https://developer.nvidia.com/cuda-downloads) >= 8.0 From 369909dbfa11ce0cfb3cff3f3e049443a5306a9b Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Sun, 31 Oct 2021 23:50:16 -0700 Subject: [PATCH 20/44] gpu descriptors --- install.py | 4 ++-- legate/core/runtime.py | 7 +++---- src/core/runtime/runtime.cc | 2 ++ src/core/runtime/runtime.h | 2 ++ src/core/task/task.cc | 7 +++++++ src/core/task/task.h | 6 +++++- 6 files changed, 21 insertions(+), 7 deletions(-) diff --git a/install.py b/install.py index 7c45e3ece..b6f9132a5 100755 --- a/install.py +++ b/install.py @@ -806,8 +806,8 @@ def driver(): ) parser.add_argument( "--cuda", - action=BooleanFlag, - default=os.environ.get("USE_CUDA", "0") == "1", + action= BooleanFlag, + default=True,#os.environ.get("USE_CUDA", "0") == "1", help="Build Legate with CUDA support.", ) parser.add_argument( diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 639bb9f98..ce34986e7 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -807,8 +807,8 @@ def __init__(self): self.validIDs.add(2) #Binary op #self.validIDs.add(5) #convert op self.validIDs.add(18) #Unary op - self.validIDs.add(9) #Fill op - self.validIDs.add(14) #Fill op + #self.validIDs.add(9) #Fill op + #self.validIDs.add(14) #Fill op # the following are conditionally fusable # they will be processed in the a subsequent level of filtering @@ -1302,7 +1302,7 @@ def propogateFuture(self,op): def _schedule(self, ops, force_eval=False): ids = [op._task_id for op in ops] - print(force_eval, "ids", ids) + #print(force_eval, "ids", ids) #case 1: try fusing current window of tasks #if partially or fully fusable, #schedule the new set of tasks @@ -1311,7 +1311,6 @@ def _schedule(self, ops, force_eval=False): # must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op]) # partitioner = Partitioner(self, [op], must_be_single=must_be_single) # strategy = partitioner.partition_stores() - if len(ops)>=2 and (not force_eval): fused_task_list,strats = self.build_fused_op(ops) if fused_task_list: diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc index 4f4c679ed..eb57f46d8 100644 --- a/src/core/runtime/runtime.cc +++ b/src/core/runtime/runtime.cc @@ -35,7 +35,9 @@ Logger log_legate("legate"); using LegateVariantImpl = void (*)(TaskContext&); /*static */ std::vector > Core::opIDs = *(new std::vector >()); +/*static */ std::vector > Core::gpuOpIDs = *(new std::vector >()); /*static */ std::unordered_map Core::cpuDescriptors = *(new std::unordered_map()); +/*static */ std::unordered_map Core::gpuDescriptors = *(new std::unordered_map()); static const char* const core_library_name = "legate.core"; diff --git a/src/core/runtime/runtime.h b/src/core/runtime/runtime.h index e5ba54b35..6d1dc8341 100644 --- a/src/core/runtime/runtime.h +++ b/src/core/runtime/runtime.h @@ -34,7 +34,9 @@ class Core { static void parse_config(void); static void shutdown(void); static std::unordered_map cpuDescriptors; + static std::unordered_map gpuDescriptors; static std::vector > opIDs; + static std::vector > gpuOpIDs; public: // Configuration settings diff --git a/src/core/task/task.cc b/src/core/task/task.cc index 301bfa2a7..6ee5fb69b 100644 --- a/src/core/task/task.cc +++ b/src/core/task/task.cc @@ -62,6 +62,13 @@ void LegateTaskRegistrar::register_all_tasks(Runtime* runtime, LibraryContext& c Core::cpuDescriptors.insert(std::pair((int64_t) newID, taskIdx.second)); } + for (auto& taskIdx : Core::gpuOpIDs){ + auto newID = context.get_task_id(taskIdx.first); + Core::gpuDescriptors.insert(std::pair((int64_t) newID, taskIdx.second)); + } + + + // Do all our registrations for (auto& task : pending_task_variants_) { task.task_id = diff --git a/src/core/task/task.h b/src/core/task/task.h index 3d464bbab..dc18b9f0f 100644 --- a/src/core/task/task.h +++ b/src/core/task/task.h @@ -136,7 +136,11 @@ class LegateTask { legion_task_wrapper::template legate_task_wrapper>); auto task_id = T::TASK_ID; - Core::opIDs.push_back(std::pair((int64_t)task_id, TASK_PTR)); + if (kind ==Legion::Processor::LOC_PROC){ + Core::opIDs.push_back(std::pair((int64_t)task_id, TASK_PTR)); + }else if (kind ==Legion::Processor::TOC_PROC){ + Core::gpuOpIDs.push_back(std::pair((int64_t)task_id, TASK_PTR)); + } T::Registrar::record_variant(task_id, T::task_name(), desc, From ab0c0448474058bdd71e525a261affa049a2b3d8 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Mon, 1 Nov 2021 11:35:03 -0700 Subject: [PATCH 21/44] Remove back edges from partition symbols back to operations to avoid object cycles --- legate/core/constraints.py | 13 +++++++++---- legate/core/operation.py | 3 ++- legate/core/solver.py | 18 +++++++++--------- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/legate/core/constraints.py b/legate/core/constraints.py index cd81306e5..39e85bdcc 100644 --- a/legate/core/constraints.py +++ b/legate/core/constraints.py @@ -55,8 +55,9 @@ def reduce(self): class PartSym(Expr): - def __init__(self, op, store, id, disjoint, complete): - self._op = op + def __init__(self, op_hash, op_name, store, id, disjoint, complete): + self._op_hash = op_hash + self._op_name = op_name self._store = store self._id = id self._disjoint = disjoint @@ -66,6 +67,10 @@ def __init__(self, op, store, id, disjoint, complete): def ndim(self): return self._store.ndim + @property + def store(self): + return self._store + @property def closed(self): return False @@ -73,10 +78,10 @@ def closed(self): def __repr__(self): disj = "D" if self._disjoint else "A" comp = "C" if self._complete else "I" - return f"X{self._id}({disj},{comp})@{self._op.get_name()}" + return f"X{self._id}({disj},{comp})@{self._op_name}" def __hash__(self): - return hash((self._op, self._id)) + return hash((self._op_hash, self._id)) def subst(self, mapping): return Lit(mapping[self]) diff --git a/legate/core/operation.py b/legate/core/operation.py index c2f13c524..5e8283dd5 100644 --- a/legate/core/operation.py +++ b/legate/core/operation.py @@ -158,7 +158,8 @@ def _get_symbol_id(self): def declare_partition(self, store, disjoint=True, complete=True): sym = PartSym( - self, + self._op_id, + self.get_name(), store, self._get_symbol_id(), disjoint=disjoint, diff --git a/legate/core/solver.py b/legate/core/solver.py index 58321c00f..b4d5c5e80 100644 --- a/legate/core/solver.py +++ b/legate/core/solver.py @@ -114,16 +114,16 @@ def launch_domain(self): def get_projection(self, part): partition = self.get_partition(part) - return partition.get_requirement(self._launch_shape, part._store) + return partition.get_requirement(self._launch_shape, part.store) def get_partition(self, part): - assert not part._store.unbound + assert not part.store.unbound if part not in self._strategy: raise ValueError(f"No strategy is found for {part}") return self._strategy[part] def get_field_space(self, part): - assert part._store.unbound + assert part.store.unbound if part not in self._fspaces: raise ValueError(f"No strategy is found for {part}") return self._fspaces[part] @@ -160,7 +160,7 @@ def _solve_broadcast_constraints( ): to_remove = OrderedSet() for unknown in unknowns: - store = unknown._store + store = unknown.store if not (store.kind is Future or unknown in broadcasts): continue @@ -183,7 +183,7 @@ def _solve_unbound_constraints( ): to_remove = OrderedSet() for unknown in unknowns: - store = unknown._store + store = unknown.store if not store.unbound: continue @@ -193,7 +193,7 @@ def _solve_unbound_constraints( continue cls = constraints.find(unknown) - assert all(to_align._store.unbound for to_align in cls) + assert all(to_align.store.unbound for to_align in cls) fspace = self._runtime.create_field_space() for to_align in cls: @@ -206,7 +206,7 @@ def _solve_unbound_constraints( def _find_restrictions(cls): merged = None for unknown in cls: - store = unknown._store + store = unknown.store restrictions = store.find_restrictions() if merged is None: merged = restrictions @@ -268,7 +268,7 @@ def partition_stores(self): all_restrictions = self._find_all_restrictions(unknowns, constraints) def cost(unknown): - store = unknown._store + store = unknown.store return ( -store.comm_volume(), not store.has_key_partition(all_restrictions[unknown]), @@ -284,7 +284,7 @@ def cost(unknown): elif unknown in dependent: continue - store = unknown._store + store = unknown.store restrictions = all_restrictions[unknown] if isinstance(prev_part, NoPartition): From 54d3bb8fc20baf186246bd7b97271f1a01142459 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Mon, 1 Nov 2021 15:39:06 -0700 Subject: [PATCH 22/44] Make sure we don't create cycles between region fields and attachments --- legate/core/runtime.py | 60 +++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 6bd9a5839..c06de76e1 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -16,6 +16,7 @@ import gc import math import struct +import weakref from collections import deque from functools import reduce @@ -319,11 +320,19 @@ def __init__(self, ptr, extent, region_field): self.ptr = ptr self.extent = extent self.end = ptr + extent - 1 - self.region_field = region_field + self._region_field = weakref.ref(region_field) def overlaps(self, other): return not (self.end < other.ptr or other.end < self.ptr) + @property + def region_field(self): + return self._region_field() + + @region_field.setter + def region_field(self, region_field): + self._region_field = weakref.ref(region_field) + class AttachmentManager(object): def __init__(self, runtime): @@ -359,22 +368,35 @@ def attachment_key(alloc): def has_attachment(self, alloc): key = self.attachment_key(alloc) - return key in self._attachments + attachment = self._attachments.get(key, None) + return attachment is not None and attachment.region_field def reuse_existing_attachment(self, alloc): key = self.attachment_key(alloc) - if key not in self._attachments: + attachment = self._attachments.get(key, None) + if attachment is None: return None - attachment = self._attachments[key] - return attachment.region_field + rf = attachment.region_field + # If the region field is already collected, we don't need to keep + # track of it for de-duplication. + if rf is None: + del self._attachments[key] + return rf def attach_external_allocation(self, alloc, region_field): key = self.attachment_key(alloc) - if key in self._attachments: + attachment = self._attachments.get(key, None) + if not (attachment is None or attachment.region_field is None): raise RuntimeError( "Cannot attach two different RegionFields to the same buffer" ) - attachment = Attachment(*key, region_field) + if attachment is None: + attachment = Attachment(*key, region_field) + else: + attachment.region_field = region_field + # We temporary remove the attachment from the map for + # the following alias checking + del self._attachments[key] for other in self._attachments.values(): if other.overlaps(attachment): raise RuntimeError( @@ -382,7 +404,19 @@ def attach_external_allocation(self, alloc, region_field): ) self._attachments[key] = attachment - def detach_external_allocation(self, alloc, detach, defer): + def _remove_allocation(self, alloc): + key = self.attachment_key(alloc) + if key not in self._attachments: + raise RuntimeError("Unable to find attachment to remove") + del self._attachments[key] + + def detach_external_allocation( + self, alloc, detach, defer=False, previously_deferred=False + ): + # If the detachment was previously deferred, then we don't + # need to remove the allocation from the map again. + if not previously_deferred: + self._remove_allocation(alloc) if defer: # If we need to defer this until later do that now self._deferred_detachments.append((alloc, detach)) @@ -391,12 +425,6 @@ def detach_external_allocation(self, alloc, detach, defer): # Dangle a reference to the field off the future to prevent the # field from being recycled until the detach is done future.field_reference = detach.field - # We also need to tell the core legate library that this buffer - # is no longer attached - key = self.attachment_key(alloc) - if key not in self._attachments: - raise RuntimeError("Unable to find attachment to remove") - del self._attachments[key] # If the future is already ready, then no need to track it if future.is_ready(): return @@ -417,7 +445,9 @@ def perform_detachments(self): detachments = self._deferred_detachments self._deferred_detachments = list() for alloc, detach in detachments: - self.detach_external_allocation(alloc, detach, defer=False) + self.detach_external_allocation( + alloc, detach, defer=False, previously_deferred=True + ) def prune_detachments(self): to_remove = [] From d6ccdd2747d9f2bb72de21658f0a7906810991aa Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Wed, 3 Nov 2021 15:25:17 -0700 Subject: [PATCH 23/44] Handle cases where one instance is used by multiple mappings --- src/core/mapping/base_mapper.cc | 46 ++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc index fd4cfa04c..9f679de9e 100644 --- a/src/core/mapping/base_mapper.cc +++ b/src/core/mapping/base_mapper.cc @@ -429,7 +429,7 @@ void BaseMapper::map_task(const MapperContext ctx, // Map each field separately for each of the logical regions std::vector needed_acquires; - std::map instances_to_mappings; + std::map> instances_to_mappings; for (uint32_t mapping_idx = 0; mapping_idx < mappings.size(); ++mapping_idx) { auto& mapping = mappings[mapping_idx]; auto req_indices = mapping.requirement_indices(); @@ -457,7 +457,7 @@ void BaseMapper::map_task(const MapperContext ctx, needed_acquires.push_back(result); for (auto req_idx : req_indices) output.chosen_instances[req_idx].push_back(result); - instances_to_mappings[result] = mapping_idx; + instances_to_mappings[result].insert(mapping_idx); } // Do an acquire on all the instances so we have our result @@ -471,27 +471,31 @@ void BaseMapper::map_task(const MapperContext ctx, filter_failed_acquires(needed_acquires, failed_acquires); for (auto failed_acquire : failed_acquires) { - auto mapping_idx = instances_to_mappings[failed_acquire]; - auto& mapping = mappings[mapping_idx]; - auto req_indices = mapping.requirement_indices(); - - std::vector> reqs; - for (auto req_idx : req_indices) reqs.push_back(std::cref(task.regions[req_idx])); - - for (auto req_idx : req_indices) { - auto& instances = output.chosen_instances[req_idx]; - uint32_t inst_idx = 0; - for (; inst_idx < instances.size(); ++inst_idx) - if (instances[inst_idx] == failed_acquire) break; - instances.erase(instances.begin() + inst_idx); - } + auto affected_mappings = instances_to_mappings[failed_acquire]; + instances_to_mappings.erase(failed_acquire); + + for (auto& mapping_idx : affected_mappings) { + auto& mapping = mappings[mapping_idx]; + auto req_indices = mapping.requirement_indices(); + + std::vector> reqs; + for (auto req_idx : req_indices) reqs.push_back(std::cref(task.regions[req_idx])); + + for (auto req_idx : req_indices) { + auto& instances = output.chosen_instances[req_idx]; + uint32_t inst_idx = 0; + for (; inst_idx < instances.size(); ++inst_idx) + if (instances[inst_idx] == failed_acquire) break; + instances.erase(instances.begin() + inst_idx); + } - PhysicalInstance result; - if (map_legate_store(ctx, task, mapping, reqs, task.target_proc, result)) - needed_acquires.push_back(result); + PhysicalInstance result; + if (map_legate_store(ctx, task, mapping, reqs, task.target_proc, result)) + needed_acquires.push_back(result); - for (auto req_idx : req_indices) output.chosen_instances[req_idx].push_back(result); - instances_to_mappings[result] = mapping_idx; + for (auto req_idx : req_indices) output.chosen_instances[req_idx].push_back(result); + instances_to_mappings[result].insert(mapping_idx); + } } } } From 51dd00f6acf0ea32569498a131ceebd44f76ccaf Mon Sep 17 00:00:00 2001 From: Manolis Papadakis Date: Fri, 5 Nov 2021 13:29:35 -0700 Subject: [PATCH 24/44] Fix import of legion CFFI --- legate/core/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/legate/core/__init__.py b/legate/core/__init__.py index 8c607466f..b3fa32b9c 100644 --- a/legate/core/__init__.py +++ b/legate/core/__init__.py @@ -19,10 +19,10 @@ # Perform a check to see if we're running inside of Legion Python # If we're not then we should raise an error message try: - from legion_cffi import ffi, lib as legion + from legion_cffi import lib as _legion # Now confirm that we are actually inside of a task - if legion.legion_runtime_has_context(): + if _legion.legion_runtime_has_context(): using_legion_python = True else: using_legion_python = False @@ -115,6 +115,10 @@ ReductionOp, ) +# NOTE: This needs to come after the imports from legate.core.legion, as we +# are overriding that module's name. +from legion_cffi import ffi, lib as legion + # Import the PyArrow type system from pyarrow import ( DataType, From f388f07bb36594d9359676184a9e64c911b2309c Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Fri, 5 Nov 2021 16:25:15 -0700 Subject: [PATCH 25/44] Make sure we flush deferred detachments --- legate/core/runtime.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index c06de76e1..957019d4e 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -851,6 +851,8 @@ def destroy(self): self.destroyed = True def dispatch(self, op, redop=None): + self._attachment_manager.perform_detachments() + self._attachment_manager.prune_detachments() if redop: return op.launch(self.legion_runtime, self.legion_context, redop) else: From 345f275c48330cf612353c18e24f4439d0d126ac Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Fri, 5 Nov 2021 18:13:36 -0700 Subject: [PATCH 26/44] reduction fix --- legate/core/runtime.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index ce34986e7..f151ff426 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -807,8 +807,8 @@ def __init__(self): self.validIDs.add(2) #Binary op #self.validIDs.add(5) #convert op self.validIDs.add(18) #Unary op - #self.validIDs.add(9) #Fill op - #self.validIDs.add(14) #Fill op + self.validIDs.add(9) #Fill op + #self.validIDs.add(14) #read op # the following are conditionally fusable # they will be processed in the a subsequent level of filtering @@ -969,7 +969,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): if start= self._window_size: + self._opLens.append(len(self._outstanding_ops)) ops = self._outstanding_ops self._outstanding_ops = [] self._schedule(ops) @@ -1391,6 +1397,7 @@ def _scheduleNew(self, ops): def flush_scheduling_window(self): if len(self._outstanding_ops) == 0: return + self._opLens.append(len(self._outstanding_ops)) ops = self._outstanding_ops self._outstanding_ops = [] self._schedule(ops) From 1f9a655b3efc473217fa3f243af031a9bdc12d31 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Sat, 13 Nov 2021 15:14:36 -0800 Subject: [PATCH 27/44] put new constraint stuff back in --- legate/core/constraints.py | 11 ++-- legate/core/operation.py | 17 ++++- legate/core/runtime.py | 124 ++++++++++++++++++++++++++----------- legate/core/solver.py | 4 ++ 4 files changed, 115 insertions(+), 41 deletions(-) diff --git a/legate/core/constraints.py b/legate/core/constraints.py index 93f89bd36..0609b8980 100644 --- a/legate/core/constraints.py +++ b/legate/core/constraints.py @@ -55,13 +55,15 @@ def reduce(self): class PartSym(Expr): - def __init__(self, op, store, id, disjoint, complete): - #self._op = op + def __init__(self, op_hash, op_name, store, id, disjoint, complete): + self._op_hash = op_hash + self._op_name = op_name self._store = store self._id = id self._disjoint = disjoint self._complete = complete + @property def ndim(self): return self._store.ndim @@ -73,11 +75,12 @@ def closed(self): def __repr__(self): disj = "D" if self._disjoint else "A" comp = "C" if self._complete else "I" - return f"X{self._id}({disj},{comp})" + #return f"X{self._id}({disj},{comp})" return f"X{self._id}({disj},{comp})@{self._op.get_name()}" def __hash__(self): - return hash(self._id) + #return hash(self._id) + return hash((self._op_hash, self._id)) return hash((self._op, self._id)) def subst(self, mapping): diff --git a/legate/core/operation.py b/legate/core/operation.py index fc1b9ae18..d3424e4ff 100644 --- a/legate/core/operation.py +++ b/legate/core/operation.py @@ -19,7 +19,11 @@ from .launcher import CopyLauncher, TaskLauncher from .store import Store from .utils import OrderedSet - +from .legion import ( + FieldSpace, + Future +) + class Operation(object): def __init__(self, context, mapper_id=0, op_id=0): @@ -165,7 +169,8 @@ def _get_symbol_id(self): def declare_partition(self, store, disjoint=True, complete=True): sym = PartSym( - self, + self._op_id, + self.get_name(), store, self._get_symbol_id(), disjoint=disjoint, @@ -179,6 +184,7 @@ def declare_partition(self, store, disjoint=True, complete=True): return sym + class Task(Operation): def __init__(self, context, task_id, mapper_id=0, op_id=0): Operation.__init__(self, context, mapper_id=mapper_id, op_id=op_id) @@ -222,19 +228,26 @@ def launch(self, strategy): # We update the key partition of a store only when it gets updated temp.set_key_partition(partition) """ + #print("inputs") for input, input_part in zip(self._inputs, self._input_parts): proj = strategy.get_projection(input_part) + #if (input._kind==Future): + # print(input, proj) tag = self.get_tag(strategy, input_part) launcher.add_input(input, proj, tag=tag) + #print("outputs", len(self._outputs)) for output, output_part in zip(self._outputs, self._output_parts): if output.unbound: continue proj = strategy.get_projection(output_part) + #if (output._kind==Future): + # print(output, proj) tag = self.get_tag(strategy, output_part) launcher.add_output(output, proj, tag=tag) partition = strategy.get_partition(output_part) # We update the key partition of a store only when it gets updated output.set_key_partition(partition) + #print() for ((reduction, redop), reduction_part) in zip( self._reductions, self._reduction_parts ): diff --git a/legate/core/runtime.py b/legate/core/runtime.py index f151ff426..d3c68b31e 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -708,7 +708,7 @@ def supress_small_fusions(self, intervals, threshold): return fusable, final_set def can_fuse(self): - for op in self.ops: + for op in reversed(self.ops): must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op]) partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single) self.partitioners.append( partitioner ) @@ -723,6 +723,7 @@ def can_fuse(self): if len(op.inputs)>1: proj = strategy.get_projection(op._input_parts[1]) self.strategies.append(strategy) + self.strategies.reverse() results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints] drint("fuse results", results) @@ -763,7 +764,7 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): class NumpyContextExists(FusionConstraint): def apply(self, contexts, runtime, ops, partitioners, strategies): - if "legate.numpy" in contexts: + if "cunumeric" in contexts: return True, [(0, len(ops))] else: return False, [(0,0)] @@ -803,12 +804,13 @@ class AllValidOps(FusionConstraint): def __init__(self): self.validIDs = set() - #these ops are always fusable + #these ops are almost always fusable self.validIDs.add(2) #Binary op - #self.validIDs.add(5) #convert op self.validIDs.add(18) #Unary op self.validIDs.add(9) #Fill op + self.validIDs.add(20) #Where op #self.validIDs.add(14) #read op + #self.validIDs.add(5) #convert op # the following are conditionally fusable # they will be processed in the a subsequent level of filtering @@ -924,20 +926,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): intervals.append((start,end)) return True, intervals - #TODO: remove me - # for each buffer, check all it's associated transforms/partitions - # across ops are equivalent - seperators = [] - for store, matrices in store_to_ops.items(): - if len(matrices)>1: - first = matrices[0] - for matrix in matrices: - if not (matrix==first).all(): - indices = linkset[store] - return True, [(0,indices[1]), (indices[1],len(ops))] - return True, [(0,len(ops))] - - class IdenticalLaunchShapes(FusionConstraint): """Fusion rule that only ops with identical @@ -971,6 +959,52 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): return True, intervals + +class ValidProducerConsumer(FusionConstraint): + """In a fused op, there cannot be a producer consumer + relationship between different views of the same buffers""" + + def apply(self, contexts, runtime, ops, partitioners, strategies): + childMap = {} + intervals = [] + i, start=0, 0 + end = len(ops) + def getRoot(store): + while store._parent: + store = store._parent + return store + + while i 0 for gop in [fused_task]) partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single) strategy = partitioner.partition_stores() #fused_task.strategy = super_strategies[i] fused_task.strategy = strategy - - return new_op_list, True + strats.append(strategy) + #print("star",strategy) + #print("flag 3") + return new_op_list, strats def _launch_outstanding(self, force_eval=True): if len(self._outstanding_ops): #print("launching outstanding", ops) - self._opLens.append(len(self._outstanding_ops)) + #self._opLens.append(len(self._outstanding_ops)) ops = self._outstanding_ops self._outstanding_ops = [] self._schedule(ops, force_eval) @@ -1318,6 +1370,7 @@ def _schedule(self, ops, force_eval=False): # strategy = partitioner.partition_stores() if len(ops)>=2 and (not force_eval): fused_task_list,strats = self.build_fused_op(ops) + #print("flist", fused_task_list) if fused_task_list: frint("created fused list", [op._task_id for op in fused_task_list]) drint("start clearing pipe") @@ -1347,6 +1400,7 @@ def _schedule(self, ops, force_eval=False): # if we already checked the ops for fusability, # then the ops' buffers have already been partitioned else: + #print("normal execution", ids) if not strats: #ops were not check for fusability, so partition them for op in ops: must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op]) @@ -1374,7 +1428,7 @@ def submit(self, op): else: self._outstanding_ops.append(op) if len(self._outstanding_ops) >= self._window_size: - self._opLens.append(len(self._outstanding_ops)) + #self._opLens.append(len(self._outstanding_ops)) ops = self._outstanding_ops self._outstanding_ops = [] self._schedule(ops) @@ -1397,7 +1451,7 @@ def _scheduleNew(self, ops): def flush_scheduling_window(self): if len(self._outstanding_ops) == 0: return - self._opLens.append(len(self._outstanding_ops)) + #self._opLens.append(len(self._outstanding_ops)) ops = self._outstanding_ops self._outstanding_ops = [] self._schedule(ops) diff --git a/legate/core/solver.py b/legate/core/solver.py index 77628df1a..fb6ba0ed6 100644 --- a/legate/core/solver.py +++ b/legate/core/solver.py @@ -269,6 +269,10 @@ def partition_stores(self): def cost(unknown): store = unknown._store + return ( + -store.comm_volume(), + not store.has_key_partition(all_restrictions[unknown]), + ) return ( store.comm_volume(), store._key_partition is None, From e2afa73edf31198846f2664c3086d56507309c33 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Mon, 15 Nov 2021 15:05:56 -0800 Subject: [PATCH 28/44] constant optimization --- legate/core/constraints.py | 2 +- legate/core/launcher.py | 2 ++ legate/core/runtime.py | 61 ++++++++++++--------------------- legate/core/solver.py | 10 +++--- legate/core/store.py | 1 - src/core/data/store.cc | 4 +-- src/core/mapping/core_mapper.cc | 2 +- 7 files changed, 33 insertions(+), 49 deletions(-) diff --git a/legate/core/constraints.py b/legate/core/constraints.py index 0609b8980..26db3e8cc 100644 --- a/legate/core/constraints.py +++ b/legate/core/constraints.py @@ -76,7 +76,7 @@ def __repr__(self): disj = "D" if self._disjoint else "A" comp = "C" if self._complete else "I" #return f"X{self._id}({disj},{comp})" - return f"X{self._id}({disj},{comp})@{self._op.get_name()}" + return f"X{self._id}({disj},{comp})@{self._op_name}" def __hash__(self): #return hash(self._id) diff --git a/legate/core/launcher.py b/legate/core/launcher.py index 78924acfe..c97ff6967 100644 --- a/legate/core/launcher.py +++ b/legate/core/launcher.py @@ -263,6 +263,8 @@ def __init__(self, region, permission, proj, tag, flags): self.region = region self.permission = permission self.proj = proj + #print(proj.__dict__) + #print(region) self.tag = tag self.flags = flags diff --git a/legate/core/runtime.py b/legate/core/runtime.py index d3c68b31e..90d4d1af3 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -807,9 +807,10 @@ def __init__(self): #these ops are almost always fusable self.validIDs.add(2) #Binary op self.validIDs.add(18) #Unary op + #self.validIDs.add(5) #Convert op self.validIDs.add(9) #Fill op self.validIDs.add(20) #Where op - #self.validIDs.add(14) #read op + self.validIDs.add(14) #read op #self.validIDs.add(5) #convert op # the following are conditionally fusable @@ -934,13 +935,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): launch_shapes = [] for i in range(len(ops)): launch_shapes.append(strategies[i]._launch_shape) - #print(launch_shapes) - """ - first_shape = launch_shapes[0] - for launch_shape in launch_shapes: - if launch_shape!=first_shape: - return True, [(0,1),(1,len(ops))] - """ intervals =[] i=1 start=0 @@ -948,6 +942,11 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): while i 1: #initialize fused task @@ -1305,21 +1303,18 @@ def build_fused_op(self,ops): fused_task.add_scalar_arg(scalar[0], ty.int32) for reduction in op._reductions: fused_task.add_reduction(reduction) - + isScalarConversion = len(op._outputs) ==1 and len(op._inputs)==1 + if int(op._task_id)==14 and isScalarConversion: #for handling scalars + op._outputs[0]._storage = op._inputs[0]._storage for input in op._inputs: - #if input._storage is None: fused_task.add_input(input) for output,part in zip(op._outputs, op._output_parts): fused_task.add_output(output) - #if key_part==None: - # key_part = partitions[z].get_partition(part) - #self.propogateFuture(fused_task) for future in op._futures: fused_task.add_future(future) z+=1 new_op_list.append(fused_task) - #print("flag 2") strats=[] for i,fused_task in enumerate(new_op_list): must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task]) @@ -1328,8 +1323,7 @@ def build_fused_op(self,ops): #fused_task.strategy = super_strategies[i] fused_task.strategy = strategy strats.append(strategy) - #print("star",strategy) - #print("flag 3") + #strats.append( super_strategies[i]) return new_op_list, strats def _launch_outstanding(self, force_eval=True): @@ -1339,12 +1333,7 @@ def _launch_outstanding(self, force_eval=True): ops = self._outstanding_ops self._outstanding_ops = [] self._schedule(ops, force_eval) - - def _launch_one(self): - if len(self._outstanding_ops): - op = self._outstanding_ops[0] - self._outstanding_ops = self._outstanding_ops[1:] - self._schedule([op], force_eval=True) + def propogateFuture(self,op): return @@ -1354,8 +1343,6 @@ def propogateFuture(self,op): while start._storage is None and start._parent: start=start._parent input._storage = start._storage - - def _schedule(self, ops, force_eval=False): ids = [op._task_id for op in ops] @@ -1364,10 +1351,6 @@ def _schedule(self, ops, force_eval=False): #if partially or fully fusable, #schedule the new set of tasks strats = False - #for op in ops: - # must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op]) - # partitioner = Partitioner(self, [op], must_be_single=must_be_single) - # strategy = partitioner.partition_stores() if len(ops)>=2 and (not force_eval): fused_task_list,strats = self.build_fused_op(ops) #print("flist", fused_task_list) diff --git a/legate/core/solver.py b/legate/core/solver.py index fb6ba0ed6..a600dd1dc 100644 --- a/legate/core/solver.py +++ b/legate/core/solver.py @@ -273,11 +273,11 @@ def cost(unknown): -store.comm_volume(), not store.has_key_partition(all_restrictions[unknown]), ) - return ( - store.comm_volume(), - store._key_partition is None, - not store.has_key_partition(all_restrictions[unknown]), - ) + #return ( + # store.comm_volume(), + # store._key_partition is None, + # not store.has_key_partition(all_restrictions[unknown]), + #) unknowns = sorted(unknowns, key=cost) diff --git a/legate/core/store.py b/legate/core/store.py index ce097db41..600aae9c5 100644 --- a/legate/core/store.py +++ b/legate/core/store.py @@ -474,7 +474,6 @@ def storage(self): # it has been initialized correctly. self._runtime._launch_outstanding(False) if self._storage is None: - #print("store none, launching", [op._task_id for op in self._runtime._outstanding_ops]) """ if self._kind ==Future: print("future") diff --git a/src/core/data/store.cc b/src/core/data/store.cc index f114ed4de..fd40a4dd9 100644 --- a/src/core/data/store.cc +++ b/src/core/data/store.cc @@ -127,8 +127,8 @@ Domain FutureWrapper::domain() const { return domain_; } ReturnValue FutureWrapper::pack() const { if (nullptr == rawptr_) { - fprintf(stderr, "Found an uninitialized Legate store\n"); - assert(false); + //fprintf(stderr, "Found an uninitialized Legate store\n"); + //assert(false); } return ReturnValue(rawptr_, field_size_); } diff --git a/src/core/mapping/core_mapper.cc b/src/core/mapping/core_mapper.cc index 6d8d57ad7..4138d670a 100644 --- a/src/core/mapping/core_mapper.cc +++ b/src/core/mapping/core_mapper.cc @@ -202,7 +202,7 @@ Mapper::MapperSyncModel CoreMapper::get_mapper_sync_model(void) const void CoreMapper::select_task_options(const MapperContext ctx, const Task& task, TaskOptions& output) { - std::cout<<"task_id "< Date: Sun, 21 Nov 2021 18:27:30 -0800 Subject: [PATCH 29/44] better constant opt --- legate/core/runtime.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 90d4d1af3..109fb4266 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -807,7 +807,7 @@ def __init__(self): #these ops are almost always fusable self.validIDs.add(2) #Binary op self.validIDs.add(18) #Unary op - #self.validIDs.add(5) #Convert op + self.validIDs.add(5) #Convert op self.validIDs.add(9) #Fill op self.validIDs.add(20) #Where op self.validIDs.add(14) #read op @@ -1049,7 +1049,7 @@ def __init__(self, core_library): # to be dispatched. This list allows cross library introspection for # Legate operations. self._outstanding_ops = [] - self._window_size=1 + self._window_size=50 self._fusion_threshold =2 self._opLens = [] self._fusedOpLens = [] @@ -1248,6 +1248,7 @@ def build_fused_op(self,ops): super_fspaces = [] super_strategies = [] super_keystores = [] + z=0 for fusable_set in fusable_sets: #create super strategy for this fusable set super_strat = {} @@ -1299,6 +1300,12 @@ def build_fused_op(self,ops): #add typical inputs and outputs of all subtasks to fused task key_part = None for j,op in enumerate(op_subset): + #if int(op._task_id) == 5: + #fused_task.add_output(op._outputs[0]) + #fused_task.add_input(op._inputs[0]) + #fused_task.add_broadcast(op._inputs[0]) + #fused_task.add_broadcast(op._outputs[0]) + #continue for scalar in op._scalar_args: fused_task.add_scalar_arg(scalar[0], ty.int32) for reduction in op._reductions: @@ -1322,6 +1329,7 @@ def build_fused_op(self,ops): strategy = partitioner.partition_stores() #fused_task.strategy = super_strategies[i] fused_task.strategy = strategy + #print("\t q ",i, strategy) strats.append(strategy) #strats.append( super_strategies[i]) return new_op_list, strats @@ -1371,7 +1379,7 @@ def _schedule(self, ops, force_eval=False): strategy = ops[0].strategy for input, part in zip(ops[0]._inputs, ops[0]._input_parts): frint("launch fused input", ops[0]._task_id, input) - proj = ops[0].strategy.get_projection(part) + #proj = ops[0].strategy.get_projection(part) self.propogateFuture(ops[0]) for output in ops[0]._outputs: From f9eb11963e3281e17b6b3624504b20e30abae80e Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Mon, 22 Nov 2021 13:57:58 -0800 Subject: [PATCH 30/44] terminal dots --- legate/core/runtime.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 109fb4266..75be54545 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -803,16 +803,19 @@ class AllValidOps(FusionConstraint): """ def __init__(self): self.validIDs = set() - + self.terminals = set() #these ops are almost always fusable self.validIDs.add(2) #Binary op self.validIDs.add(18) #Unary op self.validIDs.add(5) #Convert op self.validIDs.add(9) #Fill op self.validIDs.add(20) #Where op - self.validIDs.add(14) #read op + self.validIDs.add(7) + #self.validIDs.add(14) #read op #self.validIDs.add(5) #convert op + self.terminals.add(7) + # the following are conditionally fusable # they will be processed in the a subsequent level of filtering @@ -843,18 +846,24 @@ def __init__(self): def apply(self, contexts, runtime, ops, partitioners, strategies): results = [int(op._task_id) in self.validIDs for op in ops] + drint("valids", results) fusable_intervals = [] start, end =0,0 while end Date: Wed, 1 Dec 2021 09:39:54 -0800 Subject: [PATCH 31/44] reuse partitions --- legate/core/operation.py | 35 +++---- legate/core/runtime.py | 198 ++++++++++++++++++--------------------- 2 files changed, 105 insertions(+), 128 deletions(-) diff --git a/legate/core/operation.py b/legate/core/operation.py index 1dba97e90..8b13c92e8 100644 --- a/legate/core/operation.py +++ b/legate/core/operation.py @@ -161,6 +161,7 @@ def execute(self): def get_tag(self, strategy, part): if strategy.is_key_part(part): + return 0 return 1 # LEGATE_CORE_KEY_STORE_TAG else: return 0 @@ -217,40 +218,30 @@ def launch(self, strategy): if self._is_fused: launcher.add_fusion_metadata(self._is_fused, self._fusion_metadata) - """ - for input in self._inputs: - proj = strategy.get_projection(input) - tag = self.get_tag(strategy, input) - launcher.add_input(input, proj, tag=tag) - for temp in self._temps: - proj = strategy.get_projection(temp) - launcher.add_temp(temp, proj) - partition = strategy.get_partition(temp) - # We update the key partition of a store only when it gets updated - temp.set_key_partition(partition) - """ - #print("inputs") - for input, input_part in zip(self._inputs, self._input_parts): + if self._is_fused: #fused ops re-use encapsulated unfused partitions + input_parts = self._unfused_input_parts + output_parts = self._unfused_output_parts + reduction_parts = self._unfused_reduction_parts + else: + input_parts = self._input_parts + output_parts = self._output_parts + reduction_parts = self._reduction_parts + + for input, input_part in zip(self._inputs, input_parts): proj = strategy.get_projection(input_part) - #if (input._kind==Future): - # print(input, proj) tag = self.get_tag(strategy, input_part) launcher.add_input(input, proj, tag=tag) - #print("outputs", len(self._outputs)) - for output, output_part in zip(self._outputs, self._output_parts): + for output, output_part in zip(self._outputs, output_parts): if output.unbound: continue proj = strategy.get_projection(output_part) - #if (output._kind==Future): - # print(output, proj) tag = self.get_tag(strategy, output_part) launcher.add_output(output, proj, tag=tag) partition = strategy.get_partition(output_part) # We update the key partition of a store only when it gets updated output.set_key_partition(partition) - #print() for ((reduction, redop), reduction_part) in zip( - self._reductions, self._reduction_parts + self._reductions, reduction_parts ): partition = strategy.get_partition(reduction_part) can_read_write = partition.is_disjoint_for(strategy, reduction) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 8f6e3ea16..6f3029ab5 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -25,6 +25,10 @@ from legate.core import types as ty +import datetime +import cProfile +pr = cProfile.Profile() + from .context import Context from .corelib import CoreLib from .launcher import TaskLauncher @@ -756,6 +760,7 @@ def supress_small_fusions(self, intervals, threshold): return fusable, final_set def can_fuse(self): + #starttime = datetime.datetime.now() for op in reversed(self.ops): must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op]) partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single) @@ -768,20 +773,32 @@ def can_fuse(self): for input in op._inputs: if input.shape==output.shape: input.set_key_partition(key_part) - if len(op.inputs)>1: - proj = strategy.get_projection(op._input_parts[1]) + #if len(op.inputs)>1: + # proj = strategy.get_projection(op._input_parts[1]) self.strategies.append(strategy) self.strategies.reverse() - + """ + stoptime = datetime.datetime.now() + delta=stoptime-starttime + total = delta.total_seconds() * 1000.0 + print("partime", total, len(self.ops)) + """ + #starttime = datetime.datetime.now() results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints] + """ + stoptime = datetime.datetime.now() + delta=stoptime-starttime + total = delta.total_seconds() * 1000.0 + print("applytime", total) + """ + #starttime = datetime.datetime.now() drint("fuse results", results) all_fusable = [result[0] for result in results] interval_sets = [result[1] for result in results] #intersect intervals - #this is a very, very bad way of doing this, - # in the future I'll just "intersect" in place - # as we apply constraints + #this is an inefficent way of doing this, + #but it takes little time in practice curr_set = interval_sets[0] for interval_set in interval_sets[1:]: newset = [] @@ -793,6 +810,14 @@ def can_fuse(self): newset.append((news, newe)) curr_set=newset fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold) + + """ + stoptime = datetime.datetime.now() + delta=stoptime-starttime + total = delta.total_seconds() * 1000.0 + print("filtertime", total) + """ + drint("curset", curr_set) drint("final_set", final_set) @@ -816,34 +841,8 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): return True, [(0, len(ops))] else: return False, [(0,0)] -""" -enum NumPyOpCode { - NUMPY_ARANGE = 1, - NUMPY_BINARY_OP = 2, - NUMPY_BINARY_RED = 3, - NUMPY_BINCOUNT = 4, - NUMPY_CONVERT = 5, - NUMPY_DIAG = 6, - NUMPY_DOT = 7, - NUMPY_EYE = 8, - NUMPY_FILL = 9, - NUMPY_MATMUL = 10, - NUMPY_MATVECMUL = 11, - NUMPY_NONZERO = 12, - NUMPY_RAND = 13, - NUMPY_READ = 14, - NUMPY_SCALAR_UNARY_RED = 15, - NUMPY_TILE = 16, - NUMPY_TRANSPOSE = 17, - NUMPY_UNARY_OP = 18, - NUMPY_UNARY_RED = 19, - NUMPY_WHERE = 20, - NUMPY_WRITE = 21, - NUMPY_DOUBLE_BINARY_OP = 23, - NUMPY_FUSED_OP = 24, -} - -""" + + class AllValidOps(FusionConstraint): """ Class for only fusing only potentially fusable ops. @@ -854,43 +853,12 @@ def __init__(self): self.terminals = set() #these ops are almost always fusable self.validIDs.add(2) #Binary op - self.validIDs.add(18) #Unary op - self.validIDs.add(5) #Convert op - self.validIDs.add(9) #Fill op + self.validIDs.add(10) #Fill op self.validIDs.add(20) #Where op - self.validIDs.add(7) - #self.validIDs.add(14) #read op - #self.validIDs.add(5) #convert op - - self.terminals.add(7) - - # the following are conditionally fusable - # they will be processed in the a subsequent level of filtering - - # scalar producing ops are valid if the scalars they produce - # are NOT consumed by a subsequent op in the window - # however they can be printed, which we cannot detect in the runtime - # without static analysis, so consider these terminal fusable - #self.validIDs.add(400004) #Scalar unary red - #self.validIDs.add(400005) #Unary red - - # as all scalars are futures, - # so we can just check if both Futures are "ready" - # more powerfully, we can also create a dependency tree - # of ops, and assuming they're all scalar ops, - # and the "roots" are ready, we can fuse - #self.validIDs.add(400002) #Scalar Binary op - #self.validIDs.add(400007) #Scalar Unary op - #self.validIDs.add(400008) #Scalar binary red - - #a matmul is valid if it is the last op in the sequence - #unless if it followed by a matmul of the exact same size - #so it is terminal fusable - #self.validIDs.add(400017) #Matmul - - #vector dot is binary op + scalar producing reduction - #it is thus terminal fusable - #self.validIDs.add(400019) #dot + #self.validIDs.add(6) #Where op + #self.validIDs.add(5) #Convert op + self.validIDs.add(7) #dot op + self.terminals.add(7) #dot op is only fusable as a terminal op in a window def apply(self, contexts, runtime, ops, partitioners, strategies): results = [int(op._task_id) in self.validIDs for op in ops] @@ -948,7 +916,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): if input not in bufferSet: proj = strategies[i].get_projection(part) if hasattr(proj, 'part'): - #bufferSet[input]=proj bufferSet[input]=proj if input not in linkset: linkset[input] = [i] @@ -982,6 +949,7 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): i+=1 if start 1: #initialize fused task fused_task = numpy_context.create_task(fused_id) - - #serialize necessary metadata on all encapsulated ops + #serialize necessary metadata on all encapsulated ops #this metadata will be fed into the fused op as inputs meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset) fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true #add typical inputs and outputs of all subtasks to fused task key_part = None + fused_task._unfused_input_parts = [] + fused_task._unfused_output_parts = [] + fused_task._unfused_reduction_parts = [] + #isScalarConversion = len(op._outputs) ==1 and len(op._inputs)==1 for j,op in enumerate(op_subset): - #if int(op._task_id) == 5: - #fused_task.add_output(op._outputs[0]) - #fused_task.add_input(op._inputs[0]) - #fused_task.add_broadcast(op._inputs[0]) - #fused_task.add_broadcast(op._outputs[0]) - #continue for scalar in op._scalar_args: fused_task.add_scalar_arg(scalar[0], ty.int32) - for (reduction, redop) in op._reductions: + for (reduction, redop), part in zip(op._reductions, op._reduction_parts): fused_task.add_reduction(reduction, redop) - isScalarConversion = len(op._outputs) ==1 and len(op._inputs)==1 - #if int(op._task_id)==14 and isScalarConversion: #for handling scalars - # op._outputs[0]._storage = op._inputs[0]._storage - for input in op._inputs: + fused_task._unfused_reduction_parts.append(part) + for input,part in zip(op._inputs, op._input_parts): + #print(j, part) fused_task.add_input(input) + fused_task._unfused_input_parts.append(part) for output,part in zip(op._outputs, op._output_parts): fused_task.add_output(output) - #self.propogateFuture(fused_task) + fused_task._unfused_output_parts.append(part) for future in op._futures: fused_task.add_future(future) z+=1 new_op_list.append(fused_task) strats=[] + """ + stoptime = datetime.datetime.now() + delta=stoptime-starttime + total = delta.total_seconds() * 1000.0 + print("buildtime", total) + """ + + #pr.enable() + redoPar=False + #starttime = datetime.datetime.now() for i,fused_task in enumerate(new_op_list): - must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task]) - partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single) - strategy = partitioner.partition_stores() - #fused_task.strategy = super_strategies[i] - fused_task.strategy = strategy - #print("\t q ",i, strategy) - strats.append(strategy) - #strats.append( super_strategies[i]) + if redoPar: + must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task]) + partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single) + strategy = partitioner.partition_stores() + fused_task.strategy = strategy + strats.append(strategy) + else: + fused_task.strategy = super_strategies[i] + strats.append( super_strategies[i]) + #stoptime = datetime.datetime.now() + #delta=stoptime-starttime + #total = delta.total_seconds() * 1000.0 + #print("repartime", total) + #pr.disable() return new_op_list, strats def _launch_outstanding(self, force_eval=True): if len(self._outstanding_ops): - #print("launching outstanding", ops) #self._opLens.append(len(self._outstanding_ops)) ops = self._outstanding_ops self._outstanding_ops = [] @@ -1429,8 +1409,14 @@ def _schedule(self, ops, force_eval=False): #schedule the new set of tasks strats = False if len(ops)>=2 and (not force_eval): + #start = datetime.datetime.now() + #pr.enable() fused_task_list,strats = self.build_fused_op(ops) - #print("flist", fused_task_list) + #pr.disable() + #stop = datetime.datetime.now() + #delta=stop-start + #total = delta.total_seconds() * 1000.0 + #print("time", total) if fused_task_list: frint("created fused list", [op._task_id for op in fused_task_list]) drint("start clearing pipe") From ea1237758b1e2b7888872d335ceaa5393dc3c6f7 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Thu, 2 Dec 2021 17:37:21 -0800 Subject: [PATCH 32/44] install.py --- install.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/install.py b/install.py index 559ea7eaf..6a3313e62 100755 --- a/install.py +++ b/install.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +#/home/shiv1/pypy/pypy3.8-v7.3.7-linux64/bin/python3.8 # Copyright 2021 NVIDIA Corporation # @@ -377,6 +378,7 @@ def build_legion( legion_python_dir = os.path.join(legion_src_dir, "bindings", "python") if clean_first: + print("cleaning!\n") verbose_check_call( ["make"] + flags + ["clean"], cwd=legion_python_dir ) @@ -897,7 +899,6 @@ def driver(): "--clean", dest="clean_first", action=BooleanFlag, - #default=False, default=False, help="Clean before build, and pull latest Legion.", ) From d7a8dabddd6f7817e417630da3a7f632dd50f94b Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Sun, 5 Dec 2021 22:07:47 -0800 Subject: [PATCH 33/44] new way of applying constraints --- legate/core/runtime.py | 139 +++++++++++++++++++++++++++++++---------- 1 file changed, 106 insertions(+), 33 deletions(-) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 6f3029ab5..862f9e263 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -783,33 +783,44 @@ def can_fuse(self): total = delta.total_seconds() * 1000.0 print("partime", total, len(self.ops)) """ + streamApply = False #starttime = datetime.datetime.now() - results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints] + if not streamApply: + results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints] + all_fusable = [result[0] for result in results] + interval_sets = [result[1] for result in results] + else: + alpha = self.constraints[0].apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) + beta = self.constraints[1].apply2(self.contexts, self.runtime, self.ops, alpha[1],self.partitioners, self.strategies) + beta = self.constraints[2].apply2(self.contexts, self.runtime, self.ops, beta[1], self.partitioners, self.strategies) + #print("beta", beta) """ stoptime = datetime.datetime.now() delta=stoptime-starttime total = delta.total_seconds() * 1000.0 print("applytime", total) """ - #starttime = datetime.datetime.now() - drint("fuse results", results) - all_fusable = [result[0] for result in results] - interval_sets = [result[1] for result in results] + starttime = datetime.datetime.now() + #drint("fuse results", results) #intersect intervals #this is an inefficent way of doing this, - #but it takes little time in practice - curr_set = interval_sets[0] - for interval_set in interval_sets[1:]: - newset = [] - for aset in curr_set: - for bset in interval_set: - if not (aset[0] > bset[1] or bset[0] > aset[1]): - news = max(aset[0], bset[0]) - newe = min(aset[1], bset[1]) - newset.append((news, newe)) - curr_set=newset - fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold) + #""" + if not streamApply: + curr_set = interval_sets[0] + for interval_set in interval_sets[1:]: + newset = [] + for aset in curr_set: + for bset in interval_set: + if not (aset[0] > bset[1] or bset[0] > aset[1]): + news = max(aset[0], bset[0]) + newe = min(aset[1], bset[1]) + newset.append((news, newe)) + curr_set=newset + #""" + fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold) + else: + fusable,final_set = self.supress_small_fusions(beta[1], self.runtime._fusion_threshold) """ stoptime = datetime.datetime.now() @@ -818,11 +829,11 @@ def can_fuse(self): print("filtertime", total) """ - drint("curset", curr_set) + #drint("curset", curr_set) - drint("final_set", final_set) - drint("all fusable", fusable) - drint("intervals", interval_sets) + #drint("final_set", final_set) + #drint("all fusable", fusable) + #drint("intervals", interval_sets) #return reduce(lambda x,y: x and y, all_fusable), final_set, self.strategies return fusable, final_set, self.strategies @@ -857,8 +868,8 @@ def __init__(self): self.validIDs.add(20) #Where op #self.validIDs.add(6) #Where op #self.validIDs.add(5) #Convert op - self.validIDs.add(7) #dot op - self.terminals.add(7) #dot op is only fusable as a terminal op in a window + #self.validIDs.add(7) #dot op + #self.terminals.add(7) #dot op is only fusable as a terminal op in a window def apply(self, contexts, runtime, ops, partitioners, strategies): results = [int(op._task_id) in self.validIDs for op in ops] @@ -917,19 +928,19 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): proj = strategies[i].get_projection(part) if hasattr(proj, 'part'): bufferSet[input]=proj - if input not in linkset: - linkset[input] = [i] - else: - linkset[input].append(i) + #if input not in linkset: + # linkset[input] = [i] + #else: + # linkset[input].append(i) for output, part in zip(op._outputs, op._output_parts): if output not in bufferSet: proj = strategies[i].get_projection(part) if hasattr(proj, 'part'): bufferSet[output]=proj - if output not in linkset: - linkset[output] = [i] - else: - linkset[output].append(i) + #if output not in linkset: + # linkset[output] = [i] + #else: + # linkset[output].append(i) if i==0: #we only iterate from i==1 onwards i+=1 continue @@ -984,6 +995,30 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): #print(intervals) return True, intervals + def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies): + launch_shapes = [] + for i in range(len(ops)): + launch_shapes.append(strategies[i]._launch_shape) + #print("ls", launch_shapes) + intervals =[] + for baseInterval in baseIntervals: + start=baseInterval[0] + i=start+1 + end = baseInterval[1] + while i Date: Tue, 7 Dec 2021 14:53:19 -0800 Subject: [PATCH 34/44] minor cleanup --- legate/core/__init__.py | 4 -- legate/core/constraints.py | 1 - legate/core/launcher.py | 16 +---- legate/core/legion.py | 1 - legate/core/operation.py | 6 -- legate/core/runtime.py | 119 ++++--------------------------------- legate/core/solver.py | 2 - legate/core/store.py | 33 ---------- 8 files changed, 14 insertions(+), 168 deletions(-) diff --git a/legate/core/__init__.py b/legate/core/__init__.py index 526355308..19933da35 100644 --- a/legate/core/__init__.py +++ b/legate/core/__init__.py @@ -120,10 +120,6 @@ # are overriding that module's name. from legion_cffi import ffi, lib as legion -# NOTE: This needs to come after the imports from legate.core.legion, as we -# are overriding that module's name. -from legion_cffi import ffi, lib as legion - # Import the PyArrow type system from pyarrow import ( DataType, diff --git a/legate/core/constraints.py b/legate/core/constraints.py index 276631d2a..39e85bdcc 100644 --- a/legate/core/constraints.py +++ b/legate/core/constraints.py @@ -63,7 +63,6 @@ def __init__(self, op_hash, op_name, store, id, disjoint, complete): self._disjoint = disjoint self._complete = complete - @property def ndim(self): return self._store.ndim diff --git a/legate/core/launcher.py b/legate/core/launcher.py index ffde8e126..e396b0894 100644 --- a/legate/core/launcher.py +++ b/legate/core/launcher.py @@ -263,8 +263,6 @@ def __init__(self, region, permission, proj, tag, flags): self.region = region self.permission = permission self.proj = proj - #print(proj.__dict__) - #print(region) self.tag = tag self.flags = flags @@ -572,13 +570,12 @@ def add_store(self, args, store, proj, perm, tag, flags): if store.kind is Future: if store.has_storage: self.add_future(store.storage) - elif (perm == Permission.READ or perm == Permission.REDUCTION): + elif perm == Permission.READ or perm == Permission.REDUCTION: raise RuntimeError( "Read access to an uninitialized store is disallowed" ) read_only = perm == Permission.READ args.append(FutureStoreArg(store, read_only, store.has_storage)) - #args.append(FutureStoreArg(store, perm, store.has_storage)) else: region = store.storage.region @@ -605,12 +602,6 @@ def add_output(self, store, proj, tag=0, flags=0): self._outputs, store, proj, Permission.WRITE, tag, flags ) - # currently this is adding to outputs but we can have a seperate "temps" array in the core - def add_temp(self, store, proj, tag=0, flags=0): - self.add_store( - self._outputs, store, proj, Permission.WRITE, tag, flags - ) - def add_reduction(self, store, proj, tag=0, flags=0, read_write=False): if read_write and store.kind is not Future: self.add_store( @@ -674,11 +665,6 @@ def pack_fusion_metadata(argbuf, is_fused, fusion_metadata): def build_task(self, launch_domain, argbuf): self._req_analyzer.analyze_requirements() - #print("building task id", self._task_id) - #for req in self._req_analyzer._requirements: - # print(req) - # print(req[0].__dict__) - # print() self._out_analyzer.analyze_requirements() #pack fusion metadata diff --git a/legate/core/legion.py b/legate/core/legion.py index 7127bfe4d..a3cd653c9 100644 --- a/legate/core/legion.py +++ b/legate/core/legion.py @@ -157,7 +157,6 @@ def legate_task_postamble(runtime, context): # This is a decorator for wrapping the launch method on launchers # to dispatch any unordered deletions while the task is live def dispatch(func): - #print("dispatching") def launch(launcher, runtime, context, *args): # This context should always be in the dictionary legate_task_progress(runtime, context) diff --git a/legate/core/operation.py b/legate/core/operation.py index 8b13c92e8..39806ad81 100644 --- a/legate/core/operation.py +++ b/legate/core/operation.py @@ -36,7 +36,6 @@ def __init__(self, context, mapper_id=0, op_id=0): self._reductions = [] self._is_fused = False self._temps = [] - self._input_parts = [] self._output_parts = [] self._reduction_parts = [] @@ -123,10 +122,6 @@ def add_output(self, store, partition=None): self._outputs.append(store) self._output_parts.append(partition) - def add_temp(self, store): - self._check_store(store) - self._temps.append(store) #this may not be necessary - def add_reduction(self, store, redop, partition=None): self._check_store(store) if store.kind is Future: @@ -186,7 +181,6 @@ def declare_partition(self, store, disjoint=True, complete=True): return sym - class Task(Operation): def __init__(self, context, task_id, mapper_id=0, op_id=0): Operation.__init__(self, context, mapper_id=mapper_id, op_id=op_id) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 862f9e263..195f16072 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -48,20 +48,8 @@ from .solver import Partitioner, Strategy from .store import RegionField, Store, FusionMetadata import numpy as np -debugPrint = False -futureBugPrint = False -def zprint(*args): - return -if debugPrint: - drint = print -else: - drint = zprint -if futureBugPrint: - frint = print -else: - frint = zprint # A Field holds a reference to a field in a region tree @@ -782,7 +770,9 @@ def can_fuse(self): delta=stoptime-starttime total = delta.total_seconds() * 1000.0 print("partime", total, len(self.ops)) - """ + """ + #TODO: have all constraints use "streamApply" + # this is a more efficient way/interface for generating fusable intervals streamApply = False #starttime = datetime.datetime.now() if not streamApply: @@ -793,7 +783,6 @@ def can_fuse(self): alpha = self.constraints[0].apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) beta = self.constraints[1].apply2(self.contexts, self.runtime, self.ops, alpha[1],self.partitioners, self.strategies) beta = self.constraints[2].apply2(self.contexts, self.runtime, self.ops, beta[1], self.partitioners, self.strategies) - #print("beta", beta) """ stoptime = datetime.datetime.now() delta=stoptime-starttime @@ -801,11 +790,9 @@ def can_fuse(self): print("applytime", total) """ starttime = datetime.datetime.now() - #drint("fuse results", results) #intersect intervals #this is an inefficent way of doing this, - #""" if not streamApply: curr_set = interval_sets[0] for interval_set in interval_sets[1:]: @@ -817,7 +804,6 @@ def can_fuse(self): newe = min(aset[1], bset[1]) newset.append((news, newe)) curr_set=newset - #""" fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold) else: fusable,final_set = self.supress_small_fusions(beta[1], self.runtime._fusion_threshold) @@ -828,13 +814,6 @@ def can_fuse(self): total = delta.total_seconds() * 1000.0 print("filtertime", total) """ - - #drint("curset", curr_set) - - #drint("final_set", final_set) - #drint("all fusable", fusable) - #drint("intervals", interval_sets) - #return reduce(lambda x,y: x and y, all_fusable), final_set, self.strategies return fusable, final_set, self.strategies class FusionConstraint(object): @@ -862,19 +841,13 @@ class AllValidOps(FusionConstraint): def __init__(self): self.validIDs = set() self.terminals = set() - #these ops are almost always fusable self.validIDs.add(2) #Binary op self.validIDs.add(10) #Fill op - self.validIDs.add(20) #Where op - #self.validIDs.add(6) #Where op - #self.validIDs.add(5) #Convert op - #self.validIDs.add(7) #dot op - #self.terminals.add(7) #dot op is only fusable as a terminal op in a window + self.validIDs.add(20) #Unary op def apply(self, contexts, runtime, ops, partitioners, strategies): results = [int(op._task_id) in self.validIDs for op in ops] - drint("valids", results) fusable_intervals = [] start, end =0,0 while end=2 and (not force_eval): - #start = datetime.datetime.now() - #pr.enable() fused_task_list,strats = self.build_fused_op(ops) - #pr.disable() - #stop = datetime.datetime.now() - #delta=stop-start - #total = delta.total_seconds() * 1000.0 - #print("time", total) if fused_task_list: - frint("created fused list", [op._task_id for op in fused_task_list]) - drint("start clearing pipe") self._clearing_pipe = True for task in fused_task_list: task.execute() self._clearing_pipe = False - drint("stop clearing pipe") return # case 2: tasks processed for fusion already have @@ -1505,21 +1425,12 @@ def _schedule(self, ops, force_eval=False): # them when testing fusion legality (in case 1) if len(ops)==1 and self._clearing_pipe: strategy = ops[0].strategy - for input, part in zip(ops[0]._inputs, ops[0]._input_parts): - frint("launch fused input", ops[0]._task_id, input) - #proj = ops[0].strategy.get_projection(part) - self.propogateFuture(ops[0]) - - for output in ops[0]._outputs: - #if output._storage is None: - frint("launch used output", ops[0]._task_id, output) ops[0].launch(strategy) # case 3: execute the ops normally # if we already checked the ops for fusability, # then the ops' buffers have already been partitioned else: - #print("normal execution", ids) if not strats: #ops were not check for fusability, so partition them for op in ops: must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op]) @@ -1530,10 +1441,6 @@ def _schedule(self, ops, force_eval=False): for i,op in enumerate(ops): op.strategy = strats[i] for i,op in enumerate(ops): - for input in op._inputs: - if input._storage is None: - frint("launch ufused input", op._task_id, input) - self.propogateFuture(op) op.launch(op.strategy) diff --git a/legate/core/solver.py b/legate/core/solver.py index 262497bf1..d5b9bdf27 100644 --- a/legate/core/solver.py +++ b/legate/core/solver.py @@ -283,7 +283,6 @@ def cost(unknown): key_parts = set() prev_part = None - #import pdb; pdb.set_trace() for unknown in unknowns: if unknown in partitions: continue @@ -307,7 +306,6 @@ def cost(unknown): if to_align in partitions: continue partitions[to_align] = partition - #print("ptype", to_align, (partition)) prev_part = partition for lhs, rhs in dependent.items(): diff --git a/legate/core/store.py b/legate/core/store.py index ef2e58084..c8063f72f 100644 --- a/legate/core/store.py +++ b/legate/core/store.py @@ -536,26 +536,6 @@ def storage(self): # it has been initialized correctly. self._runtime._launch_outstanding(False) if self._storage is None: - """ - if self._kind ==Future: - print("future") - while(self._storage is None and len(self._runtime._outstanding_ops)): - print("launch_one") - print([op._task_id for op in self._runtime._outstanding_ops]) - #self._runtime._launch_outstanding() - self._runtime._launch_one() - """ - """ - if True: - import pdb; pdb.set_trace() - start = self - while start._storage is None and start._parent: - start=start._parent - if start._storage: - self._storage = start._storage - else: - self._runtime._launch_outstanding() - """ if self.unbound: raise RuntimeError( "Storage of a variable size store cannot be retrieved " @@ -565,7 +545,6 @@ def storage(self): # if necessary if self._parent is None: if self._kind is Future: - print("supressing in store.py") raise ValueError( "Illegal to access the storage of an uninitialized " "Legate store of volume 1 with scalar optimization" @@ -957,13 +936,8 @@ def packList(self, meta_list, buf): # aggregate the ints when packing # much faster than individually packing each int buf.pack_32bit_int_arr(meta_list) - #for elem in meta_list: - # buf.pack_32bit_int(elem) def pack(self, buf): - - #buf.pack_32bit_int(len(self._opIDs)) #nOps - #buf.pack_32bit_int(len(self._buffer_offsets)) #nIOBuffers+1 superbuff = [len(self._opIDs)]+[len(self._buffer_offsets)] superbuff += self._input_starts superbuff += self._output_starts @@ -974,11 +948,4 @@ def pack(self, buf): superbuff += self._future_starts superbuff += self._opIDs self.packList(superbuff, buf) - #self.packList(self._input_starts, buf) - #self.packList(self._output_starts, buf) - #self.packList(self._offset_starts, buf) - #self.packList(self._buffer_offsets, buf) - #self.packList(self._reduction_starts, buf) - #self.packList(self._scalar_starts, buf) - #self.packList(self._opIDs, buf) From a73dec165788442c53e5e0a33f93d73a58f82f58 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Tue, 7 Dec 2021 15:12:23 -0800 Subject: [PATCH 35/44] more cleanup --- install.py | 6 +---- legate/core/launcher.py | 1 - legate/core/runtime.py | 58 +++++------------------------------------ 3 files changed, 7 insertions(+), 58 deletions(-) diff --git a/install.py b/install.py index 6a3313e62..aabd00824 100755 --- a/install.py +++ b/install.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -#/home/shiv1/pypy/pypy3.8-v7.3.7-linux64/bin/python3.8 # Copyright 2021 NVIDIA Corporation # @@ -126,7 +125,6 @@ def git_clone(repo_dir, url, branch=None, tag=None, commit=None): verbose_check_call( ["git", "submodule", "update", "--init"], cwd=repo_dir ) - git_reset(repo_dir, commit) else: verbose_check_call( [ @@ -201,7 +199,6 @@ def install_legion(legion_src_dir, branch): legion_src_dir, url="https://gitlab.com/StanfordLegion/legion.git", branch=branch, - commit=commit, ) @@ -216,7 +213,7 @@ def install_thrust(thrust_dir): def update_legion(legion_src_dir, branch): # Make sure we are on the right branch for single/multi-node - git_update(legion_src_dir, branch=branch, commit=commit) + git_update(legion_src_dir, branch=branch) def build_legion( @@ -378,7 +375,6 @@ def build_legion( legion_python_dir = os.path.join(legion_src_dir, "bindings", "python") if clean_first: - print("cleaning!\n") verbose_check_call( ["make"] + flags + ["clean"], cwd=legion_python_dir ) diff --git a/legate/core/launcher.py b/legate/core/launcher.py index e396b0894..837e53068 100644 --- a/legate/core/launcher.py +++ b/legate/core/launcher.py @@ -369,7 +369,6 @@ def coalesce(self): # promote them to read write permission. if len(all_perms - set([Permission.NO_ACCESS])) > 1: perm = Permission.READ_WRITE - #perm = Permission.WRITE # When the field requires read write permission, # all projections must be the same diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 195f16072..bf625b901 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -26,8 +26,6 @@ from legate.core import types as ty import datetime -import cProfile -pr = cProfile.Profile() from .context import Context from .corelib import CoreLib @@ -824,15 +822,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): """ raise NotImplementedError("Implement in derived classes") - -class NumpyContextExists(FusionConstraint): - def apply(self, contexts, runtime, ops, partitioners, strategies): - if "cunumeric" in contexts: - return True, [(0, len(ops))] - else: - return False, [(0,0)] - - class AllValidOps(FusionConstraint): """ Class for only fusing only potentially fusable ops. @@ -873,9 +862,6 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): fusability_exists = reduce(lambda x,y: x or y,[int(op._task_id) in self.validIDs for op in ops]) return (fusability_exists, fusable_intervals) -class ValidScalarProducers(FusionConstraint): - """Checks all scalar producing are terminal ops""" - class IdenticalProjection(FusionConstraint): """Fusion rule that only ops with identical projection functors can be fused""" @@ -899,19 +885,11 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): proj = strategies[i].get_projection(part) if hasattr(proj, 'part'): bufferSet[input]=proj - #if input not in linkset: - # linkset[input] = [i] - #else: - # linkset[input].append(i) for output, part in zip(op._outputs, op._output_parts): if output not in bufferSet: proj = strategies[i].get_projection(part) if hasattr(proj, 'part'): bufferSet[output]=proj - #if output not in linkset: - # linkset[output] = [i] - #else: - # linkset[output].append(i) if i==0: #we only iterate from i==1 onwards i+=1 continue @@ -980,9 +958,6 @@ def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies intervals.append((start, end)) return True, intervals - - - class ValidProducerConsumer(FusionConstraint): """In a fused op, there cannot be a producer consumer relationship between different views of the same buffers""" @@ -1115,8 +1090,6 @@ def __init__(self, core_library): self._window_size=50 self._fusion_threshold =2 #used for logging window and fusion lengths - self._opLens = [] - self._fusedOpLens = [] self._clearing_pipe = False # Now we initialize managers @@ -1287,7 +1260,7 @@ def serialize_multiop_metadata(self, numpy_context, ops): future_starts, op_ids) fusion_metadata = FusionMetadata(*meta_arrs) meta_maps=None - return meta_maps, fusion_metadata + return fusion_metadata def build_fused_op(self,ops): @@ -1300,10 +1273,7 @@ def build_fused_op(self,ops): fusion_checker.register_constraint(ValidProducerConsumer()) can_fuse,fusable_sets, partitions = fusion_checker.can_fuse() - super_strats = [] - super_fspaces = [] super_strategies = [] - super_keystores = [] z=0 for fusable_set in fusable_sets: #create super strategy for this fusable set @@ -1315,40 +1285,28 @@ def build_fused_op(self,ops): super_strat = {**(super_strat.copy()), **partitions[j]._strategy} super_fspace = {**(super_fspace.copy()), **partitions[j]._fspaces} super_keystore = super_keystore.union(partitions[j]._key_parts) - super_strats.append(super_strat) - super_fspaces.append(super_fspace) - super_keystores.append(super_keystore) super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore)) - super_strat = {} - super_fspace = {} - for partition in partitions: - super_strat = {**(super_strat.copy()), **partition._strategy} - super_fspace = {**(super_fspace.copy()), **partition._fspaces} - - #hacky way to get numpy context and designated fused task id fused_id = self._contexts["cunumeric"].fused_id numpy_context = self._contexts["cunumeric"] numpy_runtime = numpy_context._library.runtime - z=0 + opID=0 new_op_list = [] for i,fusable_set in enumerate(fusable_sets): start, end = fusable_set op_subset = ops[start:end] #if nothing to fuse, just use the original op - #self._fusedOpLens.append(len(op_subset)) if end-start==1: normal_op = ops[start] - normal_op.strategy = partitions[z]._strategy#uper_strategies[i] + normal_op.strategy = partitions[opID]._strategy#uper_strategies[i] new_op_list.append(normal_op) - z+=1 + opID+=1 elif end-start > 1: #initialize fused task fused_task = numpy_context.create_task(fused_id) #serialize necessary metadata on all encapsulated ops - #this metadata will be fed into the fused op as inputs - meta_maps, fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset) + fusion_metadata = self.serialize_multiop_metadata(numpy_context, op_subset) fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true #add typical inputs and outputs of all subtasks to fused task @@ -1370,7 +1328,7 @@ def build_fused_op(self,ops): fused_task._unfused_output_parts.append(part) for future in op._futures: fused_task.add_future(future) - z+=1 + opID+=1 new_op_list.append(fused_task) strats=[] """ @@ -1401,7 +1359,6 @@ def build_fused_op(self,ops): def _launch_outstanding(self, force_eval=True): if len(self._outstanding_ops): - #self._opLens.append(len(self._outstanding_ops)) ops = self._outstanding_ops self._outstanding_ops = [] self._schedule(ops, force_eval) @@ -1448,13 +1405,11 @@ def submit(self, op): #always launch ops that've been processed for fusion #do not re-add to the window #as the these ops already waited in the window - #print(op.__dict__) if self._clearing_pipe: self._schedule([op]) else: self._outstanding_ops.append(op) if len(self._outstanding_ops) >= self._window_size: - #self._opLens.append(len(self._outstanding_ops)) ops = self._outstanding_ops self._outstanding_ops = [] self._schedule(ops) @@ -1477,7 +1432,6 @@ def _scheduleNew(self, ops): def flush_scheduling_window(self): if len(self._outstanding_ops) == 0: return - #self._opLens.append(len(self._outstanding_ops)) ops = self._outstanding_ops self._outstanding_ops = [] self._schedule(ops) From 437e67eef6d239786d2c03dd10c0c70681daa893 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Sun, 12 Dec 2021 11:37:24 -0800 Subject: [PATCH 36/44] use alignment info when fusing --- legate/core/operation.py | 6 +++ legate/core/runtime.py | 110 ++++++++++++++++++++++++++++++++------ legate/core/solver.py | 5 -- src/core/utilities/span.h | 2 - 4 files changed, 101 insertions(+), 22 deletions(-) diff --git a/legate/core/operation.py b/legate/core/operation.py index 39806ad81..861b042b5 100644 --- a/legate/core/operation.py +++ b/legate/core/operation.py @@ -151,6 +151,12 @@ def add_broadcast(self, store): def add_constraint(self, constraint): self._constraints.append(constraint) + def has_constraint(self, store1, store2): + part1 = self._get_unique_partition(store1) + part2 = self._get_unique_partition(store2) + cons = [str(con) for con in self._constraints] + return (str(part1 == part2) in cons) or (str(part2==part1) in cons) + def execute(self): self._context.runtime.submit(self) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index bf625b901..6bea2b5c3 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -757,10 +757,9 @@ def can_fuse(self): output.set_key_partition(partition) key_part = partition for input in op._inputs: - if input.shape==output.shape: + #check if input and output should be aligned + if op.has_constraint(input, output): input.set_key_partition(key_part) - #if len(op.inputs)>1: - # proj = strategy.get_projection(op._input_parts[1]) self.strategies.append(strategy) self.strategies.reverse() """ @@ -771,16 +770,16 @@ def can_fuse(self): """ #TODO: have all constraints use "streamApply" # this is a more efficient way/interface for generating fusable intervals - streamApply = False + streamApply = True #starttime = datetime.datetime.now() if not streamApply: results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints] all_fusable = [result[0] for result in results] interval_sets = [result[1] for result in results] else: - alpha = self.constraints[0].apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) - beta = self.constraints[1].apply2(self.contexts, self.runtime, self.ops, alpha[1],self.partitioners, self.strategies) - beta = self.constraints[2].apply2(self.contexts, self.runtime, self.ops, beta[1], self.partitioners, self.strategies) + windows = [(0, len(self.ops))] + for constraint in self.constraints: + windows = constraint.apply2(self.contexts, self.runtime, self.ops, windows, self.partitioners, self.strategies) """ stoptime = datetime.datetime.now() delta=stoptime-starttime @@ -804,7 +803,7 @@ def can_fuse(self): curr_set=newset fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold) else: - fusable,final_set = self.supress_small_fusions(beta[1], self.runtime._fusion_threshold) + fusable,final_set = self.supress_small_fusions(windows, self.runtime._fusion_threshold) """ stoptime = datetime.datetime.now() @@ -862,14 +861,42 @@ def apply(self, contexts, runtime, ops, partitioners, strategies): fusability_exists = reduce(lambda x,y: x or y,[int(op._task_id) in self.validIDs for op in ops]) return (fusability_exists, fusable_intervals) + def apply2(self, contexts, runtime, ops, baseIntervals, partitioners, strategies): + fusable_intervals = [] + results = [int(op._task_id) in self.validIDs for op in ops] + for baseInterval in baseIntervals: + start, end = baseInterval[0], baseInterval[0] + while end=2 and (not force_eval): diff --git a/legate/core/solver.py b/legate/core/solver.py index d5b9bdf27..51639f17d 100644 --- a/legate/core/solver.py +++ b/legate/core/solver.py @@ -273,11 +273,6 @@ def cost(unknown): -store.comm_volume(), not store.has_key_partition(all_restrictions[unknown]), ) - #return ( - # store.comm_volume(), - # store._key_partition is None, - # not store.has_key_partition(all_restrictions[unknown]), - #) unknowns = sorted(unknowns, key=cost) diff --git a/src/core/utilities/span.h b/src/core/utilities/span.h index c839bb365..c0a20c5a8 100644 --- a/src/core/utilities/span.h +++ b/src/core/utilities/span.h @@ -35,7 +35,6 @@ struct Span { public: decltype(auto) operator[](size_t pos) { - //std::cout<<"pos "< Date: Sun, 12 Dec 2021 11:50:05 -0800 Subject: [PATCH 37/44] new apply methods --- legate/core/runtime.py | 224 ++++------------------------------------- 1 file changed, 18 insertions(+), 206 deletions(-) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 6bea2b5c3..21793b891 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -746,7 +746,6 @@ def supress_small_fusions(self, intervals, threshold): return fusable, final_set def can_fuse(self): - #starttime = datetime.datetime.now() for op in reversed(self.ops): must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op]) partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single) @@ -756,71 +755,30 @@ def can_fuse(self): partition = strategy.get_partition(part) output.set_key_partition(partition) key_part = partition + #check if input and output should be aligned for input in op._inputs: - #check if input and output should be aligned if op.has_constraint(input, output): input.set_key_partition(key_part) self.strategies.append(strategy) self.strategies.reverse() - """ - stoptime = datetime.datetime.now() - delta=stoptime-starttime - total = delta.total_seconds() * 1000.0 - print("partime", total, len(self.ops)) - """ - #TODO: have all constraints use "streamApply" - # this is a more efficient way/interface for generating fusable intervals - streamApply = True - #starttime = datetime.datetime.now() - if not streamApply: - results = [constraint.apply(self.contexts, self.runtime, self.ops, self.partitioners, self.strategies) for constraint in self.constraints] - all_fusable = [result[0] for result in results] - interval_sets = [result[1] for result in results] - else: - windows = [(0, len(self.ops))] - for constraint in self.constraints: - windows = constraint.apply2(self.contexts, self.runtime, self.ops, windows, self.partitioners, self.strategies) - """ - stoptime = datetime.datetime.now() - delta=stoptime-starttime - total = delta.total_seconds() * 1000.0 - print("applytime", total) - """ - starttime = datetime.datetime.now() + + windows = [(0, len(self.ops))] + for constraint in self.constraints: + windows = constraint.apply(self.contexts, self.runtime, self.ops, windows, self.partitioners, self.strategies) - #intersect intervals - #this is an inefficent way of doing this, - if not streamApply: - curr_set = interval_sets[0] - for interval_set in interval_sets[1:]: - newset = [] - for aset in curr_set: - for bset in interval_set: - if not (aset[0] > bset[1] or bset[0] > aset[1]): - news = max(aset[0], bset[0]) - newe = min(aset[1], bset[1]) - newset.append((news, newe)) - curr_set=newset - fusable,final_set = self.supress_small_fusions(curr_set, self.runtime._fusion_threshold) - else: - fusable,final_set = self.supress_small_fusions(windows, self.runtime._fusion_threshold) - - """ - stoptime = datetime.datetime.now() - delta=stoptime-starttime - total = delta.total_seconds() * 1000.0 - print("filtertime", total) - """ + fusable,final_set = self.supress_small_fusions(windows, self.runtime._fusion_threshold) return fusable, final_set, self.strategies + class FusionConstraint(object): - def apply(self, contexts, runtime, ops, partitioners, strategies): + def apply(self, contexts, runtime, ops, baseIntervals, partitioners, strategies): """" Abstract class for determining a rule that constrains which legate operations can be fused """ raise NotImplementedError("Implement in derived classes") + class AllValidOps(FusionConstraint): """ Class for only fusing only potentially fusable ops. @@ -833,35 +791,7 @@ def __init__(self): self.validIDs.add(10) #Fill op self.validIDs.add(20) #Unary op - def apply(self, contexts, runtime, ops, partitioners, strategies): - results = [int(op._task_id) in self.validIDs for op in ops] - - fusable_intervals = [] - start, end =0,0 - while end 0 for gop in [fused_task]) @@ -1429,11 +1246,6 @@ def build_fused_op(self,ops): else: fused_task.strategy = super_strategies[i] strats.append( super_strategies[i]) - #stoptime = datetime.datetime.now() - #delta=stoptime-starttime - #total = delta.total_seconds() * 1000.0 - #print("repartime", total) - #pr.disable() return new_op_list, strats def _launch_outstanding(self, force_eval=True): From 239ae351fcc38683490be05efe0ea8d9cece8fcf Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Sun, 12 Dec 2021 17:01:17 -0800 Subject: [PATCH 38/44] removing serializer code --- src/core/data/scalar.h | 2 -- src/core/data/scalar.inl | 1 - src/core/data/store.cc | 13 ++++--------- src/core/data/store.h | 14 ++------------ src/core/data/transform.cc | 25 ------------------------- src/core/data/transform.h | 14 -------------- src/core/runtime/context.cc | 11 ----------- src/core/runtime/context.h | 10 +++------- src/core/utilities/deserializer.cc | 4 ++-- 9 files changed, 11 insertions(+), 83 deletions(-) diff --git a/src/core/data/scalar.h b/src/core/data/scalar.h index 205e51a51..852121a19 100644 --- a/src/core/data/scalar.h +++ b/src/core/data/scalar.h @@ -19,7 +19,6 @@ #include "core/utilities/span.h" #include "core/utilities/type_traits.h" #include "core/utilities/typedefs.h" -#include "core/utilities/makeshift_serializer.h" namespace legate { @@ -59,7 +58,6 @@ class Scalar { LegateTypeCode code_{MAX_TYPE_NUMBER}; const void* data_; - friend class MakeshiftSerializer; }; } // namespace legate diff --git a/src/core/data/scalar.inl b/src/core/data/scalar.inl index ec629708c..892ce414e 100644 --- a/src/core/data/scalar.inl +++ b/src/core/data/scalar.inl @@ -44,7 +44,6 @@ VAL Scalar::value() const template Span Scalar::values() const { - if (tuple_) { auto size = *static_cast(data_); auto data = static_cast(data_) + sizeof(uint32_t); diff --git a/src/core/data/store.cc b/src/core/data/store.cc index fd40a4dd9..ed21a6957 100644 --- a/src/core/data/store.cc +++ b/src/core/data/store.cc @@ -21,8 +21,8 @@ namespace legate { using namespace Legion; -RegionField::RegionField(int32_t dim, const PhysicalRegion& pr, FieldID fid, unsigned reqIdx) - : dim_(dim), pr_(pr), fid_(fid), reqIdx_(reqIdx) +RegionField::RegionField(int32_t dim, const PhysicalRegion& pr, FieldID fid) + : dim_(dim), pr_(pr), fid_(fid) { auto priv = pr.get_privilege(); readable_ = static_cast(priv & LEGION_READ_PRIV); @@ -34,7 +34,6 @@ RegionField::RegionField(RegionField&& other) noexcept : dim_(other.dim_), pr_(other.pr_), fid_(other.fid_), - reqIdx_(other.reqIdx_), readable_(other.readable_), writable_(other.writable_), reducible_(other.reducible_) @@ -46,7 +45,6 @@ RegionField& RegionField::operator=(RegionField&& other) noexcept dim_ = other.dim_; pr_ = other.pr_; fid_ = other.fid_; - reqIdx_ = other.reqIdx_; readable_ = other.readable_; writable_ = other.writable_; @@ -56,15 +54,14 @@ RegionField& RegionField::operator=(RegionField&& other) noexcept Domain RegionField::domain() const { return dim_dispatch(dim_, get_domain_fn{}, pr_); } -OutputRegionField::OutputRegionField(const OutputRegion& out, FieldID fid, unsigned reqIdx) : out_(out), fid_(fid), reqIdx_(reqIdx) {} +OutputRegionField::OutputRegionField(const OutputRegion& out, FieldID fid) : out_(out), fid_(fid) {} OutputRegionField::OutputRegionField(OutputRegionField&& other) noexcept - : bound_(other.bound_), out_(other.out_), fid_(other.fid_), reqIdx_(other.reqIdx_) + : bound_(other.bound_), out_(other.out_), fid_(other.fid_) { other.bound_ = false; other.out_ = OutputRegion(); other.fid_ = -1; - //TODO, how should we invalidate reqIdx } OutputRegionField& OutputRegionField::operator=(OutputRegionField&& other) noexcept @@ -72,12 +69,10 @@ OutputRegionField& OutputRegionField::operator=(OutputRegionField&& other) noexc bound_ = other.bound_; out_ = other.out_; fid_ = other.fid_; - reqIdx_= other.reqIdx_; other.bound_ = false; other.out_ = OutputRegion(); other.fid_ = -1; - //TODO, how should we invalidate reqIdx return *this; } diff --git a/src/core/data/store.h b/src/core/data/store.h index eef2896d4..308c50cd1 100644 --- a/src/core/data/store.h +++ b/src/core/data/store.h @@ -22,14 +22,13 @@ #include "core/task/return.h" #include "core/utilities/machine.h" #include "core/utilities/typedefs.h" -#include "core/utilities/makeshift_serializer.h" namespace legate { class RegionField { public: RegionField() {} - RegionField(int32_t dim, const Legion::PhysicalRegion& pr, Legion::FieldID fid, unsigned reqIdx); + RegionField(int32_t dim, const Legion::PhysicalRegion& pr, Legion::FieldID fid); public: RegionField(RegionField&& other) noexcept; @@ -139,7 +138,6 @@ class RegionField { template Legion::Rect shape() const; Legion::Domain domain() const; - unsigned getReqIdx() const {return reqIdx_; } public: bool is_readable() const { return readable_; } @@ -150,20 +148,18 @@ class RegionField { int32_t dim_{-1}; Legion::PhysicalRegion pr_{}; Legion::FieldID fid_{-1U}; - unsigned reqIdx_; //this gets packed as an unsigned private: bool readable_{false}; bool writable_{false}; bool reducible_{false}; - friend class MakeshiftSerializer; }; class OutputRegionField { public: OutputRegionField() {} - OutputRegionField(const Legion::OutputRegion& out, Legion::FieldID fid, unsigned reqIdx); + OutputRegionField(const Legion::OutputRegion& out, Legion::FieldID fid); public: OutputRegionField(OutputRegionField&& other) noexcept; @@ -181,9 +177,7 @@ class OutputRegionField { bool bound_{false}; Legion::OutputRegion out_{}; Legion::FieldID fid_{-1U}; - unsigned reqIdx_; //this gets packed as an unsigned - friend class MakeshiftSerializer; }; class FutureWrapper { @@ -246,7 +240,6 @@ class FutureWrapper { mutable bool uninitialized_{true}; mutable void* rawptr_{nullptr}; - friend class MakeshiftSerializer; }; class Store { @@ -275,7 +268,6 @@ class Store { public: int32_t dim() const { return dim_; } - bool is_future2() const { return is_future_; } LegateTypeCode code() const { return code_; } public: @@ -302,7 +294,6 @@ class Store { template Legion::Rect shape() const; Legion::Domain domain() const; - unsigned getReqIdx() const {return region_field_.getReqIdx(); } public: bool is_readable() const { return readable_; } @@ -341,7 +332,6 @@ class Store { bool writable_{false}; bool reducible_{false}; - friend class MakeshiftSerializer; }; //containts prefix sums for a sub-op diff --git a/src/core/data/transform.cc b/src/core/data/transform.cc index 4ca2b071e..0f2fddc66 100644 --- a/src/core/data/transform.cc +++ b/src/core/data/transform.cc @@ -45,11 +45,6 @@ Shift::Shift(int32_t dim, int64_t offset, StoreTransformP parent) { } -int32_t Shift::getTransformCode() const -{ - return LEGATE_CORE_TRANSFORM_SHIFT; -} - Domain Shift::transform(const Domain& input) const { auto result = nullptr != parent_ ? parent_->transform(input) : input; @@ -103,11 +98,6 @@ Promote::Promote(int32_t extra_dim, int64_t dim_size, StoreTransformP parent) { } -int32_t Promote::getTransformCode() const -{ - return LEGATE_CORE_TRANSFORM_PROMOTE; -} - Domain Promote::transform(const Domain& input) const { auto promote = [](int32_t extra_dim, int64_t dim_size, const Domain& input) { @@ -174,11 +164,6 @@ Project::Project(int32_t dim, int64_t coord, StoreTransformP parent) { } -int32_t Project::getTransformCode() const -{ - return LEGATE_CORE_TRANSFORM_PROJECT; -} - Domain Project::transform(const Domain& input) const { auto project = [](int32_t collapsed_dim, const Domain& input) { @@ -247,11 +232,6 @@ Transpose::Transpose(std::vector&& axes, StoreTransformP parent) { } -int32_t Transpose::getTransformCode() const -{ - return LEGATE_CORE_TRANSFORM_TRANSPOSE; -} - Domain Transpose::transform(const Domain& input) const { auto transpose = [](const auto& axes, const Domain& input) { @@ -335,11 +315,6 @@ Delinearize::Delinearize(int32_t dim, std::vector&& sizes, StoreTransfo for (auto size : sizes_) volume_ *= size; } -int32_t Delinearize::getTransformCode() const -{ - return LEGATE_CORE_TRANSFORM_DELINEARIZE; -} - Domain Delinearize::transform(const Domain& input) const { auto delinearize = [](const auto dim, const auto ndim, const auto& strides, const Domain& input) { diff --git a/src/core/data/transform.h b/src/core/data/transform.h index 6c272b735..e39b75962 100644 --- a/src/core/data/transform.h +++ b/src/core/data/transform.h @@ -19,9 +19,7 @@ #include #include "legion.h" -#include "core/legate_c.h" -class MakeshiftSerializer; namespace legate { class StoreTransform { @@ -33,11 +31,9 @@ class StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& input) const = 0; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const = 0; - virtual int32_t getTransformCode() const =0; virtual void print(std::ostream& out) const = 0; protected: - friend class MakeshiftSerializer; std::shared_ptr parent_{nullptr}; }; @@ -51,13 +47,11 @@ class Shift : public StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& input) const override; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override; - virtual int32_t getTransformCode() const override; virtual void print(std::ostream& out) const override; private: int32_t dim_; int64_t offset_; - friend class MakeshiftSerializer; }; class Promote : public StoreTransform { @@ -68,13 +62,11 @@ class Promote : public StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& input) const override; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override; - virtual int32_t getTransformCode() const override; virtual void print(std::ostream& out) const override; private: int32_t extra_dim_; int64_t dim_size_; - friend class MakeshiftSerializer; }; class Project : public StoreTransform { @@ -85,13 +77,11 @@ class Project : public StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& domain) const override; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override; - virtual int32_t getTransformCode() const override; virtual void print(std::ostream& out) const override; private: int32_t dim_; int64_t coord_; - friend class MakeshiftSerializer; }; class Transpose : public StoreTransform { @@ -102,12 +92,10 @@ class Transpose : public StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& domain) const override; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override; - virtual int32_t getTransformCode() const override; virtual void print(std::ostream& out) const override; private: std::vector axes_; - friend class MakeshiftSerializer; }; class Delinearize : public StoreTransform { @@ -120,7 +108,6 @@ class Delinearize : public StoreTransform { public: virtual Legion::Domain transform(const Legion::Domain& domain) const override; virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override; - virtual int32_t getTransformCode() const override; virtual void print(std::ostream& out) const override; private: @@ -128,7 +115,6 @@ class Delinearize : public StoreTransform { std::vector sizes_; std::vector strides_; int64_t volume_; - friend class MakeshiftSerializer; }; } // namespace legate diff --git a/src/core/runtime/context.cc b/src/core/runtime/context.cc index 4884c0110..79bfbf6a4 100644 --- a/src/core/runtime/context.cc +++ b/src/core/runtime/context.cc @@ -154,17 +154,6 @@ TaskContext::TaskContext(const Legion::Task* task, scalars_ = dez.unpack>(); } -/* - TaskContext::TaskContext(std::vector& inputs, std::vector& outputs, - std::vector& reductions, std::vector& scalars) - : inputs_(inputs), outputs_(outputs), reductions_(reductions), scalars_(scalars) -{ - regions_ = NULL; - context_ = NULL; - runtime_ = NULL; - task_ = NULL; -} -*/ ReturnValues TaskContext::pack_return_values() const { diff --git a/src/core/runtime/context.h b/src/core/runtime/context.h index c9cb29462..245024e10 100644 --- a/src/core/runtime/context.h +++ b/src/core/runtime/context.h @@ -18,6 +18,7 @@ #include "legion.h" #include "core/data/scalar.h" +#include "core/data/store.h" #include "core/task/return.h" @@ -25,7 +26,7 @@ namespace legate { class Store; class Scalar; -struct FusionMetadata; +//struct FusionMetadata; struct ResourceConfig { int64_t max_tasks{1000000}; @@ -118,13 +119,8 @@ class TaskContext { Legion::Runtime* runtime); TaskContext(const Legion::Task* task, const std::vector regions) -// std::vector& inputs, std::vector& outputs, std::vector& scalars) : task_(task), regions_(regions) - { - //inputs_=inputs; - //outputs_=outputs; - //scalars_=scalars; -} + {} public: diff --git a/src/core/utilities/deserializer.cc b/src/core/utilities/deserializer.cc index 3aa297811..9a0813013 100644 --- a/src/core/utilities/deserializer.cc +++ b/src/core/utilities/deserializer.cc @@ -145,7 +145,7 @@ void TaskDeserializer::_unpack(RegionField& value) auto dim = unpack(); auto idx = unpack(); auto fid = unpack(); - value = RegionField(dim, regions_[idx], fid, idx); + value = RegionField(dim, regions_[idx], fid); } void TaskDeserializer::_unpack(OutputRegionField& value) @@ -155,7 +155,7 @@ void TaskDeserializer::_unpack(OutputRegionField& value) auto idx = unpack(); auto fid = unpack(); - value = OutputRegionField(outputs_[idx], fid, idx); + value = OutputRegionField(outputs_[idx], fid); } namespace mapping { From b49557bfa69008cd9254bfe55c75dc978f8d3026 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Sun, 12 Dec 2021 17:08:42 -0800 Subject: [PATCH 39/44] more cleanup --- legate/core/operation.py | 1 - src/core/data/store.cc | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/legate/core/operation.py b/legate/core/operation.py index 861b042b5..5f148a5d0 100644 --- a/legate/core/operation.py +++ b/legate/core/operation.py @@ -35,7 +35,6 @@ def __init__(self, context, mapper_id=0, op_id=0): self._outputs = [] self._reductions = [] self._is_fused = False - self._temps = [] self._input_parts = [] self._output_parts = [] self._reduction_parts = [] diff --git a/src/core/data/store.cc b/src/core/data/store.cc index ed21a6957..4fd19b34c 100644 --- a/src/core/data/store.cc +++ b/src/core/data/store.cc @@ -122,8 +122,8 @@ Domain FutureWrapper::domain() const { return domain_; } ReturnValue FutureWrapper::pack() const { if (nullptr == rawptr_) { - //fprintf(stderr, "Found an uninitialized Legate store\n"); - //assert(false); + fprintf(stderr, "Found an uninitialized Legate store\n"); + assert(false); } return ReturnValue(rawptr_, field_size_); } From e05e3ffd5171425cf2593101877a772de3dbf7a8 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Sun, 12 Dec 2021 18:01:04 -0800 Subject: [PATCH 40/44] remove fusion reference from core --- legate/core/corelib.py | 3 ++- legate/core/runtime.py | 30 ++++++++++++++++++------------ src/core.mk | 12 +++++++----- src/core/legate_c.h | 1 + src/core/mapping/core_mapper.cc | 3 +-- src/core/runtime/runtime.cc | 25 +++++++++++++++++++++++++ src/core/runtime/runtime.h | 2 ++ src/core/task/task.cc | 6 +++++- src/core/task/task.h | 3 +++ src/legate.h | 2 ++ 10 files changed, 66 insertions(+), 21 deletions(-) diff --git a/legate/core/corelib.py b/legate/core/corelib.py index ff35071a6..77800d20d 100644 --- a/legate/core/corelib.py +++ b/legate/core/corelib.py @@ -24,7 +24,7 @@ class CoreLib(Library): def __init__(self): self._lib = None - + def get_name(self): return "legate.core" @@ -38,6 +38,7 @@ def get_c_header(self): def initialize(self, shared_lib): self._lib = shared_lib shared_lib.legate_parse_config() + #self.fused_id = self._lib.LEGATE_CORE_FUSED_TASK_ID def get_registration_callback(self): return "legate_core_perform_registration" diff --git a/legate/core/runtime.py b/legate/core/runtime.py index 21793b891..b0e2c53bf 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -778,6 +778,17 @@ def apply(self, contexts, runtime, ops, baseIntervals, partitioners, strategies) """ raise NotImplementedError("Implement in derived classes") +class cuNumericContextExists(FusionConstraint): + """ + Fusion currently exists as a cuNumeric operation + This can be removed once fusion becomes a core task + """ + def apply(self, contexts, runtime, ops, baseIntervals, partitioners, strategies): + if "cunumeric" in contexts: + return baseIntervals + else: + return [(i, i+1) for i in range(len(ops))] + class AllValidOps(FusionConstraint): """ @@ -789,7 +800,7 @@ def __init__(self): self.terminals = set() self.validIDs.add(2) #Binary op self.validIDs.add(10) #Fill op - self.validIDs.add(20) #Unary op + self.validIDs.add(21) #Unary op def apply(self, contexts, runtime, ops, baseIntervals, partitioners, strategies): fusable_intervals = [] @@ -1165,6 +1176,7 @@ def serialize_multiop_metadata(self, numpy_context, ops): def build_fused_op(self,ops): fusion_checker = FusionChecker(ops, self._contexts, self) + fusion_checker.register_constraint(cuNumericContextExists()) fusion_checker.register_constraint(AllValidOps()) fusion_checker.register_constraint(IdenticalLaunchShapes()) fusion_checker.register_constraint(IdenticalProjection()) @@ -1254,10 +1266,8 @@ def _launch_outstanding(self, force_eval=True): self._outstanding_ops = [] self._schedule(ops, force_eval) - def _schedule(self, ops, force_eval=False): ids = [op._task_id for op in ops] - #print(ids) #case 1: try fusing current window of tasks strats = False if len(ops)>=2 and (not force_eval): @@ -1267,30 +1277,26 @@ def _schedule(self, ops, force_eval=False): for task in fused_task_list: task.execute() self._clearing_pipe = False - return # case 2: tasks processed for fusion already have # their strategy "baked in", as we already partitioned # them when testing fusion legality (in case 1) - if len(ops)==1 and self._clearing_pipe: + elif len(ops)==1 and self._clearing_pipe: strategy = ops[0].strategy ops[0].launch(strategy) # case 3: execute the ops normally - # if we already checked the ops for fusability, - # then the ops' buffers have already been partitioned + # partition if op wasn't checked for fusability else: if not strats: #ops were not check for fusability, so partition them + strats = [] for op in ops: must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [op]) partitioner = Partitioner(self, [op], must_be_single=must_be_single) strategy = partitioner.partition_stores() - op.strategy = strategy - else: #strategies already calculated during failed attempt to fuse - for i,op in enumerate(ops): - op.strategy = strats[i] + strats.append(strategy) for i,op in enumerate(ops): - op.launch(op.strategy) + op.launch(strats[i]) def submit(self, op): diff --git a/src/core.mk b/src/core.mk index 988d6741d..3aa885b2c 100644 --- a/src/core.mk +++ b/src/core.mk @@ -31,12 +31,13 @@ GEN_CPU_SRC = core/legate_c.cc \ core/task/return.cc \ core/task/task.cc \ core/utilities/deserializer.cc \ - core/utilities/makeshift_serializer.cc \ core/utilities/machine.cc \ - core/utilities/linearize.cc + core/utilities/linearize.cc # \ + #core/fused/fused_op_gpu.cc \ + #core/fused/fused_op.cc ifeq ($(strip $(USE_CUDA)),1) -GEN_CPU_SRC += core/gpu/cudalibs.cc +GEN_CPU_SRC += core/gpu/cudalibs.cc endif # Header files that we need to have installed for client legate libraries @@ -63,6 +64,7 @@ INSTALL_HEADERS = legate.h \ core/utilities/dispatch.h \ core/utilities/machine.h \ core/utilities/span.h \ - core/utilities/makeshift_serializer.h \ core/utilities/type_traits.h \ - core/utilities/typedefs.h + core/utilities/typedefs.h #\ + #core/fused/fused_op.h \ + #core/fused/fused_op_wrapper.h diff --git a/src/core/legate_c.h b/src/core/legate_c.h index d3e12a502..132328cf7 100644 --- a/src/core/legate_c.h +++ b/src/core/legate_c.h @@ -21,6 +21,7 @@ typedef enum legate_core_task_id_t { LEGATE_CORE_INITIALIZE_TASK_ID, LEGATE_CORE_FINALIZE_TASK_ID, LEGATE_CORE_EXTRACT_SCALAR_TASK_ID, + //LEGATE_CORE_FUSED_TASK_ID, LEGATE_CORE_NUM_TASK_IDS, // must be last } legate_core_task_id_t; diff --git a/src/core/mapping/core_mapper.cc b/src/core/mapping/core_mapper.cc index 7f0c8ae0a..97a059166 100644 --- a/src/core/mapping/core_mapper.cc +++ b/src/core/mapping/core_mapper.cc @@ -215,8 +215,7 @@ Mapper::MapperSyncModel CoreMapper::get_mapper_sync_model(void) const void CoreMapper::select_task_options(const MapperContext ctx, const Task& task, TaskOptions& output) { - //std::cout<<"task_id "< > Core::opIDs = *(new std::vector >()); /*static */ std::vector > Core::gpuOpIDs = *(new std::vector >()); +/*static */ std::vector > Core::ompOpIDs = *(new std::vector >()); /*static */ std::unordered_map Core::cpuDescriptors = *(new std::unordered_map()); /*static */ std::unordered_map Core::gpuDescriptors = *(new std::unordered_map()); +/*static */ std::unordered_map Core::ompDescriptors = *(new std::unordered_map()); static const char* const core_library_name = "legate.core"; @@ -180,6 +183,12 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library runtime->attach_name( extract_scalar_task_id, extract_scalar_task_name, false /*mutable*/, true /*local only*/); + //const TaskID fused_task_id = context.get_task_id(LEGATE_CORE_FUSED_TASK2_ID); + //const char* fused_task_name = "Legate Core Task Fusion"; + //runtime->attach_name( + // fused_task_id, fused_task_name, false /*mutable*/, true /*local only*/); + + auto make_registrar = [&](auto task_id, auto* task_name, auto proc_kind) { TaskVariantRegistrar registrar(task_id, task_name); registrar.add_constraint(ProcessorConstraint(proc_kind)); @@ -203,6 +212,19 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library runtime->register_task_variant(registrar, LEGATE_CPU_VARIANT); } + /* + { + auto registrar = make_registrar(fused_task_id, fused_task_name, Processor::LOC_PROC); + runtime->register_task_variant(registrar, LEGATE_CPU_VARIANT); + } + { + auto registrar = make_registrar(fused_task_id, fused_task_name, Processor::TOC_PROC); + runtime->register_task_variant(registrar, LEGATE_GPU_VARIANT); + } + */ + + + #ifdef LEGATE_USE_CUDA { auto registrar = make_registrar(initialize_task_id, initialize_task_name, Processor::TOC_PROC); @@ -244,6 +266,9 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library register_legate_core_projection_functors(runtime, context); register_legate_core_sharding_functors(runtime, context); + + std::cout<<"performing legate core registration callback"< cpuDescriptors; static std::unordered_map gpuDescriptors; + static std::unordered_map ompDescriptors; static std::vector > opIDs; static std::vector > gpuOpIDs; + static std::vector > ompOpIDs; public: // Configuration settings diff --git a/src/core/task/task.cc b/src/core/task/task.cc index 4749aba8f..64adf10b9 100644 --- a/src/core/task/task.cc +++ b/src/core/task/task.cc @@ -35,7 +35,6 @@ void LegateTaskRegistrar::record_variant(TaskID tid, assert((kind == Processor::LOC_PROC) || (kind == Processor::TOC_PROC) || (kind == Processor::OMP_PROC)); - // Buffer these up until we can do our actual registration with the runtime pending_task_variants_.push_back(PendingTaskVariant(tid, false /*global*/, @@ -68,6 +67,11 @@ void LegateTaskRegistrar::register_all_tasks(Runtime* runtime, LibraryContext& c Core::gpuDescriptors.insert(std::pair((int64_t) newID, taskIdx.second)); } + for (auto& taskIdx : Core::ompOpIDs){ + auto newID = context.get_task_id(taskIdx.first); + Core::ompDescriptors.insert(std::pair((int64_t) newID, taskIdx.second)); + } + // Do all our registrations diff --git a/src/core/task/task.h b/src/core/task/task.h index cc5d08628..a5ccb25df 100644 --- a/src/core/task/task.h +++ b/src/core/task/task.h @@ -140,6 +140,9 @@ class LegateTask { }else if (kind ==Legion::Processor::TOC_PROC){ Core::gpuOpIDs.push_back(std::pair((int64_t)task_id, TASK_PTR)); } + else if (kind ==Legion::Processor::OMP_PROC){ + Core::ompOpIDs.push_back(std::pair((int64_t)task_id, TASK_PTR)); + } T::Registrar::record_variant(task_id, T::task_name(), desc, diff --git a/src/legate.h b/src/legate.h index e786b5f87..8001a95ac 100644 --- a/src/legate.h +++ b/src/legate.h @@ -27,4 +27,6 @@ #include "core/utilities/dispatch.h" #include "core/utilities/type_traits.h" #include "core/utilities/typedefs.h" +//#include "core/fused/fused_op.h" +//#include "core/fused/fused_op_wrapper.h" #include "legate_defines.h" From a59e142cba0b8b27bdf06e35139370253ecf7999 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Sun, 12 Dec 2021 18:06:57 -0800 Subject: [PATCH 41/44] remove comments --- src/core.mk | 8 ++------ src/core/legate_c.h | 1 - src/core/runtime/runtime.cc | 19 ------------------- 3 files changed, 2 insertions(+), 26 deletions(-) diff --git a/src/core.mk b/src/core.mk index 3aa885b2c..337925308 100644 --- a/src/core.mk +++ b/src/core.mk @@ -32,9 +32,7 @@ GEN_CPU_SRC = core/legate_c.cc \ core/task/task.cc \ core/utilities/deserializer.cc \ core/utilities/machine.cc \ - core/utilities/linearize.cc # \ - #core/fused/fused_op_gpu.cc \ - #core/fused/fused_op.cc + core/utilities/linearize.cc ifeq ($(strip $(USE_CUDA)),1) GEN_CPU_SRC += core/gpu/cudalibs.cc @@ -65,6 +63,4 @@ INSTALL_HEADERS = legate.h \ core/utilities/machine.h \ core/utilities/span.h \ core/utilities/type_traits.h \ - core/utilities/typedefs.h #\ - #core/fused/fused_op.h \ - #core/fused/fused_op_wrapper.h + core/utilities/typedefs.h diff --git a/src/core/legate_c.h b/src/core/legate_c.h index 132328cf7..d3e12a502 100644 --- a/src/core/legate_c.h +++ b/src/core/legate_c.h @@ -21,7 +21,6 @@ typedef enum legate_core_task_id_t { LEGATE_CORE_INITIALIZE_TASK_ID, LEGATE_CORE_FINALIZE_TASK_ID, LEGATE_CORE_EXTRACT_SCALAR_TASK_ID, - //LEGATE_CORE_FUSED_TASK_ID, LEGATE_CORE_NUM_TASK_IDS, // must be last } legate_core_task_id_t; diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc index 85c21307c..d869ec794 100644 --- a/src/core/runtime/runtime.cc +++ b/src/core/runtime/runtime.cc @@ -18,7 +18,6 @@ #include "core/runtime/context.h" #include "core/runtime/projection.h" #include "core/runtime/shard.h" -#include "core/fused/fused_op.h" #include "core/utilities/deserializer.h" #include "legate.h" #ifdef LEGATE_USE_CUDA @@ -183,11 +182,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library runtime->attach_name( extract_scalar_task_id, extract_scalar_task_name, false /*mutable*/, true /*local only*/); - //const TaskID fused_task_id = context.get_task_id(LEGATE_CORE_FUSED_TASK2_ID); - //const char* fused_task_name = "Legate Core Task Fusion"; - //runtime->attach_name( - // fused_task_id, fused_task_name, false /*mutable*/, true /*local only*/); - auto make_registrar = [&](auto task_id, auto* task_name, auto proc_kind) { TaskVariantRegistrar registrar(task_id, task_name); @@ -212,17 +206,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library runtime->register_task_variant(registrar, LEGATE_CPU_VARIANT); } - /* - { - auto registrar = make_registrar(fused_task_id, fused_task_name, Processor::LOC_PROC); - runtime->register_task_variant(registrar, LEGATE_CPU_VARIANT); - } - { - auto registrar = make_registrar(fused_task_id, fused_task_name, Processor::TOC_PROC); - runtime->register_task_variant(registrar, LEGATE_GPU_VARIANT); - } - */ - #ifdef LEGATE_USE_CUDA @@ -267,8 +250,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library register_legate_core_sharding_functors(runtime, context); - std::cout<<"performing legate core registration callback"< Date: Sun, 12 Dec 2021 18:12:09 -0800 Subject: [PATCH 42/44] more cleanup --- src/core/data/scalar.h | 1 - src/core/data/store.h | 5 +---- src/core/runtime/context.h | 2 -- src/core/runtime/runtime.cc | 4 ---- src/core/task/task.h | 2 -- src/legate.h | 2 -- 6 files changed, 1 insertion(+), 15 deletions(-) diff --git a/src/core/data/scalar.h b/src/core/data/scalar.h index 852121a19..428c75386 100644 --- a/src/core/data/scalar.h +++ b/src/core/data/scalar.h @@ -57,7 +57,6 @@ class Scalar { bool tuple_{false}; LegateTypeCode code_{MAX_TYPE_NUMBER}; const void* data_; - }; } // namespace legate diff --git a/src/core/data/store.h b/src/core/data/store.h index 308c50cd1..b14a5d850 100644 --- a/src/core/data/store.h +++ b/src/core/data/store.h @@ -17,6 +17,7 @@ #pragma once #include "legion.h" + #include "core/data/buffer.h" #include "core/data/transform.h" #include "core/task/return.h" @@ -153,7 +154,6 @@ class RegionField { bool readable_{false}; bool writable_{false}; bool reducible_{false}; - }; class OutputRegionField { @@ -177,7 +177,6 @@ class OutputRegionField { bool bound_{false}; Legion::OutputRegion out_{}; Legion::FieldID fid_{-1U}; - }; class FutureWrapper { @@ -239,7 +238,6 @@ class FutureWrapper { private: mutable bool uninitialized_{true}; mutable void* rawptr_{nullptr}; - }; class Store { @@ -331,7 +329,6 @@ class Store { bool readable_{false}; bool writable_{false}; bool reducible_{false}; - }; //containts prefix sums for a sub-op diff --git a/src/core/runtime/context.h b/src/core/runtime/context.h index 245024e10..d93d1662f 100644 --- a/src/core/runtime/context.h +++ b/src/core/runtime/context.h @@ -26,7 +26,6 @@ namespace legate { class Store; class Scalar; -//struct FusionMetadata; struct ResourceConfig { int64_t max_tasks{1000000}; @@ -36,7 +35,6 @@ struct ResourceConfig { int64_t max_shardings{0}; }; - class ResourceScope { public: ResourceScope() = default; diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc index d869ec794..a39159c66 100644 --- a/src/core/runtime/runtime.cc +++ b/src/core/runtime/runtime.cc @@ -182,7 +182,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library runtime->attach_name( extract_scalar_task_id, extract_scalar_task_name, false /*mutable*/, true /*local only*/); - auto make_registrar = [&](auto task_id, auto* task_name, auto proc_kind) { TaskVariantRegistrar registrar(task_id, task_name); registrar.add_constraint(ProcessorConstraint(proc_kind)); @@ -206,8 +205,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library runtime->register_task_variant(registrar, LEGATE_CPU_VARIANT); } - - #ifdef LEGATE_USE_CUDA { auto registrar = make_registrar(initialize_task_id, initialize_task_name, Processor::TOC_PROC); @@ -249,7 +246,6 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library register_legate_core_projection_functors(runtime, context); register_legate_core_sharding_functors(runtime, context); - } } // namespace legate diff --git a/src/core/task/task.h b/src/core/task/task.h index a5ccb25df..e83019a08 100644 --- a/src/core/task/task.h +++ b/src/core/task/task.h @@ -126,8 +126,6 @@ class LegateTask { bool inner = false, bool idempotent = false) { - - // Construct the code descriptor for this task so that the library // can register it later when it is ready Legion::CodeDescriptor desc( diff --git a/src/legate.h b/src/legate.h index 8001a95ac..e786b5f87 100644 --- a/src/legate.h +++ b/src/legate.h @@ -27,6 +27,4 @@ #include "core/utilities/dispatch.h" #include "core/utilities/type_traits.h" #include "core/utilities/typedefs.h" -//#include "core/fused/fused_op.h" -//#include "core/fused/fused_op_wrapper.h" #include "legate_defines.h" From 5bb4df59cc9f7793719970d5dda60a0f7bf638b1 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Wed, 20 Apr 2022 15:35:57 -0700 Subject: [PATCH 43/44] partitioning fix --- legate/core/operation.py | 11 +++------ legate/core/runtime.py | 51 +++++++--------------------------------- legate/core/solver.py | 2 +- 3 files changed, 13 insertions(+), 51 deletions(-) diff --git a/legate/core/operation.py b/legate/core/operation.py index 5f148a5d0..a5a5e53e4 100644 --- a/legate/core/operation.py +++ b/legate/core/operation.py @@ -217,14 +217,9 @@ def launch(self, strategy): if self._is_fused: launcher.add_fusion_metadata(self._is_fused, self._fusion_metadata) - if self._is_fused: #fused ops re-use encapsulated unfused partitions - input_parts = self._unfused_input_parts - output_parts = self._unfused_output_parts - reduction_parts = self._unfused_reduction_parts - else: - input_parts = self._input_parts - output_parts = self._output_parts - reduction_parts = self._reduction_parts + input_parts = self._input_parts + output_parts = self._output_parts + reduction_parts = self._reduction_parts for input, input_part in zip(self._inputs, input_parts): proj = strategy.get_projection(input_part) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index b0e2c53bf..d7762b7ce 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -46,6 +46,7 @@ from .solver import Partitioner, Strategy from .store import RegionField, Store, FusionMetadata import numpy as np +from .constraints import Alignment @@ -751,14 +752,6 @@ def can_fuse(self): partitioner = Partitioner(self.runtime, [op], must_be_single=must_be_single) self.partitioners.append( partitioner ) strategy = partitioner.partition_stores() - for output, part, in zip(op._outputs, op._output_parts): - partition = strategy.get_partition(part) - output.set_key_partition(partition) - key_part = partition - #check if input and output should be aligned - for input in op._inputs: - if op.has_constraint(input, output): - input.set_key_partition(key_part) self.strategies.append(strategy) self.strategies.reverse() @@ -1183,28 +1176,11 @@ def build_fused_op(self,ops): fusion_checker.register_constraint(ValidProducerConsumer()) can_fuse,fusable_sets, partitions = fusion_checker.can_fuse() - super_strategies = [] - z=0 - for fusable_set in fusable_sets: - #create super strategy for this fusable set - super_strat = {} - super_fspace = {} - super_keystore = set() - start,end = fusable_set - for j in range(start,end): - super_strat = {**(super_strat.copy()), **partitions[j]._strategy} - super_fspace = {**(super_fspace.copy()), **partitions[j]._fspaces} - super_keystore = super_keystore.union(partitions[j]._key_parts) - super_strategies.append(Strategy(partitions[start]._launch_shape, super_strat, super_fspace, super_keystore)) - - #once fusion in the core is playing nicely with the mapepr #the following two lines will be removed, and be replaced #with the 2 subsequent (commented out) lines fused_id = self._contexts["cunumeric"].fused_id numpy_context = self._contexts["cunumeric"] - #fused_id = self._contexts["legate.core"]._library.fused_id - #numpy_context = self._contexts["legate.core"] opID=0 new_op_list = [] @@ -1225,39 +1201,30 @@ def build_fused_op(self,ops): fused_task.add_fusion_metadata(fusion_metadata) #sets fused_task._is_fused to true #add typical inputs and outputs of all subtasks to fused task - key_part = None - fused_task._unfused_input_parts = [] - fused_task._unfused_output_parts = [] - fused_task._unfused_reduction_parts = [] for j,op in enumerate(op_subset): for scalar in op._scalar_args: fused_task.add_scalar_arg(scalar[0], ty.int32) for (reduction, redop), part in zip(op._reductions, op._reduction_parts): fused_task.add_reduction(reduction, redop) - fused_task._unfused_reduction_parts.append(part) for input,part in zip(op._inputs, op._input_parts): fused_task.add_input(input) - fused_task._unfused_input_parts.append(part) for output,part in zip(op._outputs, op._output_parts): fused_task.add_output(output) - fused_task._unfused_output_parts.append(part) for future in op._futures: fused_task.add_future(future) + for constraint in op._constraints: + if (isinstance(constraint, Alignment)): + fused_task.add_alignment(constraint._lhs.store, constraint._rhs.store) opID+=1 new_op_list.append(fused_task) strats=[] - redoPar=False for i,fused_task in enumerate(new_op_list): - if redoPar: - must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task]) - partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single) - strategy = partitioner.partition_stores() - fused_task.strategy = strategy - strats.append(strategy) - else: - fused_task.strategy = super_strategies[i] - strats.append( super_strategies[i]) + must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task]) + partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single) + strategy = partitioner.partition_stores() + fused_task.strategy = strategy + strats.append(strategy) return new_op_list, strats def _launch_outstanding(self, force_eval=True): diff --git a/legate/core/solver.py b/legate/core/solver.py index 51639f17d..84aa8180c 100644 --- a/legate/core/solver.py +++ b/legate/core/solver.py @@ -35,7 +35,7 @@ def empty(self): def _add(self, var1, var2): cls = set([var1, var2]) cls_id = self._next_class_id - self._next_class_id + 1 + self._next_class_id += 1 self._classes[cls_id] = cls self._class_ids[var1] = cls_id self._class_ids[var2] = cls_id From 36c40cb1ebf7a809fe1196569c81be44a41d4586 Mon Sep 17 00:00:00 2001 From: Shiv Sundram Date: Mon, 13 Jun 2022 19:13:18 -0700 Subject: [PATCH 44/44] choose midpoint partition --- legate/core/runtime.py | 57 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/legate/core/runtime.py b/legate/core/runtime.py index d7762b7ce..3d433637b 100644 --- a/legate/core/runtime.py +++ b/legate/core/runtime.py @@ -758,9 +758,55 @@ def can_fuse(self): windows = [(0, len(self.ops))] for constraint in self.constraints: windows = constraint.apply(self.contexts, self.runtime, self.ops, windows, self.partitioners, self.strategies) - + + #for i,strategy in enumerate(self.strategies): + # print(i,"i", strategy) + old_strategies = self.strategies[:] + old_strategies.reverse() + self.strategies = [] + #for i,strategy in enumerate(old_strategies): + # print(i,strategy) + ist=0 + keyps = [] + for window in reversed(windows): + fusable,final_set = self.supress_small_fusions(windows, self.runtime._fusion_threshold) + local_partitions = [] + if window[0] == window[1]: + continue + for op in reversed(self.ops[ window[0]:window[1] ]): + strategy = old_strategies[ist] + #print("looking in", strategy) + + for output, part, in zip(op._outputs, op._output_parts): + #print("need", part) + partition = strategy.get_partition(part) + local_partitions.append(partition) + ist+=1 + midpoint = int(len(local_partitions)/2) + partition = local_partitions[midpoint] + keyps.append(partition) + #print("selected", midpoint, len(local_partitions), partition) + for op in reversed(self.ops[ window[0]:window[1] ]): + #print("selected", partition) + for output, part, in zip(op._outputs, op._output_parts): + output.reset_key_partition() + output.set_key_partition(partition) + strategy._strategy[part] = partition + key_part = partition + #check if input and output should be aligned + for input, ipart in zip(op._inputs, op._input_parts): + if input.shape== output.shape: + input.reset_key_partition() + input.set_key_partition(partition) + strategy._strategy[ipart] = partition + self.strategies.append(strategy) + #return fusable, final_set, self.strategies + self.strategies.reverse() + keyps.reverse() + + fusable,final_set = self.supress_small_fusions(windows, self.runtime._fusion_threshold) - return fusable, final_set, self.strategies + return fusable, final_set, self.strategies, keyps class FusionConstraint(object): @@ -1174,7 +1220,7 @@ def build_fused_op(self,ops): fusion_checker.register_constraint(IdenticalLaunchShapes()) fusion_checker.register_constraint(IdenticalProjection()) fusion_checker.register_constraint(ValidProducerConsumer()) - can_fuse,fusable_sets, partitions = fusion_checker.can_fuse() + can_fuse,fusable_sets, partitions, keyps = fusion_checker.can_fuse() #once fusion in the core is playing nicely with the mapepr #the following two lines will be removed, and be replaced @@ -1221,10 +1267,15 @@ def build_fused_op(self,ops): for i,fused_task in enumerate(new_op_list): must_be_single = any(len(gop.scalar_outputs) > 0 for gop in [fused_task]) + for output, part, in zip(fused_task._outputs, fused_task._output_parts): + output.set_key_partition(keyps[i]) + for input, part, in zip(fused_task._inputs, fused_task._input_parts): + input.set_key_partition(keyps[i]) partitioner = Partitioner(self, [fused_task], must_be_single=must_be_single) strategy = partitioner.partition_stores() fused_task.strategy = strategy strats.append(strategy) + #print(i, strategy) return new_op_list, strats def _launch_outstanding(self, force_eval=True):