diff --git a/conftest.py b/conftest.py index c19f7f1b8b..cd74558788 100644 --- a/conftest.py +++ b/conftest.py @@ -163,7 +163,7 @@ def parallel(item): # OpenMPI requires an explicit flag for oversubscription. We need it as some # of the MPI tests will spawn lots of processes if mpi_distro == 'OpenMPI': - call = [mpi_exec, '--oversubscribe', '--timeout', '150'] + args + call = [mpi_exec, '--oversubscribe', '--timeout', '300'] + args else: call = [mpi_exec] + args diff --git a/devito/core/operator.py b/devito/core/operator.py index 2de585a11e..887c79978a 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -329,9 +329,9 @@ class OptOption(object): class ParTileArg(tuple): - def __new__(cls, items, shm=0, tag=None): + def __new__(cls, items, rule=None, tag=None): obj = super().__new__(cls, items) - obj.shm = shm + obj.rule = rule obj.tag = tag return obj @@ -371,14 +371,15 @@ def __new__(cls, items, default=None): try: y = items[1] - if is_integer(y): - # E.g., ((32, 4, 8), 1) - # E.g., ((32, 4, 8), 1, 'tag') + if is_integer(y) or isinstance(y, str) or y is None: + # E.g., ((32, 4, 8), 'rule') + # E.g., ((32, 4, 8), 'rule', 'tag') items = (ParTileArg(*items),) else: try: - # E.g., (((32, 4, 8), 1), ((32, 4, 4), 2)) - # E.g., (((32, 4, 8), 1, 'tag0'), ((32, 4, 4), 2, 'tag1')) + # E.g., (((32, 4, 8), 'rule'), ((32, 4, 4), 'rule')) + # E.g., (((32, 4, 8), 'rule0', 'tag0'), + # ((32, 4, 4), 'rule1', 'tag1')) items = tuple(ParTileArg(*i) for i in items) except TypeError: # E.g., ((32, 4, 8), (32, 4, 4)) diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py index 1ca7e29506..aa29d9c1d7 100644 --- a/devito/finite_differences/differentiable.py +++ b/devito/finite_differences/differentiable.py @@ -7,6 +7,7 @@ import sympy from sympy.core.add import _addsort from sympy.core.mul import _keep_coeff, _mulsort +from sympy.core.core import ordering_of_classes from sympy.core.decorators import call_highest_priority from sympy.core.evalf import evalf_table @@ -556,6 +557,9 @@ def __repr__(self): __str__ = __repr__ + def _sympystr(self, printer): + return str(self) + def _hashable_content(self): return super()._hashable_content() + (self.dimensions,) @@ -621,7 +625,7 @@ def __eq__(self, other): __hash__ = sympy.Basic.__hash__ def _hashable_content(self): - return (self.name, self.dimension, hash(tuple(self.weights))) + return (self.name, self.dimension, str(self.weights)) @property def dimension(self): @@ -665,6 +669,20 @@ def __new__(cls, expr, mapper, **kwargs): def _hashable_content(self): return super()._hashable_content() + (self.mapper,) + def compare(self, other): + if self is other: + return 0 + n1 = self.__class__ + n2 = other.__class__ + if n1.__name__ == n2.__name__: + return self.base.compare(other.base) + else: + return super().compare(other) + + @cached_property + def base(self): + return self.expr.func(*[a for a in self.expr.args if a is not self.weights]) + @property def weights(self): return self._weights @@ -693,6 +711,10 @@ def _evaluate(self, **kwargs): return expr +ordering_of_classes.insert(ordering_of_classes.index('Derivative') + 1, + 'IndexDerivative') + + class EvalDerivative(DifferentiableOp, sympy.Add): is_commutative = True diff --git a/devito/finite_differences/finite_difference.py b/devito/finite_differences/finite_difference.py index e6e35f03fd..5749a82ae4 100644 --- a/devito/finite_differences/finite_difference.py +++ b/devito/finite_differences/finite_difference.py @@ -207,9 +207,11 @@ def generic_derivative(expr, dim, fd_order, deriv_order, matvec=direct, x0=None, matvec, x0, symbolic, expand) -def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic, expand): +def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic, + expand): # The stencil indices - indices, x0 = generate_indices(expr, dim, fd_order, side=side, matvec=matvec, x0=x0) + indices, x0 = generate_indices(expr, dim, fd_order, side=side, matvec=matvec, + x0=x0) # Finite difference weights from Taylor approximation given these positions if symbolic: @@ -218,15 +220,19 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic weights = numeric_weights(deriv_order, indices, x0) # Enforce fixed precision FD coefficients to avoid variations in results - weights = [sympify(w).evalf(_PRECISION) for w in weights] + weights = [sympify(w).evalf(_PRECISION) for w in weights][::matvec.val] # Transpose the FD, if necessary - if matvec: - indices = indices.scale(matvec.val) + if matvec == transpose: + indices = indices.transpose() # Shift index due to staggering, if any indices = indices.shift(-(expr.indices_ref[dim] - dim)) + # The user may wish to restrict expansion to selected derivatives + if callable(expand): + expand = expand(dim) + if not expand and indices.expr is not None: weights = Weights(name='w', dimensions=indices.free_dim, initvalue=weights) diff --git a/devito/finite_differences/tools.py b/devito/finite_differences/tools.py index ba112a3116..7ab19e7664 100644 --- a/devito/finite_differences/tools.py +++ b/devito/finite_differences/tools.py @@ -175,14 +175,14 @@ def __repr__(self): def spacing(self): return self.dim.spacing - def scale(self, v): + def transpose(self): """ - Construct a new IndexSet with all indices scaled by `v`. + Transpose the IndexSet. """ - mapper = {self.spacing: v*self.spacing} + mapper = {self.spacing: -self.spacing} indices = [] - for i in self: + for i in reversed(self): try: iloc = i.xreplace(mapper) except AttributeError: @@ -190,12 +190,18 @@ def scale(self, v): iloc = sympify(i).xreplace(mapper) indices.append(iloc) + try: + free_dim = self.free_dim.transpose() + mapper.update({self.free_dim: -free_dim}) + except AttributeError: + free_dim = self.free_dim + try: expr = self.expr.xreplace(mapper) except AttributeError: expr = None - return IndexSet(self.dim, indices, expr=expr, fd=self.free_dim) + return IndexSet(self.dim, indices, expr=expr, fd=free_dim) def shift(self, v): """ diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index 19d31d5173..91a145d966 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -6,7 +6,7 @@ import sympy from devito.exceptions import InvalidOperator -from devito.ir.support import (Any, Backward, Forward, IterationSpace, +from devito.ir.support import (Any, Backward, Forward, IterationSpace, erange, pull_dims) from devito.ir.clusters.analysis import analyze from devito.ir.clusters.cluster import Cluster, ClusterGroup @@ -14,7 +14,7 @@ from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.symbolics import retrieve_indexed, uxreplace, xreplace_indices from devito.tools import (DefaultOrderedDict, Stamp, as_mapper, flatten, - is_integer, timed_pass) + is_integer, timed_pass, toposort) from devito.types import Array, Eq, Symbol from devito.types.dimension import BOTTOM, ModuloDimension @@ -29,6 +29,7 @@ def clusterize(exprs, **kwargs): clusters = [Cluster(e, e.ispace) for e in exprs] # Setup the IterationSpaces based on data dependence analysis + clusters = impose_total_ordering(clusters) clusters = Schedule().process(clusters) # Handle SteppingDimensions @@ -49,6 +50,29 @@ def clusterize(exprs, **kwargs): return ClusterGroup(clusters) +def impose_total_ordering(clusters): + """ + Create a new sequence of Clusters whose IterationSpaces are totally ordered + according to a global set of relations. + """ + global_relations = set().union(*[c.ispace.relations for c in clusters]) + ordering = toposort(global_relations) + + processed = [] + for c in clusters: + key = lambda d: ordering.index(d) + try: + relations = {tuple(sorted(c.ispace.itdims, key=key))} + except ValueError: + # See issue #2204 + relations = c.ispace.relations + ispace = c.ispace.reorder(relations=relations, mode='total') + + processed.append(c.rebuild(ispace=ispace)) + + return processed + + class Schedule(QueueStateful): """ @@ -121,10 +145,12 @@ def callback(self, clusters, prefix, backlog=None, known_break=None): require_break = scope.d_flow.cause & maybe_break if require_break: backlog = [clusters[-1]] + backlog - # Try with increasingly smaller ClusterGroups until the ambiguity is gone + # Try with increasingly smaller ClusterGroups until the + # ambiguity is gone return self.callback(clusters[:-1], prefix, backlog, require_break) - # Schedule Clusters over different IterationSpaces if this increases parallelism + # Schedule Clusters over different IterationSpaces if this increases + # parallelism for i in range(1, len(clusters)): if self._break_for_parallelism(scope, candidates, i): return self.callback(clusters[:i], prefix, clusters[i:] + backlog, @@ -146,8 +172,8 @@ def callback(self, clusters, prefix, backlog=None, known_break=None): if not backlog: return processed - # Handle the backlog -- the Clusters characterized by flow- and anti-dependences - # along one or more Dimensions + # Handle the backlog -- the Clusters characterized by flow- and + # anti-dependences along one or more Dimensions idir = {d: Any for d in known_break} stamp = Stamp() for i, c in enumerate(list(backlog)): @@ -278,7 +304,11 @@ def callback(self, clusters, prefix): size = i.function.shape_allocated[d] assert is_integer(size) - mapper[size][si].add(iaf) + # Resolve StencilDimensions in case of unexpanded expressions + # E.g. `i0 + t` -> `(t - 1, t, t + 1)` + iafs = erange(iaf) + + mapper[size][si].update(iafs) # Construct the ModuloDimensions mds = [] @@ -288,7 +318,8 @@ def callback(self, clusters, prefix): # SymPy's index ordering (t, t-1, t+1) afer modulo replacement so # that associativity errors are consistent. This corresponds to # sorting offsets {-1, 0, 1} as {0, -1, 1} assigning -inf to 0 - siafs = sorted(iafs, key=lambda i: -np.inf if i - si == 0 else (i - si)) + key = lambda i: -np.inf if i - si == 0 else (i - si) + siafs = sorted(iafs, key=key) for iaf in siafs: name = '%s%d' % (si.name, len(mds)) @@ -452,7 +483,7 @@ def normalize_reductions_dense(cluster, sregistry, options): """ opt_mapify_reduce = options['mapify-reduce'] - dims = [d for d in cluster.properties.dimensions + dims = [d for d in cluster.ispace.itdims if cluster.properties.is_parallel_atomic(d)] if not dims: diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index e7354b8693..bd5a820f30 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -8,7 +8,7 @@ Forward, Interval, IntervalGroup, IterationSpace, DataSpace, Guards, Properties, Scope, detect_accesses, detect_io, normalize_properties, normalize_syncs, - sdims_min, sdims_max) + minimum, maximum, null_ispace) from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.symbolics import estimate_cost from devito.tools import as_tuple, flatten, frozendict, infer_dtype @@ -41,10 +41,8 @@ class Cluster(object): The halo exchanges required by the Cluster. """ - def __init__(self, exprs, ispace=None, guards=None, properties=None, syncs=None, - halo_scheme=None): - ispace = ispace or IterationSpace([]) - + def __init__(self, exprs, ispace=null_ispace, guards=None, properties=None, + syncs=None, halo_scheme=None): self._exprs = tuple(ClusterizedEq(e, ispace=ispace) for e in as_tuple(exprs)) self._ispace = ispace self._guards = Guards(guards or {}) @@ -52,13 +50,7 @@ def __init__(self, exprs, ispace=None, guards=None, properties=None, syncs=None, # Normalize properties properties = Properties(properties or {}) - for d in ispace.itdimensions: - properties = properties.add(d) - for i in properties: - for d in as_tuple(i): - if d not in ispace.itdimensions: - properties = properties.drop(d) - self._properties = properties + self._properties = tailor_properties(properties, ispace) self._halo_scheme = halo_scheme @@ -85,10 +77,7 @@ def from_clusters(cls, *clusters): guards = root.guards - properties = {} - for c in clusters: - for d, v in c.properties.items(): - properties[d] = normalize_properties(properties.get(d, v), v) + properties = reduce_properties(clusters) try: syncs = normalize_syncs(*[c.syncs for c in clusters]) @@ -213,12 +202,10 @@ def is_dense(self): # at most PARALLEL_IF_PVT). This is a quick and easy check so we try it first try: pset = {PARALLEL, PARALLEL_IF_PVT} - grid = self.grid - for d in grid.dimensions: - if not any(pset & v for k, v in self.properties.items() - if d in k._defines): - raise ValueError - return True + target = set(self.grid.dimensions) + dims = {d for d in self.properties if d._defines & target} + if any(pset & self.properties[d] for d in dims): + return True except ValueError: pass @@ -280,8 +267,8 @@ def dspace(self): continue intervals = [Interval(d, - min([sdims_min(i) for i in offs]), - max([sdims_max(i) for i in offs])) + min([minimum(i) for i in offs]), + max([maximum(i) for i in offs])) for d, offs in v.items()] intervals = IntervalGroup(intervals) @@ -312,7 +299,8 @@ def dspace(self): # Special case: if the factor of a ConditionalDimension has value 1, # then we can safely resort to the parent's Interval - intervals = intervals.promote(lambda d: d.is_Conditional and d.factor == 1) + key = lambda d: d.is_Conditional and d.factor == 1 + intervals = intervals.promote(key) parts[f] = intervals @@ -339,6 +327,7 @@ def dspace(self): # Construct the `intervals` of the DataSpace, that is a global, # Dimension-centric view of the data space intervals = IntervalGroup.generate('union', *parts.values()) + # E.g., `db0 -> time`, but `xi NOT-> x` intervals = intervals.promote(lambda d: not d.is_Sub) intervals = intervals.zero(set(intervals.dimensions) - oobs) @@ -363,6 +352,10 @@ def traffic(self): reads, writes = detect_io(self.exprs, relax=True) accesses = [(i, 'r') for i in reads] + [(i, 'w') for i in writes] + # Ordering isn't important at this point, so returning an unordered + # collection makes the caller's life easier + uispace = self.ispace.reorder(mode='unordered') + ret = {} for i, mode in accesses: if not i.is_AbstractFunction: @@ -372,7 +365,8 @@ def traffic(self): intervals = self.dspace.parts[i] # Assume that invariant dimensions always cause new loads/stores - invariants = self.ispace.intervals.drop(intervals.dimensions) + invariants = uispace.intervals.drop(intervals.dimensions) + intervals = intervals.generate('union', invariants, intervals) # Bundles impact depends on the number of components @@ -384,7 +378,8 @@ def traffic(self): for n in range(v): ret[(i, mode, n)] = intervals else: - ret[(i, mode, 0)] = self.ispace.intervals + ret[(i, mode, 0)] = uispace.intervals + return ret @@ -422,6 +417,10 @@ def scope(self): def ispace(self): return self._ispace + @cached_property + def properties(self): + return tailor_properties(reduce_properties(self), self.ispace) + @cached_property def guards(self): """The guards of each Cluster in self.""" @@ -429,8 +428,10 @@ def guards(self): @cached_property def syncs(self): - """The synchronization operations of each Cluster in self.""" - return tuple(i.syncs for i in self) + """ + A view of the ClusterGroup's synchronization operations. + """ + return normalize_syncs(*[c.syncs for c in self]) @cached_property def dspace(self): @@ -465,3 +466,34 @@ def meta(self): The data type and the data space of the ClusterGroup. """ return (self.dtype, self.dspace) + + +# *** Utils + +def reduce_properties(clusters): + """ + The normalized intersection (basically, a reduction) of the Properties in + `clusters`. + """ + properties = {} + for c in clusters: + for d, v in c.properties.items(): + properties[d] = normalize_properties(properties.get(d, v), v) + + return Properties(properties) + + +def tailor_properties(properties, ispace): + """ + Create a new Properties object off `properties` that retains all and only + the iteration dimensions in `ispace`. + """ + for i in properties: + for d in as_tuple(i): + if d not in ispace.itdims: + properties = properties.drop(d) + + for d in ispace.itdims: + properties = properties.add(d) + + return properties diff --git a/devito/ir/clusters/visitors.py b/devito/ir/clusters/visitors.py index f9bb36d6e3..95aecf1a86 100644 --- a/devito/ir/clusters/visitors.py +++ b/devito/ir/clusters/visitors.py @@ -3,7 +3,7 @@ from itertools import groupby -from devito.ir.support import IterationSpace, Scope +from devito.ir.support import IterationSpace, Scope, null_ispace from devito.tools import as_tuple, flatten, timed_pass __all__ = ['Queue', 'QueueStateful', 'cluster_pass'] @@ -50,7 +50,7 @@ def _make_key(self, cluster, level): guards = None if self._q_properties_in_key: - properties = cluster.properties.drop(cluster.ispace[level:].itdimensions) + properties = cluster.properties.drop(cluster.ispace[level:].itdims) else: properties = None @@ -73,12 +73,10 @@ def _make_key(self, cluster, level): def _make_key_hook(self, cluster, level): return () - def _process_fdta(self, clusters, level, prefix=None, **kwargs): + def _process_fdta(self, clusters, level, prefix=null_ispace, **kwargs): """ fdta -> First Divide Then Apply """ - prefix = prefix or IterationSpace([]) - # Divide part processed = [] for k, g in groupby(clusters, key=lambda i: self._make_key(i, level)): diff --git a/devito/ir/equations/algorithms.py b/devito/ir/equations/algorithms.py index 0c0185055e..c27ef5c54f 100644 --- a/devito/ir/equations/algorithms.py +++ b/devito/ir/equations/algorithms.py @@ -3,8 +3,7 @@ from sympy import sympify from devito.symbolics import retrieve_indexed, uxreplace, retrieve_dimensions -from devito.tools import (PartialOrderTuple, as_tuple, flatten, - filter_sorted, filter_ordered) +from devito.tools import Ordering, as_tuple, flatten, filter_sorted, filter_ordered from devito.types import Dimension, IgnoreDimSort from devito.types.basic import AbstractFunction @@ -85,7 +84,7 @@ def handle_indexed(indexed): implicit_relations.update({tuple(filter_ordered(dims))}) - ordering = PartialOrderTuple(extra, relations=implicit_relations) + ordering = Ordering(extra, relations=implicit_relations, mode='partial') return ordering diff --git a/devito/ir/equations/equation.py b/devito/ir/equations/equation.py index ecef8b82fb..0fbfff9cf9 100644 --- a/devito/ir/equations/equation.py +++ b/devito/ir/equations/equation.py @@ -178,8 +178,8 @@ def __new__(cls, *args, **kwargs): iterators.setdefault(d, set()) # Construct the IterationSpace - intervals = IntervalGroup([Interval(d, 0, 0) for d in iterators], - relations=ordering.relations) + intervals = IntervalGroup([Interval(d) for d in iterators], + relations=ordering.relations, mode='partial') ispace = IterationSpace(intervals, iterators) # Construct the conditionals and replace the ConditionalDimensions in `expr` diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index 989f557a0b..793cd6b14d 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -363,8 +363,9 @@ def __init__(self, expr, pragmas=None, init=False, operation=None): self.operation = operation def __repr__(self): - return "<%s::%s>" % (self.__class__.__name__, - filter_ordered([f.func for f in self.functions])) + return "<%s::%s=%s>" % (self.__class__.__name__, + type(self.write), + ','.join('%s' % type(f) for f in self.functions)) @property def dtype(self): diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 0f7e46bcfd..9383501ea3 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -142,7 +142,7 @@ def preprocess(clusters, options=None, **kwargs): hs = HaloScheme.union(e.rhs.halo_scheme for e in c.exprs) queue.append(c.rebuild(halo_scheme=hs)) else: - dims = set(c.ispace.promote(lambda d: d.is_Block).itdimensions) + dims = set(c.ispace.promote(lambda d: d.is_Block).itdims) found = [] for c1 in list(queue): diff --git a/devito/ir/support/__init__.py b/devito/ir/support/__init__.py index 2b20e3ab20..8a8b50bc67 100644 --- a/devito/ir/support/__init__.py +++ b/devito/ir/support/__init__.py @@ -1,5 +1,5 @@ -from .utils import * # noqa from .vector import * # noqa +from .utils import * # noqa from .basic import * # noqa from .space import * # noqa from .guards import * # noqa diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 0ca1da8796..21b3527c06 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -3,16 +3,17 @@ from cached_property import cached_property from sympy import S -from devito.ir.support.space import Backward, IterationSpace -from devito.ir.support.utils import AccessMode +from devito.ir.support.space import Backward, null_ispace +from devito.ir.support.utils import AccessMode, extrema from devito.ir.support.vector import LabeledVector, Vector -from devito.symbolics import (retrieve_terminals, q_constant, q_affine, q_routine, - q_terminal) -from devito.tools import (Tag, as_tuple, is_integer, filter_sorted, flatten, - memoized_meth, memoized_generator) -from devito.types import Barrier, Dimension, DimensionTuple, Jump, Symbol +from devito.symbolics import (compare_ops, retrieve_indexed, retrieve_terminals, + q_constant, q_affine, q_routine, search, uxreplace) +from devito.tools import (Tag, as_mapper, as_tuple, is_integer, filter_sorted, + flatten, memoized_meth, memoized_generator) +from devito.types import (Barrier, ComponentAccess, Dimension, DimensionTuple, + Function, Jump, Symbol, Temp, TempArray, TBArray) -__all__ = ['IterationInstance', 'TimedAccess', 'Scope'] +__all__ = ['IterationInstance', 'TimedAccess', 'Scope', 'ExprGeometry'] class IndexMode(Tag): @@ -217,12 +218,12 @@ def __new__(cls, access, mode, timestamp, ispace=None): AccessMode.__init__(obj, mode=mode) return obj - def __init__(self, access, mode, timestamp, ispace=None): + def __init__(self, access, mode, timestamp, ispace=null_ispace): assert is_integer(timestamp) self.access = access self.timestamp = timestamp - self.ispace = ispace or IterationSpace([]) + self.ispace = ispace def __repr__(self): mode = '\033[1;37;31mW\033[0m' if self.is_write else '\033[1;37;32mR\033[0m' @@ -236,8 +237,9 @@ def __eq__(self, other): # which might require expensive comparisons of Vector entries (i.e., # SymPy expressions) - return (self.access is other.access and # => self.function is other.function - self.mode == other.mode and + return (self.mode == other.mode and + self.timestamp == other.timestamp and + self.access == other.access and self.ispace == other.ispace) def __hash__(self): @@ -322,6 +324,12 @@ def distance(self, other): other : TimedAccess The TimedAccess w.r.t. which the distance is computed. """ + if isinstance(self.access, ComponentAccess) and \ + isinstance(other.access, ComponentAccess) and \ + self.access.index != other.access.index: + # E.g., `uv(x).x` and `uv(x).y` -- not a real dependence! + return Vector(S.ImaginaryUnit) + ret = [] for sit, oit in zip(self.itintervals, other.itintervals): n = len(ret) @@ -812,77 +820,146 @@ def __init__(self, exprs, rules=None): A Scope enables data dependence analysis on a totally ordered sequence of expressions. """ - exprs = as_tuple(exprs) - - self.reads = {} - self.writes = {} + self.exprs = as_tuple(exprs) - self.initialized = set() + # A set of rules to drive the collection of dependencies + self.rules = as_tuple(rules) + assert all(callable(i) for i in self.rules) - for i, e in enumerate(exprs): - # Reads - terminals = retrieve_terminals(e.rhs, deep=True, mode='unique') - try: - terminals.update(retrieve_terminals(e.lhs.indices)) - except AttributeError: - pass - for j in terminals: - v = self.reads.setdefault(j.function, []) - mode = 'RR' if e.is_Reduction and j.function is e.lhs.function else 'R' - v.append(TimedAccess(j, mode, i, e.ispace)) - - # Write - terminals = [] - if q_terminal(e.lhs): - terminals.append(e.lhs) + @memoized_generator + def writes_gen(self): + """ + Generate all write accesses. + """ + for i, e in enumerate(self.exprs): + terminals = retrieve_accesses(e.lhs) if q_routine(e.rhs): try: - terminals.extend(e.rhs.writes) + terminals.update(e.rhs.writes) except AttributeError: # E.g., foreign routines, such as `cos` or `sin` pass for j in terminals: - v = self.writes.setdefault(j.function, []) - mode = 'WR' if e.is_Reduction else 'W' - v.append(TimedAccess(j, mode, i, e.ispace)) + if e.is_Reduction: + mode = 'WR' + else: + mode = 'W' + yield TimedAccess(j, mode, i, e.ispace) + + # Objects altering the control flow (e.g., synchronization barriers, + # break statements, ...) are converted into mock dependences + for i, e in enumerate(self.exprs): + if isinstance(e.rhs, (Barrier, Jump)): + yield TimedAccess(mocksym, 'W', i, e.ispace) + + @cached_property + def writes(self): + """ + Create a mapper from functions to write accesses. + """ + return as_mapper(self.writes_gen(), key=lambda i: i.function) + + @memoized_generator + def reads_explicit_gen(self): + """ + Generate all explicit reads. These are the read accesses to the + AbstractFunctions and Symbols appearing in the Scope's symbolic + expressions. + """ + for i, e in enumerate(self.exprs): + # Reads + terminals = retrieve_accesses(e.rhs, deep=True) + try: + terminals.update(retrieve_accesses(e.lhs.indices)) + except AttributeError: + pass + for j in terminals: + if j.function is e.lhs.function and e.is_Reduction: + mode = 'RR' + else: + mode = 'R' + yield TimedAccess(j, mode, i, e.ispace) # If a reduction, we got one implicit read if e.is_Reduction: - v = self.reads.setdefault(e.lhs.function, []) - v.append(TimedAccess(e.lhs, 'RR', i, e.ispace)) - - # If writing to a scalar, we have an initialization - if not e.is_Reduction and e.is_scalar: - self.initialized.add(e.lhs.function) + yield TimedAccess(e.lhs, 'RR', i, e.ispace) # Look up ConditionalDimensions for v in e.conditionals.values(): - for j in retrieve_terminals(v): - v = self.reads.setdefault(j.function, []) - v.append(TimedAccess(j, 'R', -1, e.ispace)) + for j in retrieve_accesses(v): + yield TimedAccess(j, 'R', -1, e.ispace) - # The iteration symbols too - dimensions = set().union(*[e.dimensions for e in exprs]) + @memoized_generator + def reads_implicit_gen(self): + """ + Generate all implicit reads. These are for examples the reads accesses + to the iteration symbols bounded to the Dimensions used in the Scope's + symbolic expressions. + """ + # The iteration symbols + dimensions = set().union(*[e.dimensions for e in self.exprs]) + symbols = set() for d in dimensions: - for i in d.free_symbols | d.bound_symbols: - v = self.reads.setdefault(i.function, []) - v.append(TimedAccess(i, 'R', -1)) + symbols.update(d.free_symbols | d.bound_symbols) + for i in symbols: + yield TimedAccess(i, 'R', -1) # Objects altering the control flow (e.g., synchronization barriers, # break statements, ...) are converted into mock dependences - for i, e in enumerate(exprs): - if e.find(Barrier) | e.find(Jump): - self.writes.setdefault(mocksym, []).append( - TimedAccess(mocksym, 'W', i, e.ispace) - ) - self.reads.setdefault(mocksym, []).extend([ - TimedAccess(mocksym, 'R', max(i, 0), e.ispace), - TimedAccess(mocksym, 'R', i+1, e.ispace), - ]) + for i, e in enumerate(self.exprs): + if isinstance(e.rhs, (Barrier, Jump)): + yield TimedAccess(mocksym, 'R', max(i, 0), e.ispace) + yield TimedAccess(mocksym, 'R', i+1, e.ispace) - # A set of rules to drive the collection of dependencies - self.rules = as_tuple(rules) - assert all(callable(i) for i in self.rules) + @memoized_generator + def reads_gen(self): + """ + Generate all read accesses. + """ + # NOTE: The reason to keep the explicit and implict reads separated + # is efficiency. Sometimes we wish to extract all reads to a given + # AbstractFunction, and we know that by construction these can't + # appear among the implicit reads + return chain(self.reads_explicit_gen(), self.reads_implicit_gen()) + + @memoized_generator + def reads_smart_gen(self, f): + """ + Generate all read access to a given function. + + StencilDimensions, if any, are replaced with their extrema. + + Notes + ----- + The implementation is smart, in the sense that, depending on the + given function type, it will not necessarily look everywhere inside + the scope to retrieve the corresponding read accesses. Instead, it + will only look in the places where the given type is expected to + be found. For example, a DiscreteFunction would never appear among + the iteration symbols. + """ + if isinstance(f, (Function, Temp, TempArray, TBArray)): + for i in self.reads_explicit_gen(): + if f is i.function: + for j in extrema(i.access): + yield TimedAccess(j, i.mode, i.timestamp, i.ispace) + + else: + for i in self.reads_gen(): + if f is i.function: + yield i + + @cached_property + def reads(self): + """ + Create a mapper from functions to read accesses. + """ + return as_mapper(self.reads_gen(), key=lambda i: i.function) + + @cached_property + def initialized(self): + return frozenset(e.lhs.function for e in self.exprs + if not e.is_Reduction and e.is_scalar) def getreads(self, function): return as_tuple(self.reads.get(function)) @@ -894,7 +971,8 @@ def __getitem__(self, function): return self.getwrites(function) + self.getreads(function) def __repr__(self): - tracked = filter_sorted(set(self.reads) | set(self.writes), key=lambda i: i.name) + tracked = filter_sorted(set(self.reads) | set(self.writes), + key=lambda i: i.name) maxlen = max(1, max([len(i.name) for i in tracked])) out = "{:>%d} => W : {}\n{:>%d} R : {}" % (maxlen, maxlen) pad = " "*(maxlen + 9) @@ -944,10 +1022,13 @@ def d_flow_gen(self): """Generate the flow (or "read-after-write") dependences.""" for k, v in self.writes.items(): for w in v: - for r in self.reads.get(k, []): + for r in self.reads_smart_gen(k): + if any(not rule(w, r) for rule in self.rules): + continue + dependence = Dependence(w, r) - if any(not rule(dependence) for rule in self.rules): + if dependence.is_imaginary: continue distance = dependence.distance @@ -957,8 +1038,7 @@ def d_flow_gen(self): # Non-integer vectors are not comparable. # Conservatively, we assume it is a dependence, unless # it's a read-for-reduction - is_flow = (not dependence.is_imaginary and - not r.is_read_reduction) + is_flow = not r.is_read_reduction if is_flow: yield dependence @@ -972,10 +1052,13 @@ def d_anti_gen(self): """Generate the anti (or "write-after-read") dependences.""" for k, v in self.writes.items(): for w in v: - for r in self.reads.get(k, []): + for r in self.reads_smart_gen(k): + if any(not rule(r, w) for rule in self.rules): + continue + dependence = Dependence(r, w) - if any(not rule(dependence) for rule in self.rules): + if dependence.is_imaginary: continue distance = dependence.distance @@ -985,8 +1068,7 @@ def d_anti_gen(self): # Non-integer vectors are not comparable. # Conservatively, we assume it is a dependence, unless # it's a read-for-reduction - is_anti = (not dependence.is_imaginary and - not r.is_read_reduction) + is_anti = not r.is_read_reduction if is_anti: yield dependence @@ -1001,9 +1083,12 @@ def d_output_gen(self): for k, v in self.writes.items(): for w1 in v: for w2 in self.writes.get(k, []): + if any(not rule(w2, w1) for rule in self.rules): + continue + dependence = Dependence(w2, w1) - if any(not rule(dependence) for rule in self.rules): + if dependence.is_imaginary: continue distance = dependence.distance @@ -1012,7 +1097,7 @@ def d_output_gen(self): except TypeError: # Non-integer vectors are not comparable. # Conservatively, we assume it is a dependence - is_output = not dependence.is_imaginary + is_output = True if is_output: yield dependence @@ -1039,7 +1124,7 @@ def d_from_access_gen(self, accesses): accesses = as_tuple(accesses) for d in self.d_all_gen(): for i in accesses: - if d.source is i or d.sink is i: + if d.source == i or d.sink == i: yield d break @@ -1075,3 +1160,147 @@ def r_all(self): All Relations of the Scope. """ return list(self.r_gen()) + + +class ExprGeometry(object): + + """ + Geometric representation of an expression by abstracting Indexeds as + LabeledVectors. + """ + + def __init__(self, expr, indexeds=None, bases=None, offsets=None): + self.expr = expr + + if indexeds is not None: + self.indexeds = indexeds + self.bases = bases + self.offsets = offsets + return + + self.indexeds = retrieve_indexed(expr) + + bases = [] + offsets = [] + for ii in self.iinstances: + base = [] + offset = [] + for e, fi, ai in zip(ii, ii.findices, ii.aindices): + if ai is None: + base.append((fi, e)) + else: + base.append((fi, ai)) + offset.append((ai, e - ai)) + bases.append(LabeledVector(base)) + offsets.append(LabeledVector(offset)) + + self.bases = bases + self.offsets = offsets + + def __repr__(self): + return "ExprGeometry(expr=%s)" % self.expr + + def translated(self, other, dims=None): + """ + True if `self` is translated w.r.t. `other`, False otherwise. + + Examples + -------- + Two expressions are translated if they perform the same operations, + their bases are the same and their offsets are pairwise translated. + + c := A[i,j] op A[i,j+1] -> Toffsets = {i: [0,0], j: [0,1]} + u := A[i+1,j] op A[i+1,j+1] -> Toffsets = {i: [1,1], j: [0,1]} + + Then `c` is translated w.r.t. `u` with distance `{i: 1, j: 0}` + + The test may be strengthen by imposing that a translation occurs + only along a specific set of Dimensions through the kwarg `dims`. + """ + # Check mathematical structure + if not compare_ops(self.expr, other.expr): + return False + + # Use a suitable value for `dims` if not provided by user + if dims is None: + if self.aindices != other.aindices: + return False + dims = self.aindices + dims = set(as_tuple(dims)) + + # Check bases and offsets + for i in ['Tbases', 'Toffsets']: + Ti0 = getattr(self, i) + Ti1 = getattr(other, i) + + m0 = dict(Ti0) + m1 = dict(Ti1) + + # The only hope in presence of Dimensions appearing only in either + # `self` or `other` is that they have been projected away by the caller + for d in set(m0).symmetric_difference(set(m1)): + if not d._defines & dims: + return False + + for d in set(m0).union(set(m1)): + try: + o0 = m0[d] + o1 = m1[d] + except KeyError: + continue + + distance = set(o0 - o1) + if len(distance) != 1: + return False + + if not d._defines & dims: + if distance.pop() != 0: + return False + + return True + + @cached_property + def iinstances(self): + return tuple(IterationInstance(i) for i in self.indexeds) + + @cached_property + def Tbases(self): + return LabeledVector.transpose(*self.bases) + + @cached_property + def Toffsets(self): + return LabeledVector.transpose(*self.offsets) + + @cached_property + def dimensions(self): + return frozenset(i for i, _ in self.Toffsets) + + @cached_property + def aindices(self): + try: + return tuple(zip(*self.Toffsets))[0] + except IndexError: + return () + + @property + def is_regular(self): + return all(i.is_regular for i in self.iinstances) + + +# *** Utils + +def retrieve_accesses(exprs, **kwargs): + """ + Like retrieve_terminals, but ensure that if a ComponentAccess is found, + the ComponentAccess itself is returned, while the wrapped Indexed is discarded. + """ + kwargs['mode'] = 'unique' + + compaccs = search(exprs, ComponentAccess) + if not compaccs: + return retrieve_terminals(exprs, **kwargs) + + subs = {i: Symbol('dummy%d' % n) for n, i in enumerate(compaccs)} + exprs1 = uxreplace(exprs, subs) + + return compaccs | retrieve_terminals(exprs1, **kwargs) - set(subs.values()) diff --git a/devito/ir/support/guards.py b/devito/ir/support/guards.py index dd1bcc01a5..54bd90bfa0 100644 --- a/devito/ir/support/guards.py +++ b/devito/ir/support/guards.py @@ -8,7 +8,7 @@ from devito.ir.support.space import Forward, IterationDirection from devito.symbolics import CondEq, CondNe -from devito.tools import Pickable, as_tuple, frozendict +from devito.tools import Pickable, as_tuple, frozendict, split from devito.types import Dimension __all__ = ['GuardFactor', 'GuardBound', 'GuardBoundNext', 'BaseGuardBound', @@ -228,7 +228,7 @@ def andg(self, d, guard): return Guards(m) try: - m[d] = And(m[d], guard) + m[d] = simplify_and(m[d], guard) except KeyError: m[d] = guard @@ -269,3 +269,40 @@ def filter(self, key): m = {d: v for d, v in self.items() if key(d)} return Guards(m) + + +# *** Utils + +def simplify_and(relation, v): + """ + Given `x = And(*relation.args, v)`, return `relation` if `x ≡ relation`, + `x` otherwise. + + SymPy doesn't have a builtin function to simplify boolean inequalities; here, + we run a set of simple checks to at least discard the most obvious (and thus + annoying to see in the generated code) redundancies. + """ + if isinstance(relation, And): + candidates, other = split(list(relation.args), lambda a: type(a) is type(v)) + elif type(relation) is type(v): + candidates, other = [relation], [] + else: + candidates, other = [], [relation, v] + + covered = False + new_args = [] + for a in candidates: + if a.lhs is v.lhs: + covered = True + if type(a) in (Gt, Ge) and v.rhs > a.rhs: + new_args.append(v) + elif type(a) in (Lt, Le) and v.rhs < a.rhs: + new_args.append(v) + else: + new_args.append(a) + else: + new_args.append(a) + if not covered: + new_args.append(v) + + return And(*(new_args + other)) diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py index c3c808356e..e9317ba1be 100644 --- a/devito/ir/support/properties.py +++ b/devito/ir/support/properties.py @@ -209,12 +209,12 @@ def is_parallel(self, dims): return any(len(self[d] & {PARALLEL, PARALLEL_INDEP}) > 0 for d in as_tuple(dims)) - def is_parallel_atomic(self, dims): - return any(len(self[d] & {PARALLEL_IF_ATOMIC}) > 0 for d in as_tuple(dims)) - def is_parallel_relaxed(self, dims): return any(len(self[d] & PARALLELS) > 0 for d in as_tuple(dims)) + def is_parallel_atomic(self, dims): + return any(PARALLEL_IF_ATOMIC in self.get(d, ()) for d in as_tuple(dims)) + def is_affine(self, dims): return any(AFFINE in self.get(d, ()) for d in as_tuple(dims)) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index b85b017b4a..7abc68ce08 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -6,14 +6,15 @@ from cached_property import cached_property from sympy import Expr -from devito.ir.support.utils import sdims_min, sdims_max +from devito.ir.support.utils import minimum, maximum from devito.ir.support.vector import Vector, vmin, vmax -from devito.tools import (PartialOrderTuple, Stamp, as_list, as_tuple, filter_ordered, +from devito.tools import (Ordering, Stamp, as_list, as_tuple, filter_ordered, flatten, frozendict, is_integer, toposort) from devito.types import Dimension, ModuloDimension __all__ = ['NullInterval', 'Interval', 'IntervalGroup', 'IterationSpace', - 'IterationInterval', 'DataSpace', 'Forward', 'Backward', 'Any'] + 'IterationInterval', 'DataSpace', 'Forward', 'Backward', 'Any', + 'null_ispace'] # The default Stamp, used by all new Intervals @@ -286,30 +287,40 @@ def promote(self, cond): return self def expand(self): - return Interval(self.dim, sdims_min(self.lower), sdims_max(self.upper), - self.stamp) + return Interval( + self.dim, minimum(self.lower), maximum(self.upper), self.stamp + ) -class IntervalGroup(PartialOrderTuple): +class IntervalGroup(Ordering): """ - A partially-ordered sequence of Intervals equipped with set-like - operations. + A sequence of Intervals equipped with set-like operations. """ @classmethod def reorder(cls, items, relations): if not all(isinstance(i, AbstractInterval) for i in items): - raise ValueError("Cannot create an IntervalGroup from objects of type [%s]" % + raise ValueError("Cannot create IntervalGroup from objs of type [%s]" % ', '.join(str(type(i)) for i in items)) - # The relations are between dimensions, not intervals. So we take - # care of that here - ordering = filter_ordered(toposort(relations) + [i.dim for i in items]) + + if len(relations) == 1: + # Special case: avoid expensive topological sorting if possible + relation, = relations + ordering = filter_ordered(list(relation) + [i.dim for i in items]) + else: + ordering = filter_ordered(toposort(relations) + [i.dim for i in items]) + return sorted(items, key=lambda i: ordering.index(i.dim)) + @classmethod + def simplify_relations(cls, relations, items, mode): + if mode == 'total': + return [tuple(i.dim for i in items)] + else: + return super().simplify_relations(relations, items, mode) + def __eq__(self, o): - # No need to look at the relations -- if the partial ordering is the same, - # then the IntervalGroups are considered equal return len(self) == len(o) and all(i == j for i, j in zip(self, o)) def __contains__(self, d): @@ -341,7 +352,7 @@ def is_well_defined(self): return len(self.dimensions) == len(set(self.dimensions)) @classmethod - def generate(self, op, *interval_groups, relations=None): + def generate(cls, op, *interval_groups, relations=None): """ Create a new IntervalGroup from the iterative application of an operation to some IntervalGroups. @@ -354,8 +365,7 @@ def generate(self, op, *interval_groups, relations=None): *interval_groups Input IntervalGroups. relations : tuple, optional - Relations to be used in the newly constructed IntervalGroup, in addition - to the ones inherited via each element in `interval_groups`. + Additional relations to build the new IntervalGroup. Examples -------- @@ -383,12 +393,19 @@ def generate(self, op, *interval_groups, relations=None): relations = set(as_tuple(relations)) relations.update(set().union(*[ig.relations for ig in interval_groups])) - return IntervalGroup(intervals, relations=relations) + modes = set(ig.mode for ig in interval_groups) + assert len(modes) <= 1 + try: + mode = modes.pop() + except KeyError: + mode = 'total' + + return IntervalGroup(intervals, relations=relations, mode=mode) def is_compatible(self, o): """ - Two IntervalGroups are compatible iff they can be ordered according - to some common partial ordering. + Two IntervalGroups are compatible iff their elements are not subjected + to sets of relations that would prevent a reordering. """ if set(self) != set(o): return False @@ -399,7 +416,7 @@ def is_compatible(self, o): self.add(o) return True except ValueError: - # Cyclic dependence detected, there is no common partial ordering + # Cyclic dependence detected return False def _normalize(func): @@ -410,38 +427,42 @@ def _normalize(func): def wrapper(self, o): if not isinstance(o, IntervalGroup): o = IntervalGroup(as_tuple(o)) - return func(self, o) + if self.mode != o.mode: + raise ValueError("Incompatible ordering modes") + relations = self.relations | o.relations + return func(self, o, relations, self.mode) return wrapper @_normalize - def intersection(self, o): + def intersection(self, o, relations, mode): mapper = OrderedDict([(i.dim, i) for i in o]) intervals = [i.intersection(mapper.get(i.dim, i)) for i in self] - return IntervalGroup(intervals, relations=(self.relations | o.relations)) + return IntervalGroup(intervals, relations=relations, mode=mode) @_normalize - def add(self, o): + def add(self, o, relations, mode): mapper = OrderedDict([(i.dim, i) for i in o]) intervals = [i.add(mapper.get(i.dim, NullInterval(i.dim))) for i in self] - return IntervalGroup(intervals, relations=(self.relations | o.relations)) + return IntervalGroup(intervals, relations=relations, mode=mode) @_normalize - def subtract(self, o): + def subtract(self, o, relations, mode): mapper = OrderedDict([(i.dim, i) for i in o]) intervals = [i.subtract(mapper.get(i.dim, NullInterval(i.dim))) for i in self] - return IntervalGroup(intervals, relations=(self.relations | o.relations)) + return IntervalGroup(intervals, relations=relations, mode=mode) def relaxed(self, d=None): d = set(self.dimensions if d is None else as_tuple(d)) intervals = [i.relaxed if i.dim in d else i for i in self] - return IntervalGroup(intervals, relations=self.relations) + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def promote(self, cond): - intervals = IntervalGroup([i.promote(cond) for i in self], - relations=self.relations) + intervals = [i.promote(cond) for i in self] + intervals = IntervalGroup(intervals, relations=None, mode='unordered') - # There could be duplicate Dimensions at this point, so we sum up the Intervals - # defined over the same Dimension to produce a well-defined IntervalGroup + # There could be duplicate Dimensions at this point, so we sum up the + # Intervals defined over the same Dimension to produce a well-defined + # IntervalGroup intervals = IntervalGroup.generate('add', intervals) return intervals @@ -458,23 +479,29 @@ def drop(self, d, strict=False): # Clean up relations relations = [tuple(i for i in r if i not in dims) for r in self.relations] - return IntervalGroup(intervals, relations=relations) + return IntervalGroup(intervals, relations=relations, mode=self.mode) def negate(self): - return IntervalGroup([i.negate() for i in self], relations=self.relations) + intervals = [i.negate() for i in self] + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def zero(self, d=None): d = self.dimensions if d is None else as_tuple(d) - return IntervalGroup([i.zero() if i.dim in d else i for i in self], - relations=self.relations) + intervals = [i.zero() if i.dim in d else i for i in self] + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def lift(self, d=None, v=None): d = set(self.dimensions if d is None else as_tuple(d)) - return IntervalGroup([i.lift(v) if i.dim._defines & d else i for i in self], - relations=self.relations) + intervals = [i.lift(v) if i.dim._defines & d else i for i in self] + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def reset(self): - return IntervalGroup([i.reset() for i in self], relations=self.relations) + intervals = [i.reset() for i in self] + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def switch(self, d0, d1): intervals = [i.switch(d1) if i.dim is d0 else i for i in self] @@ -482,17 +509,20 @@ def switch(self, d0, d1): # Update relations too relations = {tuple(d1 if i is d0 else i for i in r) for r in self.relations} - return IntervalGroup(intervals, relations=relations) + return IntervalGroup(intervals, relations=relations, mode=self.mode) def translate(self, d, v0=0, v1=None): - intervals = [i.translate(v0, v1) if i.dim in as_tuple(d) else i for i in self] - return IntervalGroup(intervals, relations=self.relations) + dims = as_tuple(d) + intervals = [i.translate(v0, v1) if i.dim in dims else i for i in self] + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def expand(self, d=None): if d is None: d = self.dimensions intervals = [i.expand() if i.dim in as_tuple(d) else i for i in self] - return IntervalGroup(intervals, relations=self.relations) + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def index(self, key): if isinstance(key, Interval): @@ -506,7 +536,7 @@ def __getitem__(self, key): return super(IntervalGroup, self).__getitem__(key) elif isinstance(key, slice): retval = super(IntervalGroup, self).__getitem__(key) - return IntervalGroup(retval, relations=self.relations) + return IntervalGroup(retval, relations=self.relations, mode=self.mode) if not self.is_well_defined: raise ValueError("Cannot fetch Interval from ill defined Space") @@ -627,6 +657,14 @@ def dimensions(self): def size(self): return self.intervals.size + @property + def mode(self): + return self.intervals.mode + + @property + def relations(self): + return self.intervals.relations + @property def dimension_map(self): """ @@ -650,14 +688,15 @@ class DataSpace(Space): """ def __init__(self, intervals, parts=None): - super(DataSpace, self).__init__(intervals) + super().__init__(intervals) parts = {k: v.expand() for k, v in (parts or {}).items()} self._parts = frozendict(parts) def __eq__(self, other): - return isinstance(other, DataSpace) and\ - self.intervals == other.intervals and self.parts == other.parts + return (isinstance(other, DataSpace) and + self.intervals == other.intervals and + self.parts == other.parts) def __hash__(self): return hash((super(DataSpace, self).__hash__(), self.parts)) @@ -771,7 +810,7 @@ def __getitem__(self, key): def generate(self, op, *others, relations=None): if not others: return IterationSpace(IntervalGroup()) - elif len(others) == 1: + elif len(others) == 1 and not relations: return others[0] intervals = [i.intervals for i in others] @@ -858,8 +897,12 @@ def project(self, cond, strict=True): dims = [i.dim for i in self if not func(i.dim)] intervals = self.intervals.drop(dims, strict=strict) - sub_iterators = {k: v for k, v in self.sub_iterators.items() if func(k)} - directions = {k: v for k, v in self.directions.items() if func(k)} + sub_iterators = {} + directions = {} + for i in intervals: + d = i.dim + sub_iterators[d] = self.sub_iterators[d] + directions[d] = self.directions[d] return IterationSpace(intervals, sub_iterators, directions) @@ -904,6 +947,25 @@ def promote(self, cond): return IterationSpace(intervals, sub_iterators, directions) + def prefix(self, key): + """ + Return the IterationSpace up to and including the last Interval + such that `key(interval)` is True. + """ + try: + i = self.project(key)[-1] + except IndexError: + return None + + return self[:self.index(i.dim) + 1] + + def reorder(self, relations=None, mode=None): + relations = relations or self.relations + mode = mode or self.mode + intervals = IntervalGroup(self.intervals, relations=relations, mode=mode) + + return IterationSpace(intervals, self.sub_iterators, self.directions) + def is_compatible(self, other): """ A relaxed version of ``__eq__``, in which only non-derived dimensions @@ -913,13 +975,9 @@ def is_compatible(self, other): self.nonderived_directions == other.nonderived_directions) @property - def itdimensions(self): + def itdims(self): return self.intervals.dimensions - @property - def relations(self): - return self.intervals.relations - @property def sub_iterators(self): return self._sub_iterators @@ -930,13 +988,16 @@ def directions(self): @cached_property def itintervals(self): - return tuple(self[d] for d in self.itdimensions) + return tuple(self[d] for d in self.itdims) @cached_property def dimensions(self): sub_dims = flatten(i._defines for v in self.sub_iterators.values() for i in v) - return tuple(filter_ordered(self.itdimensions + tuple(sub_dims))) + return tuple(filter_ordered(self.itdims + tuple(sub_dims))) @cached_property def nonderived_directions(self): return {k: v for k, v in self.directions.items() if not k.is_Derived} + + +null_ispace = IterationSpace([]) diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py index 3750b08a0e..a2dc2b4c73 100644 --- a/devito/ir/support/utils.py +++ b/devito/ir/support/utils.py @@ -1,12 +1,16 @@ -from collections import defaultdict +from collections import defaultdict, namedtuple +from itertools import product -from devito.symbolics import CallFromPointer, retrieve_indexed, retrieve_terminals +from devito.finite_differences import IndexDerivative +from devito.symbolics import (CallFromPointer, retrieve_indexed, retrieve_terminals, + search) from devito.tools import DefaultOrderedDict, as_tuple, flatten, filter_sorted, split from devito.types import (Dimension, DimensionTuple, Indirection, ModuloDimension, StencilDimension) __all__ = ['AccessMode', 'Stencil', 'IMask', 'detect_accesses', 'detect_io', - 'pull_dims', 'sdims_min', 'sdims_max'] + 'pull_dims', 'unbounded', 'minimum', 'maximum', 'minmax_index', + 'extrema', 'erange'] class AccessMode(object): @@ -272,25 +276,91 @@ def pull_dims(exprs, flag=True): # *** Utility functions for expressions that potentially contain StencilDimensions -def sdims_min(expr): +def unbounded(expr): """ - Replace all StencilDimensions in `expr` with their minimum point. + Retrieve all unbounded Dimensions in `expr`. """ - try: - sdims = expr.find(StencilDimension) - except AttributeError: - return expr - mapper = {e: e._min for e in sdims} - return expr.subs(mapper) + # At the moment we only have logic to retrieve unbounded StencilDimensions, + # but in the future this might change + bound = set().union(*[i.dimensions for i in search(expr, IndexDerivative)]) + sdims = search(expr, StencilDimension, mode='unique', deep=True) + + return sdims - bound + + +Extrema = namedtuple('Extrema', 'm M') -def sdims_max(expr): +def _relational(expr, callback, udims=None): """ - Replace all StencilDimensions in `expr` with their maximum point. + Helper for `minimum`, `maximum`, and potential future utilities that share + a significant chunk of logic. """ - try: - sdims = expr.find(StencilDimension) - except AttributeError: + if not udims: + udims = unbounded(expr) + + # Resolution rule 1: StencilDimensions + sdims = [d for d in udims if d.is_Stencil] + if not sdims: return expr - mapper = {e: e._max for e in sdims} + mapper = {e: callback(e) for e in sdims} + return expr.subs(mapper) + + +def minimum(expr, udims=None): + """ + Substitute the unbounded Dimensions in `expr` with their minimum point. + + Unbounded Dimensions whose possible minimum value is not known are ignored. + """ + return _relational(expr, lambda e: e._min, udims) + + +def maximum(expr, udims=None): + """ + Substitute the unbounded Dimensions in `expr` with their maximum point. + + Unbounded Dimensions whose possible maximum value is not known are ignored. + """ + return _relational(expr, lambda e: e._max, udims) + + +def extrema(expr): + """ + The minimum and maximum extrema assumed by `expr` once the unbounded + Dimensions are resolved. + """ + return Extrema(minimum(expr), maximum(expr)) + + +def minmax_index(expr, d): + """ + Return the minimum and maximum indices along the `d` Dimension + among all Indexeds in `expr`. + """ + indices = set() + for i in retrieve_indexed(expr): + try: + indices.add(i.indices[d]) + except KeyError: + pass + + return Extrema(min(minimum(i) for i in indices), + max(maximum(i) for i in indices)) + + +def erange(expr): + """ + All possible values that `expr` can assume once its unbounded Dimensions + are resolved. + """ + udims = unbounded(expr) + if not udims: + return (expr,) + + sdims = [d for d in udims if d.is_Stencil] + ranges = [i.range for i in sdims] + mappers = [dict(zip(sdims, i)) for i in product(*ranges)] + + return tuple(expr.subs(m) for m in mappers) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index eb8b793f22..0fadc5dc47 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -311,8 +311,15 @@ def _lower_exprs(cls, expressions, **kwargs): # Specialization is performed on unevaluated expressions expressions = cls._specialize_dsl(expressions, **kwargs) - # Lower functional DSL + # Lower FD derivatives + # NOTE: we force expansion of derivatives along SteppingDimensions + # because it drastically simplifies the subsequent lowering into + # ModuloDimensions + if not expand: + expand = lambda d: d.is_Stepping expressions = flatten([i._evaluate(expand=expand) for i in expressions]) + + # Scalarize the tensor equations, if any expressions = [j for i in expressions for j in i._flatten] # A second round of specialization is performed on evaluated expressions @@ -930,15 +937,22 @@ def _emit_apply_profiling(self, args): # Emit local, i.e. "per-rank" performance. Without MPI, this is the only # thing that will be emitted - for k, v in summary.items(): - rank = "[rank%d]" % k.rank if k.rank is not None else "" + def lower_perfentry(v): if v.gflopss: oi = "OI=%.2f" % fround(v.oi) gflopss = "%.2f GFlops/s" % fround(v.gflopss) gpointss = "%.2f GPts/s" % fround(v.gpointss) - metrics = "[%s]" % ", ".join([oi, gflopss, gpointss]) + return "[%s]" % ", ".join([oi, gflopss, gpointss]) + elif v.gpointss: + gpointss = "%.2f GPts/s" % fround(v.gpointss) + return "[%s]" % gpointss else: - metrics = "" + return "" + + for k, v in summary.items(): + rank = "[rank%d]" % k.rank if k.rank is not None else "" + + metrics = lower_perfentry(v) itershapes = [",".join(str(i) for i in its) for its in v.itershapes] if len(itershapes) > 1: @@ -950,9 +964,12 @@ def _emit_apply_profiling(self, args): name = "%s%s<%s>" % (k.name, rank, itershapes) perf("%s* %s ran in %.2f s %s" % (indent, name, fround(v.time), metrics)) - for n, time in summary.subsections.get(k.name, {}).items(): - perf("%s+ %s ran in %.2f s [%.2f%%]" % - (indent*2, n, time, fround(time/v.time*100))) + for n, v1 in summary.subsections.get(k.name, {}).items(): + metrics = lower_perfentry(v1) + + perf("%s+ %s ran in %.2f s [%.2f%%] %s" % + (indent*2, n, fround(v1.time), fround(v1.time/v.time*100), + metrics)) # Emit performance mode and arguments perf_args = {} diff --git a/devito/operator/profiling.py b/devito/operator/profiling.py index 2a58677481..53f16e526f 100644 --- a/devito/operator/profiling.py +++ b/devito/operator/profiling.py @@ -105,6 +105,25 @@ def track_subsection(self, sname, name): v = self._subsections.setdefault(sname, OrderedDict()) v[name] = SectionData(S.Zero, S.Zero, S.Zero, S.Zero, []) + def group_as_subsections(self, sname, sections): + ops = sum(self._sections[i].ops for i in sections) + points = sum(self._sections[i].points for i in sections) + traffic = sum(self._sections[i].traffic for i in sections) + sectiondata = SectionData(ops, S.Zero, points, traffic, []) + + v = self._subsections.setdefault(sname, OrderedDict()) + v.update({i: self._sections[i] for i in sections}) + + new_sections = OrderedDict() + for k, v in self._sections.items(): + try: + if sections.index(k) == len(sections) - 1: + new_sections[sname] = sectiondata + except ValueError: + new_sections[k] = v + self._sections.clear() + self._sections.update(new_sections) + def instrument(self, iet, timer): """ Instrument the given IET for C-level performance profiling. @@ -211,6 +230,58 @@ class AdvancedProfiler(Profiler): _supports_async_sections = True + def _evaluate_section(self, name, data, args, dtype): + # Time to run the section + time = max(getattr(args[self.name]._obj, name), 10e-7) + + # Number of FLOPs performed + try: + ops = int(subs_op_args(data.ops, args)) + except (AttributeError, TypeError): + # E.g., a section comprising just function calls, or at least + # a sequence of unrecognized or non-conventional expr statements + ops = np.nan + + try: + # Number of grid points computed + points = int(subs_op_args(data.points, args)) + + # Compulsory traffic + traffic = float(subs_op_args(data.traffic, args)*dtype().itemsize) + except (AttributeError, TypeError): + # E.g., the section has a dynamic loop size + points = np.nan + + traffic = np.nan + + # Nmber of FLOPs performed at each iteration + sops = data.sops + + # Runtime itermaps/itershapes + try: + itermaps = [OrderedDict([(k, int(subs_op_args(v, args))) + for k, v in i.items()]) + for i in data.itermaps] + itershapes = tuple(tuple(i.values()) for i in itermaps) + except TypeError: + # E.g., a section comprising just function calls, or at least + # a sequence of unrecognized or non-conventional expr statements + itershapes = () + + return time, ops, points, traffic, sops, itershapes + + def _allgather_from_comm(self, comm, time, ops, points, traffic, sops, itershapes): + times = comm.allgather(time) + assert comm.size == len(times) + + opss = comm.allgather(ops) + pointss = comm.allgather(points) + traffics = comm.allgather(traffic) + sops = [sops]*comm.size + itershapess = comm.allgather(itershapes) + + return list(zip(times, opss, pointss, traffics, sops, itershapess)) + # Override basic summary so that arguments other than runtime are computed. def summary(self, args, dtype, reduce_over=None): grid = args.grid @@ -219,71 +290,30 @@ def summary(self, args, dtype, reduce_over=None): # Produce sections summary summary = PerformanceSummary() for name, data in self._sections.items(): - # Time to run the section - time = max(getattr(args[self.name]._obj, name), 10e-7) - - # Number of FLOPs performed - try: - ops = int(subs_op_args(data.ops, args)) - except (AttributeError, TypeError): - # E.g., a section comprising just function calls, or at least - # a sequence of unrecognized or non-conventional expr statements - ops = np.nan - - try: - # Number of grid points computed - points = int(subs_op_args(data.points, args)) - - # Compulsory traffic - traffic = float(subs_op_args(data.traffic, args)*dtype().itemsize) - except (AttributeError, TypeError): - # E.g., the section has a dynamic loop size - points = np.nan - - traffic = np.nan - - # Runtime itermaps/itershapes - try: - itermaps = [OrderedDict([(k, int(subs_op_args(v, args))) - for k, v in i.items()]) - for i in data.itermaps] - itershapes = tuple(tuple(i.values()) for i in itermaps) - except TypeError: - # E.g., a section comprising just function calls, or at least - # a sequence of unrecognized or non-conventional expr statements - itershapes = () + items = self._evaluate_section(name, data, args, dtype) # Add local performance data if comm is not MPI.COMM_NULL: # With MPI enabled, we add one entry per section per rank - times = comm.allgather(time) - assert comm.size == len(times) - opss = comm.allgather(ops) - pointss = comm.allgather(points) - traffics = comm.allgather(traffic) - sops = [data.sops]*comm.size - itershapess = comm.allgather(itershapes) - items = list(zip(times, opss, pointss, traffics, sops, itershapess)) + items = self._allgather_from_comm(comm, *items) for rank in range(comm.size): summary.add(name, rank, *items[rank]) else: - summary.add(name, None, time, ops, points, traffic, data.sops, itershapes) + summary.add(name, None, *items) # Enrich summary with subsections data for sname, v in self._subsections.items(): for name, data in v.items(): - # Time to run the section - time = max(getattr(args[self.name]._obj, name), 10e-7) + items = self._evaluate_section(name, data, args, dtype) # Add local performance data if comm is not MPI.COMM_NULL: # With MPI enabled, we add one entry per section per rank - times = comm.allgather(time) - assert comm.size == len(times) + items = self._allgather_from_comm(comm, *items) for rank in range(comm.size): - summary.add_subsection(sname, name, rank, time) + summary.add_subsection(sname, name, rank, *items[rank]) else: - summary.add_subsection(sname, name, None, time) + summary.add_subsection(sname, name, None, *items) # Add global performance data if reduce_over is not None: @@ -422,11 +452,11 @@ def add(self, name, rank, time, self.input[k] = PerfInput(time, ops, points, traffic, sops, itershapes) - def add_subsection(self, sname, name, rank, time): + def add_subsection(self, sname, name, rank, time, *args): k0 = PerfKey(sname, rank) assert k0 in self - self.subsections[sname][name] = time + self.subsections[sname][name] = PerfEntry(time, None, None, None, None, []) def add_glb_vanilla(self, key, time): """ diff --git a/devito/passes/clusters/__init__.py b/devito/passes/clusters/__init__.py index d4bc2e182f..c41a628e06 100644 --- a/devito/passes/clusters/__init__.py +++ b/devito/passes/clusters/__init__.py @@ -1,7 +1,7 @@ from .utils import * # noqa from .buffering import * # noqa -from .aliases import * # noqa from .cse import * # noqa +from .aliases import * # noqa from .factorization import * # noqa from .blocking import * # noqa from .asynchrony import * # noqa diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index e0783323fe..bfa1014748 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -6,19 +6,20 @@ import numpy as np import sympy -from devito.finite_differences import EvalDerivative +from devito.finite_differences import EvalDerivative, IndexDerivative, Weights from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, ROUNDABLE, SEPARABLE, Forward, - IterationInstance, IterationSpace, Interval, Cluster, - Queue, IntervalGroup, LabeledVector, normalize_properties, - relax_properties, sdims_min, sdims_max) -from devito.symbolics import (Uxmapper, compare_ops, estimate_cost, q_constant, - reuse_if_untouched, retrieve_indexed, search, uxreplace, - sympy_dtype) -from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, generator, - split, timed_pass) -from devito.types import (Array, TempFunction, Eq, Symbol, Temp, ModuloDimension, - CustomDimension, IncrDimension, StencilDimension, Indexed, - Hyperplane) + IterationSpace, Interval, Cluster, ExprGeometry, Queue, + IntervalGroup, LabeledVector, Vector, normalize_properties, + relax_properties, unbounded, minimum, maximum, extrema, + vmax, vmin) +from devito.passes.clusters.cse import _cse +from devito.symbolics import (Uxmapper, estimate_cost, search, reuse_if_untouched, + uxreplace, sympy_dtype) +from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, + is_integer, generator, split, timed_pass) +from devito.types import (Eq, Symbol, Temp, TempArray, TempFunction, + ModuloDimension, CustomDimension, IncrDimension, + StencilDimension, Indexed, Hyperplane) from devito.types.grid import MultiSubDimension __all__ = ['cire'] @@ -83,10 +84,13 @@ def cire(clusters, mode, sregistry, options, platform): t0 = 2.0*t2[x,y,z] t1 = 3.0*t2[x,y,z+1] """ - modes = { - 'invariants': [CireInvariantsElementary, CireInvariantsDivs], - 'sops': [CireSops], - } + # NOTE: Handle prematurely expanded derivatives -- current default on + # several backends, but soon to become legacy + if mode == 'sops': + if options['expand']: + mode = 'eval-derivs' + else: + mode = 'index-derivs' for cls in modes[mode]: transformer = cls(sregistry, options, platform) @@ -121,19 +125,17 @@ def _aliases_from_clusters(self, clusters, exclude, meta): # Clusters -> AliasList found = collect(mapper.extracted, meta.ispace, self.opt_minstorage) pexprs, aliases = choose(found, exprs, mapper, self.opt_mingain) - if not aliases: - continue # AliasList -> Schedule schedule = lower_aliases(aliases, meta, self.opt_maxpar) variants.append(Variant(schedule, pexprs)) - # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off) if not variants: return [] - schedule, exprs = pick_best(variants, self.opt_schedule_strategy, - self._eval_variants_delta) + + # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off) + schedule, exprs = self._select(variants) # Schedule -> Schedule (optimization) if self.opt_rotate: @@ -141,7 +143,8 @@ def _aliases_from_clusters(self, clusters, exclude, meta): schedule = optimize_schedule_padding(schedule, meta, self.platform) # Schedule -> [Clusters]_k - processed, subs = lower_schedule(schedule, meta, self.sregistry, self.opt_ftemps) + processed, subs = lower_schedule(schedule, meta, self.sregistry, + self.opt_ftemps) # [Clusters]_k -> [Clusters]_k (optimization) if self.opt_multisubdomain: @@ -163,12 +166,44 @@ def _aliases_from_clusters(self, clusters, exclude, meta): return processed + def process(self, clusters): + raise NotImplementedError + + def _generate(self, exprs, exclude): + """ + Generate one or more extractions from ``exprs``. An extraction is a + set of CIRE candidates which may be turned into aliases. Two different + extractions may contain overlapping sub-expressions and, therefore, + should be processed and evaluated indipendently. An extraction won't + contain any of the symbols appearing in ``exclude``. + """ + raise NotImplementedError + + def _lookup_key(self, c): + """ + Create a key for the given Cluster. Clusters with same key may be + processed together in the search for CIRE candidates. Clusters should + have a different key if they must not be processed together, e.g., + when this would lead to violation of data dependencies. + """ + raise NotImplementedError + + def _select(self, variants): + """ + Select the best variant out of a set of variants, weighing flops + and working set. + """ + raise NotImplementedError + + +class CireTransformerLegacy(CireTransformer): + def _do_generate(self, exprs, exclude, cbk_search, cbk_compose=None): """ Carry out the bulk of the work of ``_generate``. """ counter = generator() - make = lambda: Symbol(name='dummy%d' % counter()) + make = lambda: Symbol(name='dummy%d' % counter(), dtype=np.float32) if cbk_compose is None: cbk_compose = lambda *args: None @@ -193,38 +228,8 @@ def _do_generate(self, exprs, exclude, cbk_search, cbk_compose=None): return mapper - def _generate(self, exprs, exclude): - """ - Generate one or more extractions from ``exprs``. An extraction is a - set of CIRE candidates which may be turned into aliases. Two different - extractions may contain overlapping sub-expressions and, therefore, - should be processed and evaluated indipendently. An extraction won't - contain any of the symbols appearing in ``exclude``. - """ - raise NotImplementedError - - def _lookup_key(self, c): - """ - Create a key for the given Cluster. Clusters with same key may be - processed together in the search for CIRE candidates. Clusters should - have a different key if they must not be processed together, e.g., - when this would lead to violation of data dependencies. - """ - raise NotImplementedError - - def _eval_variants_delta(self, delta_flops, delta_ws): - """ - Given the deltas in flop reduction and working set size increase of two - Variants, return True if the second variant is estimated to deliever - better performance, False otherwise. - """ - raise NotImplementedError - def process(self, clusters): - raise NotImplementedError - - -class CireInvariants(CireTransformer, Queue): +class CireInvariants(CireTransformerLegacy, Queue): def __init__(self, sregistry, options, platform): super().__init__(sregistry, options, platform) @@ -272,10 +277,8 @@ def _lookup_key(self, c, d): return AliasKey(ispace, intervals, c.dtype, c.guards, properties) - def _eval_variants_delta(self, delta_flops, delta_ws): - # Always prefer the Variant with fewer temporaries - return ((delta_ws == 0 and delta_flops < 0) or - (delta_ws > 0)) + def _select(self, variants): + return pick_best(variants) class CireInvariantsElementary(CireInvariants): @@ -310,7 +313,7 @@ def _generate(self, exprs, exclude): yield self._do_generate(exprs, exclude, cbk_search) -class CireSops(CireTransformer): +class CireDerivatives(CireTransformerLegacy): def __init__(self, sregistry, options, platform): super().__init__(sregistry, options, platform) @@ -341,55 +344,105 @@ def process(self, clusters): return processed def _generate(self, exprs, exclude): - # E.g., extract `u.dx*a*b` and `u.dx*a*c` from `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]` - def cbk_search(expr): - if isinstance(expr, EvalDerivative) and not expr.base.is_Function: - return expr.args - else: - return flatten(e for e in [cbk_search(a) for a in expr.args] if e) - - cbk_compose = lambda e: split_coeff(e)[1] - basextr = self._do_generate(exprs, exclude, cbk_search, cbk_compose) + # E.g., extract `u.dx*a*b` and `u.dx*a*c` from + # `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]` + basextr = self._do_generate(exprs, exclude, self._cbk_search, + self._cbk_compose) if not basextr: return yield basextr # E.g., extract `u.dx*a` from `[(u.dx*a*b).dy, (u.dx*a*c).dy]` - # That is, attempt extracting the largest common derivative-induced subexprs + # I.e., attempt extracting the largest common derivative-induced subexprs mappers = [deindexify(e) for e in basextr.extracted] counter = Counter(flatten(m.keys() for m in mappers)) groups = as_mapper(counter, key=counter.get) grank = {k: sorted(v, key=lambda e: estimate_cost(e), reverse=True) for k, v in groups.items()} - def cbk_search2(expr, rank): - ret = [] - for e in cbk_search(expr): - mapper = deindexify(e) - for i in rank: - if i in mapper: - ret.extend(mapper[i]) - break - return ret - candidates = sorted(grank, reverse=True)[:2] for i in candidates: lower_pri_elems = flatten([grank[j] for j in candidates if j != i]) - cbk_search_i = lambda e: cbk_search2(e, grank[i] + lower_pri_elems) - yield self._do_generate(exprs, exclude, cbk_search_i, cbk_compose) + cbk_search = lambda e: self._cbk_search2(e, grank[i] + lower_pri_elems) + yield self._do_generate(exprs, exclude, cbk_search, self._cbk_compose) def _lookup_key(self, c): return AliasKey(c.ispace, None, c.dtype, c.guards, c.properties) - def _eval_variants_delta(self, delta_flops, delta_ws): - # If there's a greater flop reduction using fewer temporaries, no doubts - # what's gonna be the best variant. But if the better flop reduction - # comes at the price of using more temporaries, then we have to apply - # heuristics, in particular we estimate how many flops would a temporary - # allow to save - return ((delta_flops >= 0 and delta_ws > 0 and delta_flops / delta_ws < 15) or - (delta_flops <= 0 and delta_ws < 0 and delta_flops / delta_ws > 15) or - (delta_flops <= 0 and delta_ws >= 0)) + def _select(self, variants): + if isinstance(self.opt_schedule_strategy, int): + try: + return variants[self.opt_schedule_strategy] + except IndexError: + raise ValueError("Illegal schedule %d; " + "generated %d schedules in total" + % (self.opt_schedule_strategy, len(variants))) + + return pick_best(variants) + + +class CireEvalDerivatives(CireDerivatives): + + def _cbk_compose(self, expr): + return split_coeff(expr)[1] + + def _cbk_search(self, expr): + if isinstance(expr, EvalDerivative) and not expr.base.is_Function: + return expr.args + else: + return flatten(e for e in [self._cbk_search(a) for a in expr.args] if e) + + def _cbk_search2(self, expr, rank): + ret = [] + for e in self._cbk_search(expr): + mapper = deindexify(e) + for i in rank: + if i in mapper: + ret.extend(mapper[i]) + break + return ret + + +class CireIndexDerivatives(CireDerivatives): + + def _cbk_compose(self, expr): + if expr.is_Pow: + return (expr,) + terms = [] + for a in expr.args: + try: + if not isinstance(a.function, Weights): + terms.append(a) + except AttributeError: + terms.append(a) + return tuple(terms) + + def _cbk_search(self, expr): + if isinstance(expr, IndexDerivative): + return (expr.expr,) + else: + return flatten(e for e in [self._cbk_search(a) for a in expr.args] if e) + + def _cbk_search2(self, expr, rank): + ret = [] + for e in self._cbk_search(expr): + mapper = deindexify(e) + for i in rank: + found = [v.expr if isinstance(v, IndexDerivative) else v + for v in mapper.get(i, [])] + if found: + ret.extend(found) + break + return ret + + +# Subpass mapper +modes = { + 'invariants': [CireInvariantsElementary, + CireInvariantsDivs], + 'eval-derivs': [CireEvalDerivatives], # NOTE: legacy pass + 'index-derivs': [CireIndexDerivatives], +} def collect(extracted, ispace, minstorage): @@ -439,28 +492,9 @@ def collect(extracted, ispace, minstorage): for expr in extracted: assert not expr.is_Equality - indexeds = retrieve_indexed(expr) - - bases = [] - offsets = [] - for i in indexeds: - ii = IterationInstance(i) - if ii.is_irregular: - break - - base = [] - offset = [] - for e, ai in zip(ii, ii.aindices): - if q_constant(e): - base.append(e) - else: - base.append(ai) - offset.append((ai, e - ai)) - bases.append(tuple(base)) - offsets.append(LabeledVector(offset)) - - if not indexeds or len(bases) == len(indexeds): - found.append(Candidate(expr, ispace, indexeds, bases, offsets)) + eg = ExprGeometry(expr) + if eg.is_regular: + found.append(eg) # Create groups of aliasing expressions mapper = OrderedDict() @@ -469,17 +503,13 @@ def collect(extracted, ispace, minstorage): c = unseen.pop(0) group = [c] for u in list(unseen): - # Is the arithmetic structure of `c` and `u` equivalent ? - if not compare_ops(c.expr, u.expr): - continue - # Is `c` translated w.r.t. `u` ? if not c.translated(u): continue group.append(u) unseen.remove(u) - group = Group(group) + group = Group(group, ispace=ispace) if minstorage: k = group.dimensions_translated @@ -562,8 +592,7 @@ def collect(extracted, ispace, minstorage): na = g.naliases nr = nredundants(ispace, pivot) score = estimate_cost(pivot, True)*((na - 1) + nr) - if score > 0: - aliases.add(pivot, aliaseds, list(mapper), distances, score) + aliases.add(pivot, aliaseds, list(mapper), distances, score) return aliases @@ -576,15 +605,15 @@ def choose(aliases, exprs, mapper, mingain): """ aliases = AliasList(aliases) + if not aliases: + return exprs, aliases + # `score < m` => discarded # `score > M` => optimized - # `m <= score <= M` => maybe discarded, maybe optimized -- depends on heuristics + # `m <= score <= M` => maybe discarded, maybe optimized; depends on heuristics m = mingain M = mingain*3 - if not aliases: - return exprs, aliases - # Filter off the aliases with low score key = lambda a: a.score >= m aliases.filter(key) @@ -689,8 +718,9 @@ def lower_aliases(aliases, meta, maxpar): # Given the iteration `interval`, lower distances to indices for distance, indices in zip(a.distances, indicess): + v = distance[interval.dim] or 0 try: - indices.append(d - interval.lower + distance[interval.dim]) + indices.append(d - interval.lower + v) except TypeError: indices.append(d) @@ -703,8 +733,9 @@ def lower_aliases(aliases, meta, maxpar): meta.ispace.directions) ispace = ispace.augment(sub_iterators) - processed.append(ScheduledAlias(a.pivot, writeto, ispace, a.aliaseds, - indicess, a.score)) + processed.append( + ScheduledAlias(a.pivot, writeto, ispace, a.aliaseds, indicess) + ) # The [ScheduledAliases] must be ordered so as to reuse as many of the # `ispace`'s IterationIntervals as possible in order to honor the @@ -781,14 +812,15 @@ def optimize_schedule_rotations(schedule, sregistry): # Extend `ispace` to iterate over rotations d1 = writeto[ridx+1].dim # Note: we're by construction in-bounds here - intervals = IntervalGroup(Interval(cd, 0, 0), relations={(d, cd, d1)}) + intervals = IntervalGroup(Interval(cd)) rispace = IterationSpace(intervals, {cd: dsi}, {cd: Forward}) aispace = i.ispace.zero(d) aispace = aispace.augment({d: mds + [ii]}) - ispace = IterationSpace.union(rispace, aispace) + ispace = IterationSpace.union(rispace, aispace, relations={(d, cd, d1)}) - processed.append(ScheduledAlias(pivot, writeto, ispace, i.aliaseds, - indicess, i.score)) + processed.append(ScheduledAlias( + pivot, writeto, ispace, i.aliaseds, indicess, + )) # Update the rotations mapper rmapper[d].extend(list(mapper.values())) @@ -811,8 +843,9 @@ def optimize_schedule_padding(schedule, meta, platform): ispace = i.ispace.add(Interval(it.dim, 0, it.size % vl)) else: ispace = i.ispace - processed.append(ScheduledAlias(i.pivot, i.writeto, ispace, i.aliaseds, - i.indicess, i.score)) + processed.append(ScheduledAlias( + i.pivot, i.writeto, ispace, i.aliaseds, i.indicess, + )) except (TypeError, KeyError, IndexError): processed.append(i) @@ -827,16 +860,16 @@ def lower_schedule(schedule, meta, sregistry, ftemps): make = TempFunction else: # Typical case -- the user does *not* "see" the CIRE-created temporaries - make = Array + make = TempArray clusters = [] subs = {} - for pivot, writeto, ispace, aliaseds, indicess, _ in schedule: + for pivot, writeto, ispace, aliaseds, indicess in schedule: name = sregistry.make_name() # Infer the dtype for the pivot # This prevents cases such as `floor(a*b)` with `a` and `b` floats - # that would creat a temporary `int r = b` leading to erronous numerical results - # Such cases happen with the positions for sparse functions for example. + # that would creat a temporary `int r = b` leading to erronous + # numerical results dtype = sympy_dtype(pivot, meta.dtype) if writeto: @@ -851,7 +884,7 @@ def lower_schedule(schedule, meta, sregistry, ftemps): # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) - dimensions = [d.parent if d.is_Sub else d for d in writeto.itdimensions] + dimensions = [d.parent if d.is_Sub else d for d in writeto.itdims] # The halo must be set according to the size of `writeto` halo = [(abs(i.lower), abs(i.upper)) for i in writeto] @@ -891,12 +924,12 @@ def lower_schedule(schedule, meta, sregistry, ftemps): for d, v in meta.properties.items(): if any(i.is_Modulo for i in ispace.sub_iterators[d]): properties[d] = normalize_properties(v, {SEQUENTIAL}) - elif d not in writeto.itdimensions: + elif d not in writeto.itdims: properties[d] = normalize_properties(v, {PARALLEL_IF_PVT}) - {ROUNDABLE} # Track star-shaped stencils for potential future optimization if len(writeto) > 1 and schedule.is_frame: - properties[Hyperplane(writeto.itdimensions)] = {SEPARABLE} + properties[Hyperplane(writeto.itdims)] = {SEPARABLE} # Finally, build the alias Cluster clusters.append(Cluster(expression, ispace, meta.guards, properties)) @@ -912,7 +945,7 @@ def optimize_clusters_msds(clusters): """ processed = [] for c in clusters: - msds = [d for d in c.ispace.itdimensions if isinstance(d, MultiSubDimension)] + msds = [d for d in c.ispace.itdims if isinstance(d, MultiSubDimension)] if msds: mapper = {d: d.root for d in msds} @@ -932,106 +965,48 @@ def optimize_clusters_msds(clusters): return processed -def pick_best(variants, schedule_strategy, eval_variants_delta): +def pick_best(variants): """ - Return the variant with the best trade-off between operation count - reduction and working set increase. Heuristics may be applied. + Return the variant with the best theoretical performance. """ - if type(schedule_strategy) is int: - try: - return variants[schedule_strategy] - except IndexError: - raise ValueError("Illegal schedule strategy %d; accepted `[0, %d]`" - % (schedule_strategy, len(variants) - 1)) - best = None - best_flops_score = None - best_ws_score = None - for i in variants: - i_flops_score = sum(sa.score for sa in i.schedule) - - # The working set score depends on the number and dimensionality of - # temporaries required by the Schedule - i_ws_count = Counter([len(sa.writeto) for sa in i.schedule]) - i_ws_score = tuple(i_ws_count[sa + 1] for sa in reversed(range(max(i_ws_count)))) - # TODO: For now, we assume the performance impact of an N-dimensional - # temporary is always the same regardless of the value N, but this might - # change in the future - i_ws_score = sum(i_ws_score) - - if best is None: - best, best_flops_score, best_ws_score = i, i_flops_score, i_ws_score - continue + # Flops in the two sweeps + flops0 = i.schedule.cost + flops1 = estimate_cost(i.exprs, True) - delta_flops = best_flops_score - i_flops_score - delta_ws = best_ws_score - i_ws_score - if eval_variants_delta(delta_flops, delta_ws): - best, best_flops_score, best_ws_score = i, i_flops_score, i_ws_score - - return best - - -# Utilities + flops = flops0 + flops1 + # Data movement in the two sweeps + indexeds0 = search([sa.pivot for sa in i.schedule], Indexed) + indexeds1 = search(i.exprs, Indexed) -class Candidate(object): + ntemps = len(i.schedule) + nfunctions0 = len({i.function for i in indexeds0}) + nfunctions1 = len({i.function for i in indexeds1}) - def __init__(self, expr, ispace, indexeds, bases, offsets): - self.expr = expr - self.ispace = ispace - self.indexeds = indexeds - self.bases = bases - self.offsets = offsets - - def __repr__(self): - return "Candidate(expr=%s)" % self.expr - - def translated(self, other): - """ - True if ``self`` is translated w.r.t. ``other``, False otherwise. + ws = ntemps*2 + nfunctions0 + nfunctions1 - Examples - -------- - Two candidates are translated if their bases are the same and - their offsets are pairwise translated. - - c := A[i,j] op A[i,j+1] -> Toffsets = {i: [0,0], j: [0,1]} - u := A[i+1,j] op A[i+1,j+1] -> Toffsets = {i: [1,1], j: [0,1]} - - Then `c` is translated w.r.t. `u` with distance `{i: 1, j: 0}` - """ - if len(self.Toffsets) != len(other.Toffsets): - return False - if len(self.bases) != len(other.bases): - return False - - # Check the bases - if any(b0 != b1 for b0, b1 in zip(self.bases, other.bases)): - return False + if best is None: + best, best_flops, best_ws = i, flops, ws + continue - # Check the offsets - for (d0, o0), (d1, o1) in zip(self.Toffsets, other.Toffsets): - if d0 is not d1: - return False + delta_flops = flops - best_flops + delta_ws = ws - best_ws - distance = set(o0 - o1) - if len(distance) != 1: - return False + # Magic sauce + # The coefficients were obtained empirically studying the behaviour + # of different variants in several kernels and platforms + # Intuitively, it's like trading 70 operations for 1 temporary + ws_curve = lambda x: (-1 / 70)*x - return True + if delta_ws <= ws_curve(delta_flops): + best, best_flops, best_ws = i, flops, ws - @cached_property - def Toffsets(self): - return LabeledVector.transpose(*self.offsets) + return best - @cached_property - def dimensions(self): - return frozenset(i for i, _ in self.Toffsets) - @property - def shifts(self): - return self.ispace.intervals +# Utilities class Group(tuple): @@ -1040,26 +1015,30 @@ class Group(tuple): A collection of aliasing expressions. """ - def __new__(cls, items): + def __new__(cls, items, ispace=None): # Expand out the StencilDimensions, if any processed = [] for c in items: - if not c.expr.find(StencilDimension): + sdims = [d for d in unbounded(c.expr) if d.is_Stencil] + if not sdims: processed.append(c) continue - for f in (sdims_min, sdims_max): + f0 = lambda e: minimum(e, sdims) + f1 = lambda e: maximum(e, sdims) + + for f in (f0, f1): expr = f(c.expr) indexeds = [f(i) for i in c.indexeds] offsets = [LabeledVector([(d, f(i)) for d, i in v.items()]) for v in c.offsets] - processed.append( - Candidate(expr, c.ispace, indexeds, c.bases, offsets) - ) + processed.append(ExprGeometry(expr, indexeds, c.bases, offsets)) obj = super().__new__(cls, processed) obj._items = items + obj._ispace = ispace + return obj def __repr__(self): @@ -1099,6 +1078,8 @@ def diameter(self): ret = defaultdict(int) for i in self.Toffsets: for d, v in i: + if d not in self._ispace: + continue try: distance = int(max(v) - min(v)) except TypeError: @@ -1106,7 +1087,13 @@ def diameter(self): if len(set(v)) == 1: continue else: - raise ValueError + # Worst-case scenario, we raraly end up here + # Resort to the fast vector-based comparison machinery + # (rather than the slower sympy.simplify) + items = [Vector(i) for i in v] + distance, = vmax(*items) - vmin(*items) + if not is_integer(distance): + raise ValueError ret[d] = max(ret[d], distance) return ret @@ -1114,13 +1101,13 @@ def diameter(self): @property def pivot(self): """ - A deterministically chosen Candidate for this Group. + A deterministically chosen reference for this Group. """ return self[0] @property def dimensions(self): - return self.pivot.dimensions + return frozenset(self.diameter) @property def dimensions_translated(self): @@ -1130,10 +1117,11 @@ def dimensions_translated(self): def naliases(self): na = len(self._items) - sdims = set().union(*[c.expr.find(StencilDimension) for c in self._items]) - implicit = int(np.prod([i._size for i in sdims])) - 1 + udims = set().union(*[unbounded(c.expr) for c in self._items]) + sdims = [d for d in udims if d.is_Stencil] + implicit = int(np.prod([i._size for i in sdims])) - return na + implicit + return na*max(implicit, 1) @cached_property def _pivot_legal_rotations(self): @@ -1163,7 +1151,7 @@ def _pivot_legal_rotations(self): def _pivot_min_intervals(self): """ The minimum Interval along each Dimension such that by evaluating the - pivot, all Candidates are evaluated too. + pivot, all other items are evaluated too. """ c = self.pivot @@ -1196,17 +1184,23 @@ def _pivot_legal_shifts(self): ret = defaultdict(lambda: (-np.inf, np.inf)) for i, ofs in zip(c.indexeds, c.offsets): f = i.function + for l in ofs.labels: + if isinstance(l, StencilDimension): + continue + # `f`'s cumulative halo size along `l` hsize = sum(f._size_halo[l]) # Any `ofs`'s shift due to non-[0,0] iteration space - lower, upper = c.shifts[l].offsets + lower, upper = self._ispace.intervals[l].offsets + + ofs0, ofs1 = extrema(ofs[l]) try: # Assume `ofs[l]` is a number (typical case) - maxd = min(0, max(ret[l][0], -ofs[l] - lower)) - mini = max(0, min(ret[l][1], hsize - ofs[l] - upper)) + maxd = min(0, max(ret[l][0], -ofs0 - lower)) + mini = max(0, min(ret[l][1], hsize - ofs1 - upper)) ret[l] = (maxd, mini) except TypeError: @@ -1222,7 +1216,7 @@ def _pivot_legal_shifts(self): class Alias(object): - def __init__(self, pivot, aliaseds, intervals, distances, score=0): + def __init__(self, pivot, aliaseds, intervals, distances, score): self.pivot = pivot self.aliaseds = aliaseds self.intervals = intervals @@ -1282,7 +1276,7 @@ def __iter__(self): for i in self._list: yield i - def add(self, pivot, aliaseds, intervals, distances, score=0): + def add(self, pivot, aliaseds, intervals, distances, score): assert len(aliaseds) == len(distances) self._list.append(Alias(pivot, aliaseds, intervals, distances, score)) @@ -1306,7 +1300,8 @@ def is_frame(self): return all(i.is_frame for i in self._list) -ScheduledAlias = namedtuple('SchedAlias', 'pivot writeto ispace aliaseds indicess score') +ScheduledAlias = namedtuple('SchedAlias', + 'pivot writeto ispace aliaseds indicess') class Schedule(tuple): @@ -1326,6 +1321,20 @@ def rebuild(self, *items, dmapper=None, rmapper=None, is_frame=False): is_frame=is_frame or self.is_frame ) + @cached_property + def cost(self): + # Not just the sum for the individual items' cost! There might be + # redundancies, which we factor out here... + counter = generator() + make = lambda: Symbol(name='dummy%d' % counter(), dtype=np.float32) + + tot = 0 + for v in as_mapper(self, lambda i: i.ispace).values(): + exprs = [i.pivot for i in v] + tot += estimate_cost(_cse(exprs, make), True) + + return tot + def make_rotations_table(d, v): """ diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index 741c8bf558..efd3339ce9 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -1,12 +1,14 @@ from sympy import sympify +from devito.finite_differences.differentiable import IndexSum from devito.ir.clusters import Queue from devito.ir.support import (AFFINE, PARALLEL, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT, SEQUENTIAL, SKEWABLE, TILABLES, Interval, IntervalGroup, IterationSpace, Scope) from devito.passes import is_on_device -from devito.symbolics import uxreplace, xreplace_indices -from devito.tools import UnboundedMultiTuple, as_tuple, flatten, is_integer, prod +from devito.symbolics import search, uxreplace, xreplace_indices +from devito.tools import (UnboundedMultiTuple, as_tuple, filter_ordered, + flatten, is_integer, prod) from devito.types import BlockDimension __all__ = ['blocking'] @@ -100,6 +102,8 @@ def _has_data_reuse(self, cluster): # which translates into the existance of any Relation accross Indexeds if any(r.function.is_AbstractFunction for r in cluster.scope.r_gen()): return True + if search(cluster.exprs, IndexSum): + return True # If it's a reduction operation a la matrix-matrix multiply, two Indexeds # might be enough @@ -314,7 +318,7 @@ def callback(self, clusters, prefix, blk_size_gen=None): # By passing a suitable key to `next` we ensure that we pull the # next par-tile entry iff we're now blocking an unseen TILABLE nest try: - step = sympify(blk_size_gen.next(clusters, d)) + step = sympify(blk_size_gen.next(prefix, d, clusters)) except StopIteration: return clusters else: @@ -383,18 +387,28 @@ def decompose(ispace, d, block_dims): if any(i._depth < bd._depth for i in r if i.is_Block): continue - relations.append(tuple(bd if i in d._defines else i for i in r)) + # E.g., `r=(z, i0z)` -> `[i0z0_blk0, i0z0_blk0]` + v = [bd if i in d._defines else i for i in r] + + # E.g., `v=[i0z0_blk0, i0z0_blk0]` -> `v=(i0z0_blk0,)` + v = tuple(filter_ordered(v)) + + relations.append(v) # 3: Make sure BlockDimensions at same depth stick next to each other # E.g., `(t, xbb, ybb, xb, yb, x, y)`, and NOT e.g. `(t, xbb, xb, x, ybb, ...)` # NOTE: this is perfectly legal since: # TILABLE => (perfect nest & PARALLEL) => interchangeable - for i in ispace.itdimensions: + for i in ispace.itdims: if not i.is_Block: continue for bd in block_dims: if i._depth < bd._depth: + # E.g. `(zb, y)` relations.append((i, bd)) + elif i._depth == bd._depth: + # E.g. `(y, z)` (i.e., honour input ordering) + relations.append((bd, i)) intervals = IntervalGroup(intervals, relations=relations) @@ -418,6 +432,7 @@ class BlockSizeGenerator(object): def __init__(self, par_tile): self.umt = UnboundedMultiTuple(*par_tile) + self.tip = -1 # This is for Clusters that need a small par-tile to avoid under-utilizing # computational resources (e.g., kernels running over iteration spaces that @@ -430,15 +445,45 @@ def __init__(self, par_tile): else: self.umt_small = UnboundedMultiTuple(par_tile.default) - def next(self, clusters, d): + def next(self, prefix, d, clusters): + # If a whole new set of Dimensions, move the tip -- this means `clusters` + # at `d` represents a new loop nest or kernel + x = any(i.dim.is_Block for i in flatten(c.ispace for c in clusters)) + if not x: + self.tip += 1 + # TODO: This is for now exceptionally rudimentary if all(c.properties.is_blockable_small(d) for c in clusters): - umt = self.umt_small + if not x: + self.umt_small.iter() + return self.umt_small.next() + + if x: + item = self.umt.curitem else: + # We can't `self.umt.iter()` because we might still want to + # fallback to `self.umt_small` + item = self.umt.nextitem + + # Handle user-provided rules + # TODO: This is also rudimentary + if item.rule is None: umt = self.umt + elif is_integer(item.rule): + if item.rule == self.tip: + umt = self.umt + else: + umt = self.umt_small + else: + if item.rule in {d.name for d in prefix.itdims}: + umt = self.umt + else: + # This is like "pattern unmatched" -- fallback to `umt_small` + umt = self.umt_small - if not any(i.dim.is_Block for i in flatten(c.ispace for c in clusters)): + if not x: umt.iter() + return umt.next() diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index 72b45f8e3b..dcba3eb872 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -203,7 +203,7 @@ def callback(self, clusters, prefix, cache=None): expr = lower_exprs(Eq(lhs, rhs)) ispace = b.writeto guards[pd] = GuardBound(dim.root.symbolic_min, dim.root.symbolic_max) - properties = {d: {AFFINE, PARALLEL} for d in ispace.itdimensions} + properties = {d: {AFFINE, PARALLEL} for d in ispace.itdims} init.append(Cluster(expr, ispace, guards=guards, properties=properties)) @@ -307,7 +307,7 @@ def callback(self, clusters, prefix, cache=None): for c in processed: if b.buffer in c.functions: key1 = lambda d: d not in b.dim._defines - dims = c.ispace.project(key1).itdimensions + dims = c.ispace.project(key1).itdims ispace = c.ispace.lift(dims, key0()) processed1.append(c.rebuild(ispace=ispace)) else: @@ -614,7 +614,7 @@ def readfrom(self): ispace0 = self.written.project(lambda d: d in self.xd._defines) ispace1 = self.writeto.project(lambda d: d not in self.xd._defines) - extra = (ispace0.itdimensions + ispace1.itdimensions,) + extra = (ispace0.itdims + ispace1.itdims,) ispace = IterationSpace.union(ispace0, ispace1, relations=extra) return ispace diff --git a/devito/passes/clusters/cse.py b/devito/passes/clusters/cse.py index 85b4e379df..5f00a0341e 100644 --- a/devito/passes/clusters/cse.py +++ b/devito/passes/clusters/cse.py @@ -8,6 +8,7 @@ from devito.passes.clusters.utils import makeit_ssa from devito.symbolics import estimate_cost, q_leaf from devito.symbolics.manipulation import _uxreplace +from devito.tools import as_list from devito.types import Eq, Temp as Temp0 __all__ = ['cse'] @@ -52,13 +53,18 @@ def _cse(maybe_exprs, make, min_cost=1, mode='default'): # also ensuring some form of post-processing assert mode == 'default' # Only supported mode ATM - # Just for flexibility, accept either Clusters or exprs + # Accept Clusters, Eqs or even just exprs if isinstance(maybe_exprs, Cluster): processed = list(maybe_exprs.exprs) scope = maybe_exprs.scope else: - processed = list(maybe_exprs) - scope = Scope(maybe_exprs) + maybe_exprs = as_list(maybe_exprs) + if all(e.is_Equality for e in maybe_exprs): + processed = maybe_exprs + scope = Scope(maybe_exprs) + else: + processed = [Eq(make(), e) for e in maybe_exprs] + scope = Scope([]) # Some sub-expressions aren't really "common" -- that's the case of Dimension- # independent data dependences. For example: diff --git a/devito/passes/clusters/derivatives.py b/devito/passes/clusters/derivatives.py index 00e7526f88..01e5afea6a 100644 --- a/devito/passes/clusters/derivatives.py +++ b/devito/passes/clusters/derivatives.py @@ -1,5 +1,5 @@ from devito.finite_differences import IndexDerivative -from devito.ir import Interval, IterationSpace, Queue +from devito.ir import Backward, Forward, Interval, IterationSpace, Queue from devito.passes.clusters.misc import fuse from devito.symbolics import (retrieve_dimensions, reuse_if_untouched, q_leaf, uxreplace) @@ -74,7 +74,7 @@ def _core(expr, c, weights, mapper, sregistry): try: w = weights[k] except KeyError: - w = weights[k] = w0._rebuild(name=name) + w = weights[k] = w0._rebuild(name=name, dtype=expr.dtype) expr = uxreplace(expr, {w0.indexed: w.indexed}) dims = retrieve_dimensions(expr, deep=True) @@ -87,20 +87,23 @@ def _core(expr, c, weights, mapper, sregistry): dims = tuple(d for d in dims if d not in c.ispace) intervals = [Interval(d) for d in dims] - ispace0 = IterationSpace(intervals) + directions = {d: Backward if d.backward else Forward for d in dims} + ispace0 = IterationSpace(intervals, directions=directions) - extra = (c.ispace.itdimensions + dims,) + extra = (c.ispace.itdims + dims,) ispace = IterationSpace.union(c.ispace, ispace0, relations=extra) name = sregistry.make_name(prefix='r') - s = Symbol(name=name, dtype=c.dtype) + s = Symbol(name=name, dtype=w.dtype) expr0 = Eq(s, 0.) ispace1 = ispace.project(lambda d: d is not dims[-1]) processed.insert(0, c.rebuild(exprs=expr0, ispace=ispace1)) # Transform e.g. `w[i0] -> w[i0 + 2]` for alignment with the # StencilDimensions starting points - subs = {expr.weights: expr.weights.subs(d, d - d._min) for d in dims} + subs = {expr.weights: + expr.weights.subs(d, d - d._min) + for d in dims} expr1 = Inc(s, uxreplace(expr.expr, subs)) processed.append(c.rebuild(exprs=expr1, ispace=ispace)) diff --git a/devito/passes/clusters/factorization.py b/devito/passes/clusters/factorization.py index 1353bf89b3..41c1ed2ec0 100644 --- a/devito/passes/clusters/factorization.py +++ b/devito/passes/clusters/factorization.py @@ -43,6 +43,71 @@ def factorize(cluster, *args): return cluster.rebuild(processed) +def collect_special(expr): + """ + Factorize elemental functions, pows, and other special symbolic objects, + prioritizing the most expensive entities. + """ + args, candidates = zip(*[_collect_nested(arg) for arg in expr.args]) + candidates = ReducerMap.fromdicts(*candidates) + + funcs = candidates.getall('funcs', []) + pows = candidates.getall('pows', []) + coeffs = candidates.getall('coeffs', []) + + # Functions/Pows are collected first, coefficients afterwards + terms = [] + w_funcs = [] + w_pows = [] + w_coeffs = [] + for i in args: + _args = i.args + if any(j in funcs for j in _args): + w_funcs.append(i) + elif any(j in pows for j in _args): + w_pows.append(i) + elif any(j in coeffs for j in _args): + w_coeffs.append(i) + else: + terms.append(i) + + # Collect common funcs + if len(w_funcs) > 1: + w_funcs = Add(*w_funcs, evaluate=False) + w_funcs = collect(w_funcs, funcs, evaluate=False) + try: + terms.extend([Mul(k, collect_const(v), evaluate=False) + for k, v in w_funcs.items()]) + except AttributeError: + assert w_funcs == 0 + else: + terms.extend(w_funcs) + + # Collect common pows + w_pows = Add(*w_pows, evaluate=False) + w_pows = collect(w_pows, pows, evaluate=False) + try: + terms.extend([Mul(k, collect_const(v), evaluate=False) + for k, v in w_pows.items()]) + except AttributeError: + assert w_pows == 0 + + # Collect common temporaries (r0, r1, ...) + w_coeffs = Add(*w_coeffs, evaluate=False) + symbols = retrieve_symbols(w_coeffs) + if symbols: + w_coeffs = collect(w_coeffs, symbols, evaluate=False) + try: + terms.extend([Mul(k, collect_const(v), evaluate=False) + for k, v in w_coeffs.items()]) + except AttributeError: + assert w_coeffs == 0 + else: + terms.append(w_coeffs) + + return Add(*terms) + + def collect_const(expr): """ *Much* faster alternative to sympy.collect_const. *Potentially* slightly less @@ -67,9 +132,10 @@ def collect_const(expr): if len(v) == 1 and not v[0].is_Add: # Special case: avoid e.g. (-2)*a mul = Mul(k, *v) - elif all(i.is_Mul and len(i.args) == 2 and i.args[0] == -1 for i in v): + elif all(i.is_Mul and i.args[0] == -1 for i in v): # Other special case: [-a, -b, -c ...] - add = Add(*[i.args[1] for i in v], evaluate=False) + operands = [Mul(*i.args[1:], evaluate=False) for i in v] + add = Add(*operands, evaluate=False) mul = Mul(-k, add, evaluate=False) elif k == 1: # 1 * (a + c) @@ -89,105 +155,54 @@ def collect_const(expr): return Add(*terms) -def collect_nested(expr): +def strategy0(expr): + rebuilt = collect_special(expr) + rebuilt = collect_const(rebuilt) + + return rebuilt + + +strategies = { + 'default': strategy0 +} + + +def _collect_nested(expr): + """ + Recursion helper for `collect_nested`. """ - Collect numeric coefficients, trascendental functions, and symbolic powers, - across all levels of the expression tree. + # Return semantic (rebuilt expression, factorization candidates) + + if expr.is_Number: + return expr, {'coeffs': expr} + elif expr.is_Function: + return expr, {'funcs': expr} + elif expr.is_Pow: + return expr, {'pows': expr} + elif expr.is_Symbol or expr.is_Indexed or isinstance(expr, BasicWrapperMixin): + return expr, {} + elif expr.is_Add: + return strategies['default'](expr), {} + elif expr.is_Mul: + args, candidates = zip(*[_collect_nested(arg) for arg in expr.args]) + return Mul(*args), ReducerMap.fromdicts(*candidates) + elif expr.is_Equality: + args, candidates = zip(*[_collect_nested(expr.lhs), + _collect_nested(expr.rhs)]) + return expr.func(*args, evaluate=False), ReducerMap.fromdicts(*candidates) + else: + args, candidates = zip(*[_collect_nested(arg) for arg in expr.args]) + return expr.func(*args), ReducerMap.fromdicts(*candidates) - The collection gives precedence to (in order of importance): - 1) Trascendental functions, - 2) Symbolic powers, - 3) Numeric coefficients. +def collect_nested(expr): + """ + Collect numeric coefficients, trascendental functions, pows, and other + symbolic objects across all levels of the expression tree. Parameters ---------- expr : expr-like The expression to be factorized. """ - - def run(expr): - # Return semantic (rebuilt expression, factorization candidates) - - if expr.is_Number: - return expr, {'coeffs': expr} - elif expr.is_Function: - return expr, {'funcs': expr} - elif expr.is_Pow: - return expr, {'pows': expr} - elif expr.is_Symbol or expr.is_Indexed or isinstance(expr, BasicWrapperMixin): - return expr, {} - elif expr.is_Add: - args, candidates = zip(*[run(arg) for arg in expr.args]) - candidates = ReducerMap.fromdicts(*candidates) - - funcs = candidates.getall('funcs', []) - pows = candidates.getall('pows', []) - coeffs = candidates.getall('coeffs', []) - - # Functions/Pows are collected first, coefficients afterwards - terms = [] - w_funcs = [] - w_pows = [] - w_coeffs = [] - for i in args: - _args = i.args - if any(j in funcs for j in _args): - w_funcs.append(i) - elif any(j in pows for j in _args): - w_pows.append(i) - elif any(j in coeffs for j in _args): - w_coeffs.append(i) - else: - terms.append(i) - - # Collect common funcs - if len(w_funcs) > 1: - w_funcs = Add(*w_funcs, evaluate=False) - w_funcs = collect(w_funcs, funcs, evaluate=False) - try: - terms.extend([Mul(k, collect_const(v), evaluate=False) - for k, v in w_funcs.items()]) - except AttributeError: - assert w_funcs == 0 - else: - terms.extend(w_funcs) - - # Collect common pows - w_pows = Add(*w_pows, evaluate=False) - w_pows = collect(w_pows, pows, evaluate=False) - try: - terms.extend([Mul(k, collect_const(v), evaluate=False) - for k, v in w_pows.items()]) - except AttributeError: - assert w_pows == 0 - - # Collect common temporaries (r0, r1, ...) - w_coeffs = Add(*w_coeffs, evaluate=False) - symbols = retrieve_symbols(w_coeffs) - if symbols: - w_coeffs = collect(w_coeffs, symbols, evaluate=False) - try: - terms.extend([Mul(k, collect_const(v), evaluate=False) - for k, v in w_coeffs.items()]) - except AttributeError: - assert w_coeffs == 0 - else: - terms.append(w_coeffs) - - # Collect common coefficients - rebuilt = Add(*terms) - rebuilt = collect_const(rebuilt) - - return rebuilt, {} - elif expr.is_Mul: - args, candidates = zip(*[run(arg) for arg in expr.args]) - return Mul(*args), ReducerMap.fromdicts(*candidates) - elif expr.is_Equality: - args, candidates = zip(*[run(expr.lhs), run(expr.rhs)]) - return expr.func(*args, evaluate=False), ReducerMap.fromdicts(*candidates) - else: - args, candidates = zip(*[run(arg) for arg in expr.args]) - return expr.func(*args), ReducerMap.fromdicts(*candidates) - - return run(expr)[0] + return _collect_nested(expr)[0] diff --git a/devito/passes/clusters/implicit.py b/devito/passes/clusters/implicit.py index af02805102..a85d713b4e 100644 --- a/devito/passes/clusters/implicit.py +++ b/devito/passes/clusters/implicit.py @@ -117,11 +117,10 @@ def callback(self, clusters, prefix): # The IterationSpace induced by the MultiSubDomain if dims: intervals = [Interval(i) for i in dims] - relations = (ispace0.itdimensions + dims, dims + ispace1.itdimensions) - ispaceN = IterationSpace( - IntervalGroup(intervals, relations=relations), sub_iterators - ) - ispace = IterationSpace.union(ispace0, ispaceN) + ispaceN = IterationSpace(IntervalGroup(intervals), sub_iterators) + + relations = (ispace0.itdims + dims, dims + ispace1.itdims) + ispace = IterationSpace.union(ispace0, ispaceN, relations=relations) else: ispaceN = None ispace = ispace0 @@ -157,7 +156,7 @@ def callback(self, clusters, prefix): tip = nxt if ispaceN: - ispace = IterationSpace.union(c.ispace, ispaceN) + ispace = IterationSpace.union(c.ispace, ispaceN, relations=relations) processed.append(c.rebuild(ispace=ispace)) else: processed.append(c) diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index 453e8c79bc..1153342733 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -186,18 +186,18 @@ def _key(self, c): if isinstance(c, Cluster): syncs = (c.syncs,) else: - syncs = c.syncs + syncs = tuple(i.syncs for i in c) for i in syncs: mapper = defaultdict(set) for k, v in i.items(): for s in v: - if isinstance(s, (FetchUpdate, PrefetchUpdate)) or \ + if isinstance(s, PrefetchUpdate) or \ (not self.fusetasks and isinstance(s, WaitLock)): # NOTE: A mix of Clusters w/ and w/o WaitLocks can safely # be fused, as in the worst case scenario the WaitLocks # get "hoisted" above the first Cluster in the sequence continue - elif (isinstance(s, (WaitLock, ReleaseLock)) or + elif (isinstance(s, (FetchUpdate, WaitLock, ReleaseLock)) or (self.fusetasks and isinstance(s, WithLock))): mapper[k].add(type(s)) else: @@ -307,7 +307,7 @@ def choose_element(queue, scheduled): return ClusterGroup(dag.topological_sort(choose_element), prefix) - def _build_dag(self, cgroups, prefix, peeking=False): + def _build_dag(self, cgroups, prefix): """ A DAG representing the data dependences across the ClusterGroups within a given scope. @@ -317,10 +317,10 @@ def _build_dag(self, cgroups, prefix, peeking=False): dag = DAG(nodes=cgroups) for n, cg0 in enumerate(cgroups): - def is_cross(dep): + def is_cross(source, sink): # True if a cross-ClusterGroup dependence, False otherwise - t0 = dep.source.timestamp - t1 = dep.sink.timestamp + t0 = source.timestamp + t1 = sink.timestamp v = len(cg0.exprs) return t0 < v <= t1 or t1 < v <= t0 @@ -354,9 +354,6 @@ def is_cross(dep): elif any(scope.d_output_gen()): dag.add_edge(cg0, cg1) - if peeking and dag.edges: - return dag - return dag diff --git a/devito/passes/equations/linearity.py b/devito/passes/equations/linearity.py index 0950e1126f..66390f821e 100644 --- a/devito/passes/equations/linearity.py +++ b/devito/passes/equations/linearity.py @@ -23,10 +23,16 @@ def collect_derivatives(expressions): mapper = inspect(e) # E.g., 0.2*u.dx -> (0.2*u).dx - ep = aggregate_coeffs(e, mapper) + e1 = aggregate_coeffs(e, mapper) # E.g., (0.2*u).dx + (0.3*v).dx -> (0.2*u + 0.3*v).dx - processed.append(factorize_derivatives(ep)) + e2 = factorize_derivatives(e1) + if e2 == e1: + # No luck, stick to `e` to preserve e.g. the original + # coefficient factorization + processed.append(e) + else: + processed.append(e2) return processed diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index c29c625122..ed3db7a22f 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -5,6 +5,7 @@ EntryFunction, ThreadCallable, Uxreplace, derive_parameters) from devito.ir.support import SymbolRegistry +from devito.mpi.distributed import MPINeighborhood from devito.tools import DAG, as_tuple, filter_ordered, timed_pass from devito.types import (Array, CompositeObject, Lock, IncrDimension, Indirection, Temp) @@ -318,6 +319,11 @@ def _(i, mapper, sregistry): mapper[i] = v +@abstract_object.register(MPINeighborhood) +def _(i, mapper, sregistry): + mapper[i] = i._rebuild() + + @abstract_object.register(BlockDimension) def _(i, mapper, sregistry): if i._depth != 2: diff --git a/devito/passes/iet/instrument.py b/devito/passes/iet/instrument.py index db9b446d7f..b844fd9a08 100644 --- a/devito/passes/iet/instrument.py +++ b/devito/passes/iet/instrument.py @@ -1,10 +1,12 @@ -from devito.ir.iet import (BusyWait, FindNodes, FindSymbols, MapNodes, Section, - TimedList, Transformer) +from itertools import groupby + +from devito.ir.iet import (BusyWait, FindNodes, FindSymbols, MapNodes, Iteration, + Section, TimedList, Transformer) from devito.mpi.routines import (HaloUpdateCall, HaloWaitCall, MPICall, MPIList, HaloUpdateList, HaloWaitList, RemainderCall, ComputeCall) from devito.passes.iet.engine import iet_pass -from devito.types import Timer +from devito.types import TempArray, TempFunction, Timer __all__ = ['instrument'] @@ -25,10 +27,12 @@ def instrument(graph, **kwargs): @iet_pass def track_subsections(iet, **kwargs): """ - Add custom Sections to the `profiler`. Custom Sections include: + Add sub-Sections to the `profiler`. Sub-Sections include: * MPI Calls (e.g., HaloUpdateCall and HaloUpdateWait) * Busy-waiting on While(lock) (e.g., from host-device orchestration) + * Multi-pass implementations -- one sub-Section for each pass, within one + macro Section """ profiler = kwargs['profiler'] sregistry = kwargs['sregistry'] @@ -45,6 +49,7 @@ def track_subsections(iet, **kwargs): mapper = {} + # MPI Calls, busy-waiting for NodeType in [MPIList, MPICall, BusyWait, ComputeCall]: for k, v in MapNodes(Section, NodeType).visit(iet).items(): for i in v: @@ -57,6 +62,37 @@ def track_subsections(iet, **kwargs): iet = Transformer(mapper).visit(iet) + # Multi-pass implementations + mapper = {} + + for i in FindNodes(Iteration).visit(iet): + for k, g in groupby(i.nodes, key=lambda n: n.is_Section): + if not k: + continue + + candidates = [] + for i in g: + functions = FindSymbols().visit(i) + if any(isinstance(f, (TempArray, TempFunction)) for f in functions): + candidates.append(i) + else: + # They must be consecutive Sections + break + if len(candidates) < 2: + continue + + name = sregistry.make_name(prefix='multipass') + body = [i._rebuild(is_subsection=True) for i in candidates] + section = Section(name, body=body) + + profiler.group_as_subsections(name, [i.name for i in candidates]) + + mapper[candidates.pop(0)] = section + for i in candidates: + mapper[i] = None + + iet = Transformer(mapper).visit(iet) + return iet, {} @@ -100,6 +136,6 @@ def sync_sections(iet, lang=None, profiler=None, **kwargs): if runs_async and not unnecessary: mapper[tl] = tl._rebuild(body=tl.body + (sync,)) - iet = Transformer(mapper).visit(iet) + iet = Transformer(mapper, nested=True).visit(iet) return iet, {} diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py index 4199283e54..e83aeae943 100644 --- a/devito/passes/iet/misc.py +++ b/devito/passes/iet/misc.py @@ -5,11 +5,11 @@ from devito.finite_differences import Max, Min from devito.ir import (Any, Forward, Iteration, List, Prodder, FindApplications, - FindNodes, Transformer, Uxreplace, filter_iterations, - retrieve_iteration_tree) + FindNodes, FindSymbols, Transformer, Uxreplace, + filter_iterations, retrieve_iteration_tree) from devito.passes.iet.engine import iet_pass from devito.symbolics import evalrel, has_integer_args -from devito.tools import as_mapper, split +from devito.tools import as_mapper, filter_ordered, split __all__ = ['avoid_denormals', 'hoist_prodders', 'relax_incr_dimensions', 'generate_macros', 'minimize_symbols'] @@ -178,23 +178,29 @@ def minimize_symbols(iet): def remove_redundant_moddims(iet): - subs0 = {} - subs1 = {} - for n in FindNodes(Iteration).visit(iet): - mds = [d for d in n.uindices - if d.is_Modulo and d.origin is not None] - if not mds: - continue + key = lambda d: d.is_Modulo and d.origin is not None + mds = [d for d in FindSymbols('dimensions').visit(iet) if key(d)] + if not mds: + return iet - mapper = as_mapper(mds, key=lambda md: md.origin % md.modulo) - for k, v in mapper.items(): - chosen = v.pop(0) - subs0.update({d: chosen for d in v}) + mapper = as_mapper(mds, key=lambda md: md.origin % md.modulo) - uindices = [d for d in n.uindices if d not in subs0] - subs1[n] = n._rebuild(uindices=uindices) + subs = {} + for k, v in mapper.items(): + chosen = v.pop(0) + subs.update({d: chosen for d in v}) + + body = Uxreplace(subs).visit(iet.body) + iet = iet._rebuild(body=body) + + # ModuloDimensions are defined in Iteration headers, hence they must be + # removed from there too + subs = {} + for n in FindNodes(Iteration).visit(iet): + if not set(n.uindices) & set(mds): + continue + subs[n] = n._rebuild(uindices=filter_ordered(n.uindices)) - iet = Transformer(subs1, nested=True).visit(iet) - iet = Uxreplace(subs0).visit(iet) + iet = Transformer(subs, nested=True).visit(iet) return iet diff --git a/devito/symbolics/inspection.py b/devito/symbolics/inspection.py index 94279db4ab..25ed8f84eb 100644 --- a/devito/symbolics/inspection.py +++ b/devito/symbolics/inspection.py @@ -111,6 +111,8 @@ def estimate_cost(exprs, estimate=False): 'pow': 50, 'div': 5, 'Abs': 5, + 'floor': 1, + 'ceil': 1 } diff --git a/devito/symbolics/printer.py b/devito/symbolics/printer.py index 8f7ef6a719..cf3ba528bc 100644 --- a/devito/symbolics/printer.py +++ b/devito/symbolics/printer.py @@ -229,10 +229,13 @@ def _print_TrigonometricFunction(self, expr): func_name += 'f' return '%s(%s)' % (func_name, self._print(*expr.args)) + def _print_DefFunction(self, expr): + arguments = [self._print(i) for i in expr.arguments] + return "%s(%s)" % (expr.name, ','.join(arguments)) + def _print_Fallback(self, expr): return expr.__str__() - _print_DefFunction = _print_Fallback _print_MacroArgument = _print_Fallback _print_IndexedBase = _print_Fallback _print_IndexSum = _print_Fallback diff --git a/devito/symbolics/search.py b/devito/symbolics/search.py index 94e533a692..7680bd462a 100644 --- a/devito/symbolics/search.py +++ b/devito/symbolics/search.py @@ -1,3 +1,5 @@ +import sympy + from devito.symbolics.queries import (q_indexed, q_function, q_terminal, q_leaf, q_symbol, q_dimension, q_derivative) from devito.tools import as_tuple @@ -48,7 +50,7 @@ def __init__(self, query, mode, deep=False): self.deep = deep def _next(self, expr): - if self.deep is True and expr.is_Indexed: + if self.deep and expr.is_Indexed: return expr.indices elif q_leaf(expr): return () @@ -110,10 +112,18 @@ def search(exprs, query, mode='unique', visit='dfs', deep=False): assert mode in Search.modes, "Unknown mode" - searcher = Search(query, mode, deep) + if isinstance(query, type): + Q = lambda obj: isinstance(obj, query) + else: + Q = query + + searcher = Search(Q, mode, deep) found = Search.modes[mode]() for e in as_tuple(exprs): + if not isinstance(e, sympy.Basic): + continue + if visit == 'dfs': found.update(searcher.dfs(e)) elif visit == 'bfs': diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 61bb7f5257..803f399e3e 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -10,8 +10,7 @@ from devito.tools.algorithms import toposort __all__ = ['Bunch', 'EnrichedTuple', 'ReducerMap', 'DefaultOrderedDict', - 'OrderedSet', 'PartialOrderTuple', 'DAG', 'frozendict', - 'UnboundedMultiTuple'] + 'OrderedSet', 'Ordering', 'DAG', 'frozendict', 'UnboundedMultiTuple'] class Bunch(object): @@ -29,6 +28,13 @@ class Bunch(object): def __init__(self, **kwargs): self.__dict__.update(kwargs) + def __repr__(self): + return "Bunch(%s)" % ", ".join(["%s=%s" % i for i in self.__dict__.items()]) + + def __iter__(self): + for i in self.__dict__.values(): + yield i + class EnrichedTuple(tuple, Pickable): @@ -279,7 +285,7 @@ def __str__(self): union = property(lambda self: self.__or__) -class PartialOrderTuple(tuple): +class Ordering(tuple): """ A tuple whose elements are ordered according to a set of relations. @@ -289,36 +295,62 @@ class PartialOrderTuple(tuple): items : object or iterable of objects The elements of the tuple. relations : iterable of tuples, optional - One or more binary relations between elements in ``items``. If not - provided, then ``items`` is interpreted as a totally ordered sequence. + One or more n-ary relations between the elements in `items`. If not + provided, then `items` is interpreted as a totally ordered sequence. If provided, then a (partial) ordering is computed and all elements in - ``items`` for which a relation is not provided are appended. + `items` for which a relation is not provided are appended. + mode : str, optional + If 'total' (default), the resulting object is interpreted as a totally + ordered sequence; the object's relations are simplified away and no + subsequent operation involving the Ordering will ever be able to alter + the obtained sequence. If 'partial', the outcome is a partially ordered + sequence; the relations as provided by the user are preserved, which + leaves room for further reordering upon future operations. If 'unordered', + the `relations` are ignored and the resulting object degenerates to an + unordered collection. """ - def __new__(cls, items=None, relations=None): + def __new__(cls, items=None, relations=None, mode='total'): + assert mode in ('total', 'partial', 'unordered') + items = as_tuple(items) if relations: items = cls.reorder(items, relations) - obj = super(PartialOrderTuple, cls).__new__(cls, items) - obj._relations = set(tuple(i) for i in as_tuple(relations)) + + obj = super().__new__(cls, items) + + obj._relations = frozenset(cls.simplify_relations(relations, items, mode)) + obj._mode = mode + return obj @classmethod def reorder(cls, items, relations): return filter_ordered(toposort(relations) + list(items)) + @classmethod + def simplify_relations(cls, relations, items, mode): + if mode == 'total': + return [tuple(items)] + elif mode == 'partial': + return [tuple(i) for i in as_tuple(relations)] + else: + return [] + def __eq__(self, other): - return super(PartialOrderTuple, self).__eq__(other) and\ - self.relations == other.relations + return (super().__eq__(other) and + self.relations == other.relations and + self.mode == other.mode) def __hash__(self): - return hash(*([i for i in self] + list(self.relations))) + return hash(*([i for i in self] + list(self.relations) + [self.mode])) @property def relations(self): return self._relations - def generate_ordering(self): - raise NotImplementedError + @property + def mode(self): + return self._mode class DAG(object): @@ -333,11 +365,13 @@ class DAG(object): https://github.com/thieman/py-dag/ """ - def __init__(self, nodes=None, edges=None): + def __init__(self, nodes=None, edges=None, labels=None): self.graph = OrderedDict() self.labels = DefaultOrderedDict(dict) + for node in as_tuple(nodes): self.add_node(node) + for i in as_tuple(edges): try: ind_node, dep_node = i @@ -346,6 +380,11 @@ def __init__(self, nodes=None, edges=None): self.labels[ind_node][dep_node] = label self.add_edge(ind_node, dep_node) + for ind_node, i in (labels or {}).items(): + for dep_node, v in i.items(): + if dep_node in self.graph.get(ind_node, []): + self.labels[ind_node][dep_node] = v + def __contains__(self, key): return key in self.graph @@ -353,6 +392,10 @@ def __contains__(self, key): def nodes(self): return tuple(self.graph) + @property + def roots(self): + return tuple(n for n in self.nodes if not self.predecessors(n)) + @property def edges(self): ret = [] @@ -364,6 +407,9 @@ def edges(self): def size(self): return len(self.graph) + def clone(self): + return self.__class__(self.nodes, self.edges, self.labels) + def add_node(self, node_name, ignore_existing=False): """Add a node if it does not exist yet, or error out.""" if node_name in self.graph: @@ -412,6 +458,24 @@ def predecessors(self, node): """Return a list of all predecessors of the given node.""" return [key for key in self.graph if node in self.graph[key]] + def all_predecessors(self, node): + """ + Return a list of all nodes ultimately predecessors of the given node + in the dependency graph, in topological order. + """ + found = set() + + def _all_predecessors(n): + if n in found: + return + found.add(n) + for predecessor in self.predecessors(n): + _all_predecessors(predecessor) + + _all_predecessors(node) + + return found + def downstream(self, node): """Return a list of all nodes this node has edges towards.""" if node not in self.graph: @@ -507,6 +571,28 @@ def connected_components(self, enumerated=False): else: return tuple(groups) + def find_paths(self, node): + if node not in self.graph: + raise KeyError('node %s is not in graph' % node) + + paths = [] + + def dfs(node, path): + path.append(node) + + if not self.graph[node]: + paths.append(tuple(path)) + else: + for child in self.graph[node]: + dfs(child, path) + + # Remove the node from the path to backtrack + path.pop() + + dfs(node, []) + + return tuple(paths) + class frozendict(Mapping): """ @@ -589,7 +675,11 @@ def __init__(self, *items): nitems = [] for i in as_tuple(items): if isinstance(i, Iterable): - nitems.append(tuple(i)) + if isinstance(i, tuple): + # Honours tuple subclasses + nitems.append(i) + else: + nitems.append(tuple(i)) else: raise ValueError("Expected sequence, got %s" % type(i)) @@ -603,6 +693,17 @@ def __repr__(self): items[self.tip] = "*%s" % items[self.tip] return "%s(%s)" % (self.__class__.__name__, ", ".join(items)) + @property + def curitem(self): + return self.items[self.tip] + + @property + def nextitem(self): + return self.items[min(self.tip + 1, max(len(self.items) - 1, 0))] + + def index(self, item): + return self.items.index(item) + def iter(self): if not self.items: raise ValueError("No tuples available") diff --git a/devito/tools/dtypes_lowering.py b/devito/tools/dtypes_lowering.py index 0b3cd53ebf..62776eefd5 100644 --- a/devito/tools/dtypes_lowering.py +++ b/devito/tools/dtypes_lowering.py @@ -75,12 +75,15 @@ def add_dtype(self, field_name, count): self.update(build_dtypes_vector([field_name], [count])) - def get_base_dtype(self, v): + def get_base_dtype(self, v, default=None): for (base_dtype, count), dtype in self.items(): if dtype is v: return base_dtype - raise ValueError + if default is not None: + return default + else: + raise ValueError dtypes_vector_mapper = DTypesVectorMapper() @@ -263,8 +266,10 @@ def infer_dtype(dtypes): highest precision; * If there's at least one floating dtype, ignore any integer dtypes. """ - fdtypes = {i for i in dtypes if np.issubdtype(i, np.floating)} + # Resolve the vector types, if any + dtypes = {dtypes_vector_mapper.get_base_dtype(i, i) for i in dtypes} + fdtypes = {i for i in dtypes if np.issubdtype(i, np.floating)} if len(fdtypes) > 1: return max(fdtypes, key=lambda i: np.dtype(i).itemsize) elif len(fdtypes) == 1: diff --git a/devito/tools/utils.py b/devito/tools/utils.py index d99bf34b25..ff8b5608f2 100644 --- a/devito/tools/utils.py +++ b/devito/tools/utils.py @@ -3,6 +3,7 @@ from functools import reduce from itertools import chain, combinations, groupby, product, zip_longest from operator import attrgetter, mul +import sys import types import numpy as np @@ -15,6 +16,11 @@ 'humanbytes', 'contains_val'] +# Some utils run faster with Python>=3.7 +vi = sys.version_info +py_ge_37 = (vi.major, vi.minor) >= (3, 7) + + def prod(iterable, initial=1): return reduce(mul, iterable, initial) @@ -157,32 +163,49 @@ def single_or(l): return any(i) and not any(i) -def filter_ordered(elements, key=None): - """ - Filter elements in a list while preserving order. +if py_ge_37: + def filter_ordered(elements, key=None): + # This method exploits the fact that dictionary keys are unique and ordered + # (since Python 3.7). It's concise and often faster for larger lists - Parameters - ---------- - key : callable, optional - Conversion key used during equality comparison. - """ - if isinstance(elements, types.GeneratorType): - elements = list(elements) - seen = set() - if key is None: - try: - unordered, inds = np.unique(elements, return_index=True) - return unordered[np.argsort(inds)].tolist() - except: - return sorted(list(set(elements)), key=elements.index) - else: - ret = [] - for e in elements: - k = key(e) - if k not in seen: - ret.append(e) - seen.add(k) - return ret + if isinstance(elements, types.GeneratorType): + elements = list(elements) + + if key is None: + return list(dict.fromkeys(elements)) + else: + return list(dict(zip([key(i) for i in elements], elements)).values()) + +else: + def filter_ordered(elements, key=None): + if isinstance(elements, types.GeneratorType): + elements = list(elements) + + seen = set() + if key is None: + try: + unordered, inds = np.unique(elements, return_index=True) + return unordered[np.argsort(inds)].tolist() + except: + return sorted(list(set(elements)), key=elements.index) + else: + ret = [] + for e in elements: + k = key(e) + if k not in seen: + ret.append(e) + seen.add(k) + return ret + + +filter_ordered.__doc__ = """\ +Filter elements in a list while preserving order. + +Parameters +---------- +key : callable, optional + Conversion key used during equality comparison. +""" def filter_sorted(elements, key=None): diff --git a/devito/types/array.py b/devito/types/array.py index f12922f330..f7113c19b1 100644 --- a/devito/types/array.py +++ b/devito/types/array.py @@ -11,7 +11,8 @@ from devito.types.basic import AbstractFunction from devito.types.utils import CtypesFactory, DimensionTuple -__all__ = ['Array', 'ArrayMapped', 'ArrayObject', 'PointerArray', 'Bundle'] +__all__ = ['Array', 'ArrayMapped', 'ArrayObject', 'PointerArray', 'Bundle', + 'ComponentAccess'] class ArrayBasic(AbstractFunction): diff --git a/devito/types/basic.py b/devito/types/basic.py index 400232faed..102bfe3a75 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -358,6 +358,17 @@ def __init__(self, *args, **kwargs): def __init_finalize__(self, *args, **kwargs): self._is_const = kwargs.get('is_const', False) + def __eq__(self, other): + return (type(self) is type(other) and + self.dtype is other.dtype and + self.is_const == other.is_const and + super().__eq__(other)) + + __hash__ = sympy.Symbol.__hash__ + + def _hashable_content(self): + return super()._hashable_content() + (self.dtype, self.is_const) + @property def dtype(self): return self._dtype @@ -844,7 +855,6 @@ def __new__(cls, *args, **kwargs): # through the `function` field newobj.function = function or newobj - # Initialization newobj.__init_finalize__(*args, **kwargs) return newobj diff --git a/devito/types/dense.py b/devito/types/dense.py index 2a13fa91af..08146eaf83 100644 --- a/devito/types/dense.py +++ b/devito/types/dense.py @@ -998,7 +998,15 @@ def __init_finalize__(self, *args, **kwargs): else: raise TypeError("`space_order` must be int or 3-tuple of ints") - self._fd = self.__fd_setup__() + # Acquire derivative shortcuts + if self is self.function: + self._fd = self.__fd_setup__() + else: + # E.g., `self is f(x + i0, y)` and `self.function is f(x, y)` + # Dynamically genereating derivative shortcuts is expensive; we + # can clearly avoid that here though! + self._fd = self.function._fd + # Flag whether it is a parameter or a variable. # Used at operator evaluation to evaluate the Function at the # variable location (i.e. if the variable is staggered in x the diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 3166c7503b..61d82277da 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -144,6 +144,9 @@ def __dtype_setup__(cls, **kwargs): def __str__(self): return self.name + def _hashable_content(self): + return tuple(getattr(self, i) for i in self.__rargs__ + self.__rkwargs__) + @property def spacing(self): """Symbol representing the physical spacing along the Dimension.""" @@ -370,6 +373,20 @@ def __new__(cls, *args, **kwargs): def __init_finalize__(self, name, spacing=None, **kwargs): self._spacing = spacing or Spacing(name='h_%s' % name, is_const=True) + def __eq__(self, other): + # Being of type Cached, Dimensions are by construction unique. But unlike + # Symbols, equality is much stricter -- we consider any two Dimensions + # equal iff they are the very same object. This has several advantages. + # First of all, it makes it much more difficult to trick the compiler + # to generate buggy code (e.g., using two different "x" Dimensions that + # actually represent the same iteration space). Secondly, comparison + # is much cheaper, since we avoid having to go through all of the + # __rargs__/__rkwargs__, and there can be quite a few depending on the + # specific Dimension type + return self is other + + __hash__ = Symbol.__hash__ + class DefaultDimension(Dimension, DataSymbol): @@ -1386,8 +1403,10 @@ class StencilDimension(BasicDimension): is_Stencil = True __rargs__ = BasicDimension.__rargs__ + ('_min', '_max') + __rkwargs__ = BasicDimension.__rkwargs__ + ('step',) - def __init_finalize__(self, name, _min, _max, spacing=None, **kwargs): + def __init_finalize__(self, name, _min, _max, spacing=None, step=1, + **kwargs): self._spacing = sympy.sympify(spacing) or sympy.S.One if not is_integer(_min): @@ -1396,16 +1415,25 @@ def __init_finalize__(self, name, _min, _max, spacing=None, **kwargs): raise ValueError("Expected integer `max` (got %s)" % _max) if not is_integer(self._spacing): raise ValueError("Expected integer `spacing` (got %s)" % self._spacing) + if not is_integer(step): + raise ValueError("Expected integer `step` (got %s)" % step) self._min = _min self._max = _max + self._step = step + self._size = _max - _min + 1 if self._size < 1: raise ValueError("Expected size greater than 0 (got %s)" % self._size) - def _hashable_content(self): - return super()._hashable_content() + (self._min, self._max, self._spacing) + @property + def step(self): + return self._step + + @property + def backward(self): + return self.step < 0 @cached_property def symbolic_size(self): @@ -1423,6 +1451,9 @@ def symbolic_max(self): def range(self): return range(self._min, self._max + 1) + def transpose(self): + return StencilDimension(self.name, -self._max, -self._min, step=-1) + @property def _arg_names(self): return () @@ -1552,7 +1583,7 @@ class AffineIndexAccessFunction(IndexAccessFunction): * the "main" Dimension (in practice a SpaceDimension or a TimeDimension); * an offset (number or symbolic expression, of dtype integer). - * a StencilDimension; + * one or more StencilDimensions; Examples -------- @@ -1561,39 +1592,47 @@ class AffineIndexAccessFunction(IndexAccessFunction): """ def __new__(cls, *args, **kwargs): + # `args` may contain arbitrarily complicated expressions, so first of all + # we let SymPy simplify it, then we process the args and see if the + # resulting expression is indeed an AffineIndexAccessFunction + add = sympy.Add(*args, **kwargs) + if not isinstance(add, sympy.Add): + # E.g., reduced to a Symbol + return add + d = 0 - sd = 0 + sds = [] ofs_items = [] - for a in args: + for a in add.args: if isinstance(a, StencilDimension): - if sd != 0: - return sympy.Add(*args, **kwargs) - sd = a + sds.append(a) elif isinstance(a, Dimension): d = cls._separate_dims(d, a, ofs_items) if d is None: - return sympy.Add(*args, **kwargs) + return add elif isinstance(a, AffineIndexAccessFunction): - if sd != 0 and a.sd != 0: - return sympy.Add(*args, **kwargs) + if sds and a.sds: + return add d = cls._separate_dims(d, a.d, ofs_items) if d is None: - return sympy.Add(*args, **kwargs) - sd = a.sd + return add + sds = list(a.sds or sds) ofs_items.append(a.ofs) else: ofs_items.append(a) ofs = sympy.Add(*[i for i in ofs_items if i is not None]) if not all(is_integer(i) or i.is_Symbol for i in ofs.free_symbols): - return sympy.Add(*args, **kwargs) + return add - obj = IndexAccessFunction.__new__(cls, d, ofs, sd) + sds = tuple(sds) + + obj = IndexAccessFunction.__new__(cls, d, ofs, *sds) if isinstance(obj, AffineIndexAccessFunction): obj.d = d obj.ofs = ofs - obj.sd = sd + obj.sds = sds else: # E.g., SymPy simplified it to Zero or something else pass @@ -1603,7 +1642,7 @@ def __new__(cls, *args, **kwargs): @classmethod def _separate_dims(cls, d0, d1, ofs_items): if d0 == 0 and d1 == 0: - return None + return 0 elif d0 == 0 and isinstance(d1, Dimension): return d1 elif d1 == 0 and isinstance(d0, Dimension): diff --git a/devito/types/misc.py b/devito/types/misc.py index c644a60614..ccbd9b7fdc 100644 --- a/devito/types/misc.py +++ b/devito/types/misc.py @@ -3,12 +3,12 @@ import numpy as np from sympy.core.core import ordering_of_classes -from devito.types import CompositeObject, Indexed, Symbol +from devito.types import Array, CompositeObject, Indexed, Symbol from devito.types.basic import IndexedData from devito.tools import Pickable, as_tuple __all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard', - 'Global', 'Hyperplane', 'Indirection', 'Temp', 'Jump'] + 'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump'] class Timer(CompositeObject): @@ -175,8 +175,8 @@ def __new__(cls, name=None, mapped=None, dtype=np.uint64, is_const=True, class Temp(Symbol): """ - A Temp is a Symbol used by compiler passes to store locally-constructed - temporary expressions. + A Temp is a Symbol used by compiler passes to store intermediate + sub-expressions. """ # Just make sure the SymPy args ordering is the same regardless of whether @@ -184,6 +184,16 @@ class Temp(Symbol): ordering_of_classes.insert(ordering_of_classes.index('Symbol') + 1, 'Temp') +class TempArray(Array): + + """ + A TempArray is an Array used by compiler passes to store intermediate + sub-expressions. + """ + + pass + + class Jump(object): """ diff --git a/devito/types/object.py b/devito/types/object.py index d5d20d65d9..1db973cce9 100644 --- a/devito/types/object.py +++ b/devito/types/object.py @@ -53,6 +53,8 @@ def __repr__(self): def _sympystr(self, printer): return str(self) + _ccode = _sympystr + def _hashable_content(self): return (self.name, self.dtype) diff --git a/devito/types/parallel.py b/devito/types/parallel.py index 285aa38257..491839779d 100644 --- a/devito/types/parallel.py +++ b/devito/types/parallel.py @@ -22,7 +22,7 @@ __all__ = ['NThreads', 'NThreadsNested', 'NThreadsNonaffine', 'NThreadsBase', 'DeviceID', 'ThreadID', 'Lock', 'PThreadArray', 'SharedData', - 'NPThreads', 'DeviceRM', 'QueueID', 'Barrier'] + 'NPThreads', 'DeviceRM', 'QueueID', 'Barrier', 'TBArray'] class NThreadsBase(Scalar): @@ -294,3 +294,15 @@ class Barrier(object): """ pass + + +class TBArray(Array): + + """ + An Array used for performance optimization within a thread block. + """ + + def __init_finalize__(self, *args, **kwargs): + kwargs['liveness'] = 'eager' + + super().__init_finalize__(*args, **kwargs) diff --git a/devito/types/tensor.py b/devito/types/tensor.py index 6c94c5fc3a..62a555ef78 100644 --- a/devito/types/tensor.py +++ b/devito/types/tensor.py @@ -173,7 +173,13 @@ def is_diagonal(self): for i in range(self.rows) if i != j]) def _evaluate(self, **kwargs): - return self.applyfunc(lambda x: getattr(x, 'evaluate', x)) + def _do_evaluate(x): + try: + expand = kwargs.get('expand', True) + return x._evaluate(expand=expand) + except AttributeError: + return x + return self.applyfunc(_do_evaluate) def values(self): if self.is_diagonal: diff --git a/examples/performance/01_gpu.ipynb b/examples/performance/01_gpu.ipynb index a3f01a55a4..1b4e29ddb9 100644 --- a/examples/performance/01_gpu.ipynb +++ b/examples/performance/01_gpu.ipynb @@ -253,7 +253,7 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define uL0(time, x, y) u[(time)*x_stride0 + (x)*y_stride0 + (y)]\n", + "#define uL0(time,x,y) u[(time)*x_stride0 + (x)*y_stride0 + (y)]\n", "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", diff --git a/tests/test_caching.py b/tests/test_caching.py index a11c6319a3..227ac4d193 100644 --- a/tests/test_caching.py +++ b/tests/test_caching.py @@ -110,6 +110,19 @@ def test_default_dimension(self): dd1 = DefaultDimension(name='dd') assert hash(dd0) != hash(dd1) + def test_spacing(self): + """ + Test that spacing symbols from grids with different dtypes have different + hash value. + """ + grid0 = Grid(shape=(4, 4), dtype=np.float32) + grid1 = Grid(shape=(4, 4), dtype=np.float64) + + h_x0 = grid0.dimensions[0].spacing + h_x1 = grid1.dimensions[0].spacing + + assert hash(h_x0) != hash(h_x1) + @pytest.mark.parametrize('FunctionType', [Function, TimeFunction]) def test_function(self, FunctionType): """Test that different Functions have different hash value.""" diff --git a/tests/test_derivatives.py b/tests/test_derivatives.py index 123975d63d..5fa11a6fa0 100644 --- a/tests/test_derivatives.py +++ b/tests/test_derivatives.py @@ -457,6 +457,16 @@ def test_all_shortcuts(self, so): for fd in g._fd: assert getattr(g, fd) + def test_transpose_simple(self): + grid = Grid(shape=(4, 4)) + + f = TimeFunction(name='f', grid=grid, space_order=4) + + assert str(f.dx.T.evaluate) == ("-0.0833333333*f(t, x - 2*h_x, y)/h_x " + "+ 0.666666667*f(t, x - h_x, y)/h_x " + "- 0.666666667*f(t, x + h_x, y)/h_x " + "+ 0.0833333333*f(t, x + 2*h_x, y)/h_x") + @pytest.mark.parametrize('so', [2, 4, 8, 12]) @pytest.mark.parametrize('ndim', [1, 2]) @pytest.mark.parametrize('derivative, adjoint_name', [ @@ -743,6 +753,29 @@ def test_dxdy_v2(self): term1 = f.dxdy._evaluate(expand=False) assert len(term1.find(IndexDerivative)) == 2 + def test_transpose(self): + grid = Grid(shape=(4, 4)) + x, _ = grid.dimensions + h_x = x.spacing + + f = TimeFunction(name='f', grid=grid, space_order=4) + + term = f.dx.T._evaluate(expand=False) + assert isinstance(term, IndexDerivative) + + i0, = term.dimensions + assert term.base == f.subs(x, x + i0*h_x) + + def test_tensor_algebra(self): + grid = Grid(shape=(4, 4)) + + f = Function(name='f', grid=grid, space_order=4) + + v = grad(f)._evaluate(expand=False) + + assert all(isinstance(i, IndexDerivative) for i in v) + assert all(zip([Add(*i.args) for i in grad(f).evaluate], v.evaluate)) + def bypass_uneval(expr): unevals = expr.find(EvalDerivative) diff --git a/tests/test_dimension.py b/tests/test_dimension.py index 7532602190..e4446900b3 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -27,7 +27,7 @@ def test_basic(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs == 1 - assert expr.sd == 0 + assert expr.sds == () s0 = Symbol(name='s0', dtype=np.int32) s1 = Symbol(name='s1', dtype=np.int32) @@ -37,7 +37,7 @@ def test_basic(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs == s0 + s1 + 1 - assert expr.sd == 0 + assert expr.sds == () def test_reversed(self): d = Dimension(name='x') @@ -47,14 +47,14 @@ def test_reversed(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs == 1 - assert expr.sd == 0 + assert expr.sds == () expr = d.symbolic_max + d assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs is d.symbolic_max - assert expr.sd == 0 + assert expr.sds == () def test_non_affine(self): grid = Grid(shape=(3,)) @@ -75,8 +75,8 @@ def test_stencil_dim(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d - assert expr.sd is sd assert expr.ofs == 1 + assert expr.sds == (sd,) s = Symbol(name='s') @@ -84,8 +84,36 @@ def test_stencil_dim(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d - assert expr.sd is sd assert expr.ofs == 1 + s + assert expr.sds == (sd,) + + expr = sd + 1 + d + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d is d + assert expr.ofs == 1 + assert expr.sds == (sd,) + + def test_stencil_dim_multiple(self): + d = Dimension(name='x') + sd0 = StencilDimension('i0', 0, 1) + sd1 = StencilDimension('i1', 0, 1) + + expr = d + sd0 + sd1 + 1 + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d is d + assert expr.ofs == 1 + assert expr.sds == (sd0, sd1) + + s = Symbol(name='s') + + expr = sd0 + d + sd1 + 1 + s + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d is d + assert expr.ofs == 1 + s + assert expr.sds == (sd0, sd1) def test_sub(self): d = Dimension(name='x') @@ -103,8 +131,15 @@ def test_sub(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d - assert expr.sd is sd assert expr.ofs == -1 - s + assert expr.sds == (sd,) + + expr = d + 1 + sd - d + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d == 0 + assert expr.ofs == 1 + assert expr.sds == (sd,) class TestBufferedDimension(object): @@ -1461,12 +1496,7 @@ def test_sparse_time_function(self): for i in range(12, 20): assert np.all(p.data[i] == 0) - @pytest.mark.parametrize('init_value,expected', [ - ([2, 1, 3], [2, 2, 0]), # updates f1, f2 - ([3, 3, 3], [3, 3, 0]), # updates f2 - ([1, 2, 3], [1, 2, 3]) # no updates - ]) - def test_issue_1435(self, init_value, expected): + def test_issue_1435(self): names = 't1 t2 t3 t4 t5 t6 t7 t8 t9 t10' t1, t2, t3, t4, t5, t6, t7, t8, t9, t10 = \ tuple(SpaceDimension(i) for i in names.split()) @@ -1477,10 +1507,6 @@ def test_issue_1435(self, init_value, expected): dimensions=(t5, t6, t7, t8))) f2 = Function(name='f2', grid=f1.grid) - f0.data[:] = init_value[0] - f1.data[:] = init_value[1] - f2.data[:] = init_value[2] - cd = ConditionalDimension(name='cd', parent=t10, condition=Or(Gt(f0[t5, t6, t7 + t9, t8 + t10], @@ -1497,11 +1523,6 @@ def test_issue_1435(self, init_value, expected): # Check it compiles correctly! See issue report op.cfunction - op.apply(t9_M=5, t10_M=5) - - assert np.all(f0.data[:] == expected[0]) - assert np.all(f1.data[:] == expected[1]) - assert np.all(f2.data[:] == expected[2]) @pytest.mark.parametrize('factor', [ 4, diff --git a/tests/test_dle.py b/tests/test_dle.py index 8e37947a36..bfcb732bb4 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -294,9 +294,9 @@ class TestBlockingParTile(object): ((16, 16, 16), ((16, 16, 16), (16, 16, 16))), ((32, 4, 4), ((4, 4, 32), (4, 4, 32))), (((16, 4, 4), (16, 16, 16)), ((4, 4, 16), (16, 16, 16))), - (((32, 4, 4), 1), ((4, 4, 32), (4, 4, 32))), - (((32, 4, 4), 1, 'tag0'), ((4, 4, 32), (4, 4, 32))), - ((((32, 4, 8), 1, 'tag0'), ((32, 8, 4), 2)), ((8, 4, 32), (4, 8, 32))), + (((32, 4, 4), None), ((4, 4, 32), (4, 4, 32))), + (((32, 4, 4), None, 'tag0'), ((4, 4, 32), (4, 4, 32))), + ((((32, 4, 8), None, 'tag0'), ((32, 8, 4), None)), ((8, 4, 32), (4, 8, 32))), ]) def test_structure(self, par_tile, expected): grid = Grid(shape=(8, 8, 8)) @@ -344,6 +344,62 @@ def test_structure_2p5D(self): assert iters[0].step == par_tile[1] assert iters[1].step == par_tile[0] + def test_custom_rule0(self): + grid = Grid(shape=(8, 8, 8)) + + u = TimeFunction(name="u", grid=grid, space_order=4) + v = TimeFunction(name="v", grid=grid, space_order=4) + + eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx), + Eq(v.forward, u.forward.dx)] + + # "Apply par-tile=(4, 4, 4) to the loop nest (kernel) with id (rule)=1, + # and use default for the rest!" + par_tile = (4, 4, 4) + rule = 1 + + op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule), + 'blockinner': True})) + + # Check generated code. By having specified "1" as rule, we expect the + # given par-tile to be applied to the kernel with id 1 + bns, _ = assert_blocking(op, {'z0_blk0', 'x1_blk0', 'z2_blk0'}) + root = bns['x1_blk0'] + iters = FindNodes(Iteration).visit(root) + iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1] + assert len(iters) == 3 + assert all(i.step == j for i, j in zip(iters, par_tile)) + + def test_custom_rule1(self): + grid = Grid(shape=(8, 8, 8)) + x, y, z = grid.dimensions + + f = Function(name='f', grid=grid) + u = TimeFunction(name="u", grid=grid, space_order=4) + v = TimeFunction(name="v", grid=grid, space_order=4) + + eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx + cos(f)*cos(f[x+1, y, z])), + Eq(v.forward, u.forward.dx)] + + # "Apply par-tile=(4, 4, 4) to the loop nests (kernels) embedded within + # the time loop, and use default for the rest!" + par_tile = (4, 4, 4) + rule = grid.time_dim.name # We must be able to infer it from str + + op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule), + 'blockinner': True, + 'blockrelax': True})) + + # Check generated code. By having specified "time" as rule, we expect the + # given par-tile to be applied to the kernel within the time loop + bns, _ = assert_blocking(op, {'x0_blk0', 'x1_blk0', 'x2_blk0'}) + for i in ['x0_blk0', 'x1_blk0', 'x2_blk0']: + root = bns[i] + iters = FindNodes(Iteration).visit(root) + iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1] + assert len(iters) == 3 + assert all(i.step == j for i, j in zip(iters, par_tile)) + @pytest.mark.parametrize("shape", [(10,), (10, 45), (20, 33), (10, 31, 45), (45, 31, 45)]) @pytest.mark.parametrize("time_order", [2]) @@ -1125,6 +1181,7 @@ def test_multiple_subnests_v0(self): _R(u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3.*f) + 1.) op = Operator(eqn, opt=('advanced', {'openmp': True, 'cire-mingain': 0, + 'cire-schedule': 1, 'par-nested': 0, 'par-collapse-ncores': 1, 'par-dynamic-work': 0})) @@ -1160,6 +1217,7 @@ def test_multiple_subnests_v1(self): _R(u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3.*f) + 1.) op = Operator(eqn, opt=('advanced', {'openmp': True, 'cire-mingain': 0, + 'cire-schedule': 1, 'cire-rotate': True, 'par-nested': 0, 'par-collapse-ncores': 1, diff --git a/tests/test_dse.py b/tests/test_dse.py index c111104506..009482ca43 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -527,6 +527,7 @@ def test_collection(self, exprs, expected): ispace = exprs[0].ispace aliases = collect(extracted, ispace, False) + aliases.filter(lambda a: a.score > 0) assert len(aliases) == len(expected) assert all(i.pivot in expected for i in aliases) @@ -1573,14 +1574,14 @@ def test_lazy_solve_produces_larger_temps(self): """ grid = Grid(shape=(10, 10)) - u = TimeFunction(name="u", grid=grid, space_order=4, time_order=2) + u = TimeFunction(name="u", grid=grid, space_order=8, time_order=2) pde = u.dt2 - (u.dx.dx + u.dy.dy) + u.dx.dy eq = Eq(u.forward, solve(pde, u.forward)) op = Operator(eq) assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 2 - assert op._profiler._sections['section0'].sops == 39 + assert op._profiler._sections['section0'].sops == 67 def test_hoisting_iso_ot4_akin(self): """ @@ -2062,8 +2063,11 @@ def test_tti_adjoint_akin_v2(self): eqn = Eq(p.backward, H0) - op0 = Operator(eqn, subs=grid.spacing_map, opt=('noop', {'openmp': True})) - op1 = Operator(eqn, subs=grid.spacing_map, opt=('advanced', {'openmp': True})) + op0 = Operator(eqn, subs=grid.spacing_map, + opt=('noop', {'openmp': True})) + op1 = Operator(eqn, subs=grid.spacing_map, + opt=('advanced', {'openmp': True, + 'cire-schedule': 0})) # Check code generation bns, pbs = assert_blocking(op1, {'x0_blk0'}) @@ -2154,9 +2158,9 @@ def test_nested_first_derivatives_unbalanced(self): ('v.dx.dx + p.dx.dx', (2, 2, (0, 2)), (61, 61, 25)), ('(v.dx + v.dy).dx - (v.dx + v.dy).dy + 2*f.dx.dx + f*f.dy.dy + f.dx.dx(x0=1)', - (3, 3, (0, 3)), (218, 202, 74)), + (3, 3, (0, 3)), (218, 202, 75)), ('(g*(1 + f)*v.dx).dx + (2*g*f*v.dx).dx', - (1, 2, (0, 1)), (52, 70, 20)), + (1, 1, (0, 1)), (52, 70, 20)), ('g*(f.dx.dx + g.dx.dx)', (1, 2, (0, 1)), (47, 62, 17)), ]) @@ -2233,7 +2237,8 @@ def test_derivatives_from_different_levels(self): eqn = Eq(v.forward, f*(1. + v).dx + 2.*f*((1. + v).dx + f)) - op = Operator(eqn, opt=('advanced', {'cire-mingain': 0})) + op = Operator(eqn, opt=('advanced', {'cire-mingain': 0, + 'cire-schedule': 0})) # Check code generation assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 1 @@ -2729,11 +2734,10 @@ def test_fullopt(self): assert summary0[('section1', None)].ops == 44 assert np.isclose(summary0[('section0', None)].oi, 2.851, atol=0.001) - assert summary1[('section0', None)].ops == 9 - assert summary1[('section1', None)].ops == 31 - assert summary1[('section2', None)].ops == 88 - assert summary1[('section3', None)].ops == 22 - assert np.isclose(summary1[('section1', None)].oi, 1.767, atol=0.001) + assert summary1[('section0', None)].ops == 31 + assert summary1[('section1', None)].ops == 88 + assert summary1[('section2', None)].ops == 25 + assert np.isclose(summary1[('section0', None)].oi, 1.767, atol=0.001) assert np.allclose(u0.data, u1.data, atol=10e-5) assert np.allclose(rec0.data, rec1.data, atol=10e-5) @@ -2793,8 +2797,8 @@ def test_fullopt(self): assert np.allclose(self.tti_noopt[1].data, rec.data, atol=10e-1) # Check expected opcount/oi - assert summary[('section2', None)].ops == 92 - assert np.isclose(summary[('section2', None)].oi, 2.074, atol=0.001) + assert summary[('section1', None)].ops == 92 + assert np.isclose(summary[('section1', None)].oi, 2.074, atol=0.001) # With optimizations enabled, there should be exactly four BlockDimensions op = wavesolver.op_fwd() @@ -2812,7 +2816,7 @@ def test_fullopt(self): # 3 Arrays are defined globally for the sparse positions temporaries # and two additional bock-sized Arrays are defined locally arrays = [i for i in FindSymbols().visit(op) if i.is_Array] - extra_arrays = 2+3 + extra_arrays = 2 assert len(arrays) == 4 + extra_arrays assert all(i._mem_heap and not i._mem_external for i in arrays) bns, pbs = assert_blocking(op, {'x0_blk0'}) @@ -2848,18 +2852,19 @@ def test_fullopt_w_mpi(self): def test_opcounts(self, space_order, expected): op = self.tti_operator(opt='advanced', space_order=space_order) sections = list(op.op_fwd()._profiler._sections.values()) - assert sections[2].sops == expected + assert sections[1].sops == expected @switchconfig(profiling='advanced') - @pytest.mark.parametrize('space_order,expected', [ - (4, 121), + @pytest.mark.parametrize('space_order,exp_ops,exp_arrays', [ + (4, 122, 6), (8, 221, 7) ]) - def test_opcounts_adjoint(self, space_order, expected): - wavesolver = self.tti_operator(opt=('advanced', {'openmp': False})) + def test_opcounts_adjoint(self, space_order, exp_ops, exp_arrays): + wavesolver = self.tti_operator(space_order=space_order, + opt=('advanced', {'openmp': False})) op = wavesolver.op_adj() - assert op._profiler._sections['section2'].sops == expected - assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3 + assert op._profiler._sections['section1'].sops == exp_ops + assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == exp_arrays class TestTTIv2(object): diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 3bf641a120..d3164722f2 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -869,8 +869,8 @@ def test_tasking_over_compiler_generated(self): assert len(retrieve_iteration_tree(op)) == 5 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) - assert len(sections) == 3 - assert 'while(lock0[t1] == 0)' in str(sections[1].body[0].body[0].body[0]) + assert len(sections) == 4 + assert 'while(lock0[t1] == 0)' in str(sections[2].body[0].body[0].body[0]) op0.apply(time_M=nt-1) op1.apply(time_M=nt-1, u=u1, usave=usave1) diff --git a/tests/test_linearize.py b/tests/test_linearize.py index 45347bf605..efed9f79db 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -479,8 +479,8 @@ def test_issue_1838(): op1.apply(time_M=3, dt=1., p0=p1) # Check generated code - assert "r0L0(x, y, z) r0[(x)*y_stride1 + (y)*z_stride1 + (z)]" in str(op1) - assert "r4L0(x, y, z) r4[(x)*y_stride2 + (y)*z_stride1 + (z)]" in str(op1) + assert "r0L0(x,y,z) r0[(x)*y_stride1 + (y)*z_stride1 + (z)]" in str(op1) + assert "r4L0(x,y,z) r4[(x)*y_stride2 + (y)*z_stride1 + (z)]" in str(op1) assert np.allclose(p0.data, p1.data, rtol=1e-6) diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 46b8e18fa5..9d827a3001 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -2108,7 +2108,8 @@ def test_cire(self): eqn = Eq(u.forward, _R(_R(u[t, x, y] + u[t, x+1, y+1])*3.*f + _R(u[t, x+2, y+2] + u[t, x+3, y+3])*3.*f) + 1.) op0 = Operator(eqn, opt='noop') - op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0})) + op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0, + 'cire-schedule': 1})) assert len([i for i in FindSymbols().visit(op1.body) if i.is_Array]) == 1 @@ -2141,7 +2142,8 @@ def test_cire_with_shifted_diagonal_halo_touch(self): eqn = Eq(u.forward, _R(_R(u[t, x, y] + u[t, x+2, y])*3.*f + _R(u[t, x+1, y+1] + u[t, x+3, y+1])*3.*f) + 1.) op0 = Operator(eqn, opt='noop') - op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0})) + op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0, + 'cire-schedule': 1})) assert len([i for i in FindSymbols().visit(op1.body) if i.is_Array]) == 1 @@ -2499,10 +2501,8 @@ def test_adjoint_codegen(self, shape, kernel, space_order, save): op_adj = solver.op_adj() adj_calls = FindNodes(Call).visit(op_adj) - # one halo, ndim memalign and free (pos temp rec/src) - sf_calls = 2 * len(shape) - assert len(fwd_calls) == 1 + sf_calls - assert len(adj_calls) == 1 + sf_calls + assert len(fwd_calls) == 1 + assert len(adj_calls) == 1 def run_adjoint_F(self, nd): """ diff --git a/tests/test_pickle.py b/tests/test_pickle.py index 16f44bdaed..26b6ea7f63 100644 --- a/tests/test_pickle.py +++ b/tests/test_pickle.py @@ -236,7 +236,7 @@ def test_sub_dimension(self, pickle): assert di.name == new_di.name assert di.dtype == new_di.dtype - assert di.parent == new_di.parent + assert di.parent.name == new_di.parent.name assert di._thickness == new_di._thickness assert di._interval == new_di._interval @@ -249,7 +249,7 @@ def test_conditional_dimension(self, pickle): new_cd = pickle.loads(pkl_cd) assert cd.name == new_cd.name - assert cd.parent == new_cd.parent + assert cd.parent.name == new_cd.parent.name assert cd.factor == new_cd.factor assert cd.condition == new_cd.condition @@ -262,7 +262,7 @@ def test_incr_dimension(self, pickle): new_dd = pickle.loads(pkl_dd) assert dd.name == new_dd.name - assert dd.parent == new_dd.parent + assert dd.parent.name == new_dd.parent.name assert dd.symbolic_min == new_dd.symbolic_min assert dd.symbolic_max == new_dd.symbolic_max assert dd.step == new_dd.step @@ -384,7 +384,7 @@ def test_guard_factor(self, pickle): pkl_gf = pickle.dumps(gf) new_gf = pickle.loads(pkl_gf) - assert gf == new_gf + assert str(gf) == str(new_gf) def test_temp_function(self, pickle): grid = Grid(shape=(3, 3)) diff --git a/tests/test_symbolics.py b/tests/test_symbolics.py index ce5bb3cdf2..fcc7395b05 100644 --- a/tests/test_symbolics.py +++ b/tests/test_symbolics.py @@ -303,6 +303,18 @@ class BarCast(Cast): assert v != v1 +def test_findexed(): + grid = Grid(shape=(3, 3, 3)) + f = Function(name='f', grid=grid) + + fi = FIndexed.from_indexed(f.indexify(), "foo", strides=(1, 2)) + new_fi = fi.func(strides=(3, 4)) + + assert new_fi.name == fi.name == 'f' + assert new_fi.indices == fi.indices + assert new_fi.strides == (3, 4) + + def test_symbolic_printing(): b = Symbol('b') @@ -315,17 +327,11 @@ class MyLocalObject(LocalObject, Expr): lo = MyLocalObject(name='lo') assert str(lo + 2) == '2 + lo' - -def test_findexed(): - grid = Grid(shape=(3, 3, 3)) - f = Function(name='f', grid=grid) - + grid = Grid((10,)) + f = Function(name="f", grid=grid) fi = FIndexed.from_indexed(f.indexify(), "foo", strides=(1, 2)) - new_fi = fi.func(strides=(3, 4)) - - assert new_fi.name == fi.name == 'f' - assert new_fi.indices == fi.indices - assert new_fi.strides == (3, 4) + df = DefFunction('aaa', arguments=[fi]) + assert ccode(df) == 'aaa(foo(x))' def test_is_on_grid(): diff --git a/tests/test_unexpansion.py b/tests/test_unexpansion.py index d9d01b6fc9..fa076096a3 100644 --- a/tests/test_unexpansion.py +++ b/tests/test_unexpansion.py @@ -5,7 +5,23 @@ from devito.types import Symbol -class TestBasic(object): +class TestLoopScheduling(object): + + def test_backward_dt2(self): + grid = Grid(shape=(4, 4)) + + f = Function(name='f', grid=grid) + v = TimeFunction(name='v', grid=grid, time_order=2) + + eqns = [Eq(v.backward, v + 1.), + Eq(f, v.dt2)] + + op = Operator(eqns, opt=('advanced', {'openmp': True, + 'expand': False})) + assert_structure(op, ['t,x,y'], 't,x,y') + + +class Test1Pass(object): def test_v0(self): grid = Grid(shape=(10, 10, 10)) @@ -85,7 +101,8 @@ def test_v2(self): op0 = Operator(eqns) op1 = Operator(eqns, opt=('advanced', {'expand': False, - 'blocklevels': 0})) + 'blocklevels': 0, + 'cire-mingain': 200})) # Check generated code -- expect maximal fusion! assert_structure(op1, @@ -110,7 +127,8 @@ def test_v3(self): Eq(v.forward, (v + u.dx.dy + 1.))] op0 = Operator(eqns) - op1 = Operator(eqns, opt=('advanced', {'expand': False})) + op1 = Operator(eqns, opt=('advanced', {'expand': False, + 'cire-mingain': 200})) # Check generated code -- redundant IndexDerivatives have been caught! op1._profiler._sections['section0'].sops == 65 @@ -123,63 +141,12 @@ def test_v3(self): def test_v4(self): grid = Grid(shape=(16, 16, 16)) - t = grid.stepping_dim - x, y, z = grid.dimensions - - so = 4 - - a = Function(name='a', grid=grid, space_order=so) - f = Function(name='f', grid=grid, space_order=so) - e = Function(name='e', grid=grid, space_order=so) - r = Function(name='r', grid=grid, space_order=so) - p0 = TimeFunction(name='p0', grid=grid, time_order=2, space_order=so) - m0 = TimeFunction(name='m0', grid=grid, time_order=2, space_order=so) - - def g1(field, r, e): - return (cos(e) * cos(r) * field.dx(x0=x+x.spacing/2) + - cos(e) * sin(r) * field.dy(x0=y+y.spacing/2) - - sin(e) * field.dz(x0=z+z.spacing/2)) - - def g2(field, r, e): - return - (sin(r) * field.dx(x0=x+x.spacing/2) - - cos(r) * field.dy(x0=y+y.spacing/2)) - - def g3(field, r, e): - return (sin(e) * cos(r) * field.dx(x0=x+x.spacing/2) + - sin(e) * sin(r) * field.dy(x0=y+y.spacing/2) + - cos(e) * field.dz(x0=z+z.spacing/2)) - - def g1_tilde(field, r, e): - return ((cos(e) * cos(r) * field).dx(x0=x-x.spacing/2) + - (cos(e) * sin(r) * field).dy(x0=y-y.spacing/2) - - (sin(e) * field).dz(x0=z-z.spacing/2)) - - def g2_tilde(field, r, e): - return - ((sin(r) * field).dx(x0=x-x.spacing/2) - - (cos(r) * field).dy(x0=y-y.spacing/2)) - - def g3_tilde(field, r, e): - return ((sin(e) * cos(r) * field).dx(x0=x-x.spacing/2) + - (sin(e) * sin(r) * field).dy(x0=y-y.spacing/2) + - (cos(e) * field).dz(x0=z-z.spacing/2)) - - update_p = t.spacing**2 * a**2 / f * \ - (g1_tilde(f * g1(p0, r, e), r, e) + - g2_tilde(f * g2(p0, r, e), r, e) + - g3_tilde(f * g3(p0, r, e) + f * g3(m0, r, e), r, e)) + \ - (2 - t.spacing * a) - - update_m = t.spacing**2 * a**2 / f * \ - (g1_tilde(f * g1(m0, r, e), r, e) + - g2_tilde(f * g2(m0, r, e), r, e) + - g3_tilde(f * g3(m0, r, e) + f * g3(p0, r, e), r, e)) + \ - (2 - t.spacing * a) - - eqns = [Eq(p0.forward, update_p), - Eq(m0.forward, update_m)] + + eqns = tti_sa_eqns(grid) op = Operator(eqns, subs=grid.spacing_map, - opt=('advanced', {'expand': False})) + opt=('advanced', {'expand': False, + 'cire-mingain': 400})) # Check code generation assert op._profiler._sections['section1'].sops == 1442 @@ -203,7 +170,8 @@ def test_v5(self): Eq(m0.forward, m0.dx.dx + m0.backward)] op = Operator(eqns, subs=grid.spacing_map, - opt=('advanced', {'expand': False})) + opt=('advanced', {'expand': False, + 'cire-mingain': 200})) # Check code generation assert op._profiler._sections['section0'].sops == 127 @@ -227,10 +195,145 @@ def test_v6(self): Eq(m0.forward, (m0.dx + s0).dx + f*m0.backward)] op = Operator(eqns, subs=grid.spacing_map, - opt=('advanced', {'expand': False})) + opt=('advanced', {'expand': False, + 'cire-mingain': 200})) # Check code generation assert op._profiler._sections['section0'].sops == 133 assert_structure(op, ['t,x,y', 't,x,y,i1', 't,x,y,i1,i0'], 't,x,y,i1,i0') op.cfunction + + def test_transpose(self): + shape = (10, 10, 10) + grid = Grid(shape=shape) + x, _, _ = grid.dimensions + + u = TimeFunction(name='u', grid=grid, space_order=4) + u1 = TimeFunction(name='u', grid=grid, space_order=4) + + # Chessboard-like init + u.data[:] = np.indices(shape).sum(axis=0) % 10 + 1 + u1.data[:] = np.indices(shape).sum(axis=0) % 10 + 1 + + eqn = Eq(u.forward, u.dx(x0=x+x.spacing/2).T + 1.) + + op0 = Operator(eqn) + op1 = Operator(eqn, opt=('advanced', {'expand': False})) + + op0.apply(time_M=10) + op1.apply(time_M=10, u=u1) + + assert np.allclose(u.data, u1.data, rtol=10e-6) + + +class Test2Pass(object): + + def test_v0(self): + grid = Grid(shape=(10, 10, 10)) + + u = TimeFunction(name='u', grid=grid, space_order=8) + v = TimeFunction(name='v', grid=grid, space_order=8) + u1 = TimeFunction(name='u', grid=grid, space_order=8) + v1 = TimeFunction(name='v', grid=grid, space_order=8) + + eqns = [Eq(u.forward, (u.dx.dy + v*u + 1.)), + Eq(v.forward, (v + u.dx.dy + 1.))] + + op0 = Operator(eqns) + op1 = Operator(eqns, opt=('advanced', {'expand': False, + 'openmp': True})) + + # Check generated code + assert op1._profiler._sections['section0'].sops == 41 + assert_structure(op1, ['t', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i0', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i1'], + 't,x0_blk0,y0_blk0,x,y,z,i0,y,z,i1') + + op0.apply(time_M=5) + op1.apply(time_M=5, u=u1, v=v1) + + assert np.allclose(u.data, u1.data, rtol=10e-3) + assert np.allclose(v.data, v1.data, rtol=10e-3) + + def test_v1(self): + grid = Grid(shape=(16, 16, 16)) + + eqns = tti_sa_eqns(grid) + + op = Operator(eqns, subs=grid.spacing_map, + opt=('advanced', {'expand': False, + 'openmp': False})) + + # Check code generation + assert op._profiler._sections['section1'].sops == 190 + assert_structure(op, ['x,y,z', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i0', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i1'], + 'x,y,z,t,x0_blk0,y0_blk0,x,y,z,i0,x,y,z,i1') + + op.cfunction + + +def tti_sa_eqns(grid): + t = grid.stepping_dim + x, y, z = grid.dimensions + + so = 4 + + a = Function(name='a', grid=grid, space_order=so) + f = Function(name='f', grid=grid, space_order=so) + e = Function(name='e', grid=grid, space_order=so) + r = Function(name='r', grid=grid, space_order=so) + p0 = TimeFunction(name='p0', grid=grid, time_order=2, space_order=so) + m0 = TimeFunction(name='m0', grid=grid, time_order=2, space_order=so) + + def g1(field, r, e): + return (cos(e) * cos(r) * field.dx(x0=x+x.spacing/2) + + cos(e) * sin(r) * field.dy(x0=y+y.spacing/2) - + sin(e) * field.dz(x0=z+z.spacing/2)) + + def g2(field, r, e): + return - (sin(r) * field.dx(x0=x+x.spacing/2) - + cos(r) * field.dy(x0=y+y.spacing/2)) + + def g3(field, r, e): + return (sin(e) * cos(r) * field.dx(x0=x+x.spacing/2) + + sin(e) * sin(r) * field.dy(x0=y+y.spacing/2) + + cos(e) * field.dz(x0=z+z.spacing/2)) + + def g1_tilde(field, r, e): + return ((cos(e) * cos(r) * field).dx(x0=x-x.spacing/2) + + (cos(e) * sin(r) * field).dy(x0=y-y.spacing/2) - + (sin(e) * field).dz(x0=z-z.spacing/2)) + + def g2_tilde(field, r, e): + return - ((sin(r) * field).dx(x0=x-x.spacing/2) - + (cos(r) * field).dy(x0=y-y.spacing/2)) + + def g3_tilde(field, r, e): + return ((sin(e) * cos(r) * field).dx(x0=x-x.spacing/2) + + (sin(e) * sin(r) * field).dy(x0=y-y.spacing/2) + + (cos(e) * field).dz(x0=z-z.spacing/2)) + + update_p = t.spacing**2 * a**2 / f * \ + (g1_tilde(f * g1(p0, r, e), r, e) + + g2_tilde(f * g2(p0, r, e), r, e) + + g3_tilde(f * g3(p0, r, e) + f * g3(m0, r, e), r, e)) + \ + (2 - t.spacing * a) + + update_m = t.spacing**2 * a**2 / f * \ + (g1_tilde(f * g1(m0, r, e), r, e) + + g2_tilde(f * g2(m0, r, e), r, e) + + g3_tilde(f * g3(m0, r, e) + f * g3(p0, r, e), r, e)) + \ + (2 - t.spacing * a) + + eqns = [Eq(p0.forward, update_p), + Eq(m0.forward, update_m)] + + return eqns