From 38f50fa1d95736f63e23400646e0985df817a418 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 28 Apr 2023 07:55:25 +0000 Subject: [PATCH 01/79] compiler: Emulate e.find(type) with (faster) search(e, type) --- devito/symbolics/search.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/devito/symbolics/search.py b/devito/symbolics/search.py index 94e533a692..7fdb74187b 100644 --- a/devito/symbolics/search.py +++ b/devito/symbolics/search.py @@ -48,7 +48,7 @@ def __init__(self, query, mode, deep=False): self.deep = deep def _next(self, expr): - if self.deep is True and expr.is_Indexed: + if self.deep and expr.is_Indexed: return expr.indices elif q_leaf(expr): return () @@ -110,7 +110,12 @@ def search(exprs, query, mode='unique', visit='dfs', deep=False): assert mode in Search.modes, "Unknown mode" - searcher = Search(query, mode, deep) + if isinstance(query, type): + Q = lambda obj: isinstance(obj, query) + else: + Q = query + + searcher = Search(Q, mode, deep) found = Search.modes[mode]() for e in as_tuple(exprs): From 3c2ceee13a2320906a031ef5ad558e69e3d386c8 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 3 May 2023 08:08:40 +0000 Subject: [PATCH 02/79] compiler: Add IterationSpace.prefix --- devito/ir/support/space.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index b85b017b4a..d06a7675fa 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -904,6 +904,18 @@ def promote(self, cond): return IterationSpace(intervals, sub_iterators, directions) + def prefix(self, key): + """ + Return the IterationSpace up to and including the last Interval + such that `key(interval)` is True. + """ + try: + i = self.project(key)[-1] + except IndexError: + return None + + return self[:self.index(i.dim) + 1] + def is_compatible(self, other): """ A relaxed version of ``__eq__``, in which only non-derived dimensions From ad3287affb4289291d2a9a87f3f2600ce0becfdb Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 3 May 2023 10:51:41 +0000 Subject: [PATCH 03/79] compiler: Use internal repr for IndexSums --- devito/finite_differences/differentiable.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py index 1ca7e29506..48d22e2612 100644 --- a/devito/finite_differences/differentiable.py +++ b/devito/finite_differences/differentiable.py @@ -556,6 +556,9 @@ def __repr__(self): __str__ = __repr__ + def _sympystr(self, printer): + return str(self) + def _hashable_content(self): return super()._hashable_content() + (self.dimensions,) From f5be82638c4a8c52383dc14daa1578b9dabae96b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 3 May 2023 15:53:43 +0000 Subject: [PATCH 04/79] compiler: Add Bunch.__repr__ --- devito/tools/data_structures.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 61bb7f5257..a386a1a0c7 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -29,6 +29,9 @@ class Bunch(object): def __init__(self, **kwargs): self.__dict__.update(kwargs) + def __repr__(self): + return "Bunch(%s)" % ", ".join(["%s=%s" % i for i in self.__dict__.items()]) + class EnrichedTuple(tuple, Pickable): From 1e88a0169d7dd37764fe52fd96232b37eb67add7 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 4 May 2023 08:36:35 +0000 Subject: [PATCH 05/79] compiler: Add ClusterGroup.rebuild --- devito/ir/clusters/cluster.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index e7354b8693..02c43cb88a 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -406,6 +406,14 @@ def __new__(cls, clusters, ispace=None): obj._ispace = ispace return obj + def rebuild(self, ispace=None): + processed = [] + for c in self: + ispace = IterationSpace.union(c.ispace, ispace) + processed.append(c.rebuild(ispace=ispace)) + + return self.__class__(processed, ispace) + @classmethod def concatenate(cls, *cgroups): return list(chain(*cgroups)) From 95f51b68692accd1b8f39243dbc4aa2a3ce60ef5 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 5 May 2023 08:33:06 +0000 Subject: [PATCH 06/79] compiler: Add ClusterGroup.properties --- devito/ir/clusters/cluster.py | 40 +++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 02c43cb88a..3827228a0d 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -52,13 +52,7 @@ def __init__(self, exprs, ispace=None, guards=None, properties=None, syncs=None, # Normalize properties properties = Properties(properties or {}) - for d in ispace.itdimensions: - properties = properties.add(d) - for i in properties: - for d in as_tuple(i): - if d not in ispace.itdimensions: - properties = properties.drop(d) - self._properties = properties + self._properties = tailor_properties(properties, ispace) self._halo_scheme = halo_scheme @@ -85,10 +79,7 @@ def from_clusters(cls, *clusters): guards = root.guards - properties = {} - for c in clusters: - for d, v in c.properties.items(): - properties[d] = normalize_properties(properties.get(d, v), v) + properties = reduce_properties(clusters) try: syncs = normalize_syncs(*[c.syncs for c in clusters]) @@ -430,6 +421,10 @@ def scope(self): def ispace(self): return self._ispace + @cached_property + def properties(self): + return tailor_properties(reduce_properties(self), self.ispace) + @cached_property def guards(self): """The guards of each Cluster in self.""" @@ -473,3 +468,26 @@ def meta(self): The data type and the data space of the ClusterGroup. """ return (self.dtype, self.dspace) + + +# *** Utils + +def reduce_properties(clusters): + properties = {} + for c in clusters: + for d, v in c.properties.items(): + properties[d] = normalize_properties(properties.get(d, v), v) + + return Properties(properties) + + +def tailor_properties(properties, ispace): + for d in ispace.itdimensions: + properties = properties.add(d) + + for i in properties: + for d in as_tuple(i): + if d not in ispace.itdimensions: + properties = properties.drop(d) + + return properties From 020ad7dc9d1757a67a11ac37729e6f6b365cef23 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 12 May 2023 07:37:26 +0000 Subject: [PATCH 07/79] comiler: Add minmax_index() --- devito/ir/support/utils.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py index 3750b08a0e..19038e7c7a 100644 --- a/devito/ir/support/utils.py +++ b/devito/ir/support/utils.py @@ -6,7 +6,7 @@ StencilDimension) __all__ = ['AccessMode', 'Stencil', 'IMask', 'detect_accesses', 'detect_io', - 'pull_dims', 'sdims_min', 'sdims_max'] + 'pull_dims', 'sdims_min', 'sdims_max', 'minmax_index'] class AccessMode(object): @@ -294,3 +294,18 @@ def sdims_max(expr): return expr mapper = {e: e._max for e in sdims} return expr.subs(mapper) + + +def minmax_index(expr, d): + """ + Return the minimum and maximum indices along the `d` Dimension + among all Indexeds in `expr`. + """ + indices = set() + for i in retrieve_indexed(expr): + try: + indices.add(i.indices[d]) + except KeyError: + pass + + return min(indices), max(indices) From ebcee2b0fcc88e0aa103b3da80ebb5841fe058d0 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 9 Jun 2023 15:00:35 +0000 Subject: [PATCH 08/79] compiler: Add DAG.all_predecessors --- devito/tools/data_structures.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index a386a1a0c7..f32fb709e1 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -415,6 +415,24 @@ def predecessors(self, node): """Return a list of all predecessors of the given node.""" return [key for key in self.graph if node in self.graph[key]] + def all_predecessors(self, node): + """ + Return a list of all nodes ultimately predecessors of the given node + in the dependency graph, in topological order. + """ + found = set() + + def _all_predecessors(n): + if n in found: + return + found.add(n) + for predecessor in self.predecessors(n): + _all_predecessors(predecessor) + + _all_predecessors(node) + + return found + def downstream(self, node): """Return a list of all nodes this node has edges towards.""" if node not in self.graph: From 866a66c5918db19c9ce8b56e6864af938dc3db7a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 14 Jun 2023 09:33:52 +0000 Subject: [PATCH 09/79] tools: Add DAG.find_paths --- devito/tools/data_structures.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index f32fb709e1..4ceb5f5d08 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -528,6 +528,28 @@ def connected_components(self, enumerated=False): else: return tuple(groups) + def find_paths(self, node): + if node not in self.graph: + raise KeyError('node %s is not in graph' % node) + + paths = [] + + def dfs(node, path): + path.append(node) + + if not self.graph[node]: + paths.append(tuple(path)) + else: + for child in self.graph[node]: + dfs(child, path) + + # Remove the node from the path to backtrack + path.pop() + + dfs(node, []) + + return tuple(paths) + class frozendict(Mapping): """ From 1db334e7c8c92f490f06b2bdde4177386302206c Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 15 Jun 2023 08:42:47 +0000 Subject: [PATCH 10/79] compiler: Generalize AffineIndexAccessFunction --- devito/types/dimension.py | 18 +++++++++--------- tests/test_dimension.py | 35 ++++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 3166c7503b..f391de0740 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -1552,7 +1552,7 @@ class AffineIndexAccessFunction(IndexAccessFunction): * the "main" Dimension (in practice a SpaceDimension or a TimeDimension); * an offset (number or symbolic expression, of dtype integer). - * a StencilDimension; + * one or more StencilDimensions; Examples -------- @@ -1562,24 +1562,22 @@ class AffineIndexAccessFunction(IndexAccessFunction): def __new__(cls, *args, **kwargs): d = 0 - sd = 0 + sds = [] ofs_items = [] for a in args: if isinstance(a, StencilDimension): - if sd != 0: - return sympy.Add(*args, **kwargs) - sd = a + sds.append(a) elif isinstance(a, Dimension): d = cls._separate_dims(d, a, ofs_items) if d is None: return sympy.Add(*args, **kwargs) elif isinstance(a, AffineIndexAccessFunction): - if sd != 0 and a.sd != 0: + if sds and a.sds: return sympy.Add(*args, **kwargs) d = cls._separate_dims(d, a.d, ofs_items) if d is None: return sympy.Add(*args, **kwargs) - sd = a.sd + sds = list(a.sds or sds) ofs_items.append(a.ofs) else: ofs_items.append(a) @@ -1588,12 +1586,14 @@ def __new__(cls, *args, **kwargs): if not all(is_integer(i) or i.is_Symbol for i in ofs.free_symbols): return sympy.Add(*args, **kwargs) - obj = IndexAccessFunction.__new__(cls, d, ofs, sd) + sds = tuple(sds) + + obj = IndexAccessFunction.__new__(cls, d, ofs, *sds) if isinstance(obj, AffineIndexAccessFunction): obj.d = d obj.ofs = ofs - obj.sd = sd + obj.sds = sds else: # E.g., SymPy simplified it to Zero or something else pass diff --git a/tests/test_dimension.py b/tests/test_dimension.py index 7532602190..1e719532d9 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -27,7 +27,7 @@ def test_basic(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs == 1 - assert expr.sd == 0 + assert expr.sds == () s0 = Symbol(name='s0', dtype=np.int32) s1 = Symbol(name='s1', dtype=np.int32) @@ -37,7 +37,7 @@ def test_basic(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs == s0 + s1 + 1 - assert expr.sd == 0 + assert expr.sds == () def test_reversed(self): d = Dimension(name='x') @@ -47,14 +47,14 @@ def test_reversed(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs == 1 - assert expr.sd == 0 + assert expr.sds == () expr = d.symbolic_max + d assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs is d.symbolic_max - assert expr.sd == 0 + assert expr.sds == () def test_non_affine(self): grid = Grid(shape=(3,)) @@ -75,8 +75,8 @@ def test_stencil_dim(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d - assert expr.sd is sd assert expr.ofs == 1 + assert expr.sds == (sd,) s = Symbol(name='s') @@ -84,8 +84,29 @@ def test_stencil_dim(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d - assert expr.sd is sd assert expr.ofs == 1 + s + assert expr.sds == (sd,) + + def test_stencil_dim_multiple(self): + d = Dimension(name='x') + sd0 = StencilDimension('i0', 0, 1) + sd1 = StencilDimension('i1', 0, 1) + + expr = d + sd0 + sd1 + 1 + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d is d + assert expr.ofs == 1 + assert expr.sds == (sd0, sd1) + + s = Symbol(name='s') + + expr = sd0 + d + sd1 + 1 + s + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d is d + assert expr.ofs == 1 + s + assert expr.sds == (sd0, sd1) def test_sub(self): d = Dimension(name='x') @@ -103,8 +124,8 @@ def test_sub(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d - assert expr.sd is sd assert expr.ofs == -1 - s + assert expr.sds == (sd,) class TestBufferedDimension(object): From 023da188291d71a1b33d096577d0f29cee46a361 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 20 Jun 2023 13:02:31 +0000 Subject: [PATCH 11/79] compiler: Fix DefFunction printing --- devito/symbolics/printer.py | 5 ++++- devito/types/object.py | 2 ++ tests/test_linearize.py | 4 ++-- tests/test_symbolics.py | 26 ++++++++++++++++---------- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/devito/symbolics/printer.py b/devito/symbolics/printer.py index 8f7ef6a719..cf3ba528bc 100644 --- a/devito/symbolics/printer.py +++ b/devito/symbolics/printer.py @@ -229,10 +229,13 @@ def _print_TrigonometricFunction(self, expr): func_name += 'f' return '%s(%s)' % (func_name, self._print(*expr.args)) + def _print_DefFunction(self, expr): + arguments = [self._print(i) for i in expr.arguments] + return "%s(%s)" % (expr.name, ','.join(arguments)) + def _print_Fallback(self, expr): return expr.__str__() - _print_DefFunction = _print_Fallback _print_MacroArgument = _print_Fallback _print_IndexedBase = _print_Fallback _print_IndexSum = _print_Fallback diff --git a/devito/types/object.py b/devito/types/object.py index d5d20d65d9..1db973cce9 100644 --- a/devito/types/object.py +++ b/devito/types/object.py @@ -53,6 +53,8 @@ def __repr__(self): def _sympystr(self, printer): return str(self) + _ccode = _sympystr + def _hashable_content(self): return (self.name, self.dtype) diff --git a/tests/test_linearize.py b/tests/test_linearize.py index 45347bf605..efed9f79db 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -479,8 +479,8 @@ def test_issue_1838(): op1.apply(time_M=3, dt=1., p0=p1) # Check generated code - assert "r0L0(x, y, z) r0[(x)*y_stride1 + (y)*z_stride1 + (z)]" in str(op1) - assert "r4L0(x, y, z) r4[(x)*y_stride2 + (y)*z_stride1 + (z)]" in str(op1) + assert "r0L0(x,y,z) r0[(x)*y_stride1 + (y)*z_stride1 + (z)]" in str(op1) + assert "r4L0(x,y,z) r4[(x)*y_stride2 + (y)*z_stride1 + (z)]" in str(op1) assert np.allclose(p0.data, p1.data, rtol=1e-6) diff --git a/tests/test_symbolics.py b/tests/test_symbolics.py index ce5bb3cdf2..fcc7395b05 100644 --- a/tests/test_symbolics.py +++ b/tests/test_symbolics.py @@ -303,6 +303,18 @@ class BarCast(Cast): assert v != v1 +def test_findexed(): + grid = Grid(shape=(3, 3, 3)) + f = Function(name='f', grid=grid) + + fi = FIndexed.from_indexed(f.indexify(), "foo", strides=(1, 2)) + new_fi = fi.func(strides=(3, 4)) + + assert new_fi.name == fi.name == 'f' + assert new_fi.indices == fi.indices + assert new_fi.strides == (3, 4) + + def test_symbolic_printing(): b = Symbol('b') @@ -315,17 +327,11 @@ class MyLocalObject(LocalObject, Expr): lo = MyLocalObject(name='lo') assert str(lo + 2) == '2 + lo' - -def test_findexed(): - grid = Grid(shape=(3, 3, 3)) - f = Function(name='f', grid=grid) - + grid = Grid((10,)) + f = Function(name="f", grid=grid) fi = FIndexed.from_indexed(f.indexify(), "foo", strides=(1, 2)) - new_fi = fi.func(strides=(3, 4)) - - assert new_fi.name == fi.name == 'f' - assert new_fi.indices == fi.indices - assert new_fi.strides == (3, 4) + df = DefFunction('aaa', arguments=[fi]) + assert ccode(df) == 'aaa(foo(x))' def test_is_on_grid(): From 34e30f5f59258e2efd326d9173378f3070b52860 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 22 Jun 2023 15:17:17 +0000 Subject: [PATCH 12/79] compiler: aliases.Candidate -> ir.ExprGeometry --- devito/ir/support/__init__.py | 2 +- devito/ir/support/basic.py | 105 +++++++++++++++++++++++++-- devito/ir/support/utils.py | 1 + devito/passes/clusters/aliases.py | 113 +++++------------------------- 4 files changed, 119 insertions(+), 102 deletions(-) diff --git a/devito/ir/support/__init__.py b/devito/ir/support/__init__.py index 2b20e3ab20..8a8b50bc67 100644 --- a/devito/ir/support/__init__.py +++ b/devito/ir/support/__init__.py @@ -1,5 +1,5 @@ -from .utils import * # noqa from .vector import * # noqa +from .utils import * # noqa from .basic import * # noqa from .space import * # noqa from .guards import * # noqa diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 0ca1da8796..c5dfb76052 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -6,13 +6,14 @@ from devito.ir.support.space import Backward, IterationSpace from devito.ir.support.utils import AccessMode from devito.ir.support.vector import LabeledVector, Vector -from devito.symbolics import (retrieve_terminals, q_constant, q_affine, q_routine, - q_terminal) +from devito.symbolics import (compare_ops, retrieve_indexed, retrieve_terminals, + q_constant, q_affine, q_routine, q_terminal) from devito.tools import (Tag, as_tuple, is_integer, filter_sorted, flatten, memoized_meth, memoized_generator) -from devito.types import Barrier, Dimension, DimensionTuple, Jump, Symbol +from devito.types import (Barrier, Dimension, DimensionTuple, Jump, Symbol, + StencilDimension) -__all__ = ['IterationInstance', 'TimedAccess', 'Scope'] +__all__ = ['IterationInstance', 'TimedAccess', 'Scope', 'ExprGeometry'] class IndexMode(Tag): @@ -1075,3 +1076,99 @@ def r_all(self): All Relations of the Scope. """ return list(self.r_gen()) + + +class ExprGeometry(object): + + """ + Geometric representation of an expression by abstracting Indexeds as + LabeledVectors. + """ + + def __init__(self, expr, indexeds=None, bases=None, offsets=None): + self.expr = expr + + if indexeds is not None: + self.indexeds = indexeds + self.bases = bases + self.offsets = offsets + return + + indexeds = retrieve_indexed(expr) + + bases = [] + offsets = [] + for i in indexeds: + ii = IterationInstance(i) + if ii.is_irregular: + raise ValueError("Cannot understand expression geometry") + + base = [] + offset = [] + for e, ai in zip(ii, ii.aindices): + if q_constant(e): + base.append(e) + else: + base.append(ai) + offset.append((ai, e - ai)) + bases.append(tuple(base)) + offsets.append(LabeledVector(offset)) + + self.indexeds = indexeds + self.bases = bases + self.offsets = offsets + + def __repr__(self): + return "ExprGeometry(expr=%s)" % self.expr + + def translated(self, other, dims=None): + """ + True if `self` is translated w.r.t. `other`, False otherwise. + + Examples + -------- + Two expressions are translated if they perform the same operations, + their bases are the same and their offsets are pairwise translated. + + c := A[i,j] op A[i,j+1] -> Toffsets = {i: [0,0], j: [0,1]} + u := A[i+1,j] op A[i+1,j+1] -> Toffsets = {i: [1,1], j: [0,1]} + + Then `c` is translated w.r.t. `u` with distance `{i: 1, j: 0}` + + The test may be strengthen by imposing that a translation occurs + only along a specific set of Dimensions through the kwarg `dims`. + """ + if not compare_ops(self.expr, other.expr): + return False + + if len(self.Toffsets) != len(other.Toffsets): + return False + if len(self.bases) != len(other.bases): + return False + + # Check the bases + if any(b0 != b1 for b0, b1 in zip(self.bases, other.bases)): + return False + + # Check the offsets + for (d0, o0), (d1, o1) in zip(self.Toffsets, other.Toffsets): + if d0 is not d1: + return False + + distance = set(o0 - o1) + if len(distance) != 1: + return False + + if dims and not d0._defines & set(as_tuple(dims)): + if distance.pop() != 0: + return False + + return True + + @cached_property + def Toffsets(self): + return LabeledVector.transpose(*self.offsets) + + @cached_property + def dimensions(self): + return frozenset(i for i, _ in self.Toffsets) diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py index 19038e7c7a..36da9001c3 100644 --- a/devito/ir/support/utils.py +++ b/devito/ir/support/utils.py @@ -1,5 +1,6 @@ from collections import defaultdict +from devito.ir.support.vector import LabeledVector from devito.symbolics import CallFromPointer, retrieve_indexed, retrieve_terminals from devito.tools import DefaultOrderedDict, as_tuple, flatten, filter_sorted, split from devito.types import (Dimension, DimensionTuple, Indirection, ModuloDimension, diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index e0783323fe..73f3af7b43 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -8,11 +8,11 @@ from devito.finite_differences import EvalDerivative from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, ROUNDABLE, SEPARABLE, Forward, - IterationInstance, IterationSpace, Interval, Cluster, - Queue, IntervalGroup, LabeledVector, normalize_properties, + IterationSpace, Interval, Cluster, ExprGeometry, Queue, + IntervalGroup, LabeledVector, normalize_properties, relax_properties, sdims_min, sdims_max) -from devito.symbolics import (Uxmapper, compare_ops, estimate_cost, q_constant, - reuse_if_untouched, retrieve_indexed, search, uxreplace, +from devito.symbolics import (Uxmapper, estimate_cost, q_constant, search, + reuse_if_untouched, retrieve_indexed, uxreplace, sympy_dtype) from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, generator, split, timed_pass) @@ -439,28 +439,10 @@ def collect(extracted, ispace, minstorage): for expr in extracted: assert not expr.is_Equality - indexeds = retrieve_indexed(expr) - - bases = [] - offsets = [] - for i in indexeds: - ii = IterationInstance(i) - if ii.is_irregular: - break - - base = [] - offset = [] - for e, ai in zip(ii, ii.aindices): - if q_constant(e): - base.append(e) - else: - base.append(ai) - offset.append((ai, e - ai)) - bases.append(tuple(base)) - offsets.append(LabeledVector(offset)) - - if not indexeds or len(bases) == len(indexeds): - found.append(Candidate(expr, ispace, indexeds, bases, offsets)) + try: + found.append(ExprGeometry(expr)) + except ValueError: + pass # Create groups of aliasing expressions mapper = OrderedDict() @@ -469,17 +451,13 @@ def collect(extracted, ispace, minstorage): c = unseen.pop(0) group = [c] for u in list(unseen): - # Is the arithmetic structure of `c` and `u` equivalent ? - if not compare_ops(c.expr, u.expr): - continue - # Is `c` translated w.r.t. `u` ? if not c.translated(u): continue group.append(u) unseen.remove(u) - group = Group(group) + group = Group(group, ispace=ispace) if minstorage: k = group.dimensions_translated @@ -975,72 +953,13 @@ def pick_best(variants, schedule_strategy, eval_variants_delta): # Utilities -class Candidate(object): - - def __init__(self, expr, ispace, indexeds, bases, offsets): - self.expr = expr - self.ispace = ispace - self.indexeds = indexeds - self.bases = bases - self.offsets = offsets - - def __repr__(self): - return "Candidate(expr=%s)" % self.expr - - def translated(self, other): - """ - True if ``self`` is translated w.r.t. ``other``, False otherwise. - - Examples - -------- - Two candidates are translated if their bases are the same and - their offsets are pairwise translated. - - c := A[i,j] op A[i,j+1] -> Toffsets = {i: [0,0], j: [0,1]} - u := A[i+1,j] op A[i+1,j+1] -> Toffsets = {i: [1,1], j: [0,1]} - - Then `c` is translated w.r.t. `u` with distance `{i: 1, j: 0}` - """ - if len(self.Toffsets) != len(other.Toffsets): - return False - if len(self.bases) != len(other.bases): - return False - - # Check the bases - if any(b0 != b1 for b0, b1 in zip(self.bases, other.bases)): - return False - - # Check the offsets - for (d0, o0), (d1, o1) in zip(self.Toffsets, other.Toffsets): - if d0 is not d1: - return False - - distance = set(o0 - o1) - if len(distance) != 1: - return False - - return True - - @cached_property - def Toffsets(self): - return LabeledVector.transpose(*self.offsets) - - @cached_property - def dimensions(self): - return frozenset(i for i, _ in self.Toffsets) - - @property - def shifts(self): - return self.ispace.intervals - - class Group(tuple): """ A collection of aliasing expressions. """ - def __new__(cls, items): + def __new__(cls, items, ispace=None): # Expand out the StencilDimensions, if any processed = [] for c in items: @@ -1054,12 +973,12 @@ def __new__(cls, items): offsets = [LabeledVector([(d, f(i)) for d, i in v.items()]) for v in c.offsets] - processed.append( - Candidate(expr, c.ispace, indexeds, c.bases, offsets) - ) + processed.append(ExprGeometry(expr, indexeds, c.bases, offsets)) obj = super().__new__(cls, processed) obj._items = items + obj._ispace = ispace + return obj def __repr__(self): @@ -1114,7 +1033,7 @@ def diameter(self): @property def pivot(self): """ - A deterministically chosen Candidate for this Group. + A deterministically chosen reference for this Group. """ return self[0] @@ -1163,7 +1082,7 @@ def _pivot_legal_rotations(self): def _pivot_min_intervals(self): """ The minimum Interval along each Dimension such that by evaluating the - pivot, all Candidates are evaluated too. + pivot, all other items are evaluated too. """ c = self.pivot @@ -1201,7 +1120,7 @@ def _pivot_legal_shifts(self): hsize = sum(f._size_halo[l]) # Any `ofs`'s shift due to non-[0,0] iteration space - lower, upper = c.shifts[l].offsets + lower, upper = self._ispace.intervals[l].offsets try: # Assume `ofs[l]` is a number (typical case) From 26dbac223f3c16a0b33a3f3efad309990eef72bc Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 26 Jun 2023 13:13:02 +0000 Subject: [PATCH 13/79] compiler: Generalize and enhance ExprGeometry --- devito/ir/support/basic.py | 85 +++++++++++++++++++++---------- devito/passes/clusters/aliases.py | 7 ++- 2 files changed, 60 insertions(+), 32 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index c5dfb76052..1a08bdbe48 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -1094,27 +1094,22 @@ def __init__(self, expr, indexeds=None, bases=None, offsets=None): self.offsets = offsets return - indexeds = retrieve_indexed(expr) + self.indexeds = retrieve_indexed(expr) bases = [] offsets = [] - for i in indexeds: - ii = IterationInstance(i) - if ii.is_irregular: - raise ValueError("Cannot understand expression geometry") - + for ii in self.iinstances: base = [] offset = [] - for e, ai in zip(ii, ii.aindices): - if q_constant(e): - base.append(e) + for e, fi, ai in zip(ii, ii.findices, ii.aindices): + if ai is None: + base.append((fi, e)) else: - base.append(ai) + base.append((fi, ai)) offset.append((ai, e - ai)) - bases.append(tuple(base)) + bases.append(LabeledVector(base)) offsets.append(LabeledVector(offset)) - self.indexeds = indexeds self.bases = bases self.offsets = offsets @@ -1138,33 +1133,56 @@ def translated(self, other, dims=None): The test may be strengthen by imposing that a translation occurs only along a specific set of Dimensions through the kwarg `dims`. """ + # Check mathematical structure if not compare_ops(self.expr, other.expr): return False - if len(self.Toffsets) != len(other.Toffsets): - return False - if len(self.bases) != len(other.bases): - return False + # Use a suitable value for `dims` if not provided by user + if dims is None: + if self.aindices != other.aindices: + return False + dims = self.aindices + dims = set(as_tuple(dims)) - # Check the bases - if any(b0 != b1 for b0, b1 in zip(self.bases, other.bases)): - return False + # Check bases and offsets + for i in ['Tbases', 'Toffsets']: + Ti0 = getattr(self, i) + Ti1 = getattr(other, i) - # Check the offsets - for (d0, o0), (d1, o1) in zip(self.Toffsets, other.Toffsets): - if d0 is not d1: - return False + m0 = dict(Ti0) + m1 = dict(Ti1) - distance = set(o0 - o1) - if len(distance) != 1: - return False + # The only hope in presence of Dimensions appearing only in either + # `self` or `other` is that they have been projected away by the caller + for d in set(m0).symmetric_difference(set(m1)): + if not d._defines & dims: + return False + + for d in set(m0).union(set(m1)): + try: + o0 = m0[d] + o1 = m1[d] + except KeyError: + continue - if dims and not d0._defines & set(as_tuple(dims)): - if distance.pop() != 0: + distance = set(o0 - o1) + if len(distance) != 1: return False + if not d._defines & dims: + if distance.pop() != 0: + return False + return True + @cached_property + def iinstances(self): + return tuple(IterationInstance(i) for i in self.indexeds) + + @cached_property + def Tbases(self): + return LabeledVector.transpose(*self.bases) + @cached_property def Toffsets(self): return LabeledVector.transpose(*self.offsets) @@ -1172,3 +1190,14 @@ def Toffsets(self): @cached_property def dimensions(self): return frozenset(i for i, _ in self.Toffsets) + + @cached_property + def aindices(self): + try: + return tuple(zip(*self.Toffsets))[0] + except IndexError: + return () + + @property + def is_regular(self): + return all(i.is_regular for i in self.iinstances) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 73f3af7b43..5f30ecbc7c 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -439,10 +439,9 @@ def collect(extracted, ispace, minstorage): for expr in extracted: assert not expr.is_Equality - try: - found.append(ExprGeometry(expr)) - except ValueError: - pass + eg = ExprGeometry(expr) + if eg.is_regular: + found.append(eg) # Create groups of aliasing expressions mapper = OrderedDict() From 62b341cdd42dd09d5952df73a1f695aed4006c02 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 27 Jun 2023 14:14:51 +0000 Subject: [PATCH 14/79] compiler: Tweak minmax_index --- devito/ir/support/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py index 36da9001c3..2a2c8c4df7 100644 --- a/devito/ir/support/utils.py +++ b/devito/ir/support/utils.py @@ -309,4 +309,5 @@ def minmax_index(expr, d): except KeyError: pass - return min(indices), max(indices) + return (min(sdims_min(i) for i in indices), + max(sdims_max(i) for i in indices)) From ed576e5cac716a491d8f8b7035568eacf47cbb02 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 4 Jul 2023 07:41:23 +0000 Subject: [PATCH 15/79] compiler: Add and_smart for guards auto-simplification --- devito/ir/support/guards.py | 41 +++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/devito/ir/support/guards.py b/devito/ir/support/guards.py index dd1bcc01a5..a378929fbc 100644 --- a/devito/ir/support/guards.py +++ b/devito/ir/support/guards.py @@ -8,7 +8,7 @@ from devito.ir.support.space import Forward, IterationDirection from devito.symbolics import CondEq, CondNe -from devito.tools import Pickable, as_tuple, frozendict +from devito.tools import Pickable, as_tuple, frozendict, split from devito.types import Dimension __all__ = ['GuardFactor', 'GuardBound', 'GuardBoundNext', 'BaseGuardBound', @@ -228,7 +228,7 @@ def andg(self, d, guard): return Guards(m) try: - m[d] = And(m[d], guard) + m[d] = and_smart(m[d], guard) except KeyError: m[d] = guard @@ -269,3 +269,40 @@ def filter(self, key): m = {d: v for d, v in self.items() if key(d)} return Guards(m) + + +# *** Utils + +def and_smart(relation, v): + """ + Given `x = And(*relation.args, v)`, return `relation` if `x ≡ relation`, + `x` otherwise. + + SymPy doesn't have a builtin function to simplify boolean inequalities; here, + we run a set of simple checks to at least discard the most obvious (and thus + annoying to see in the generated code) redundancies. + """ + if isinstance(relation, And): + candidates, other = split(list(relation.args), lambda a: type(a) is type(v)) + elif type(relation) is type(v): + candidates, other = [relation], [] + else: + candidates, other = [], [relation, v] + + covered = False + new_args = [] + for a in candidates: + if a.lhs is v.lhs: + covered = True + if type(a) in (Gt, Ge) and v.rhs > a.rhs: + new_args.append(v) + elif type(a) in (Lt, Le) and v.rhs < a.rhs: + new_args.append(v) + else: + new_args.append(a) + else: + new_args.append(a) + if not covered: + new_args.append(v) + + return And(*(new_args + other)) From 8e89323dc0100a397e317e28c2d4ff8890a835cd Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 18 Jul 2023 08:37:19 +0000 Subject: [PATCH 16/79] compiler: Improve Cluster.is_dense --- devito/ir/clusters/cluster.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 3827228a0d..919785104d 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -204,12 +204,10 @@ def is_dense(self): # at most PARALLEL_IF_PVT). This is a quick and easy check so we try it first try: pset = {PARALLEL, PARALLEL_IF_PVT} - grid = self.grid - for d in grid.dimensions: - if not any(pset & v for k, v in self.properties.items() - if d in k._defines): - raise ValueError - return True + target = set(self.grid.dimensions) + dims = {d for d in self.properties if d._defines & target} + if any(pset & self.properties[d] for d in dims): + return True except ValueError: pass From 8e95763c422b0df928cb39885dbfa8fca9ed2313 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 18 Jul 2023 12:34:36 +0000 Subject: [PATCH 17/79] compiler: Add IndexDerivative.base --- devito/finite_differences/differentiable.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py index 48d22e2612..fdebf282dc 100644 --- a/devito/finite_differences/differentiable.py +++ b/devito/finite_differences/differentiable.py @@ -668,6 +668,10 @@ def __new__(cls, expr, mapper, **kwargs): def _hashable_content(self): return super()._hashable_content() + (self.mapper,) + @cached_property + def base(self): + return self.expr.func(*[a for a in self.expr.args if a is not self.weights]) + @property def weights(self): return self._weights From f1e25ed6174c99150333468bc04fa3809b964593 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 19 Jul 2023 10:56:35 +0000 Subject: [PATCH 18/79] compiler: Patch AffineIndexAccessFunction --- devito/types/dimension.py | 20 ++++++++++++++------ tests/test_dimension.py | 14 ++++++++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/devito/types/dimension.py b/devito/types/dimension.py index f391de0740..d3f9a00285 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -1561,22 +1561,30 @@ class AffineIndexAccessFunction(IndexAccessFunction): """ def __new__(cls, *args, **kwargs): + # `args` may contain arbitrarily complicated expressions, so first of all + # we let SymPy simplify it, then we process the args and see if the + # resulting expression is indeed an AffineIndexAccessFunction + add = sympy.Add(*args, **kwargs) + if not isinstance(add, sympy.Add): + # E.g., reduced to a Symbol + return add + d = 0 sds = [] ofs_items = [] - for a in args: + for a in add.args: if isinstance(a, StencilDimension): sds.append(a) elif isinstance(a, Dimension): d = cls._separate_dims(d, a, ofs_items) if d is None: - return sympy.Add(*args, **kwargs) + return add elif isinstance(a, AffineIndexAccessFunction): if sds and a.sds: - return sympy.Add(*args, **kwargs) + return add d = cls._separate_dims(d, a.d, ofs_items) if d is None: - return sympy.Add(*args, **kwargs) + return add sds = list(a.sds or sds) ofs_items.append(a.ofs) else: @@ -1584,7 +1592,7 @@ def __new__(cls, *args, **kwargs): ofs = sympy.Add(*[i for i in ofs_items if i is not None]) if not all(is_integer(i) or i.is_Symbol for i in ofs.free_symbols): - return sympy.Add(*args, **kwargs) + return add sds = tuple(sds) @@ -1603,7 +1611,7 @@ def __new__(cls, *args, **kwargs): @classmethod def _separate_dims(cls, d0, d1, ofs_items): if d0 == 0 and d1 == 0: - return None + return 0 elif d0 == 0 and isinstance(d1, Dimension): return d1 elif d1 == 0 and isinstance(d0, Dimension): diff --git a/tests/test_dimension.py b/tests/test_dimension.py index 1e719532d9..d61ddb5233 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -87,6 +87,13 @@ def test_stencil_dim(self): assert expr.ofs == 1 + s assert expr.sds == (sd,) + expr = sd + 1 + d + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d is d + assert expr.ofs == 1 + assert expr.sds == (sd,) + def test_stencil_dim_multiple(self): d = Dimension(name='x') sd0 = StencilDimension('i0', 0, 1) @@ -127,6 +134,13 @@ def test_sub(self): assert expr.ofs == -1 - s assert expr.sds == (sd,) + expr = d + 1 + sd - d + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d == 0 + assert expr.ofs == 1 + assert expr.sds == (sd,) + class TestBufferedDimension(object): From 5188194a866aa0b9b75de3b82c2ab634243facf2 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 18 Jul 2023 16:18:00 +0000 Subject: [PATCH 19/79] compiler: Support 2-pass impls w unexpasion --- devito/ir/support/utils.py | 35 ++++-- devito/passes/clusters/aliases.py | 51 ++++++-- devito/symbolics/search.py | 5 + tests/test_unexpansion.py | 186 ++++++++++++++++++++---------- 4 files changed, 195 insertions(+), 82 deletions(-) diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py index 2a2c8c4df7..40cd843c8b 100644 --- a/devito/ir/support/utils.py +++ b/devito/ir/support/utils.py @@ -1,13 +1,15 @@ from collections import defaultdict +from devito.finite_differences import IndexDerivative from devito.ir.support.vector import LabeledVector -from devito.symbolics import CallFromPointer, retrieve_indexed, retrieve_terminals +from devito.symbolics import (CallFromPointer, retrieve_indexed, retrieve_terminals, + search) from devito.tools import DefaultOrderedDict, as_tuple, flatten, filter_sorted, split from devito.types import (Dimension, DimensionTuple, Indirection, ModuloDimension, StencilDimension) __all__ = ['AccessMode', 'Stencil', 'IMask', 'detect_accesses', 'detect_io', - 'pull_dims', 'sdims_min', 'sdims_max', 'minmax_index'] + 'pull_dims', 'sdims_free', 'sdims_min', 'sdims_max', 'minmax_index'] class AccessMode(object): @@ -273,25 +275,34 @@ def pull_dims(exprs, flag=True): # *** Utility functions for expressions that potentially contain StencilDimensions -def sdims_min(expr): +def sdims_free(expr): """ - Replace all StencilDimensions in `expr` with their minimum point. + Retrieve all unbounded StencilDimensions in `expr`. """ - try: - sdims = expr.find(StencilDimension) - except AttributeError: + bound = set().union(*[i.dimensions for i in search(expr, IndexDerivative)]) + sdims = search(expr, StencilDimension, mode='unique', deep=True) + return sdims - bound + + +def sdims_min(expr, sdims=None): + """ + Replace StencilDimensions in `expr` with their minimum point. + """ + if not sdims: + sdims = sdims_free(expr) + if not sdims: return expr mapper = {e: e._min for e in sdims} return expr.subs(mapper) -def sdims_max(expr): +def sdims_max(expr, sdims=None): """ - Replace all StencilDimensions in `expr` with their maximum point. + Replace StencilDimensions in `expr` with their maximum point. """ - try: - sdims = expr.find(StencilDimension) - except AttributeError: + if not sdims: + sdims = sdims_free(expr) + if not sdims: return expr mapper = {e: e._max for e in sdims} return expr.subs(mapper) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 5f30ecbc7c..98927f2ea8 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -6,11 +6,11 @@ import numpy as np import sympy -from devito.finite_differences import EvalDerivative +from devito.finite_differences import EvalDerivative, IndexDerivative from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, ROUNDABLE, SEPARABLE, Forward, IterationSpace, Interval, Cluster, ExprGeometry, Queue, IntervalGroup, LabeledVector, normalize_properties, - relax_properties, sdims_min, sdims_max) + relax_properties, sdims_free, sdims_min, sdims_max) from devito.symbolics import (Uxmapper, estimate_cost, q_constant, search, reuse_if_untouched, retrieve_indexed, uxreplace, sympy_dtype) @@ -85,7 +85,8 @@ def cire(clusters, mode, sregistry, options, platform): """ modes = { 'invariants': [CireInvariantsElementary, CireInvariantsDivs], - 'sops': [CireSops], + 'sops': [CireSops, # TODO: one day, we'll get rid of this legacy pass + CireIndexDerivatives], } for cls in modes[mode]: @@ -385,13 +386,29 @@ def _eval_variants_delta(self, delta_flops, delta_ws): # If there's a greater flop reduction using fewer temporaries, no doubts # what's gonna be the best variant. But if the better flop reduction # comes at the price of using more temporaries, then we have to apply - # heuristics, in particular we estimate how many flops would a temporary + # heuristics, in particular we estimate how many flops a temporary would # allow to save return ((delta_flops >= 0 and delta_ws > 0 and delta_flops / delta_ws < 15) or (delta_flops <= 0 and delta_ws < 0 and delta_flops / delta_ws > 15) or (delta_flops <= 0 and delta_ws >= 0)) +class CireIndexDerivatives(CireSops): + + def _generate(self, exprs, exclude): + # E.g., extract `u.dx*a*b` and `u.dx*a*c` from `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]` + def cbk_search(expr): + if isinstance(expr, IndexDerivative): + return (expr.base,) + else: + return flatten(e for e in [cbk_search(a) for a in expr.args] if e) + + basextr = self._do_generate(exprs, exclude, cbk_search) + if not basextr: + return + yield basextr + + def collect(extracted, ispace, minstorage): """ Find groups of aliasing expressions. @@ -666,8 +683,9 @@ def lower_aliases(aliases, meta, maxpar): # Given the iteration `interval`, lower distances to indices for distance, indices in zip(a.distances, indicess): + v = distance[interval.dim] or 0 try: - indices.append(d - interval.lower + distance[interval.dim]) + indices.append(d - interval.lower + v) except TypeError: indices.append(d) @@ -962,11 +980,15 @@ def __new__(cls, items, ispace=None): # Expand out the StencilDimensions, if any processed = [] for c in items: - if not c.expr.find(StencilDimension): + sdims = sdims_free(c.expr) + if not sdims: processed.append(c) continue - for f in (sdims_min, sdims_max): + f0 = lambda e: sdims_min(e, sdims) + f1 = lambda e: sdims_max(e, sdims) + + for f in (f0, f1): expr = f(c.expr) indexeds = [f(i) for i in c.indexeds] offsets = [LabeledVector([(d, f(i)) for d, i in v.items()]) @@ -1017,6 +1039,8 @@ def diameter(self): ret = defaultdict(int) for i in self.Toffsets: for d, v in i: + if d not in self._ispace: + continue try: distance = int(max(v) - min(v)) except TypeError: @@ -1048,7 +1072,7 @@ def dimensions_translated(self): def naliases(self): na = len(self._items) - sdims = set().union(*[c.expr.find(StencilDimension) for c in self._items]) + sdims = set().union(*[sdims_free(c.expr) for c in self._items]) implicit = int(np.prod([i._size for i in sdims])) - 1 return na + implicit @@ -1114,17 +1138,24 @@ def _pivot_legal_shifts(self): ret = defaultdict(lambda: (-np.inf, np.inf)) for i, ofs in zip(c.indexeds, c.offsets): f = i.function + for l in ofs.labels: + if isinstance(l, StencilDimension): + continue + # `f`'s cumulative halo size along `l` hsize = sum(f._size_halo[l]) # Any `ofs`'s shift due to non-[0,0] iteration space lower, upper = self._ispace.intervals[l].offsets + ofs0 = sdims_min(ofs[l]) + ofs1 = sdims_max(ofs[l]) + try: # Assume `ofs[l]` is a number (typical case) - maxd = min(0, max(ret[l][0], -ofs[l] - lower)) - mini = max(0, min(ret[l][1], hsize - ofs[l] - upper)) + maxd = min(0, max(ret[l][0], -ofs0 - lower)) + mini = max(0, min(ret[l][1], hsize - ofs1 - upper)) ret[l] = (maxd, mini) except TypeError: diff --git a/devito/symbolics/search.py b/devito/symbolics/search.py index 7fdb74187b..7680bd462a 100644 --- a/devito/symbolics/search.py +++ b/devito/symbolics/search.py @@ -1,3 +1,5 @@ +import sympy + from devito.symbolics.queries import (q_indexed, q_function, q_terminal, q_leaf, q_symbol, q_dimension, q_derivative) from devito.tools import as_tuple @@ -119,6 +121,9 @@ def search(exprs, query, mode='unique', visit='dfs', deep=False): found = Search.modes[mode]() for e in as_tuple(exprs): + if not isinstance(e, sympy.Basic): + continue + if visit == 'dfs': found.update(searcher.dfs(e)) elif visit == 'bfs': diff --git a/tests/test_unexpansion.py b/tests/test_unexpansion.py index d9d01b6fc9..1a0b264ae9 100644 --- a/tests/test_unexpansion.py +++ b/tests/test_unexpansion.py @@ -5,7 +5,7 @@ from devito.types import Symbol -class TestBasic(object): +class Test1Pass(object): def test_v0(self): grid = Grid(shape=(10, 10, 10)) @@ -85,7 +85,8 @@ def test_v2(self): op0 = Operator(eqns) op1 = Operator(eqns, opt=('advanced', {'expand': False, - 'blocklevels': 0})) + 'blocklevels': 0, + 'cire-mingain': 200})) # Check generated code -- expect maximal fusion! assert_structure(op1, @@ -110,7 +111,8 @@ def test_v3(self): Eq(v.forward, (v + u.dx.dy + 1.))] op0 = Operator(eqns) - op1 = Operator(eqns, opt=('advanced', {'expand': False})) + op1 = Operator(eqns, opt=('advanced', {'expand': False, + 'cire-mingain': 200})) # Check generated code -- redundant IndexDerivatives have been caught! op1._profiler._sections['section0'].sops == 65 @@ -123,63 +125,12 @@ def test_v3(self): def test_v4(self): grid = Grid(shape=(16, 16, 16)) - t = grid.stepping_dim - x, y, z = grid.dimensions - - so = 4 - - a = Function(name='a', grid=grid, space_order=so) - f = Function(name='f', grid=grid, space_order=so) - e = Function(name='e', grid=grid, space_order=so) - r = Function(name='r', grid=grid, space_order=so) - p0 = TimeFunction(name='p0', grid=grid, time_order=2, space_order=so) - m0 = TimeFunction(name='m0', grid=grid, time_order=2, space_order=so) - - def g1(field, r, e): - return (cos(e) * cos(r) * field.dx(x0=x+x.spacing/2) + - cos(e) * sin(r) * field.dy(x0=y+y.spacing/2) - - sin(e) * field.dz(x0=z+z.spacing/2)) - - def g2(field, r, e): - return - (sin(r) * field.dx(x0=x+x.spacing/2) - - cos(r) * field.dy(x0=y+y.spacing/2)) - - def g3(field, r, e): - return (sin(e) * cos(r) * field.dx(x0=x+x.spacing/2) + - sin(e) * sin(r) * field.dy(x0=y+y.spacing/2) + - cos(e) * field.dz(x0=z+z.spacing/2)) - - def g1_tilde(field, r, e): - return ((cos(e) * cos(r) * field).dx(x0=x-x.spacing/2) + - (cos(e) * sin(r) * field).dy(x0=y-y.spacing/2) - - (sin(e) * field).dz(x0=z-z.spacing/2)) - - def g2_tilde(field, r, e): - return - ((sin(r) * field).dx(x0=x-x.spacing/2) - - (cos(r) * field).dy(x0=y-y.spacing/2)) - - def g3_tilde(field, r, e): - return ((sin(e) * cos(r) * field).dx(x0=x-x.spacing/2) + - (sin(e) * sin(r) * field).dy(x0=y-y.spacing/2) + - (cos(e) * field).dz(x0=z-z.spacing/2)) - - update_p = t.spacing**2 * a**2 / f * \ - (g1_tilde(f * g1(p0, r, e), r, e) + - g2_tilde(f * g2(p0, r, e), r, e) + - g3_tilde(f * g3(p0, r, e) + f * g3(m0, r, e), r, e)) + \ - (2 - t.spacing * a) - - update_m = t.spacing**2 * a**2 / f * \ - (g1_tilde(f * g1(m0, r, e), r, e) + - g2_tilde(f * g2(m0, r, e), r, e) + - g3_tilde(f * g3(m0, r, e) + f * g3(p0, r, e), r, e)) + \ - (2 - t.spacing * a) - - eqns = [Eq(p0.forward, update_p), - Eq(m0.forward, update_m)] + + eqns = tti_sa_eqns(grid) op = Operator(eqns, subs=grid.spacing_map, - opt=('advanced', {'expand': False})) + opt=('advanced', {'expand': False, + 'cire-mingain': 400})) # Check code generation assert op._profiler._sections['section1'].sops == 1442 @@ -203,7 +154,8 @@ def test_v5(self): Eq(m0.forward, m0.dx.dx + m0.backward)] op = Operator(eqns, subs=grid.spacing_map, - opt=('advanced', {'expand': False})) + opt=('advanced', {'expand': False, + 'cire-mingain': 200})) # Check code generation assert op._profiler._sections['section0'].sops == 127 @@ -227,10 +179,124 @@ def test_v6(self): Eq(m0.forward, (m0.dx + s0).dx + f*m0.backward)] op = Operator(eqns, subs=grid.spacing_map, - opt=('advanced', {'expand': False})) + opt=('advanced', {'expand': False, + 'cire-mingain': 200})) # Check code generation assert op._profiler._sections['section0'].sops == 133 assert_structure(op, ['t,x,y', 't,x,y,i1', 't,x,y,i1,i0'], 't,x,y,i1,i0') op.cfunction + + +class Test2Pass(object): + + def test_v0(self): + grid = Grid(shape=(10, 10, 10)) + + u = TimeFunction(name='u', grid=grid, space_order=4) + v = TimeFunction(name='v', grid=grid, space_order=4) + u1 = TimeFunction(name='u', grid=grid, space_order=4) + v1 = TimeFunction(name='v', grid=grid, space_order=4) + + eqns = [Eq(u.forward, (u.dx.dy + v*u + 1.)), + Eq(v.forward, (v + u.dx.dy + 1.))] + + op0 = Operator(eqns) + op1 = Operator(eqns, opt=('advanced', {'expand': False, + 'openmp': True})) + + # Check generated code + op1._profiler._sections['section0'].sops == 65 + assert_structure(op1, ['t', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i0', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i1'], + 't,x0_blk0,y0_blk0,x,y,z,i0,y,z,i1') + + op0.apply(time_M=5) + op1.apply(time_M=5, u=u1, v=v1) + + assert np.allclose(u.data, u1.data, rtol=10e-3) + assert np.allclose(v.data, v1.data, rtol=10e-3) + + def test_v1(self): + grid = Grid(shape=(16, 16, 16)) + + eqns = tti_sa_eqns(grid) + + op = Operator(eqns, subs=grid.spacing_map, + opt=('advanced', {'expand': False, + 'openmp': False})) + + # Check code generation + assert op._profiler._sections['section1'].sops == 190 + assert_structure(op, ['x,y,z', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i0', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i1'], + 'x,y,z,t,x0_blk0,y0_blk0,x,y,z,i0,x,y,z,i1') + + op.cfunction + + +def tti_sa_eqns(grid): + t = grid.stepping_dim + x, y, z = grid.dimensions + + so = 4 + + a = Function(name='a', grid=grid, space_order=so) + f = Function(name='f', grid=grid, space_order=so) + e = Function(name='e', grid=grid, space_order=so) + r = Function(name='r', grid=grid, space_order=so) + p0 = TimeFunction(name='p0', grid=grid, time_order=2, space_order=so) + m0 = TimeFunction(name='m0', grid=grid, time_order=2, space_order=so) + + def g1(field, r, e): + return (cos(e) * cos(r) * field.dx(x0=x+x.spacing/2) + + cos(e) * sin(r) * field.dy(x0=y+y.spacing/2) - + sin(e) * field.dz(x0=z+z.spacing/2)) + + def g2(field, r, e): + return - (sin(r) * field.dx(x0=x+x.spacing/2) - + cos(r) * field.dy(x0=y+y.spacing/2)) + + def g3(field, r, e): + return (sin(e) * cos(r) * field.dx(x0=x+x.spacing/2) + + sin(e) * sin(r) * field.dy(x0=y+y.spacing/2) + + cos(e) * field.dz(x0=z+z.spacing/2)) + + def g1_tilde(field, r, e): + return ((cos(e) * cos(r) * field).dx(x0=x-x.spacing/2) + + (cos(e) * sin(r) * field).dy(x0=y-y.spacing/2) - + (sin(e) * field).dz(x0=z-z.spacing/2)) + + def g2_tilde(field, r, e): + return - ((sin(r) * field).dx(x0=x-x.spacing/2) - + (cos(r) * field).dy(x0=y-y.spacing/2)) + + def g3_tilde(field, r, e): + return ((sin(e) * cos(r) * field).dx(x0=x-x.spacing/2) + + (sin(e) * sin(r) * field).dy(x0=y-y.spacing/2) + + (cos(e) * field).dz(x0=z-z.spacing/2)) + + update_p = t.spacing**2 * a**2 / f * \ + (g1_tilde(f * g1(p0, r, e), r, e) + + g2_tilde(f * g2(p0, r, e), r, e) + + g3_tilde(f * g3(p0, r, e) + f * g3(m0, r, e), r, e)) + \ + (2 - t.spacing * a) + + update_m = t.spacing**2 * a**2 / f * \ + (g1_tilde(f * g1(m0, r, e), r, e) + + g2_tilde(f * g2(m0, r, e), r, e) + + g3_tilde(f * g3(m0, r, e) + f * g3(p0, r, e), r, e)) + \ + (2 - t.spacing * a) + + eqns = [Eq(p0.forward, update_p), + Eq(m0.forward, update_m)] + + + return eqns From 8c38c88f784314d87d3b48ea0fa036fe73c79c4e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 20 Jul 2023 08:38:21 +0000 Subject: [PATCH 20/79] compiler: Improve profiling of multipass implementations --- devito/operator/operator.py | 24 ++++-- devito/operator/profiling.py | 130 ++++++++++++++++++------------ devito/passes/clusters/aliases.py | 8 +- devito/passes/iet/instrument.py | 44 +++++++++- devito/types/misc.py | 18 ++++- tests/test_gpu_common.py | 4 +- 6 files changed, 157 insertions(+), 71 deletions(-) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index eb8b793f22..cdc327f057 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -930,15 +930,22 @@ def _emit_apply_profiling(self, args): # Emit local, i.e. "per-rank" performance. Without MPI, this is the only # thing that will be emitted - for k, v in summary.items(): - rank = "[rank%d]" % k.rank if k.rank is not None else "" + def lower_perfentry(v): if v.gflopss: oi = "OI=%.2f" % fround(v.oi) gflopss = "%.2f GFlops/s" % fround(v.gflopss) gpointss = "%.2f GPts/s" % fround(v.gpointss) - metrics = "[%s]" % ", ".join([oi, gflopss, gpointss]) + return "[%s]" % ", ".join([oi, gflopss, gpointss]) + elif v.gpointss: + gpointss = "%.2f GPts/s" % fround(v.gpointss) + return "[%s]" % gpointss else: - metrics = "" + return "" + + for k, v in summary.items(): + rank = "[rank%d]" % k.rank if k.rank is not None else "" + + metrics = lower_perfentry(v) itershapes = [",".join(str(i) for i in its) for its in v.itershapes] if len(itershapes) > 1: @@ -950,9 +957,12 @@ def _emit_apply_profiling(self, args): name = "%s%s<%s>" % (k.name, rank, itershapes) perf("%s* %s ran in %.2f s %s" % (indent, name, fround(v.time), metrics)) - for n, time in summary.subsections.get(k.name, {}).items(): - perf("%s+ %s ran in %.2f s [%.2f%%]" % - (indent*2, n, time, fround(time/v.time*100))) + for n, v1 in summary.subsections.get(k.name, {}).items(): + metrics = lower_perfentry(v1) + + perf("%s+ %s ran in %.2f s [%.2f%%] %s" % + (indent*2, n, fround(v1.time), fround(v1.time/v.time*100), + metrics)) # Emit performance mode and arguments perf_args = {} diff --git a/devito/operator/profiling.py b/devito/operator/profiling.py index 2a58677481..53f16e526f 100644 --- a/devito/operator/profiling.py +++ b/devito/operator/profiling.py @@ -105,6 +105,25 @@ def track_subsection(self, sname, name): v = self._subsections.setdefault(sname, OrderedDict()) v[name] = SectionData(S.Zero, S.Zero, S.Zero, S.Zero, []) + def group_as_subsections(self, sname, sections): + ops = sum(self._sections[i].ops for i in sections) + points = sum(self._sections[i].points for i in sections) + traffic = sum(self._sections[i].traffic for i in sections) + sectiondata = SectionData(ops, S.Zero, points, traffic, []) + + v = self._subsections.setdefault(sname, OrderedDict()) + v.update({i: self._sections[i] for i in sections}) + + new_sections = OrderedDict() + for k, v in self._sections.items(): + try: + if sections.index(k) == len(sections) - 1: + new_sections[sname] = sectiondata + except ValueError: + new_sections[k] = v + self._sections.clear() + self._sections.update(new_sections) + def instrument(self, iet, timer): """ Instrument the given IET for C-level performance profiling. @@ -211,6 +230,58 @@ class AdvancedProfiler(Profiler): _supports_async_sections = True + def _evaluate_section(self, name, data, args, dtype): + # Time to run the section + time = max(getattr(args[self.name]._obj, name), 10e-7) + + # Number of FLOPs performed + try: + ops = int(subs_op_args(data.ops, args)) + except (AttributeError, TypeError): + # E.g., a section comprising just function calls, or at least + # a sequence of unrecognized or non-conventional expr statements + ops = np.nan + + try: + # Number of grid points computed + points = int(subs_op_args(data.points, args)) + + # Compulsory traffic + traffic = float(subs_op_args(data.traffic, args)*dtype().itemsize) + except (AttributeError, TypeError): + # E.g., the section has a dynamic loop size + points = np.nan + + traffic = np.nan + + # Nmber of FLOPs performed at each iteration + sops = data.sops + + # Runtime itermaps/itershapes + try: + itermaps = [OrderedDict([(k, int(subs_op_args(v, args))) + for k, v in i.items()]) + for i in data.itermaps] + itershapes = tuple(tuple(i.values()) for i in itermaps) + except TypeError: + # E.g., a section comprising just function calls, or at least + # a sequence of unrecognized or non-conventional expr statements + itershapes = () + + return time, ops, points, traffic, sops, itershapes + + def _allgather_from_comm(self, comm, time, ops, points, traffic, sops, itershapes): + times = comm.allgather(time) + assert comm.size == len(times) + + opss = comm.allgather(ops) + pointss = comm.allgather(points) + traffics = comm.allgather(traffic) + sops = [sops]*comm.size + itershapess = comm.allgather(itershapes) + + return list(zip(times, opss, pointss, traffics, sops, itershapess)) + # Override basic summary so that arguments other than runtime are computed. def summary(self, args, dtype, reduce_over=None): grid = args.grid @@ -219,71 +290,30 @@ def summary(self, args, dtype, reduce_over=None): # Produce sections summary summary = PerformanceSummary() for name, data in self._sections.items(): - # Time to run the section - time = max(getattr(args[self.name]._obj, name), 10e-7) - - # Number of FLOPs performed - try: - ops = int(subs_op_args(data.ops, args)) - except (AttributeError, TypeError): - # E.g., a section comprising just function calls, or at least - # a sequence of unrecognized or non-conventional expr statements - ops = np.nan - - try: - # Number of grid points computed - points = int(subs_op_args(data.points, args)) - - # Compulsory traffic - traffic = float(subs_op_args(data.traffic, args)*dtype().itemsize) - except (AttributeError, TypeError): - # E.g., the section has a dynamic loop size - points = np.nan - - traffic = np.nan - - # Runtime itermaps/itershapes - try: - itermaps = [OrderedDict([(k, int(subs_op_args(v, args))) - for k, v in i.items()]) - for i in data.itermaps] - itershapes = tuple(tuple(i.values()) for i in itermaps) - except TypeError: - # E.g., a section comprising just function calls, or at least - # a sequence of unrecognized or non-conventional expr statements - itershapes = () + items = self._evaluate_section(name, data, args, dtype) # Add local performance data if comm is not MPI.COMM_NULL: # With MPI enabled, we add one entry per section per rank - times = comm.allgather(time) - assert comm.size == len(times) - opss = comm.allgather(ops) - pointss = comm.allgather(points) - traffics = comm.allgather(traffic) - sops = [data.sops]*comm.size - itershapess = comm.allgather(itershapes) - items = list(zip(times, opss, pointss, traffics, sops, itershapess)) + items = self._allgather_from_comm(comm, *items) for rank in range(comm.size): summary.add(name, rank, *items[rank]) else: - summary.add(name, None, time, ops, points, traffic, data.sops, itershapes) + summary.add(name, None, *items) # Enrich summary with subsections data for sname, v in self._subsections.items(): for name, data in v.items(): - # Time to run the section - time = max(getattr(args[self.name]._obj, name), 10e-7) + items = self._evaluate_section(name, data, args, dtype) # Add local performance data if comm is not MPI.COMM_NULL: # With MPI enabled, we add one entry per section per rank - times = comm.allgather(time) - assert comm.size == len(times) + items = self._allgather_from_comm(comm, *items) for rank in range(comm.size): - summary.add_subsection(sname, name, rank, time) + summary.add_subsection(sname, name, rank, *items[rank]) else: - summary.add_subsection(sname, name, None, time) + summary.add_subsection(sname, name, None, *items) # Add global performance data if reduce_over is not None: @@ -422,11 +452,11 @@ def add(self, name, rank, time, self.input[k] = PerfInput(time, ops, points, traffic, sops, itershapes) - def add_subsection(self, sname, name, rank, time): + def add_subsection(self, sname, name, rank, time, *args): k0 = PerfKey(sname, rank) assert k0 in self - self.subsections[sname][name] = time + self.subsections[sname][name] = PerfEntry(time, None, None, None, None, []) def add_glb_vanilla(self, key, time): """ diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 98927f2ea8..07d100f156 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -16,9 +16,9 @@ sympy_dtype) from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, generator, split, timed_pass) -from devito.types import (Array, TempFunction, Eq, Symbol, Temp, ModuloDimension, - CustomDimension, IncrDimension, StencilDimension, Indexed, - Hyperplane) +from devito.types import (Eq, Symbol, Temp, TempArray, TempFunction, + ModuloDimension, CustomDimension, IncrDimension, + StencilDimension, Indexed, Hyperplane) from devito.types.grid import MultiSubDimension __all__ = ['cire'] @@ -822,7 +822,7 @@ def lower_schedule(schedule, meta, sregistry, ftemps): make = TempFunction else: # Typical case -- the user does *not* "see" the CIRE-created temporaries - make = Array + make = TempArray clusters = [] subs = {} diff --git a/devito/passes/iet/instrument.py b/devito/passes/iet/instrument.py index db9b446d7f..9ba8a8751e 100644 --- a/devito/passes/iet/instrument.py +++ b/devito/passes/iet/instrument.py @@ -1,10 +1,12 @@ -from devito.ir.iet import (BusyWait, FindNodes, FindSymbols, MapNodes, Section, - TimedList, Transformer) +from itertools import groupby + +from devito.ir.iet import (BusyWait, FindNodes, FindSymbols, MapNodes, Iteration, + Section, TimedList, Transformer) from devito.mpi.routines import (HaloUpdateCall, HaloWaitCall, MPICall, MPIList, HaloUpdateList, HaloWaitList, RemainderCall, ComputeCall) from devito.passes.iet.engine import iet_pass -from devito.types import Timer +from devito.types import TempArray, TempFunction, Timer __all__ = ['instrument'] @@ -25,10 +27,12 @@ def instrument(graph, **kwargs): @iet_pass def track_subsections(iet, **kwargs): """ - Add custom Sections to the `profiler`. Custom Sections include: + Add sub-Sections to the `profiler`. Sub-Sections include: * MPI Calls (e.g., HaloUpdateCall and HaloUpdateWait) * Busy-waiting on While(lock) (e.g., from host-device orchestration) + * Multi-pass implementations -- one sub-Section for each pass, within one + macro Section """ profiler = kwargs['profiler'] sregistry = kwargs['sregistry'] @@ -45,6 +49,7 @@ def track_subsections(iet, **kwargs): mapper = {} + # MPI Calls, busy-waiting for NodeType in [MPIList, MPICall, BusyWait, ComputeCall]: for k, v in MapNodes(Section, NodeType).visit(iet).items(): for i in v: @@ -57,6 +62,37 @@ def track_subsections(iet, **kwargs): iet = Transformer(mapper).visit(iet) + # Multi-pass implementations + mapper = {} + + for i in FindNodes(Iteration).visit(iet): + for k, g in groupby(i.nodes, key=lambda n: n.is_Section): + if not k: + continue + + candidates = [] + for i in g: + functions = FindSymbols().visit(i) + if any(isinstance(f, (TempArray, TempFunction)) for f in functions): + candidates.append(i) + else: + # They must be consecutive Sections + break + if len(candidates) < 2: + continue + + name = sregistry.make_name(prefix='multipass') + body = [i._rebuild(is_subsection=True) for i in candidates] + section = Section(name, body=body) + + profiler.group_as_subsections(name, [i.name for i in candidates]) + + mapper[candidates.pop(0)] = section + for i in candidates: + mapper[i] = None + + iet = Transformer(mapper).visit(iet) + return iet, {} diff --git a/devito/types/misc.py b/devito/types/misc.py index c644a60614..ccbd9b7fdc 100644 --- a/devito/types/misc.py +++ b/devito/types/misc.py @@ -3,12 +3,12 @@ import numpy as np from sympy.core.core import ordering_of_classes -from devito.types import CompositeObject, Indexed, Symbol +from devito.types import Array, CompositeObject, Indexed, Symbol from devito.types.basic import IndexedData from devito.tools import Pickable, as_tuple __all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard', - 'Global', 'Hyperplane', 'Indirection', 'Temp', 'Jump'] + 'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump'] class Timer(CompositeObject): @@ -175,8 +175,8 @@ def __new__(cls, name=None, mapped=None, dtype=np.uint64, is_const=True, class Temp(Symbol): """ - A Temp is a Symbol used by compiler passes to store locally-constructed - temporary expressions. + A Temp is a Symbol used by compiler passes to store intermediate + sub-expressions. """ # Just make sure the SymPy args ordering is the same regardless of whether @@ -184,6 +184,16 @@ class Temp(Symbol): ordering_of_classes.insert(ordering_of_classes.index('Symbol') + 1, 'Temp') +class TempArray(Array): + + """ + A TempArray is an Array used by compiler passes to store intermediate + sub-expressions. + """ + + pass + + class Jump(object): """ diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 3bf641a120..d3164722f2 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -869,8 +869,8 @@ def test_tasking_over_compiler_generated(self): assert len(retrieve_iteration_tree(op)) == 5 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) - assert len(sections) == 3 - assert 'while(lock0[t1] == 0)' in str(sections[1].body[0].body[0].body[0]) + assert len(sections) == 4 + assert 'while(lock0[t1] == 0)' in str(sections[2].body[0].body[0].body[0]) op0.apply(time_M=nt-1) op1.apply(time_M=nt-1, u=u1, usave=usave1) From 60dee8c4eaac24379c6946f06d42356a3b3be24c Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 21 Jul 2023 13:30:06 +0000 Subject: [PATCH 21/79] compiler: Patch sync_sections --- devito/passes/iet/instrument.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/passes/iet/instrument.py b/devito/passes/iet/instrument.py index 9ba8a8751e..b844fd9a08 100644 --- a/devito/passes/iet/instrument.py +++ b/devito/passes/iet/instrument.py @@ -136,6 +136,6 @@ def sync_sections(iet, lang=None, profiler=None, **kwargs): if runs_async and not unnecessary: mapper[tl] = tl._rebuild(body=tl.body + (sync,)) - iet = Transformer(mapper).visit(iet) + iet = Transformer(mapper, nested=True).visit(iet) return iet, {} From 4a22575ec3632ad3a30fd15fb9040375bb33f1e9 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 21 Jul 2023 16:46:39 +0000 Subject: [PATCH 22/79] compiler: Enhance CireIndexDerivatives --- devito/passes/clusters/aliases.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 07d100f156..b503673c68 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -408,6 +408,18 @@ def cbk_search(expr): return yield basextr + # E.g., extract `u.dx` from `u.dx*a*b`, which in turn was extracted + # from `(u.dx*a*b).dy`. That is, the inner derivatives only + def cbk_search2(expr): + if isinstance(expr, IndexDerivative): + return (expr,) + else: + return flatten(e for e in [cbk_search2(a) for a in expr.args] if e) + + sndxtr = self._do_generate(list(basextr), exclude, cbk_search2) + if sndxtr: + yield sndxtr + def collect(extracted, ispace, minstorage): """ From cfc926c004864bfcd63e538f1509dc4b1df71f50 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 24 Jul 2023 10:49:32 +0000 Subject: [PATCH 23/79] compiler: Patch has_data_reuse to account for StencilDimension --- devito/passes/clusters/blocking.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index 741c8bf558..91ce60a291 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -1,11 +1,12 @@ from sympy import sympify +from devito.finite_differences.differentiable import IndexSum from devito.ir.clusters import Queue from devito.ir.support import (AFFINE, PARALLEL, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT, SEQUENTIAL, SKEWABLE, TILABLES, Interval, IntervalGroup, IterationSpace, Scope) from devito.passes import is_on_device -from devito.symbolics import uxreplace, xreplace_indices +from devito.symbolics import search, uxreplace, xreplace_indices from devito.tools import UnboundedMultiTuple, as_tuple, flatten, is_integer, prod from devito.types import BlockDimension @@ -100,6 +101,8 @@ def _has_data_reuse(self, cluster): # which translates into the existance of any Relation accross Indexeds if any(r.function.is_AbstractFunction for r in cluster.scope.r_gen()): return True + if search(cluster.exprs, IndexSum): + return True # If it's a reduction operation a la matrix-matrix multiply, two Indexeds # might be enough From 94cdb406c2d261fb560ca006e525d128e768b2fd Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 24 Jul 2023 14:00:29 +0000 Subject: [PATCH 24/79] pep8 happiness --- devito/ir/support/basic.py | 3 +-- devito/ir/support/utils.py | 1 - devito/passes/clusters/aliases.py | 5 ++--- tests/test_unexpansion.py | 1 - 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 1a08bdbe48..99185b2e1d 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -10,8 +10,7 @@ q_constant, q_affine, q_routine, q_terminal) from devito.tools import (Tag, as_tuple, is_integer, filter_sorted, flatten, memoized_meth, memoized_generator) -from devito.types import (Barrier, Dimension, DimensionTuple, Jump, Symbol, - StencilDimension) +from devito.types import Barrier, Dimension, DimensionTuple, Jump, Symbol __all__ = ['IterationInstance', 'TimedAccess', 'Scope', 'ExprGeometry'] diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py index 40cd843c8b..73e0cc206c 100644 --- a/devito/ir/support/utils.py +++ b/devito/ir/support/utils.py @@ -1,7 +1,6 @@ from collections import defaultdict from devito.finite_differences import IndexDerivative -from devito.ir.support.vector import LabeledVector from devito.symbolics import (CallFromPointer, retrieve_indexed, retrieve_terminals, search) from devito.tools import DefaultOrderedDict, as_tuple, flatten, filter_sorted, split diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index b503673c68..93575f3d56 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -11,9 +11,8 @@ IterationSpace, Interval, Cluster, ExprGeometry, Queue, IntervalGroup, LabeledVector, normalize_properties, relax_properties, sdims_free, sdims_min, sdims_max) -from devito.symbolics import (Uxmapper, estimate_cost, q_constant, search, - reuse_if_untouched, retrieve_indexed, uxreplace, - sympy_dtype) +from devito.symbolics import (Uxmapper, estimate_cost, search, reuse_if_untouched, + uxreplace, sympy_dtype) from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, generator, split, timed_pass) from devito.types import (Eq, Symbol, Temp, TempArray, TempFunction, diff --git a/tests/test_unexpansion.py b/tests/test_unexpansion.py index 1a0b264ae9..5cadaf0d9f 100644 --- a/tests/test_unexpansion.py +++ b/tests/test_unexpansion.py @@ -298,5 +298,4 @@ def g3_tilde(field, r, e): eqns = [Eq(p0.forward, update_p), Eq(m0.forward, update_m)] - return eqns From c7f660561c5c533e8c342e9412fc74e79ffc2a7d Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 27 Jul 2023 07:31:37 +0000 Subject: [PATCH 25/79] compiler: Patch infer_dtype to support vector types --- devito/tools/dtypes_lowering.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/devito/tools/dtypes_lowering.py b/devito/tools/dtypes_lowering.py index 0b3cd53ebf..c06f97a220 100644 --- a/devito/tools/dtypes_lowering.py +++ b/devito/tools/dtypes_lowering.py @@ -75,12 +75,15 @@ def add_dtype(self, field_name, count): self.update(build_dtypes_vector([field_name], [count])) - def get_base_dtype(self, v): + def get_base_dtype(self, v, default=None): for (base_dtype, count), dtype in self.items(): if dtype is v: return base_dtype - raise ValueError + if default is not None: + return default + else: + raise ValueError dtypes_vector_mapper = DTypesVectorMapper() @@ -263,8 +266,10 @@ def infer_dtype(dtypes): highest precision; * If there's at least one floating dtype, ignore any integer dtypes. """ - fdtypes = {i for i in dtypes if np.issubdtype(i, np.floating)} + # Resolve the vector types, if any + dtypes = {dtypes_vector_mapper.get_base_dtype(i, i) for i in dtypes } + fdtypes = {i for i in dtypes if np.issubdtype(i, np.floating)} if len(fdtypes) > 1: return max(fdtypes, key=lambda i: np.dtype(i).itemsize) elif len(fdtypes) == 1: From 45d9ae7c6b1e1975e7c261a4238fad2be2e75fa6 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 28 Jul 2023 09:44:32 +0000 Subject: [PATCH 26/79] compiler: Fix DDA involving ComponentAccesses --- devito/ir/support/basic.py | 59 +++++++++++++++++++++++++-------- devito/passes/clusters/misc.py | 5 +-- devito/tools/dtypes_lowering.py | 2 +- devito/types/array.py | 3 +- 4 files changed, 49 insertions(+), 20 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 99185b2e1d..8eb87adbe7 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -7,10 +7,11 @@ from devito.ir.support.utils import AccessMode from devito.ir.support.vector import LabeledVector, Vector from devito.symbolics import (compare_ops, retrieve_indexed, retrieve_terminals, - q_constant, q_affine, q_routine, q_terminal) + q_constant, q_affine, q_routine, search, uxreplace) from devito.tools import (Tag, as_tuple, is_integer, filter_sorted, flatten, memoized_meth, memoized_generator) -from devito.types import Barrier, Dimension, DimensionTuple, Jump, Symbol +from devito.types import (Barrier, ComponentAccess, Dimension, DimensionTuple, + Jump, Symbol) __all__ = ['IterationInstance', 'TimedAccess', 'Scope', 'ExprGeometry'] @@ -322,6 +323,12 @@ def distance(self, other): other : TimedAccess The TimedAccess w.r.t. which the distance is computed. """ + if isinstance(self.access, ComponentAccess) and \ + isinstance(other.access, ComponentAccess) and \ + self.access.index != other.access.index: + # E.g., `uv(x).x` and `uv(x).y` -- not a real dependence! + return Vector(S.ImaginaryUnit) + ret = [] for sit, oit in zip(self.itintervals, other.itintervals): n = len(ret) @@ -821,9 +828,9 @@ def __init__(self, exprs, rules=None): for i, e in enumerate(exprs): # Reads - terminals = retrieve_terminals(e.rhs, deep=True, mode='unique') + terminals = retrieve_accesses(e.rhs, deep=True) try: - terminals.update(retrieve_terminals(e.lhs.indices)) + terminals.update(retrieve_accesses(e.lhs.indices)) except AttributeError: pass for j in terminals: @@ -832,12 +839,10 @@ def __init__(self, exprs, rules=None): v.append(TimedAccess(j, mode, i, e.ispace)) # Write - terminals = [] - if q_terminal(e.lhs): - terminals.append(e.lhs) + terminals = retrieve_accesses(e.lhs) if q_routine(e.rhs): try: - terminals.extend(e.rhs.writes) + terminals.update(e.rhs.writes) except AttributeError: # E.g., foreign routines, such as `cos` or `sin` pass @@ -857,7 +862,7 @@ def __init__(self, exprs, rules=None): # Look up ConditionalDimensions for v in e.conditionals.values(): - for j in retrieve_terminals(v): + for j in retrieve_accesses(v): v = self.reads.setdefault(j.function, []) v.append(TimedAccess(j, 'R', -1, e.ispace)) @@ -947,6 +952,9 @@ def d_flow_gen(self): for r in self.reads.get(k, []): dependence = Dependence(w, r) + if dependence.is_imaginary: + continue + if any(not rule(dependence) for rule in self.rules): continue @@ -957,8 +965,7 @@ def d_flow_gen(self): # Non-integer vectors are not comparable. # Conservatively, we assume it is a dependence, unless # it's a read-for-reduction - is_flow = (not dependence.is_imaginary and - not r.is_read_reduction) + is_flow = not r.is_read_reduction if is_flow: yield dependence @@ -975,6 +982,9 @@ def d_anti_gen(self): for r in self.reads.get(k, []): dependence = Dependence(r, w) + if dependence.is_imaginary: + continue + if any(not rule(dependence) for rule in self.rules): continue @@ -985,8 +995,7 @@ def d_anti_gen(self): # Non-integer vectors are not comparable. # Conservatively, we assume it is a dependence, unless # it's a read-for-reduction - is_anti = (not dependence.is_imaginary and - not r.is_read_reduction) + is_anti = not r.is_read_reduction if is_anti: yield dependence @@ -1003,6 +1012,9 @@ def d_output_gen(self): for w2 in self.writes.get(k, []): dependence = Dependence(w2, w1) + if dependence.is_imaginary: + continue + if any(not rule(dependence) for rule in self.rules): continue @@ -1012,7 +1024,7 @@ def d_output_gen(self): except TypeError: # Non-integer vectors are not comparable. # Conservatively, we assume it is a dependence - is_output = not dependence.is_imaginary + is_output = True if is_output: yield dependence @@ -1200,3 +1212,22 @@ def aindices(self): @property def is_regular(self): return all(i.is_regular for i in self.iinstances) + + +# *** Utils + +def retrieve_accesses(exprs, **kwargs): + """ + Like retrieve_terminals, but ensure that if a ComponentAccess is found, + the ComponentAccess itself is returned, while the wrapped Indexed is discarded. + """ + kwargs['mode'] = 'unique' + + compaccs = search(exprs, ComponentAccess) + if not compaccs: + return retrieve_terminals(exprs, **kwargs) + + subs = {i: Symbol('dummy%d' % n) for n, i in enumerate(compaccs)} + exprs1 = uxreplace(exprs, subs) + + return compaccs | retrieve_terminals(exprs1, **kwargs) - set(subs.values()) diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index 453e8c79bc..e90aad341d 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -307,7 +307,7 @@ def choose_element(queue, scheduled): return ClusterGroup(dag.topological_sort(choose_element), prefix) - def _build_dag(self, cgroups, prefix, peeking=False): + def _build_dag(self, cgroups, prefix): """ A DAG representing the data dependences across the ClusterGroups within a given scope. @@ -354,9 +354,6 @@ def is_cross(dep): elif any(scope.d_output_gen()): dag.add_edge(cg0, cg1) - if peeking and dag.edges: - return dag - return dag diff --git a/devito/tools/dtypes_lowering.py b/devito/tools/dtypes_lowering.py index c06f97a220..62776eefd5 100644 --- a/devito/tools/dtypes_lowering.py +++ b/devito/tools/dtypes_lowering.py @@ -267,7 +267,7 @@ def infer_dtype(dtypes): * If there's at least one floating dtype, ignore any integer dtypes. """ # Resolve the vector types, if any - dtypes = {dtypes_vector_mapper.get_base_dtype(i, i) for i in dtypes } + dtypes = {dtypes_vector_mapper.get_base_dtype(i, i) for i in dtypes} fdtypes = {i for i in dtypes if np.issubdtype(i, np.floating)} if len(fdtypes) > 1: diff --git a/devito/types/array.py b/devito/types/array.py index f12922f330..f7113c19b1 100644 --- a/devito/types/array.py +++ b/devito/types/array.py @@ -11,7 +11,8 @@ from devito.types.basic import AbstractFunction from devito.types.utils import CtypesFactory, DimensionTuple -__all__ = ['Array', 'ArrayMapped', 'ArrayObject', 'PointerArray', 'Bundle'] +__all__ = ['Array', 'ArrayMapped', 'ArrayObject', 'PointerArray', 'Bundle', + 'ComponentAccess'] class ArrayBasic(AbstractFunction): From 952dcf3ec8a532d590eb9207d3ab8f66e40f8276 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 31 Jul 2023 13:19:26 +0000 Subject: [PATCH 27/79] api: Support pattern-matching par-tile --- devito/core/operator.py | 15 ++++---- devito/ir/support/space.py | 4 ++ devito/passes/clusters/blocking.py | 39 +++++++++++++++++-- devito/tools/data_structures.py | 17 +++++++- tests/test_dle.py | 62 ++++++++++++++++++++++++++++-- 5 files changed, 122 insertions(+), 15 deletions(-) diff --git a/devito/core/operator.py b/devito/core/operator.py index 2de585a11e..887c79978a 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -329,9 +329,9 @@ class OptOption(object): class ParTileArg(tuple): - def __new__(cls, items, shm=0, tag=None): + def __new__(cls, items, rule=None, tag=None): obj = super().__new__(cls, items) - obj.shm = shm + obj.rule = rule obj.tag = tag return obj @@ -371,14 +371,15 @@ def __new__(cls, items, default=None): try: y = items[1] - if is_integer(y): - # E.g., ((32, 4, 8), 1) - # E.g., ((32, 4, 8), 1, 'tag') + if is_integer(y) or isinstance(y, str) or y is None: + # E.g., ((32, 4, 8), 'rule') + # E.g., ((32, 4, 8), 'rule', 'tag') items = (ParTileArg(*items),) else: try: - # E.g., (((32, 4, 8), 1), ((32, 4, 4), 2)) - # E.g., (((32, 4, 8), 1, 'tag0'), ((32, 4, 4), 2, 'tag1')) + # E.g., (((32, 4, 8), 'rule'), ((32, 4, 4), 'rule')) + # E.g., (((32, 4, 8), 'rule0', 'tag0'), + # ((32, 4, 4), 'rule1', 'tag1')) items = tuple(ParTileArg(*i) for i in items) except TypeError: # E.g., ((32, 4, 8), (32, 4, 4)) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index d06a7675fa..b7b6c0822d 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -928,6 +928,10 @@ def is_compatible(self, other): def itdimensions(self): return self.intervals.dimensions + @property + def itdims(self): + return self.itdimensions + @property def relations(self): return self.intervals.relations diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index 91ce60a291..d731fa2126 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -317,7 +317,7 @@ def callback(self, clusters, prefix, blk_size_gen=None): # By passing a suitable key to `next` we ensure that we pull the # next par-tile entry iff we're now blocking an unseen TILABLE nest try: - step = sympify(blk_size_gen.next(clusters, d)) + step = sympify(blk_size_gen.next(prefix, d, clusters)) except StopIteration: return clusters else: @@ -421,6 +421,7 @@ class BlockSizeGenerator(object): def __init__(self, par_tile): self.umt = UnboundedMultiTuple(*par_tile) + self.tip = -1 # This is for Clusters that need a small par-tile to avoid under-utilizing # computational resources (e.g., kernels running over iteration spaces that @@ -433,15 +434,45 @@ def __init__(self, par_tile): else: self.umt_small = UnboundedMultiTuple(par_tile.default) - def next(self, clusters, d): + def next(self, prefix, d, clusters): + # If a whole new set of Dimensions, move the tip -- this means `clusters` + # at `d` represents a new loop nest or kernel + x = any(i.dim.is_Block for i in flatten(c.ispace for c in clusters)) + if not x: + self.tip += 1 + # TODO: This is for now exceptionally rudimentary if all(c.properties.is_blockable_small(d) for c in clusters): - umt = self.umt_small + if not x: + self.umt_small.iter() + return self.umt_small.next() + + if x: + item = self.umt.curitem else: + # We can't `self.umt.iter()` because we might still want to + # fallback to `self.umt_small` + item = self.umt.nextitem + + # Handle user-provided rules + # TODO: This is also rudimentary + if item.rule is None: umt = self.umt + elif is_integer(item.rule): + if item.rule == self.tip: + umt = self.umt + else: + umt = self.umt_small + else: + if item.rule in {d.name for d in prefix.itdims}: + umt = self.umt + else: + # This is like "pattern unmatched" -- fallback to `umt_small` + umt = self.umt_small - if not any(i.dim.is_Block for i in flatten(c.ispace for c in clusters)): + if not x: umt.iter() + return umt.next() diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 4ceb5f5d08..3dc3b5f180 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -632,7 +632,11 @@ def __init__(self, *items): nitems = [] for i in as_tuple(items): if isinstance(i, Iterable): - nitems.append(tuple(i)) + if isinstance(i, tuple): + # Honours tuple subclasses + nitems.append(i) + else: + nitems.append(tuple(i)) else: raise ValueError("Expected sequence, got %s" % type(i)) @@ -646,6 +650,17 @@ def __repr__(self): items[self.tip] = "*%s" % items[self.tip] return "%s(%s)" % (self.__class__.__name__, ", ".join(items)) + @property + def curitem(self): + return self.items[self.tip] + + @property + def nextitem(self): + return self.items[min(self.tip + 1, max(len(self.items) - 1, 0))] + + def index(self, item): + return self.items.index(item) + def iter(self): if not self.items: raise ValueError("No tuples available") diff --git a/tests/test_dle.py b/tests/test_dle.py index 8e37947a36..23fbe0b730 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -294,9 +294,9 @@ class TestBlockingParTile(object): ((16, 16, 16), ((16, 16, 16), (16, 16, 16))), ((32, 4, 4), ((4, 4, 32), (4, 4, 32))), (((16, 4, 4), (16, 16, 16)), ((4, 4, 16), (16, 16, 16))), - (((32, 4, 4), 1), ((4, 4, 32), (4, 4, 32))), - (((32, 4, 4), 1, 'tag0'), ((4, 4, 32), (4, 4, 32))), - ((((32, 4, 8), 1, 'tag0'), ((32, 8, 4), 2)), ((8, 4, 32), (4, 8, 32))), + (((32, 4, 4), None), ((4, 4, 32), (4, 4, 32))), + (((32, 4, 4), None, 'tag0'), ((4, 4, 32), (4, 4, 32))), + ((((32, 4, 8), None, 'tag0'), ((32, 8, 4), None)), ((8, 4, 32), (4, 8, 32))), ]) def test_structure(self, par_tile, expected): grid = Grid(shape=(8, 8, 8)) @@ -344,6 +344,62 @@ def test_structure_2p5D(self): assert iters[0].step == par_tile[1] assert iters[1].step == par_tile[0] + def test_custom_rule0(self): + grid = Grid(shape=(8, 8, 8)) + + u = TimeFunction(name="u", grid=grid, space_order=4) + v = TimeFunction(name="v", grid=grid, space_order=4) + + eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx), + Eq(v.forward, u.forward.dx)] + + # "Apply par-tile=(4, 4, 4) to the loop nest (kernel) with id (rule)=1, + # and use default for the rest!" + par_tile = (4, 4, 4) + rule = 1 + + op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule), + 'blockinner': True})) + + # Check generated code. By having specified "1" as rule, we expect the + # given par-tile to be applied to the kernel with id 1 + bns, _ = assert_blocking(op, {'z0_blk0', 'x1_blk0', 'z2_blk0'}) + root = bns['x1_blk0'] + iters = FindNodes(Iteration).visit(root) + iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1] + assert len(iters) == 3 + assert all(i.step == j for i, j in zip(iters, par_tile)) + + def test_custom_rule1(self): + grid = Grid(shape=(8, 8, 8)) + x, y, z = grid.dimensions + + f = Function(name='f', grid=grid) + u = TimeFunction(name="u", grid=grid, space_order=4) + v = TimeFunction(name="v", grid=grid, space_order=4) + + eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx + cos(f)*cos(f[x+1, y, z])), + Eq(v.forward, u.forward.dx)] + + # "Apply par-tile=(4, 4, 4) to the loop nests (kernels) embedded within + # the time loop, and use default for the rest!" + par_tile = (4, 4, 4) + rule = grid.time_dim.name # We must be able to infer it from str + + op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule), + 'blockinner': True, + 'blockrelax': True})) + + # Check generated code. By having specified "1" as rule, we expect the + # given par-tile to be applied to the kernel with id 1 + bns, _ = assert_blocking(op, {'z0_blk0', 'x1_blk0', 'x2_blk0', 'x3_blk0'}) + for i in ['x1_blk0', 'x2_blk0', 'x3_blk0']: + root = bns[i] + iters = FindNodes(Iteration).visit(root) + iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1] + assert len(iters) == 3 + assert all(i.step == j for i, j in zip(iters, par_tile)) + @pytest.mark.parametrize("shape", [(10,), (10, 45), (20, 33), (10, 31, 45), (45, 31, 45)]) @pytest.mark.parametrize("time_order", [2]) From b453a16405ec3de38dea8870fa9fac83e27e4c97 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 31 Jul 2023 14:16:32 +0000 Subject: [PATCH 28/79] compiler: Patch minimize_symbols for parlang backends --- devito/passes/iet/misc.py | 42 ++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py index 4199283e54..e83aeae943 100644 --- a/devito/passes/iet/misc.py +++ b/devito/passes/iet/misc.py @@ -5,11 +5,11 @@ from devito.finite_differences import Max, Min from devito.ir import (Any, Forward, Iteration, List, Prodder, FindApplications, - FindNodes, Transformer, Uxreplace, filter_iterations, - retrieve_iteration_tree) + FindNodes, FindSymbols, Transformer, Uxreplace, + filter_iterations, retrieve_iteration_tree) from devito.passes.iet.engine import iet_pass from devito.symbolics import evalrel, has_integer_args -from devito.tools import as_mapper, split +from devito.tools import as_mapper, filter_ordered, split __all__ = ['avoid_denormals', 'hoist_prodders', 'relax_incr_dimensions', 'generate_macros', 'minimize_symbols'] @@ -178,23 +178,29 @@ def minimize_symbols(iet): def remove_redundant_moddims(iet): - subs0 = {} - subs1 = {} - for n in FindNodes(Iteration).visit(iet): - mds = [d for d in n.uindices - if d.is_Modulo and d.origin is not None] - if not mds: - continue + key = lambda d: d.is_Modulo and d.origin is not None + mds = [d for d in FindSymbols('dimensions').visit(iet) if key(d)] + if not mds: + return iet - mapper = as_mapper(mds, key=lambda md: md.origin % md.modulo) - for k, v in mapper.items(): - chosen = v.pop(0) - subs0.update({d: chosen for d in v}) + mapper = as_mapper(mds, key=lambda md: md.origin % md.modulo) - uindices = [d for d in n.uindices if d not in subs0] - subs1[n] = n._rebuild(uindices=uindices) + subs = {} + for k, v in mapper.items(): + chosen = v.pop(0) + subs.update({d: chosen for d in v}) + + body = Uxreplace(subs).visit(iet.body) + iet = iet._rebuild(body=body) + + # ModuloDimensions are defined in Iteration headers, hence they must be + # removed from there too + subs = {} + for n in FindNodes(Iteration).visit(iet): + if not set(n.uindices) & set(mds): + continue + subs[n] = n._rebuild(uindices=filter_ordered(n.uindices)) - iet = Transformer(subs1, nested=True).visit(iet) - iet = Uxreplace(subs0).visit(iet) + iet = Transformer(subs, nested=True).visit(iet) return iet From 01ff19fec0ae52207b25352097dff52e28ac1432 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 2 Aug 2023 15:19:39 +0000 Subject: [PATCH 29/79] compiler: Expand along SteppingDimensions --- devito/finite_differences/finite_difference.py | 10 ++++++++-- devito/operator/operator.py | 9 ++++++++- tests/test_unexpansion.py | 16 ++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/devito/finite_differences/finite_difference.py b/devito/finite_differences/finite_difference.py index e6e35f03fd..57ba6e6382 100644 --- a/devito/finite_differences/finite_difference.py +++ b/devito/finite_differences/finite_difference.py @@ -207,9 +207,11 @@ def generic_derivative(expr, dim, fd_order, deriv_order, matvec=direct, x0=None, matvec, x0, symbolic, expand) -def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic, expand): +def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic, + expand): # The stencil indices - indices, x0 = generate_indices(expr, dim, fd_order, side=side, matvec=matvec, x0=x0) + indices, x0 = generate_indices(expr, dim, fd_order, side=side, matvec=matvec, + x0=x0) # Finite difference weights from Taylor approximation given these positions if symbolic: @@ -227,6 +229,10 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic # Shift index due to staggering, if any indices = indices.shift(-(expr.indices_ref[dim] - dim)) + # The user may wish to restrict expansion to selected derivatives + if callable(expand): + expand = expand(dim) + if not expand and indices.expr is not None: weights = Weights(name='w', dimensions=indices.free_dim, initvalue=weights) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index cdc327f057..0fadc5dc47 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -311,8 +311,15 @@ def _lower_exprs(cls, expressions, **kwargs): # Specialization is performed on unevaluated expressions expressions = cls._specialize_dsl(expressions, **kwargs) - # Lower functional DSL + # Lower FD derivatives + # NOTE: we force expansion of derivatives along SteppingDimensions + # because it drastically simplifies the subsequent lowering into + # ModuloDimensions + if not expand: + expand = lambda d: d.is_Stepping expressions = flatten([i._evaluate(expand=expand) for i in expressions]) + + # Scalarize the tensor equations, if any expressions = [j for i in expressions for j in i._flatten] # A second round of specialization is performed on evaluated expressions diff --git a/tests/test_unexpansion.py b/tests/test_unexpansion.py index 5cadaf0d9f..3efb8dc0ee 100644 --- a/tests/test_unexpansion.py +++ b/tests/test_unexpansion.py @@ -5,6 +5,22 @@ from devito.types import Symbol +class TestLoopScheduling(object): + + def test_backward_dt2(self): + grid = Grid(shape=(4, 4)) + + f = Function(name='f', grid=grid) + v = TimeFunction(name='v', grid=grid, time_order=2) + + eqns = [Eq(v.backward, v + 1.), + Eq(f, v.dt2)] + + op = Operator(eqns, opt=('advanced', {'openmp': True, + 'expand': False})) + assert_structure(op, ['t,x,y'], 't,x,y') + + class Test1Pass(object): def test_v0(self): From b8c3c4b3b66916042e2eb2d38d459639a1be2542 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 2 Aug 2023 15:22:14 +0000 Subject: [PATCH 30/79] compiler: Update behavior of ClusterGroup.syncs --- devito/ir/clusters/cluster.py | 6 ++++-- devito/passes/clusters/misc.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 919785104d..92cb0fdfe4 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -430,8 +430,10 @@ def guards(self): @cached_property def syncs(self): - """The synchronization operations of each Cluster in self.""" - return tuple(i.syncs for i in self) + """ + A view of the ClusterGroup's synchronization operations. + """ + return normalize_syncs(*[c.syncs for c in self]) @cached_property def dspace(self): diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index e90aad341d..c14c49958c 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -186,7 +186,7 @@ def _key(self, c): if isinstance(c, Cluster): syncs = (c.syncs,) else: - syncs = c.syncs + syncs = tuple(i.syncs for i in c) for i in syncs: mapper = defaultdict(set) for k, v in i.items(): From ce4d407f22ba3ee4cbd8b69828c076896242a416 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 2 Aug 2023 15:23:01 +0000 Subject: [PATCH 31/79] compiler: Add and exploit properties.is_parallel_atomic --- devito/ir/clusters/algorithms.py | 2 +- devito/ir/support/properties.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index 19d31d5173..af8fbb8920 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -452,7 +452,7 @@ def normalize_reductions_dense(cluster, sregistry, options): """ opt_mapify_reduce = options['mapify-reduce'] - dims = [d for d in cluster.properties.dimensions + dims = [d for d in cluster.ispace.itdims if cluster.properties.is_parallel_atomic(d)] if not dims: diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py index c3c808356e..a7113a4833 100644 --- a/devito/ir/support/properties.py +++ b/devito/ir/support/properties.py @@ -215,6 +215,9 @@ def is_parallel_atomic(self, dims): def is_parallel_relaxed(self, dims): return any(len(self[d] & PARALLELS) > 0 for d in as_tuple(dims)) + def is_parallel_atomic(self, dims): + return any(PARALLEL_IF_ATOMIC in self.get(d, ()) for d in as_tuple(dims)) + def is_affine(self, dims): return any(AFFINE in self.get(d, ()) for d in as_tuple(dims)) From 066236b3a9d3cf9a0bb45d6c009fd9f75c2ba601 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 2 Aug 2023 15:23:45 +0000 Subject: [PATCH 32/79] compiler: Enhance DDA across IndexDerivatives --- devito/ir/clusters/algorithms.py | 21 +++++++++++++------- devito/ir/support/basic.py | 26 ++++++++++++++++++------ devito/ir/support/utils.py | 34 ++++++++++++++++++++++++++++---- 3 files changed, 64 insertions(+), 17 deletions(-) diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index af8fbb8920..1ef347cbb9 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -6,7 +6,7 @@ import sympy from devito.exceptions import InvalidOperator -from devito.ir.support import (Any, Backward, Forward, IterationSpace, +from devito.ir.support import (Any, Backward, Forward, IterationSpace, erange, pull_dims) from devito.ir.clusters.analysis import analyze from devito.ir.clusters.cluster import Cluster, ClusterGroup @@ -121,10 +121,12 @@ def callback(self, clusters, prefix, backlog=None, known_break=None): require_break = scope.d_flow.cause & maybe_break if require_break: backlog = [clusters[-1]] + backlog - # Try with increasingly smaller ClusterGroups until the ambiguity is gone + # Try with increasingly smaller ClusterGroups until the + # ambiguity is gone return self.callback(clusters[:-1], prefix, backlog, require_break) - # Schedule Clusters over different IterationSpaces if this increases parallelism + # Schedule Clusters over different IterationSpaces if this increases + # parallelism for i in range(1, len(clusters)): if self._break_for_parallelism(scope, candidates, i): return self.callback(clusters[:i], prefix, clusters[i:] + backlog, @@ -146,8 +148,8 @@ def callback(self, clusters, prefix, backlog=None, known_break=None): if not backlog: return processed - # Handle the backlog -- the Clusters characterized by flow- and anti-dependences - # along one or more Dimensions + # Handle the backlog -- the Clusters characterized by flow- and + # anti-dependences along one or more Dimensions idir = {d: Any for d in known_break} stamp = Stamp() for i, c in enumerate(list(backlog)): @@ -278,7 +280,11 @@ def callback(self, clusters, prefix): size = i.function.shape_allocated[d] assert is_integer(size) - mapper[size][si].add(iaf) + # Resolve StencilDimensions in case of unexpanded expressions + # E.g. `i0 + t` -> `(t - 1, t, t + 1)` + iafs = erange(iaf) + + mapper[size][si].update(iafs) # Construct the ModuloDimensions mds = [] @@ -288,7 +294,8 @@ def callback(self, clusters, prefix): # SymPy's index ordering (t, t-1, t+1) afer modulo replacement so # that associativity errors are consistent. This corresponds to # sorting offsets {-1, 0, 1} as {0, -1, 1} assigning -inf to 0 - siafs = sorted(iafs, key=lambda i: -np.inf if i - si == 0 else (i - si)) + key = lambda i: -np.inf if i - si == 0 else (i - si) + siafs = sorted(iafs, key=key) for iaf in siafs: name = '%s%d' % (si.name, len(mds)) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 8eb87adbe7..e17bc8d6d7 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -4,7 +4,7 @@ from sympy import S from devito.ir.support.space import Backward, IterationSpace -from devito.ir.support.utils import AccessMode +from devito.ir.support.utils import AccessMode, extrema from devito.ir.support.vector import LabeledVector, Vector from devito.symbolics import (compare_ops, retrieve_indexed, retrieve_terminals, q_constant, q_affine, q_routine, search, uxreplace) @@ -237,7 +237,7 @@ def __eq__(self, other): # which might require expensive comparisons of Vector entries (i.e., # SymPy expressions) - return (self.access is other.access and # => self.function is other.function + return (self.access is other.access and self.mode == other.mode and self.ispace == other.ispace) @@ -899,7 +899,8 @@ def __getitem__(self, function): return self.getwrites(function) + self.getreads(function) def __repr__(self): - tracked = filter_sorted(set(self.reads) | set(self.writes), key=lambda i: i.name) + tracked = filter_sorted(set(self.reads) | set(self.writes), + key=lambda i: i.name) maxlen = max(1, max([len(i.name) for i in tracked])) out = "{:>%d} => W : {}\n{:>%d} R : {}" % (maxlen, maxlen) pad = " "*(maxlen + 9) @@ -924,6 +925,19 @@ def __repr__(self): return "\n".join([out.format(i.name, w, '', r) for i, r, w in zip(tracked, reads, writes)]) + @cached_property + def reads_extremaed(self): + """ + A view of the Scope's reads in which StencilDimensions are replaced + with their extrema. + """ + ret = {f: [] for f in self.reads} + for f, v in self.reads.items(): + for i in v: + for j in set(extrema(i.access)): + ret[f].append(TimedAccess(j, i.mode, i.timestamp, i.ispace)) + return ret + @cached_property def accesses(self): groups = list(self.reads.values()) + list(self.writes.values()) @@ -949,7 +963,7 @@ def d_flow_gen(self): """Generate the flow (or "read-after-write") dependences.""" for k, v in self.writes.items(): for w in v: - for r in self.reads.get(k, []): + for r in self.reads_extremaed.get(k, []): dependence = Dependence(w, r) if dependence.is_imaginary: @@ -979,7 +993,7 @@ def d_anti_gen(self): """Generate the anti (or "write-after-read") dependences.""" for k, v in self.writes.items(): for w in v: - for r in self.reads.get(k, []): + for r in self.reads_extremaed.get(k, []): dependence = Dependence(r, w) if dependence.is_imaginary: @@ -1051,7 +1065,7 @@ def d_from_access_gen(self, accesses): accesses = as_tuple(accesses) for d in self.d_all_gen(): for i in accesses: - if d.source is i or d.sink is i: + if d.source == i or d.sink == i: yield d break diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py index 73e0cc206c..770748bcb9 100644 --- a/devito/ir/support/utils.py +++ b/devito/ir/support/utils.py @@ -1,4 +1,5 @@ -from collections import defaultdict +from collections import defaultdict, namedtuple +from itertools import product from devito.finite_differences import IndexDerivative from devito.symbolics import (CallFromPointer, retrieve_indexed, retrieve_terminals, @@ -8,7 +9,8 @@ StencilDimension) __all__ = ['AccessMode', 'Stencil', 'IMask', 'detect_accesses', 'detect_io', - 'pull_dims', 'sdims_free', 'sdims_min', 'sdims_max', 'minmax_index'] + 'pull_dims', 'sdims_free', 'sdims_min', 'sdims_max', 'minmax_index', + 'extrema', 'erange'] class AccessMode(object): @@ -307,6 +309,9 @@ def sdims_max(expr, sdims=None): return expr.subs(mapper) +Extrema = namedtuple('Extrema', 'm M') + + def minmax_index(expr, d): """ Return the minimum and maximum indices along the `d` Dimension @@ -319,5 +324,26 @@ def minmax_index(expr, d): except KeyError: pass - return (min(sdims_min(i) for i in indices), - max(sdims_max(i) for i in indices)) + return Extrema(min(sdims_min(i) for i in indices), + max(sdims_max(i) for i in indices)) + + +def extrema(expr): + """ + The minimum and maximum extrema assumed by `expr` once the unbounded + StencilDimensions are resolved. + """ + return Extrema(sdims_min(expr), sdims_max(expr)) + + +def erange(expr): + """ + A tuple with all possible values that `expr` can assume along its + unbounded StencilDimensions. + """ + sdims = sdims_free(expr) + if not sdims: + return (expr,) + ranges = [i.range for i in sdims] + mappers = [dict(zip(sdims, i)) for i in product(*ranges)] + return tuple(expr.subs(m) for m in mappers) From bc64af5ce4e3190054cff885e5a097e8f550ce03 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 4 Aug 2023 09:56:16 +0000 Subject: [PATCH 33/79] compiler: Tidy up utilities --- devito/ir/clusters/cluster.py | 14 ++---- devito/ir/support/space.py | 7 +-- devito/ir/support/utils.py | 79 +++++++++++++++++++------------ devito/passes/clusters/aliases.py | 14 +++--- 4 files changed, 62 insertions(+), 52 deletions(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 92cb0fdfe4..6362d44eea 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -8,7 +8,7 @@ Forward, Interval, IntervalGroup, IterationSpace, DataSpace, Guards, Properties, Scope, detect_accesses, detect_io, normalize_properties, normalize_syncs, - sdims_min, sdims_max) + minimum, maximum) from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.symbolics import estimate_cost from devito.tools import as_tuple, flatten, frozendict, infer_dtype @@ -269,8 +269,8 @@ def dspace(self): continue intervals = [Interval(d, - min([sdims_min(i) for i in offs]), - max([sdims_max(i) for i in offs])) + min([minimum(i) for i in offs]), + max([maximum(i) for i in offs])) for d, offs in v.items()] intervals = IntervalGroup(intervals) @@ -395,14 +395,6 @@ def __new__(cls, clusters, ispace=None): obj._ispace = ispace return obj - def rebuild(self, ispace=None): - processed = [] - for c in self: - ispace = IterationSpace.union(c.ispace, ispace) - processed.append(c.rebuild(ispace=ispace)) - - return self.__class__(processed, ispace) - @classmethod def concatenate(cls, *cgroups): return list(chain(*cgroups)) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index b7b6c0822d..529258bada 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -6,7 +6,7 @@ from cached_property import cached_property from sympy import Expr -from devito.ir.support.utils import sdims_min, sdims_max +from devito.ir.support.utils import minimum, maximum from devito.ir.support.vector import Vector, vmin, vmax from devito.tools import (PartialOrderTuple, Stamp, as_list, as_tuple, filter_ordered, flatten, frozendict, is_integer, toposort) @@ -286,8 +286,9 @@ def promote(self, cond): return self def expand(self): - return Interval(self.dim, sdims_min(self.lower), sdims_max(self.upper), - self.stamp) + return Interval( + self.dim, minimum(self.lower), maximum(self.upper), self.stamp + ) class IntervalGroup(PartialOrderTuple): diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py index 770748bcb9..b79c331fbb 100644 --- a/devito/ir/support/utils.py +++ b/devito/ir/support/utils.py @@ -9,7 +9,7 @@ StencilDimension) __all__ = ['AccessMode', 'Stencil', 'IMask', 'detect_accesses', 'detect_io', - 'pull_dims', 'sdims_free', 'sdims_min', 'sdims_max', 'minmax_index', + 'pull_dims', 'unbounded', 'minimum', 'maximum', 'minmax_index', 'extrema', 'erange'] @@ -276,40 +276,62 @@ def pull_dims(exprs, flag=True): # *** Utility functions for expressions that potentially contain StencilDimensions -def sdims_free(expr): +def unbounded(expr): """ - Retrieve all unbounded StencilDimensions in `expr`. + Retrieve all unbounded Dimensions in `expr`. """ + # At the moment we only have logic to retrieve unbounded StencilDimensions, + # but in the future this might change bound = set().union(*[i.dimensions for i in search(expr, IndexDerivative)]) sdims = search(expr, StencilDimension, mode='unique', deep=True) + return sdims - bound -def sdims_min(expr, sdims=None): +Extrema = namedtuple('Extrema', 'm M') + + +def _minmax(expr, callback, udims=None): """ - Replace StencilDimensions in `expr` with their minimum point. + Helper for `minimum`, `maximum`, and potential future utilities that share + a significant chunk of logic. """ - if not sdims: - sdims = sdims_free(expr) + if not udims: + udims = unbounded(expr) + + # Resolution rule 1: StencilDimensions + sdims = [d for d in udims if d.is_Stencil] if not sdims: return expr - mapper = {e: e._min for e in sdims} + mapper = {e: callback(e) for e in sdims} + return expr.subs(mapper) -def sdims_max(expr, sdims=None): +def minimum(expr, udims=None): """ - Replace StencilDimensions in `expr` with their maximum point. + Substitute the unbounded Dimensions in `expr` with their minimum point. + + Unbounded Dimensions whose possible minimum value is not known are ignored. """ - if not sdims: - sdims = sdims_free(expr) - if not sdims: - return expr - mapper = {e: e._max for e in sdims} - return expr.subs(mapper) + return _minmax(expr, lambda e: e._min, udims) -Extrema = namedtuple('Extrema', 'm M') +def maximum(expr, udims=None): + """ + Substitute the unbounded Dimensions in `expr` with their maximum point. + + Unbounded Dimensions whose possible maximum value is not known are ignored. + """ + return _minmax(expr, lambda e: e._max, udims) + + +def extrema(expr): + """ + The minimum and maximum extrema assumed by `expr` once the unbounded + Dimensions are resolved. + """ + return Extrema(minimum(expr), maximum(expr)) def minmax_index(expr, d): @@ -324,26 +346,21 @@ def minmax_index(expr, d): except KeyError: pass - return Extrema(min(sdims_min(i) for i in indices), - max(sdims_max(i) for i in indices)) - - -def extrema(expr): - """ - The minimum and maximum extrema assumed by `expr` once the unbounded - StencilDimensions are resolved. - """ - return Extrema(sdims_min(expr), sdims_max(expr)) + return Extrema(min(minimum(i) for i in indices), + max(maximum(i) for i in indices)) def erange(expr): """ - A tuple with all possible values that `expr` can assume along its - unbounded StencilDimensions. + All possible values that `expr` can assume once its unbounded Dimensions + are resolved. """ - sdims = sdims_free(expr) - if not sdims: + udims = unbounded(expr) + if not udims: return (expr,) + + sdims = [d for d in udims if d.is_Stencil] ranges = [i.range for i in sdims] mappers = [dict(zip(sdims, i)) for i in product(*ranges)] + return tuple(expr.subs(m) for m in mappers) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 93575f3d56..5dfb5a7245 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -10,7 +10,7 @@ from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, ROUNDABLE, SEPARABLE, Forward, IterationSpace, Interval, Cluster, ExprGeometry, Queue, IntervalGroup, LabeledVector, normalize_properties, - relax_properties, sdims_free, sdims_min, sdims_max) + relax_properties, unbounded, minimum, maximum, extrema) from devito.symbolics import (Uxmapper, estimate_cost, search, reuse_if_untouched, uxreplace, sympy_dtype) from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, generator, @@ -991,13 +991,13 @@ def __new__(cls, items, ispace=None): # Expand out the StencilDimensions, if any processed = [] for c in items: - sdims = sdims_free(c.expr) + sdims = [d for d in unbounded(c.expr) if d.is_Stencil] if not sdims: processed.append(c) continue - f0 = lambda e: sdims_min(e, sdims) - f1 = lambda e: sdims_max(e, sdims) + f0 = lambda e: minimum(e, sdims) + f1 = lambda e: maximum(e, sdims) for f in (f0, f1): expr = f(c.expr) @@ -1083,7 +1083,8 @@ def dimensions_translated(self): def naliases(self): na = len(self._items) - sdims = set().union(*[sdims_free(c.expr) for c in self._items]) + udims = set().union(*[unbounded(c.expr) for c in self._items]) + sdims = [d for d in udims if d.is_Stencil] implicit = int(np.prod([i._size for i in sdims])) - 1 return na + implicit @@ -1160,8 +1161,7 @@ def _pivot_legal_shifts(self): # Any `ofs`'s shift due to non-[0,0] iteration space lower, upper = self._ispace.intervals[l].offsets - ofs0 = sdims_min(ofs[l]) - ofs1 = sdims_max(ofs[l]) + ofs0, ofs1 = extrema(ofs[l]) try: # Assume `ofs[l]` is a number (typical case) From 5139358dbb3875659dbc0bbf93a482c80a155fe4 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 22 Aug 2023 08:54:24 +0000 Subject: [PATCH 34/79] compiler: Make IndexDerivatives homogeneous irrespective of matvec --- .../finite_differences/finite_difference.py | 9 ++++++-- devito/finite_differences/tools.py | 18 +++++++++++++++ devito/passes/clusters/derivatives.py | 9 +++++--- devito/types/dimension.py | 14 +++++++++-- tests/test_derivatives.py | 23 +++++++++++++++++++ tests/test_unexpansion.py | 22 ++++++++++++++++++ 6 files changed, 88 insertions(+), 7 deletions(-) diff --git a/devito/finite_differences/finite_difference.py b/devito/finite_differences/finite_difference.py index 57ba6e6382..9eb4261b2b 100644 --- a/devito/finite_differences/finite_difference.py +++ b/devito/finite_differences/finite_difference.py @@ -223,8 +223,7 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic weights = [sympify(w).evalf(_PRECISION) for w in weights] # Transpose the FD, if necessary - if matvec: - indices = indices.scale(matvec.val) + indices = indices.scale(matvec.val) # Shift index due to staggering, if any indices = indices.shift(-(expr.indices_ref[dim] - dim)) @@ -236,6 +235,12 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic if not expand and indices.expr is not None: weights = Weights(name='w', dimensions=indices.free_dim, initvalue=weights) + if matvec == transpose: + # For homogenity, always generate e.g. `x + i0` rather than `x - i0` + # for transpose and `x + i0` for direct + indices = indices.transpose() + weights = weights._subs(indices.free_dim, -indices.free_dim) + # Inject the StencilDimension # E.g. `x + i*h_x` into `f(x)` s.t. `f(x + i*h_x)` expr = expr._subs(dim, indices.expr) diff --git a/devito/finite_differences/tools.py b/devito/finite_differences/tools.py index ba112a3116..5ec3629403 100644 --- a/devito/finite_differences/tools.py +++ b/devito/finite_differences/tools.py @@ -197,6 +197,24 @@ def scale(self, v): return IndexSet(self.dim, indices, expr=expr, fd=self.free_dim) + def transpose(self): + """ + Transpose the IndexSet. + """ + indices = tuple(reversed(self)) + + free_dim = StencilDimension(self.free_dim.name, + -self.free_dim._max, + -self.free_dim._min, + backward=True) + + try: + expr = self.expr._subs(self.free_dim, -free_dim) + except AttributeError: + expr = None + + return IndexSet(self.dim, indices, expr=expr, fd=free_dim) + def shift(self, v): """ Construct a new IndexSet with all indices shifted by `v`. diff --git a/devito/passes/clusters/derivatives.py b/devito/passes/clusters/derivatives.py index 00e7526f88..fd0d8bdcad 100644 --- a/devito/passes/clusters/derivatives.py +++ b/devito/passes/clusters/derivatives.py @@ -1,5 +1,5 @@ from devito.finite_differences import IndexDerivative -from devito.ir import Interval, IterationSpace, Queue +from devito.ir import Backward, Forward, Interval, IterationSpace, Queue from devito.passes.clusters.misc import fuse from devito.symbolics import (retrieve_dimensions, reuse_if_untouched, q_leaf, uxreplace) @@ -87,7 +87,8 @@ def _core(expr, c, weights, mapper, sregistry): dims = tuple(d for d in dims if d not in c.ispace) intervals = [Interval(d) for d in dims] - ispace0 = IterationSpace(intervals) + directions = {d: Backward if d.backward else Forward for d in dims} + ispace0 = IterationSpace(intervals, directions=directions) extra = (c.ispace.itdimensions + dims,) ispace = IterationSpace.union(c.ispace, ispace0, relations=extra) @@ -100,7 +101,9 @@ def _core(expr, c, weights, mapper, sregistry): # Transform e.g. `w[i0] -> w[i0 + 2]` for alignment with the # StencilDimensions starting points - subs = {expr.weights: expr.weights.subs(d, d - d._min) for d in dims} + subs = {expr.weights: + expr.weights.subs(d, d - (d._max if d.backward else d._min)) + for d in dims} expr1 = Inc(s, uxreplace(expr.expr, subs)) processed.append(c.rebuild(exprs=expr1, ispace=ispace)) diff --git a/devito/types/dimension.py b/devito/types/dimension.py index d3f9a00285..0c341bb1a2 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -1386,8 +1386,10 @@ class StencilDimension(BasicDimension): is_Stencil = True __rargs__ = BasicDimension.__rargs__ + ('_min', '_max') + __rkwargs__ = BasicDimension.__rkwargs__ + ('backward',) - def __init_finalize__(self, name, _min, _max, spacing=None, **kwargs): + def __init_finalize__(self, name, _min, _max, spacing=None, backward=False, + **kwargs): self._spacing = sympy.sympify(spacing) or sympy.S.One if not is_integer(_min): @@ -1399,13 +1401,21 @@ def __init_finalize__(self, name, _min, _max, spacing=None, **kwargs): self._min = _min self._max = _max + self._size = _max - _min + 1 + self._backward = backward + if self._size < 1: raise ValueError("Expected size greater than 0 (got %s)" % self._size) def _hashable_content(self): - return super()._hashable_content() + (self._min, self._max, self._spacing) + return (super()._hashable_content() + + (self._min, self._max, self._spacing, self._backward)) + + @cached_property + def backward(self): + return self._backward @cached_property def symbolic_size(self): diff --git a/tests/test_derivatives.py b/tests/test_derivatives.py index 123975d63d..8031932df5 100644 --- a/tests/test_derivatives.py +++ b/tests/test_derivatives.py @@ -457,6 +457,16 @@ def test_all_shortcuts(self, so): for fd in g._fd: assert getattr(g, fd) + def test_transpose_simple(self): + grid = Grid(shape=(4, 4)) + + f = TimeFunction(name='f', grid=grid, space_order=4) + + assert str(f.dx.T.evaluate) == ("-0.0833333333*f(t, x - 2*h_x, y)/h_x " + "+ 0.666666667*f(t, x - h_x, y)/h_x " + "- 0.666666667*f(t, x + h_x, y)/h_x " + "+ 0.0833333333*f(t, x + 2*h_x, y)/h_x") + @pytest.mark.parametrize('so', [2, 4, 8, 12]) @pytest.mark.parametrize('ndim', [1, 2]) @pytest.mark.parametrize('derivative, adjoint_name', [ @@ -743,6 +753,19 @@ def test_dxdy_v2(self): term1 = f.dxdy._evaluate(expand=False) assert len(term1.find(IndexDerivative)) == 2 + def test_transpose(self): + grid = Grid(shape=(4, 4)) + x, _ = grid.dimensions + h_x = x.spacing + + f = TimeFunction(name='f', grid=grid, space_order=4) + + term = f.dx.T._evaluate(expand=False) + assert isinstance(term, IndexDerivative) + + i0, = term.dimensions + assert term.base == f.subs(x, x + i0*h_x) + def bypass_uneval(expr): unevals = expr.find(EvalDerivative) diff --git a/tests/test_unexpansion.py b/tests/test_unexpansion.py index 3efb8dc0ee..7d0391a85a 100644 --- a/tests/test_unexpansion.py +++ b/tests/test_unexpansion.py @@ -204,6 +204,28 @@ def test_v6(self): op.cfunction + def test_transpose(self): + shape = (10, 10, 10) + grid = Grid(shape=shape) + x, _, _ = grid.dimensions + + u = TimeFunction(name='u', grid=grid, space_order=4) + u1 = TimeFunction(name='u', grid=grid, space_order=4) + + # Chessboard-like init + u.data[:] = np.indices(shape).sum(axis=0) % 10 + 1 + u1.data[:] = np.indices(shape).sum(axis=0) % 10 + 1 + + eqn = Eq(u.forward, u.dx(x0=x+x.spacing/2).T + 1.) + + op0 = Operator(eqn) + op1 = Operator(eqn, opt=('advanced', {'expand': False})) + + op0.apply(time_M=10) + op1.apply(time_M=10, u=u1) + + assert np.allclose(u.data, u1.data, rtol=10e-6) + class Test2Pass(object): From 993600297f48f34717be929d293547f2254a3ce8 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 23 Aug 2023 14:41:10 +0000 Subject: [PATCH 35/79] compiler: Fix 2-pass implementations with expand=False --- devito/passes/clusters/aliases.py | 114 ++++++++++++++++-------------- devito/passes/clusters/cse.py | 12 +++- 2 files changed, 71 insertions(+), 55 deletions(-) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 5dfb5a7245..95f0058219 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -6,15 +6,15 @@ import numpy as np import sympy -from devito.finite_differences import EvalDerivative, IndexDerivative +from devito.finite_differences import EvalDerivative, IndexDerivative, Weights from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, ROUNDABLE, SEPARABLE, Forward, IterationSpace, Interval, Cluster, ExprGeometry, Queue, IntervalGroup, LabeledVector, normalize_properties, relax_properties, unbounded, minimum, maximum, extrema) from devito.symbolics import (Uxmapper, estimate_cost, search, reuse_if_untouched, uxreplace, sympy_dtype) -from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, generator, - split, timed_pass) +from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, + generator, split, timed_pass) from devito.types import (Eq, Symbol, Temp, TempArray, TempFunction, ModuloDimension, CustomDimension, IncrDimension, StencilDimension, Indexed, Hyperplane) @@ -340,43 +340,46 @@ def process(self, clusters): return processed - def _generate(self, exprs, exclude): - # E.g., extract `u.dx*a*b` and `u.dx*a*c` from `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]` - def cbk_search(expr): - if isinstance(expr, EvalDerivative) and not expr.base.is_Function: - return expr.args - else: - return flatten(e for e in [cbk_search(a) for a in expr.args] if e) + def _compose(self, expr): + return split_coeff(expr)[1] - cbk_compose = lambda e: split_coeff(e)[1] - basextr = self._do_generate(exprs, exclude, cbk_search, cbk_compose) + def _search(self, expr): + if isinstance(expr, EvalDerivative) and not expr.base.is_Function: + return expr.args + else: + return flatten(e for e in [self._search(a) for a in expr.args] if e) + + def _search2(self, expr, rank): + ret = [] + for e in self._search(expr): + mapper = deindexify(e) + for i in rank: + if i in mapper: + ret.extend(mapper[i]) + break + return ret + + def _generate(self, exprs, exclude): + # E.g., extract `u.dx*a*b` and `u.dx*a*c` from + # `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]` + basextr = self._do_generate(exprs, exclude, self._search, self._compose) if not basextr: return yield basextr # E.g., extract `u.dx*a` from `[(u.dx*a*b).dy, (u.dx*a*c).dy]` - # That is, attempt extracting the largest common derivative-induced subexprs + # I.e., attempt extracting the largest common derivative-induced subexprs mappers = [deindexify(e) for e in basextr.extracted] counter = Counter(flatten(m.keys() for m in mappers)) groups = as_mapper(counter, key=counter.get) grank = {k: sorted(v, key=lambda e: estimate_cost(e), reverse=True) for k, v in groups.items()} - def cbk_search2(expr, rank): - ret = [] - for e in cbk_search(expr): - mapper = deindexify(e) - for i in rank: - if i in mapper: - ret.extend(mapper[i]) - break - return ret - candidates = sorted(grank, reverse=True)[:2] for i in candidates: lower_pri_elems = flatten([grank[j] for j in candidates if j != i]) - cbk_search_i = lambda e: cbk_search2(e, grank[i] + lower_pri_elems) - yield self._do_generate(exprs, exclude, cbk_search_i, cbk_compose) + search_i = lambda e: self._search2(e, grank[i] + lower_pri_elems) + yield self._do_generate(exprs, exclude, search_i, self._compose) def _lookup_key(self, c): return AliasKey(c.ispace, None, c.dtype, c.guards, c.properties) @@ -394,30 +397,33 @@ def _eval_variants_delta(self, delta_flops, delta_ws): class CireIndexDerivatives(CireSops): - def _generate(self, exprs, exclude): - # E.g., extract `u.dx*a*b` and `u.dx*a*c` from `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]` - def cbk_search(expr): - if isinstance(expr, IndexDerivative): - return (expr.base,) - else: - return flatten(e for e in [cbk_search(a) for a in expr.args] if e) - - basextr = self._do_generate(exprs, exclude, cbk_search) - if not basextr: - return - yield basextr - - # E.g., extract `u.dx` from `u.dx*a*b`, which in turn was extracted - # from `(u.dx*a*b).dy`. That is, the inner derivatives only - def cbk_search2(expr): - if isinstance(expr, IndexDerivative): - return (expr,) - else: - return flatten(e for e in [cbk_search2(a) for a in expr.args] if e) - - sndxtr = self._do_generate(list(basextr), exclude, cbk_search2) - if sndxtr: - yield sndxtr + def _compose(self, expr): + terms = [] + for a in expr.args: + try: + if not isinstance(a.function, Weights): + terms.append(a) + except AttributeError: + terms.append(a) + return tuple(terms) + + def _search(self, expr): + if isinstance(expr, IndexDerivative): + return (expr.expr,) + else: + return flatten(e for e in [self._search(a) for a in expr.args] if e) + + def _search2(self, expr, rank): + ret = [] + for e in self._search(expr): + mapper = deindexify(e) + for i in rank: + found = [v.expr if isinstance(v, IndexDerivative) else v + for v in mapper.get(i, [])] + if found: + ret.extend(found) + break + return ret def collect(extracted, ispace, minstorage): @@ -955,7 +961,7 @@ def pick_best(variants, schedule_strategy, eval_variants_delta): best_ws_score = None for i in variants: - i_flops_score = sum(sa.score for sa in i.schedule) + i_flops_score = i.schedule.score # The working set score depends on the number and dimensionality of # temporaries required by the Schedule @@ -1085,9 +1091,9 @@ def naliases(self): udims = set().union(*[unbounded(c.expr) for c in self._items]) sdims = [d for d in udims if d.is_Stencil] - implicit = int(np.prod([i._size for i in sdims])) - 1 + implicit = int(np.prod([i._size for i in sdims])) - return na + implicit + return na*max(implicit, 1) @cached_property def _pivot_legal_rotations(self): @@ -1286,6 +1292,10 @@ def rebuild(self, *items, dmapper=None, rmapper=None, is_frame=False): is_frame=is_frame or self.is_frame ) + @property + def score(self): + return sum(i.score for i in self) + def make_rotations_table(d, v): """ diff --git a/devito/passes/clusters/cse.py b/devito/passes/clusters/cse.py index 85b4e379df..5f00a0341e 100644 --- a/devito/passes/clusters/cse.py +++ b/devito/passes/clusters/cse.py @@ -8,6 +8,7 @@ from devito.passes.clusters.utils import makeit_ssa from devito.symbolics import estimate_cost, q_leaf from devito.symbolics.manipulation import _uxreplace +from devito.tools import as_list from devito.types import Eq, Temp as Temp0 __all__ = ['cse'] @@ -52,13 +53,18 @@ def _cse(maybe_exprs, make, min_cost=1, mode='default'): # also ensuring some form of post-processing assert mode == 'default' # Only supported mode ATM - # Just for flexibility, accept either Clusters or exprs + # Accept Clusters, Eqs or even just exprs if isinstance(maybe_exprs, Cluster): processed = list(maybe_exprs.exprs) scope = maybe_exprs.scope else: - processed = list(maybe_exprs) - scope = Scope(maybe_exprs) + maybe_exprs = as_list(maybe_exprs) + if all(e.is_Equality for e in maybe_exprs): + processed = maybe_exprs + scope = Scope(maybe_exprs) + else: + processed = [Eq(make(), e) for e in maybe_exprs] + scope = Scope([]) # Some sub-expressions aren't really "common" -- that's the case of Dimension- # independent data dependences. For example: From dc839616e34e4864470a184b523d850abafe1ba8 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 25 Aug 2023 09:21:10 +0000 Subject: [PATCH 36/79] compiler: Tweak aliases selection --- devito/passes/clusters/aliases.py | 91 ++++++++++++++++++------------- tests/test_dse.py | 1 + 2 files changed, 54 insertions(+), 38 deletions(-) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 95f0058219..c1c7d0a040 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -82,11 +82,13 @@ def cire(clusters, mode, sregistry, options, platform): t0 = 2.0*t2[x,y,z] t1 = 3.0*t2[x,y,z+1] """ - modes = { - 'invariants': [CireInvariantsElementary, CireInvariantsDivs], - 'sops': [CireSops, # TODO: one day, we'll get rid of this legacy pass - CireIndexDerivatives], - } + # NOTE: Handle prematurely expanded derivatives -- current default on + # several backends, but soon to become legacy + if mode == 'sops': + if options['expand']: + mode = 'eval-derivs' + else: + mode = 'index-derivs' for cls in modes[mode]: transformer = cls(sregistry, options, platform) @@ -310,7 +312,7 @@ def _generate(self, exprs, exclude): yield self._do_generate(exprs, exclude, cbk_search) -class CireSops(CireTransformer): +class CireDerivatives(CireTransformer): def __init__(self, sregistry, options, platform): super().__init__(sregistry, options, platform) @@ -340,25 +342,6 @@ def process(self, clusters): return processed - def _compose(self, expr): - return split_coeff(expr)[1] - - def _search(self, expr): - if isinstance(expr, EvalDerivative) and not expr.base.is_Function: - return expr.args - else: - return flatten(e for e in [self._search(a) for a in expr.args] if e) - - def _search2(self, expr, rank): - ret = [] - for e in self._search(expr): - mapper = deindexify(e) - for i in rank: - if i in mapper: - ret.extend(mapper[i]) - break - return ret - def _generate(self, exprs, exclude): # E.g., extract `u.dx*a*b` and `u.dx*a*c` from # `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]` @@ -390,12 +373,36 @@ def _eval_variants_delta(self, delta_flops, delta_ws): # comes at the price of using more temporaries, then we have to apply # heuristics, in particular we estimate how many flops a temporary would # allow to save - return ((delta_flops >= 0 and delta_ws > 0 and delta_flops / delta_ws < 15) or - (delta_flops <= 0 and delta_ws < 0 and delta_flops / delta_ws > 15) or - (delta_flops <= 0 and delta_ws >= 0)) + return ( + (delta_flops >= 0 and delta_ws > 0 and delta_flops / delta_ws < 15) or + (delta_flops <= 0 and delta_ws < 0 and delta_flops / delta_ws > 15) or + (delta_flops <= 0 and delta_ws >= 0) + ) + + +class CireEvalDerivatives(CireDerivatives): + + def _compose(self, expr): + return split_coeff(expr)[1] + def _search(self, expr): + if isinstance(expr, EvalDerivative) and not expr.base.is_Function: + return expr.args + else: + return flatten(e for e in [self._search(a) for a in expr.args] if e) + + def _search2(self, expr, rank): + ret = [] + for e in self._search(expr): + mapper = deindexify(e) + for i in rank: + if i in mapper: + ret.extend(mapper[i]) + break + return ret -class CireIndexDerivatives(CireSops): + +class CireIndexDerivatives(CireDerivatives): def _compose(self, expr): terms = [] @@ -426,6 +433,15 @@ def _search2(self, expr, rank): return ret +# Subpass mapper +modes = { + 'invariants': [CireInvariantsElementary, + CireInvariantsDivs], + 'eval-derivs': [CireEvalDerivatives], # TODO: legacy pass + 'index-derivs': [CireIndexDerivatives], +} + + def collect(extracted, ispace, minstorage): """ Find groups of aliasing expressions. @@ -573,8 +589,7 @@ def collect(extracted, ispace, minstorage): na = g.naliases nr = nredundants(ispace, pivot) score = estimate_cost(pivot, True)*((na - 1) + nr) - if score > 0: - aliases.add(pivot, aliaseds, list(mapper), distances, score) + aliases.add(pivot, aliaseds, list(mapper), distances, score) return aliases @@ -587,17 +602,17 @@ def choose(aliases, exprs, mapper, mingain): """ aliases = AliasList(aliases) - # `score < m` => discarded + if not aliases: + return exprs, aliases + + # `score <= m` => discarded # `score > M` => optimized - # `m <= score <= M` => maybe discarded, maybe optimized -- depends on heuristics + # `m <= score <= M` => maybe discarded, maybe optimized; depends on heuristics m = mingain M = mingain*3 - if not aliases: - return exprs, aliases - # Filter off the aliases with low score - key = lambda a: a.score >= m + key = lambda a: a.score > m aliases.filter(key) # Project the candidate aliases into `exprs` to derive the final working set @@ -608,7 +623,7 @@ def choose(aliases, exprs, mapper, mingain): # Filter off the aliases with a weak flop-reduction / working-set tradeoff key = lambda a: \ a.score > M or \ - m <= a.score <= M and max(len(wset(a.pivot)), 1) > len(wset(a.pivot) & owset) + m < a.score <= M and max(len(wset(a.pivot)), 1) > len(wset(a.pivot) & owset) aliases.filter(key) if not aliases: diff --git a/tests/test_dse.py b/tests/test_dse.py index c111104506..f1f160a0e1 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -527,6 +527,7 @@ def test_collection(self, exprs, expected): ispace = exprs[0].ispace aliases = collect(extracted, ispace, False) + aliases.filter(lambda a: a.score > 0) assert len(aliases) == len(expected) assert all(i.pivot in expected for i in aliases) From 58a9bc355f45cddf9c90b1049bec8f1000c3e70e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 25 Aug 2023 10:16:42 +0000 Subject: [PATCH 37/79] compiler: Patch CireIndexDerivatives --- devito/passes/clusters/aliases.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index c1c7d0a040..e6636aa319 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -405,6 +405,8 @@ def _search2(self, expr, rank): class CireIndexDerivatives(CireDerivatives): def _compose(self, expr): + if expr.is_Pow: + return (expr,) terms = [] for a in expr.args: try: From c684a442807cff91fcba70b28ae07c2a603f77a9 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 29 Aug 2023 12:05:33 +0000 Subject: [PATCH 38/79] compiler: Add DAG.roots --- devito/tools/data_structures.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 3dc3b5f180..70be879cab 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -356,6 +356,10 @@ def __contains__(self, key): def nodes(self): return tuple(self.graph) + @property + def roots(self): + return tuple(n for n in self.nodes if not self.predecessors(n)) + @property def edges(self): ret = [] From ee0f3ed018b79293378cec22ffbbdd6132a6d2a6 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 29 Aug 2023 12:12:15 +0000 Subject: [PATCH 39/79] compiler: Remame CIRE search/compose funcs --- devito/passes/clusters/aliases.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index e6636aa319..5655ca1363 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -345,7 +345,8 @@ def process(self, clusters): def _generate(self, exprs, exclude): # E.g., extract `u.dx*a*b` and `u.dx*a*c` from # `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]` - basextr = self._do_generate(exprs, exclude, self._search, self._compose) + basextr = self._do_generate(exprs, exclude, self._cbk_search, + self._cbk_compose) if not basextr: return yield basextr @@ -361,8 +362,8 @@ def _generate(self, exprs, exclude): candidates = sorted(grank, reverse=True)[:2] for i in candidates: lower_pri_elems = flatten([grank[j] for j in candidates if j != i]) - search_i = lambda e: self._search2(e, grank[i] + lower_pri_elems) - yield self._do_generate(exprs, exclude, search_i, self._compose) + cbk_search = lambda e: self._cbk_search2(e, grank[i] + lower_pri_elems) + yield self._do_generate(exprs, exclude, cbk_search, self._cbk_compose) def _lookup_key(self, c): return AliasKey(c.ispace, None, c.dtype, c.guards, c.properties) @@ -382,18 +383,18 @@ def _eval_variants_delta(self, delta_flops, delta_ws): class CireEvalDerivatives(CireDerivatives): - def _compose(self, expr): + def _cbk_compose(self, expr): return split_coeff(expr)[1] - def _search(self, expr): + def _cbk_search(self, expr): if isinstance(expr, EvalDerivative) and not expr.base.is_Function: return expr.args else: - return flatten(e for e in [self._search(a) for a in expr.args] if e) + return flatten(e for e in [self._cbk_search(a) for a in expr.args] if e) - def _search2(self, expr, rank): + def _cbk_search2(self, expr, rank): ret = [] - for e in self._search(expr): + for e in self._cbk_search(expr): mapper = deindexify(e) for i in rank: if i in mapper: @@ -404,7 +405,7 @@ def _search2(self, expr, rank): class CireIndexDerivatives(CireDerivatives): - def _compose(self, expr): + def _cbk_compose(self, expr): if expr.is_Pow: return (expr,) terms = [] @@ -416,15 +417,15 @@ def _compose(self, expr): terms.append(a) return tuple(terms) - def _search(self, expr): + def _cbk_search(self, expr): if isinstance(expr, IndexDerivative): return (expr.expr,) else: - return flatten(e for e in [self._search(a) for a in expr.args] if e) + return flatten(e for e in [self._cbk_search(a) for a in expr.args] if e) - def _search2(self, expr, rank): + def _cbk_search2(self, expr, rank): ret = [] - for e in self._search(expr): + for e in self._cbk_search(expr): mapper = deindexify(e) for i in rank: found = [v.expr if isinstance(v, IndexDerivative) else v From 3f9a7be9ed298855f5c96745c43aa99d6a89a86a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 30 Aug 2023 09:00:50 +0000 Subject: [PATCH 40/79] compiler: Tweak cire-schedule behavior --- devito/passes/clusters/aliases.py | 113 +++++++++++++++++------------- tests/test_dle.py | 2 + tests/test_dse.py | 3 +- tests/test_mpi.py | 6 +- 4 files changed, 73 insertions(+), 51 deletions(-) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 5655ca1363..c6ddfaf2a1 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -123,19 +123,17 @@ def _aliases_from_clusters(self, clusters, exclude, meta): # Clusters -> AliasList found = collect(mapper.extracted, meta.ispace, self.opt_minstorage) pexprs, aliases = choose(found, exprs, mapper, self.opt_mingain) - if not aliases: - continue # AliasList -> Schedule schedule = lower_aliases(aliases, meta, self.opt_maxpar) variants.append(Variant(schedule, pexprs)) - # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off) if not variants: return [] - schedule, exprs = pick_best(variants, self.opt_schedule_strategy, - self._eval_variants_delta) + + # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off) + schedule, exprs = self._select(variants) # Schedule -> Schedule (optimization) if self.opt_rotate: @@ -143,7 +141,8 @@ def _aliases_from_clusters(self, clusters, exclude, meta): schedule = optimize_schedule_padding(schedule, meta, self.platform) # Schedule -> [Clusters]_k - processed, subs = lower_schedule(schedule, meta, self.sregistry, self.opt_ftemps) + processed, subs = lower_schedule(schedule, meta, self.sregistry, + self.opt_ftemps) # [Clusters]_k -> [Clusters]_k (optimization) if self.opt_multisubdomain: @@ -165,6 +164,46 @@ def _aliases_from_clusters(self, clusters, exclude, meta): return processed + def process(self, clusters): + raise NotImplementedError + + def _generate(self, exprs, exclude): + """ + Generate one or more extractions from ``exprs``. An extraction is a + set of CIRE candidates which may be turned into aliases. Two different + extractions may contain overlapping sub-expressions and, therefore, + should be processed and evaluated indipendently. An extraction won't + contain any of the symbols appearing in ``exclude``. + """ + raise NotImplementedError + + def _lookup_key(self, c): + """ + Create a key for the given Cluster. Clusters with same key may be + processed together in the search for CIRE candidates. Clusters should + have a different key if they must not be processed together, e.g., + when this would lead to violation of data dependencies. + """ + raise NotImplementedError + + def _select(self, variants): + """ + Select the best variant out of a set of variants, weighing flops + and working set. + """ + raise NotImplementedError + + def _eval_variants_delta(self, delta_flops, delta_ws): + """ + Given the deltas in flop reduction and working set size increase of two + Variants, return True if the second variant is estimated to deliever + better performance, False otherwise. + """ + raise NotImplementedError + + +class CireTransformerLegacy(CireTransformer): + def _do_generate(self, exprs, exclude, cbk_search, cbk_compose=None): """ Carry out the bulk of the work of ``_generate``. @@ -195,38 +234,8 @@ def _do_generate(self, exprs, exclude, cbk_search, cbk_compose=None): return mapper - def _generate(self, exprs, exclude): - """ - Generate one or more extractions from ``exprs``. An extraction is a - set of CIRE candidates which may be turned into aliases. Two different - extractions may contain overlapping sub-expressions and, therefore, - should be processed and evaluated indipendently. An extraction won't - contain any of the symbols appearing in ``exclude``. - """ - raise NotImplementedError - - def _lookup_key(self, c): - """ - Create a key for the given Cluster. Clusters with same key may be - processed together in the search for CIRE candidates. Clusters should - have a different key if they must not be processed together, e.g., - when this would lead to violation of data dependencies. - """ - raise NotImplementedError - - def _eval_variants_delta(self, delta_flops, delta_ws): - """ - Given the deltas in flop reduction and working set size increase of two - Variants, return True if the second variant is estimated to deliever - better performance, False otherwise. - """ - raise NotImplementedError - def process(self, clusters): - raise NotImplementedError - - -class CireInvariants(CireTransformer, Queue): +class CireInvariants(CireTransformerLegacy, Queue): def __init__(self, sregistry, options, platform): super().__init__(sregistry, options, platform) @@ -274,6 +283,9 @@ def _lookup_key(self, c, d): return AliasKey(ispace, intervals, c.dtype, c.guards, properties) + def _select(self, variants): + return pick_best(variants, self._eval_variants_delta) + def _eval_variants_delta(self, delta_flops, delta_ws): # Always prefer the Variant with fewer temporaries return ((delta_ws == 0 and delta_flops < 0) or @@ -312,7 +324,7 @@ def _generate(self, exprs, exclude): yield self._do_generate(exprs, exclude, cbk_search) -class CireDerivatives(CireTransformer): +class CireDerivatives(CireTransformerLegacy): def __init__(self, sregistry, options, platform): super().__init__(sregistry, options, platform) @@ -368,6 +380,17 @@ def _generate(self, exprs, exclude): def _lookup_key(self, c): return AliasKey(c.ispace, None, c.dtype, c.guards, c.properties) + def _select(self, variants): + if isinstance(self.opt_schedule_strategy, int): + try: + return variants[self.opt_schedule_strategy] + except IndexError: + raise ValueError("Illegal schedule %d; " + "generated %d schedules in total" + % (self.opt_schedule_strategy, len(variants))) + + return pick_best(variants, self._eval_variants_delta) + def _eval_variants_delta(self, delta_flops, delta_ws): # If there's a greater flop reduction using fewer temporaries, no doubts # what's gonna be the best variant. But if the better flop reduction @@ -440,7 +463,7 @@ def _cbk_search2(self, expr, rank): modes = { 'invariants': [CireInvariantsElementary, CireInvariantsDivs], - 'eval-derivs': [CireEvalDerivatives], # TODO: legacy pass + 'eval-derivs': [CireEvalDerivatives], # NOTE: legacy pass 'index-derivs': [CireIndexDerivatives], } @@ -962,18 +985,11 @@ def optimize_clusters_msds(clusters): return processed -def pick_best(variants, schedule_strategy, eval_variants_delta): +def pick_best(variants, eval_variants_delta): """ Return the variant with the best trade-off between operation count reduction and working set increase. Heuristics may be applied. """ - if type(schedule_strategy) is int: - try: - return variants[schedule_strategy] - except IndexError: - raise ValueError("Illegal schedule strategy %d; accepted `[0, %d]`" - % (schedule_strategy, len(variants) - 1)) - best = None best_flops_score = None best_ws_score = None @@ -984,7 +1000,8 @@ def pick_best(variants, schedule_strategy, eval_variants_delta): # The working set score depends on the number and dimensionality of # temporaries required by the Schedule i_ws_count = Counter([len(sa.writeto) for sa in i.schedule]) - i_ws_score = tuple(i_ws_count[sa + 1] for sa in reversed(range(max(i_ws_count)))) + i_ws_score = [i_ws_count[sa + 1] + for sa in reversed(range(max(i_ws_count, default=0)))] # TODO: For now, we assume the performance impact of an N-dimensional # temporary is always the same regardless of the value N, but this might # change in the future diff --git a/tests/test_dle.py b/tests/test_dle.py index 23fbe0b730..4d851be03c 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -1181,6 +1181,7 @@ def test_multiple_subnests_v0(self): _R(u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3.*f) + 1.) op = Operator(eqn, opt=('advanced', {'openmp': True, 'cire-mingain': 0, + 'cire-schedule': 1, 'par-nested': 0, 'par-collapse-ncores': 1, 'par-dynamic-work': 0})) @@ -1216,6 +1217,7 @@ def test_multiple_subnests_v1(self): _R(u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3.*f) + 1.) op = Operator(eqn, opt=('advanced', {'openmp': True, 'cire-mingain': 0, + 'cire-schedule': 1, 'cire-rotate': True, 'par-nested': 0, 'par-collapse-ncores': 1, diff --git a/tests/test_dse.py b/tests/test_dse.py index f1f160a0e1..174cd4967e 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -2234,7 +2234,8 @@ def test_derivatives_from_different_levels(self): eqn = Eq(v.forward, f*(1. + v).dx + 2.*f*((1. + v).dx + f)) - op = Operator(eqn, opt=('advanced', {'cire-mingain': 0})) + op = Operator(eqn, opt=('advanced', {'cire-mingain': 0, + 'cire-schedule': 0})) # Check code generation assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 1 diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 46b8e18fa5..9951b5411e 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -2108,7 +2108,8 @@ def test_cire(self): eqn = Eq(u.forward, _R(_R(u[t, x, y] + u[t, x+1, y+1])*3.*f + _R(u[t, x+2, y+2] + u[t, x+3, y+3])*3.*f) + 1.) op0 = Operator(eqn, opt='noop') - op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0})) + op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0, + 'cire-schedule': 1})) assert len([i for i in FindSymbols().visit(op1.body) if i.is_Array]) == 1 @@ -2141,7 +2142,8 @@ def test_cire_with_shifted_diagonal_halo_touch(self): eqn = Eq(u.forward, _R(_R(u[t, x, y] + u[t, x+2, y])*3.*f + _R(u[t, x+1, y+1] + u[t, x+3, y+1])*3.*f) + 1.) op0 = Operator(eqn, opt='noop') - op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0})) + op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0, + 'cire-schedule': 1})) assert len([i for i in FindSymbols().visit(op1.body) if i.is_Array]) == 1 From 1f3b4700405710b688e18890288659a9548058c9 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 30 Aug 2023 13:44:12 +0000 Subject: [PATCH 41/79] compiler: Improve DAG --- devito/tools/data_structures.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 70be879cab..128c8e91eb 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -336,11 +336,13 @@ class DAG(object): https://github.com/thieman/py-dag/ """ - def __init__(self, nodes=None, edges=None): + def __init__(self, nodes=None, edges=None, labels=None): self.graph = OrderedDict() self.labels = DefaultOrderedDict(dict) + for node in as_tuple(nodes): self.add_node(node) + for i in as_tuple(edges): try: ind_node, dep_node = i @@ -349,6 +351,11 @@ def __init__(self, nodes=None, edges=None): self.labels[ind_node][dep_node] = label self.add_edge(ind_node, dep_node) + for ind_node, i in (labels or {}).items(): + for dep_node, v in i.items(): + if dep_node in self.graph.get(ind_node, []): + self.labels[ind_node][dep_node] = v + def __contains__(self, key): return key in self.graph @@ -371,6 +378,9 @@ def edges(self): def size(self): return len(self.graph) + def clone(self): + return self.__class__(self.nodes, self.edges, self.labels) + def add_node(self, node_name, ignore_existing=False): """Add a node if it does not exist yet, or error out.""" if node_name in self.graph: From 43990178308a262c2c0751f35f5b4985f661f1b7 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 30 Aug 2023 15:01:17 +0000 Subject: [PATCH 42/79] compiler: Make collect_derivatives stable --- devito/passes/equations/linearity.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/devito/passes/equations/linearity.py b/devito/passes/equations/linearity.py index 0950e1126f..66390f821e 100644 --- a/devito/passes/equations/linearity.py +++ b/devito/passes/equations/linearity.py @@ -23,10 +23,16 @@ def collect_derivatives(expressions): mapper = inspect(e) # E.g., 0.2*u.dx -> (0.2*u).dx - ep = aggregate_coeffs(e, mapper) + e1 = aggregate_coeffs(e, mapper) # E.g., (0.2*u).dx + (0.3*v).dx -> (0.2*u + 0.3*v).dx - processed.append(factorize_derivatives(ep)) + e2 = factorize_derivatives(e1) + if e2 == e1: + # No luck, stick to `e` to preserve e.g. the original + # coefficient factorization + processed.append(e) + else: + processed.append(e2) return processed From 7b7dad86c3e8e871f6a53c6c7bb299d443f05aaa Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 31 Aug 2023 10:02:18 +0000 Subject: [PATCH 43/79] compiler: Tweak group aliases detection --- devito/passes/clusters/aliases.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index c6ddfaf2a1..4222b00a6b 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -9,12 +9,13 @@ from devito.finite_differences import EvalDerivative, IndexDerivative, Weights from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, ROUNDABLE, SEPARABLE, Forward, IterationSpace, Interval, Cluster, ExprGeometry, Queue, - IntervalGroup, LabeledVector, normalize_properties, - relax_properties, unbounded, minimum, maximum, extrema) + IntervalGroup, LabeledVector, Vector, normalize_properties, + relax_properties, unbounded, minimum, maximum, extrema, + vmax, vmin) from devito.symbolics import (Uxmapper, estimate_cost, search, reuse_if_untouched, uxreplace, sympy_dtype) from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, - generator, split, timed_pass) + is_integer, generator, split, timed_pass) from devito.types import (Eq, Symbol, Temp, TempArray, TempFunction, ModuloDimension, CustomDimension, IncrDimension, StencilDimension, Indexed, Hyperplane) @@ -1100,7 +1101,13 @@ def diameter(self): if len(set(v)) == 1: continue else: - raise ValueError + # Worst-case scenario, we raraly end up here + # Resort to the fast vector-based comparison machinery + # (rather than the slower sympy.simplify) + items = [Vector(i) for i in v] + distance, = vmax(*items) - vmin(*items) + if not is_integer(distance): + raise ValueError ret[d] = max(ret[d], distance) return ret @@ -1114,7 +1121,7 @@ def pivot(self): @property def dimensions(self): - return self.pivot.dimensions + return frozenset(self.diameter) @property def dimensions_translated(self): From 14167aa9ea1fbfbc017fc5406b1f16534ec40550 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 4 Sep 2023 08:06:58 +0000 Subject: [PATCH 44/79] compiler: Make Bunch iterable --- devito/tools/data_structures.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 128c8e91eb..b7627ef434 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -32,6 +32,10 @@ def __init__(self, **kwargs): def __repr__(self): return "Bunch(%s)" % ", ".join(["%s=%s" % i for i in self.__dict__.items()]) + def __iter__(self): + for i in self.__dict__.values(): + yield i + class EnrichedTuple(tuple, Pickable): From 42d6f46328e5e428ac1fe093bd5c42569d776d7e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 20 Apr 2023 14:00:31 +0000 Subject: [PATCH 45/79] compiler: Drop .find where possible --- devito/ir/support/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index e17bc8d6d7..ac5cc495a7 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -876,7 +876,7 @@ def __init__(self, exprs, rules=None): # Objects altering the control flow (e.g., synchronization barriers, # break statements, ...) are converted into mock dependences for i, e in enumerate(exprs): - if e.find(Barrier) | e.find(Jump): + if isinstance(e.rhs, (Barrier, Jump)): self.writes.setdefault(mocksym, []).append( TimedAccess(mocksym, 'W', i, e.ispace) ) From ed0a6c59d7e828c7ce14bbb7d8e8b0be5553a16c Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 6 Sep 2023 10:54:54 +0000 Subject: [PATCH 46/79] examples: Update expected output --- examples/performance/01_gpu.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/performance/01_gpu.ipynb b/examples/performance/01_gpu.ipynb index a3f01a55a4..1b4e29ddb9 100644 --- a/examples/performance/01_gpu.ipynb +++ b/examples/performance/01_gpu.ipynb @@ -253,7 +253,7 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define uL0(time, x, y) u[(time)*x_stride0 + (x)*y_stride0 + (y)]\n", + "#define uL0(time,x,y) u[(time)*x_stride0 + (x)*y_stride0 + (y)]\n", "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", From 132ac87a94b2ff645df2d199f8b5878ba6c01ad5 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 6 Sep 2023 15:04:32 +0000 Subject: [PATCH 47/79] compiler: Fix deterministic codegen --- devito/finite_differences/differentiable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py index fdebf282dc..c780e08bfc 100644 --- a/devito/finite_differences/differentiable.py +++ b/devito/finite_differences/differentiable.py @@ -624,7 +624,7 @@ def __eq__(self, other): __hash__ = sympy.Basic.__hash__ def _hashable_content(self): - return (self.name, self.dimension, hash(tuple(self.weights))) + return (self.name, self.dimension, str(self.weights)) @property def dimension(self): From edd3c9196e50b6e5173892cefd638bcb06281189 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 7 Sep 2023 13:03:27 +0000 Subject: [PATCH 48/79] compiler: Patch Expression.__repr__ --- devito/ir/iet/nodes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index 989f557a0b..793cd6b14d 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -363,8 +363,9 @@ def __init__(self, expr, pragmas=None, init=False, operation=None): self.operation = operation def __repr__(self): - return "<%s::%s>" % (self.__class__.__name__, - filter_ordered([f.func for f in self.functions])) + return "<%s::%s=%s>" % (self.__class__.__name__, + type(self.write), + ','.join('%s' % type(f) for f in self.functions)) @property def dtype(self): From bcf6015fc9a8957ec185ee899861abb2d76c311d Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 7 Sep 2023 13:45:21 +0000 Subject: [PATCH 49/79] compiler: Restore previous cost model for aliases retention --- devito/passes/clusters/aliases.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 4222b00a6b..227889e081 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -632,14 +632,14 @@ def choose(aliases, exprs, mapper, mingain): if not aliases: return exprs, aliases - # `score <= m` => discarded + # `score < m` => discarded # `score > M` => optimized # `m <= score <= M` => maybe discarded, maybe optimized; depends on heuristics m = mingain M = mingain*3 # Filter off the aliases with low score - key = lambda a: a.score > m + key = lambda a: a.score >= m aliases.filter(key) # Project the candidate aliases into `exprs` to derive the final working set @@ -650,7 +650,7 @@ def choose(aliases, exprs, mapper, mingain): # Filter off the aliases with a weak flop-reduction / working-set tradeoff key = lambda a: \ a.score > M or \ - m < a.score <= M and max(len(wset(a.pivot)), 1) > len(wset(a.pivot) & owset) + m <= a.score <= M and max(len(wset(a.pivot)), 1) > len(wset(a.pivot) & owset) aliases.filter(key) if not aliases: From a465e29cfd932d8bc47aac18db72f5ea6bb84756 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 7 Sep 2023 13:45:49 +0000 Subject: [PATCH 50/79] compiler: Polishing --- conftest.py | 2 +- devito/finite_differences/finite_difference.py | 2 +- devito/ir/clusters/cluster.py | 12 ++++++++++-- devito/ir/clusters/visitors.py | 2 +- devito/ir/stree/algorithms.py | 2 +- devito/ir/support/guards.py | 4 ++-- devito/ir/support/space.py | 10 +++------- devito/ir/support/utils.py | 6 +++--- devito/passes/clusters/aliases.py | 8 ++++---- devito/passes/clusters/blocking.py | 2 +- devito/passes/clusters/buffering.py | 6 +++--- devito/passes/clusters/derivatives.py | 2 +- devito/passes/clusters/implicit.py | 2 +- 13 files changed, 32 insertions(+), 28 deletions(-) diff --git a/conftest.py b/conftest.py index c19f7f1b8b..cd74558788 100644 --- a/conftest.py +++ b/conftest.py @@ -163,7 +163,7 @@ def parallel(item): # OpenMPI requires an explicit flag for oversubscription. We need it as some # of the MPI tests will spawn lots of processes if mpi_distro == 'OpenMPI': - call = [mpi_exec, '--oversubscribe', '--timeout', '150'] + args + call = [mpi_exec, '--oversubscribe', '--timeout', '300'] + args else: call = [mpi_exec] + args diff --git a/devito/finite_differences/finite_difference.py b/devito/finite_differences/finite_difference.py index 9eb4261b2b..cd42864173 100644 --- a/devito/finite_differences/finite_difference.py +++ b/devito/finite_differences/finite_difference.py @@ -236,7 +236,7 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic weights = Weights(name='w', dimensions=indices.free_dim, initvalue=weights) if matvec == transpose: - # For homogenity, always generate e.g. `x + i0` rather than `x - i0` + # For homogeneity, always generate e.g. `x + i0` rather than `x - i0` # for transpose and `x + i0` for direct indices = indices.transpose() weights = weights._subs(indices.free_dim, -indices.free_dim) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 6362d44eea..16d2feb2d3 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -465,6 +465,10 @@ def meta(self): # *** Utils def reduce_properties(clusters): + """ + The normalized intersection (basically, a reduction) of the Properties in + `clusters`. + """ properties = {} for c in clusters: for d, v in c.properties.items(): @@ -474,12 +478,16 @@ def reduce_properties(clusters): def tailor_properties(properties, ispace): - for d in ispace.itdimensions: + """ + Create a new Properties object off `properties` that retains all and only + the iteration dimensions in `ispace`. + """ + for d in ispace.itdims: properties = properties.add(d) for i in properties: for d in as_tuple(i): - if d not in ispace.itdimensions: + if d not in ispace.itdims: properties = properties.drop(d) return properties diff --git a/devito/ir/clusters/visitors.py b/devito/ir/clusters/visitors.py index f9bb36d6e3..979ae76249 100644 --- a/devito/ir/clusters/visitors.py +++ b/devito/ir/clusters/visitors.py @@ -50,7 +50,7 @@ def _make_key(self, cluster, level): guards = None if self._q_properties_in_key: - properties = cluster.properties.drop(cluster.ispace[level:].itdimensions) + properties = cluster.properties.drop(cluster.ispace[level:].itdims) else: properties = None diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 0f7e46bcfd..9383501ea3 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -142,7 +142,7 @@ def preprocess(clusters, options=None, **kwargs): hs = HaloScheme.union(e.rhs.halo_scheme for e in c.exprs) queue.append(c.rebuild(halo_scheme=hs)) else: - dims = set(c.ispace.promote(lambda d: d.is_Block).itdimensions) + dims = set(c.ispace.promote(lambda d: d.is_Block).itdims) found = [] for c1 in list(queue): diff --git a/devito/ir/support/guards.py b/devito/ir/support/guards.py index a378929fbc..54bd90bfa0 100644 --- a/devito/ir/support/guards.py +++ b/devito/ir/support/guards.py @@ -228,7 +228,7 @@ def andg(self, d, guard): return Guards(m) try: - m[d] = and_smart(m[d], guard) + m[d] = simplify_and(m[d], guard) except KeyError: m[d] = guard @@ -273,7 +273,7 @@ def filter(self, key): # *** Utils -def and_smart(relation, v): +def simplify_and(relation, v): """ Given `x = And(*relation.args, v)`, return `relation` if `x ≡ relation`, `x` otherwise. diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 529258bada..4a03fb9bb6 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -925,13 +925,9 @@ def is_compatible(self, other): return (self.intervals.is_compatible(other.intervals) and self.nonderived_directions == other.nonderived_directions) - @property - def itdimensions(self): - return self.intervals.dimensions - @property def itdims(self): - return self.itdimensions + return self.intervals.dimensions @property def relations(self): @@ -947,12 +943,12 @@ def directions(self): @cached_property def itintervals(self): - return tuple(self[d] for d in self.itdimensions) + return tuple(self[d] for d in self.itdims) @cached_property def dimensions(self): sub_dims = flatten(i._defines for v in self.sub_iterators.values() for i in v) - return tuple(filter_ordered(self.itdimensions + tuple(sub_dims))) + return tuple(filter_ordered(self.itdims + tuple(sub_dims))) @cached_property def nonderived_directions(self): diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py index b79c331fbb..a2dc2b4c73 100644 --- a/devito/ir/support/utils.py +++ b/devito/ir/support/utils.py @@ -291,7 +291,7 @@ def unbounded(expr): Extrema = namedtuple('Extrema', 'm M') -def _minmax(expr, callback, udims=None): +def _relational(expr, callback, udims=None): """ Helper for `minimum`, `maximum`, and potential future utilities that share a significant chunk of logic. @@ -314,7 +314,7 @@ def minimum(expr, udims=None): Unbounded Dimensions whose possible minimum value is not known are ignored. """ - return _minmax(expr, lambda e: e._min, udims) + return _relational(expr, lambda e: e._min, udims) def maximum(expr, udims=None): @@ -323,7 +323,7 @@ def maximum(expr, udims=None): Unbounded Dimensions whose possible maximum value is not known are ignored. """ - return _minmax(expr, lambda e: e._max, udims) + return _relational(expr, lambda e: e._max, udims) def extrema(expr): diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 227889e081..c0ba1bf4ed 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -905,7 +905,7 @@ def lower_schedule(schedule, meta, sregistry, ftemps): # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) - dimensions = [d.parent if d.is_Sub else d for d in writeto.itdimensions] + dimensions = [d.parent if d.is_Sub else d for d in writeto.itdims] # The halo must be set according to the size of `writeto` halo = [(abs(i.lower), abs(i.upper)) for i in writeto] @@ -945,12 +945,12 @@ def lower_schedule(schedule, meta, sregistry, ftemps): for d, v in meta.properties.items(): if any(i.is_Modulo for i in ispace.sub_iterators[d]): properties[d] = normalize_properties(v, {SEQUENTIAL}) - elif d not in writeto.itdimensions: + elif d not in writeto.itdims: properties[d] = normalize_properties(v, {PARALLEL_IF_PVT}) - {ROUNDABLE} # Track star-shaped stencils for potential future optimization if len(writeto) > 1 and schedule.is_frame: - properties[Hyperplane(writeto.itdimensions)] = {SEPARABLE} + properties[Hyperplane(writeto.itdims)] = {SEPARABLE} # Finally, build the alias Cluster clusters.append(Cluster(expression, ispace, meta.guards, properties)) @@ -966,7 +966,7 @@ def optimize_clusters_msds(clusters): """ processed = [] for c in clusters: - msds = [d for d in c.ispace.itdimensions if isinstance(d, MultiSubDimension)] + msds = [d for d in c.ispace.itdims if isinstance(d, MultiSubDimension)] if msds: mapper = {d: d.root for d in msds} diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index d731fa2126..2aaaf4a71f 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -392,7 +392,7 @@ def decompose(ispace, d, block_dims): # E.g., `(t, xbb, ybb, xb, yb, x, y)`, and NOT e.g. `(t, xbb, xb, x, ybb, ...)` # NOTE: this is perfectly legal since: # TILABLE => (perfect nest & PARALLEL) => interchangeable - for i in ispace.itdimensions: + for i in ispace.itdims: if not i.is_Block: continue for bd in block_dims: diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index 72b45f8e3b..dcba3eb872 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -203,7 +203,7 @@ def callback(self, clusters, prefix, cache=None): expr = lower_exprs(Eq(lhs, rhs)) ispace = b.writeto guards[pd] = GuardBound(dim.root.symbolic_min, dim.root.symbolic_max) - properties = {d: {AFFINE, PARALLEL} for d in ispace.itdimensions} + properties = {d: {AFFINE, PARALLEL} for d in ispace.itdims} init.append(Cluster(expr, ispace, guards=guards, properties=properties)) @@ -307,7 +307,7 @@ def callback(self, clusters, prefix, cache=None): for c in processed: if b.buffer in c.functions: key1 = lambda d: d not in b.dim._defines - dims = c.ispace.project(key1).itdimensions + dims = c.ispace.project(key1).itdims ispace = c.ispace.lift(dims, key0()) processed1.append(c.rebuild(ispace=ispace)) else: @@ -614,7 +614,7 @@ def readfrom(self): ispace0 = self.written.project(lambda d: d in self.xd._defines) ispace1 = self.writeto.project(lambda d: d not in self.xd._defines) - extra = (ispace0.itdimensions + ispace1.itdimensions,) + extra = (ispace0.itdims + ispace1.itdims,) ispace = IterationSpace.union(ispace0, ispace1, relations=extra) return ispace diff --git a/devito/passes/clusters/derivatives.py b/devito/passes/clusters/derivatives.py index fd0d8bdcad..222b665af4 100644 --- a/devito/passes/clusters/derivatives.py +++ b/devito/passes/clusters/derivatives.py @@ -90,7 +90,7 @@ def _core(expr, c, weights, mapper, sregistry): directions = {d: Backward if d.backward else Forward for d in dims} ispace0 = IterationSpace(intervals, directions=directions) - extra = (c.ispace.itdimensions + dims,) + extra = (c.ispace.itdims + dims,) ispace = IterationSpace.union(c.ispace, ispace0, relations=extra) name = sregistry.make_name(prefix='r') diff --git a/devito/passes/clusters/implicit.py b/devito/passes/clusters/implicit.py index af02805102..73659705d3 100644 --- a/devito/passes/clusters/implicit.py +++ b/devito/passes/clusters/implicit.py @@ -117,7 +117,7 @@ def callback(self, clusters, prefix): # The IterationSpace induced by the MultiSubDomain if dims: intervals = [Interval(i) for i in dims] - relations = (ispace0.itdimensions + dims, dims + ispace1.itdimensions) + relations = (ispace0.itdims + dims, dims + ispace1.itdims) ispaceN = IterationSpace( IntervalGroup(intervals, relations=relations), sub_iterators ) From 870428207f0672cc86af0ca3075b1a00ffba3c10 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 7 Sep 2023 15:33:42 +0000 Subject: [PATCH 51/79] compiler: Fix AbstractSymbol hashing --- devito/types/basic.py | 11 +++++++++++ tests/test_caching.py | 13 +++++++++++++ 2 files changed, 24 insertions(+) diff --git a/devito/types/basic.py b/devito/types/basic.py index 400232faed..b2ff2d4422 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -358,6 +358,17 @@ def __init__(self, *args, **kwargs): def __init_finalize__(self, *args, **kwargs): self._is_const = kwargs.get('is_const', False) + def __eq__(self, other): + return (super().__eq__(other) and + isinstance(other, AbstractSymbol) and + self.dtype is other.dtype and + self.is_const == other.is_const) + + __hash__ = sympy.Symbol.__hash__ + + def _hashable_content(self): + return super()._hashable_content() + (self.dtype, self.is_const) + @property def dtype(self): return self._dtype diff --git a/tests/test_caching.py b/tests/test_caching.py index a11c6319a3..227ac4d193 100644 --- a/tests/test_caching.py +++ b/tests/test_caching.py @@ -110,6 +110,19 @@ def test_default_dimension(self): dd1 = DefaultDimension(name='dd') assert hash(dd0) != hash(dd1) + def test_spacing(self): + """ + Test that spacing symbols from grids with different dtypes have different + hash value. + """ + grid0 = Grid(shape=(4, 4), dtype=np.float32) + grid1 = Grid(shape=(4, 4), dtype=np.float64) + + h_x0 = grid0.dimensions[0].spacing + h_x1 = grid1.dimensions[0].spacing + + assert hash(h_x0) != hash(h_x1) + @pytest.mark.parametrize('FunctionType', [Function, TimeFunction]) def test_function(self, FunctionType): """Test that different Functions have different hash value.""" From a8f9e49d1c6e4d53159346066d66e457d6393012 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 7 Sep 2023 16:38:19 +0000 Subject: [PATCH 52/79] compiler: Avoid redundancies within relations --- devito/passes/clusters/blocking.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index 2aaaf4a71f..e9ea5f60a9 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -7,7 +7,8 @@ IntervalGroup, IterationSpace, Scope) from devito.passes import is_on_device from devito.symbolics import search, uxreplace, xreplace_indices -from devito.tools import UnboundedMultiTuple, as_tuple, flatten, is_integer, prod +from devito.tools import (UnboundedMultiTuple, as_tuple, filter_ordered, + flatten, is_integer, prod) from devito.types import BlockDimension __all__ = ['blocking'] @@ -386,7 +387,13 @@ def decompose(ispace, d, block_dims): if any(i._depth < bd._depth for i in r if i.is_Block): continue - relations.append(tuple(bd if i in d._defines else i for i in r)) + # E.g., `r=(z, i0z)` -> `[i0z0_blk0, i0z0_blk0]` + v = [bd if i in d._defines else i for i in r] + + # E.g., `v=[i0z0_blk0, i0z0_blk0]` -> `v=(i0z0_blk0,)` + v = tuple(filter_ordered(v)) + + relations.append(v) # 3: Make sure BlockDimensions at same depth stick next to each other # E.g., `(t, xbb, ybb, xb, yb, x, y)`, and NOT e.g. `(t, xbb, xb, x, ybb, ...)` From 16af2286051cf87d282c2ff8bca82fb5e1b18261 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 8 Sep 2023 10:28:26 +0000 Subject: [PATCH 53/79] compiler: Fix AbstractSymbol.__eq__ --- devito/types/basic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devito/types/basic.py b/devito/types/basic.py index b2ff2d4422..c9419a00cf 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -359,8 +359,8 @@ def __init_finalize__(self, *args, **kwargs): self._is_const = kwargs.get('is_const', False) def __eq__(self, other): - return (super().__eq__(other) and - isinstance(other, AbstractSymbol) and + return (isinstance(other, AbstractSymbol) and + super().__eq__(other) and self.dtype is other.dtype and self.is_const == other.is_const) From 35053d9a476f6d350da8040be26020ff46ab3fbd Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Sat, 9 Sep 2023 14:00:47 +0000 Subject: [PATCH 54/79] compiler: Relax Dimension._hashable_content --- devito/types/dimension.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 0c341bb1a2..cbaf2c0b41 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -144,6 +144,14 @@ def __dtype_setup__(cls, **kwargs): def __str__(self): return self.name + def _hashable_content(self): + # No need to make dtype and is_const part of the hash as they're + # always the same in all kinds of Dimension. This makes __eq__ and + # other comparison methods a bit more robust, since they rely on + # _hashable_content, and e.g. np.int32 < np.int32 would raise + # a stupid, avoidable exception + return sympy.Symbol._hashable_content(self) + @property def spacing(self): """Symbol representing the physical spacing along the Dimension.""" From dfbe16963fc335e1a409373abf1f995610b0286e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 11 Sep 2023 10:18:30 +0000 Subject: [PATCH 55/79] compiler: Patch MPINeighborhood reconstruction --- devito/passes/iet/engine.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index c29c625122..ed3db7a22f 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -5,6 +5,7 @@ EntryFunction, ThreadCallable, Uxreplace, derive_parameters) from devito.ir.support import SymbolRegistry +from devito.mpi.distributed import MPINeighborhood from devito.tools import DAG, as_tuple, filter_ordered, timed_pass from devito.types import (Array, CompositeObject, Lock, IncrDimension, Indirection, Temp) @@ -318,6 +319,11 @@ def _(i, mapper, sregistry): mapper[i] = v +@abstract_object.register(MPINeighborhood) +def _(i, mapper, sregistry): + mapper[i] = i._rebuild() + + @abstract_object.register(BlockDimension) def _(i, mapper, sregistry): if i._depth != 2: From f142b32b9212f60258725fc436245cbd8167f358 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 12 Sep 2023 08:23:29 +0000 Subject: [PATCH 56/79] compiler: Add TBArray --- devito/types/parallel.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/devito/types/parallel.py b/devito/types/parallel.py index 285aa38257..491839779d 100644 --- a/devito/types/parallel.py +++ b/devito/types/parallel.py @@ -22,7 +22,7 @@ __all__ = ['NThreads', 'NThreadsNested', 'NThreadsNonaffine', 'NThreadsBase', 'DeviceID', 'ThreadID', 'Lock', 'PThreadArray', 'SharedData', - 'NPThreads', 'DeviceRM', 'QueueID', 'Barrier'] + 'NPThreads', 'DeviceRM', 'QueueID', 'Barrier', 'TBArray'] class NThreadsBase(Scalar): @@ -294,3 +294,15 @@ class Barrier(object): """ pass + + +class TBArray(Array): + + """ + An Array used for performance optimization within a thread block. + """ + + def __init_finalize__(self, *args, **kwargs): + kwargs['liveness'] = 'eager' + + super().__init_finalize__(*args, **kwargs) From 957000b0a642195df53a6ab016aa8541ac1bfa95 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 8 Sep 2023 09:02:22 +0000 Subject: [PATCH 57/79] compiler: Speedup codegen --- devito/ir/support/basic.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index ac5cc495a7..5d06bb5c1a 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -237,8 +237,9 @@ def __eq__(self, other): # which might require expensive comparisons of Vector entries (i.e., # SymPy expressions) - return (self.access is other.access and - self.mode == other.mode and + return (self.mode == other.mode and + self.timestamp == other.timestamp and + self.access == other.access and self.ispace == other.ispace) def __hash__(self): @@ -925,18 +926,21 @@ def __repr__(self): return "\n".join([out.format(i.name, w, '', r) for i, r, w in zip(tracked, reads, writes)]) - @cached_property - def reads_extremaed(self): + @memoized_meth + def getreads_extremaed(self, f): """ - A view of the Scope's reads in which StencilDimensions are replaced + A view of the Scope's reads in which the StencilDimensions are replaced with their extrema. """ - ret = {f: [] for f in self.reads} - for f, v in self.reads.items(): - for i in v: - for j in set(extrema(i.access)): - ret[f].append(TimedAccess(j, i.mode, i.timestamp, i.ispace)) - return ret + ret = set() + + if f.is_Symbol: + return self.reads.get(f, ret) + else: + for i in self.reads.get(f, []): + for j in extrema(i.access): + ret.add(TimedAccess(j, i.mode, i.timestamp, i.ispace)) + return ret @cached_property def accesses(self): @@ -963,7 +967,7 @@ def d_flow_gen(self): """Generate the flow (or "read-after-write") dependences.""" for k, v in self.writes.items(): for w in v: - for r in self.reads_extremaed.get(k, []): + for r in self.getreads_extremaed(k): dependence = Dependence(w, r) if dependence.is_imaginary: @@ -993,7 +997,7 @@ def d_anti_gen(self): """Generate the anti (or "write-after-read") dependences.""" for k, v in self.writes.items(): for w in v: - for r in self.reads_extremaed.get(k, []): + for r in self.getreads_extremaed(k): dependence = Dependence(r, w) if dependence.is_imaginary: From 1c5ad3d28ca08b26e9a7afa7c2a6ae0aaa02e31b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 8 Sep 2023 11:57:12 +0000 Subject: [PATCH 58/79] compiler: Speedup codegen by minimizing relations --- devito/ir/clusters/algorithms.py | 26 +++++++++++++++++++- devito/ir/clusters/cluster.py | 7 +++++- devito/ir/support/space.py | 38 ++++++++++++++++-------------- devito/passes/clusters/aliases.py | 4 ++-- devito/passes/clusters/blocking.py | 4 ++++ devito/passes/clusters/implicit.py | 9 ++++--- devito/tools/data_structures.py | 6 +++-- tests/test_dimension.py | 4 ---- 8 files changed, 65 insertions(+), 33 deletions(-) diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index 1ef347cbb9..871bdf1e57 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -14,7 +14,7 @@ from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.symbolics import retrieve_indexed, uxreplace, xreplace_indices from devito.tools import (DefaultOrderedDict, Stamp, as_mapper, flatten, - is_integer, timed_pass) + is_integer, timed_pass, toposort) from devito.types import Array, Eq, Symbol from devito.types.dimension import BOTTOM, ModuloDimension @@ -29,6 +29,7 @@ def clusterize(exprs, **kwargs): clusters = [Cluster(e, e.ispace) for e in exprs] # Setup the IterationSpaces based on data dependence analysis + clusters = impose_total_ordering(clusters) clusters = Schedule().process(clusters) # Handle SteppingDimensions @@ -49,6 +50,29 @@ def clusterize(exprs, **kwargs): return ClusterGroup(clusters) +def impose_total_ordering(clusters): + """ + Create a new sequence of Clusters whose IterationSpaces are totally ordered + according to a global set of relations. + """ + global_relations = set().union(*[c.ispace.relations for c in clusters]) + ordering = toposort(global_relations) + + processed = [] + for c in clusters: + key = lambda d: ordering.index(d) + try: + relations = {tuple(sorted(c.ispace.itdims, key=key))} + except ValueError: + # See issue #X + relations = c.ispace.relations + ispace = c.ispace.reorder(relations=relations) + + processed.append(c.rebuild(ispace=ispace)) + + return processed + + class Schedule(QueueStateful): """ diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 16d2feb2d3..95c5dd7cd4 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -259,6 +259,9 @@ def dspace(self): Derive the DataSpace of the Cluster from its expressions, IterationSpace, and Guards. """ + if self.is_halo_touch: + return DataSpace([]) + accesses = detect_accesses(self.exprs) # Construct the `parts` of the DataSpace, that is a projection of the data @@ -301,7 +304,8 @@ def dspace(self): # Special case: if the factor of a ConditionalDimension has value 1, # then we can safely resort to the parent's Interval - intervals = intervals.promote(lambda d: d.is_Conditional and d.factor == 1) + key = lambda d: d.is_Conditional and d.factor == 1 + intervals = intervals.promote(key) parts[f] = intervals @@ -328,6 +332,7 @@ def dspace(self): # Construct the `intervals` of the DataSpace, that is a global, # Dimension-centric view of the data space intervals = IntervalGroup.generate('union', *parts.values()) + # E.g., `db0 -> time`, but `xi NOT-> x` intervals = intervals.promote(lambda d: not d.is_Sub) intervals = intervals.zero(set(intervals.dimensions) - oobs) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 4a03fb9bb6..7829ccb495 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -8,8 +8,8 @@ from devito.ir.support.utils import minimum, maximum from devito.ir.support.vector import Vector, vmin, vmax -from devito.tools import (PartialOrderTuple, Stamp, as_list, as_tuple, filter_ordered, - flatten, frozendict, is_integer, toposort) +from devito.tools import (PartialOrderTuple, Stamp, as_list, as_tuple, + filter_ordered, flatten, frozendict, is_integer, toposort) from devito.types import Dimension, ModuloDimension __all__ = ['NullInterval', 'Interval', 'IntervalGroup', 'IterationSpace', @@ -294,23 +294,21 @@ def expand(self): class IntervalGroup(PartialOrderTuple): """ - A partially-ordered sequence of Intervals equipped with set-like - operations. + A sequence of Intervals equipped with set-like operations. """ @classmethod def reorder(cls, items, relations): if not all(isinstance(i, AbstractInterval) for i in items): - raise ValueError("Cannot create an IntervalGroup from objects of type [%s]" % + raise ValueError("Cannot create IntervalGroup from objs of type [%s]" % ', '.join(str(type(i)) for i in items)) + # The relations are between dimensions, not intervals. So we take # care of that here ordering = filter_ordered(toposort(relations) + [i.dim for i in items]) return sorted(items, key=lambda i: ordering.index(i.dim)) def __eq__(self, o): - # No need to look at the relations -- if the partial ordering is the same, - # then the IntervalGroups are considered equal return len(self) == len(o) and all(i == j for i, j in zip(self, o)) def __contains__(self, d): @@ -342,7 +340,7 @@ def is_well_defined(self): return len(self.dimensions) == len(set(self.dimensions)) @classmethod - def generate(self, op, *interval_groups, relations=None): + def generate(cls, op, *interval_groups, relations=None): """ Create a new IntervalGroup from the iterative application of an operation to some IntervalGroups. @@ -355,8 +353,7 @@ def generate(self, op, *interval_groups, relations=None): *interval_groups Input IntervalGroups. relations : tuple, optional - Relations to be used in the newly constructed IntervalGroup, in addition - to the ones inherited via each element in `interval_groups`. + Additional relations to build the new IntervalGroup. Examples -------- @@ -388,8 +385,7 @@ def generate(self, op, *interval_groups, relations=None): def is_compatible(self, o): """ - Two IntervalGroups are compatible iff they can be ordered according - to some common partial ordering. + Two IntervalGroups are compatible iff they can be totally ordered. """ if set(self) != set(o): return False @@ -400,7 +396,7 @@ def is_compatible(self, o): self.add(o) return True except ValueError: - # Cyclic dependence detected, there is no common partial ordering + # Cyclic dependence detected, there is no possible total ordering return False def _normalize(func): @@ -437,12 +433,13 @@ def relaxed(self, d=None): intervals = [i.relaxed if i.dim in d else i for i in self] return IntervalGroup(intervals, relations=self.relations) - def promote(self, cond): + def promote(self, cond, relations=None): intervals = IntervalGroup([i.promote(cond) for i in self], - relations=self.relations) + relations=relations) - # There could be duplicate Dimensions at this point, so we sum up the Intervals - # defined over the same Dimension to produce a well-defined IntervalGroup + # There could be duplicate Dimensions at this point, so we sum up the + # Intervals defined over the same Dimension to produce a well-defined + # IntervalGroup intervals = IntervalGroup.generate('add', intervals) return intervals @@ -772,7 +769,7 @@ def __getitem__(self, key): def generate(self, op, *others, relations=None): if not others: return IterationSpace(IntervalGroup()) - elif len(others) == 1: + elif len(others) == 1 and not relations: return others[0] intervals = [i.intervals for i in others] @@ -917,6 +914,11 @@ def prefix(self, key): return self[:self.index(i.dim) + 1] + def reorder(self, relations): + intervals = IntervalGroup(self.intervals, relations=relations) + + return IterationSpace(intervals, self.sub_iterators, self.directions) + def is_compatible(self, other): """ A relaxed version of ``__eq__``, in which only non-derived dimensions diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index c0ba1bf4ed..eb2a6dbd35 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -835,11 +835,11 @@ def optimize_schedule_rotations(schedule, sregistry): # Extend `ispace` to iterate over rotations d1 = writeto[ridx+1].dim # Note: we're by construction in-bounds here - intervals = IntervalGroup(Interval(cd, 0, 0), relations={(d, cd, d1)}) + intervals = IntervalGroup(Interval(cd)) rispace = IterationSpace(intervals, {cd: dsi}, {cd: Forward}) aispace = i.ispace.zero(d) aispace = aispace.augment({d: mds + [ii]}) - ispace = IterationSpace.union(rispace, aispace) + ispace = IterationSpace.union(rispace, aispace, relations={(d, cd, d1)}) processed.append(ScheduledAlias(pivot, writeto, ispace, i.aliaseds, indicess, i.score)) diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index e9ea5f60a9..efd3339ce9 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -404,7 +404,11 @@ def decompose(ispace, d, block_dims): continue for bd in block_dims: if i._depth < bd._depth: + # E.g. `(zb, y)` relations.append((i, bd)) + elif i._depth == bd._depth: + # E.g. `(y, z)` (i.e., honour input ordering) + relations.append((bd, i)) intervals = IntervalGroup(intervals, relations=relations) diff --git a/devito/passes/clusters/implicit.py b/devito/passes/clusters/implicit.py index 73659705d3..a85d713b4e 100644 --- a/devito/passes/clusters/implicit.py +++ b/devito/passes/clusters/implicit.py @@ -117,11 +117,10 @@ def callback(self, clusters, prefix): # The IterationSpace induced by the MultiSubDomain if dims: intervals = [Interval(i) for i in dims] + ispaceN = IterationSpace(IntervalGroup(intervals), sub_iterators) + relations = (ispace0.itdims + dims, dims + ispace1.itdims) - ispaceN = IterationSpace( - IntervalGroup(intervals, relations=relations), sub_iterators - ) - ispace = IterationSpace.union(ispace0, ispaceN) + ispace = IterationSpace.union(ispace0, ispaceN, relations=relations) else: ispaceN = None ispace = ispace0 @@ -157,7 +156,7 @@ def callback(self, clusters, prefix): tip = nxt if ispaceN: - ispace = IterationSpace.union(c.ispace, ispaceN) + ispace = IterationSpace.union(c.ispace, ispaceN, relations=relations) processed.append(c.rebuild(ispace=ispace)) else: processed.append(c) diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index b7627ef434..dda2791eb0 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -305,8 +305,10 @@ def __new__(cls, items=None, relations=None): items = as_tuple(items) if relations: items = cls.reorder(items, relations) - obj = super(PartialOrderTuple, cls).__new__(cls, items) - obj._relations = set(tuple(i) for i in as_tuple(relations)) + + obj = super().__new__(cls, items) + obj._relations = frozenset(tuple(i) for i in as_tuple(relations)) + return obj @classmethod diff --git a/tests/test_dimension.py b/tests/test_dimension.py index d61ddb5233..704e25670c 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -1534,10 +1534,6 @@ def test_issue_1435(self, init_value, expected): op.cfunction op.apply(t9_M=5, t10_M=5) - assert np.all(f0.data[:] == expected[0]) - assert np.all(f1.data[:] == expected[1]) - assert np.all(f2.data[:] == expected[2]) - @pytest.mark.parametrize('factor', [ 4, Constant(name='factor', dtype=np.int32, value=4), From 019fd4f4d90fb007be2e355596b68c71641407b0 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 11 Sep 2023 09:36:02 +0000 Subject: [PATCH 59/79] compiler: Speedup codegen by minimizing relations --- devito/ir/clusters/algorithms.py | 2 +- devito/ir/clusters/cluster.py | 13 ++-- devito/ir/equations/algorithms.py | 5 +- devito/ir/equations/equation.py | 4 +- devito/ir/support/space.py | 110 +++++++++++++++++++----------- devito/tools/data_structures.py | 49 +++++++++---- tests/test_dimension.py | 12 +--- 7 files changed, 122 insertions(+), 73 deletions(-) diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index 871bdf1e57..a1b068fc05 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -66,7 +66,7 @@ def impose_total_ordering(clusters): except ValueError: # See issue #X relations = c.ispace.relations - ispace = c.ispace.reorder(relations=relations) + ispace = c.ispace.reorder(relations=relations, mode='total') processed.append(c.rebuild(ispace=ispace)) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 95c5dd7cd4..5576ab30c2 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -259,9 +259,6 @@ def dspace(self): Derive the DataSpace of the Cluster from its expressions, IterationSpace, and Guards. """ - if self.is_halo_touch: - return DataSpace([]) - accesses = detect_accesses(self.exprs) # Construct the `parts` of the DataSpace, that is a projection of the data @@ -357,6 +354,10 @@ def traffic(self): reads, writes = detect_io(self.exprs, relax=True) accesses = [(i, 'r') for i in reads] + [(i, 'w') for i in writes] + # Ordering isn't important at this point, so returning an unordered + # collection makes the caller's life easier + uispace = self.ispace.reorder(mode='unordered') + ret = {} for i, mode in accesses: if not i.is_AbstractFunction: @@ -366,7 +367,8 @@ def traffic(self): intervals = self.dspace.parts[i] # Assume that invariant dimensions always cause new loads/stores - invariants = self.ispace.intervals.drop(intervals.dimensions) + invariants = uispace.intervals.drop(intervals.dimensions) + intervals = intervals.generate('union', invariants, intervals) # Bundles impact depends on the number of components @@ -378,7 +380,8 @@ def traffic(self): for n in range(v): ret[(i, mode, n)] = intervals else: - ret[(i, mode, 0)] = self.ispace.intervals + ret[(i, mode, 0)] = uispace.intervals + return ret diff --git a/devito/ir/equations/algorithms.py b/devito/ir/equations/algorithms.py index 0c0185055e..c27ef5c54f 100644 --- a/devito/ir/equations/algorithms.py +++ b/devito/ir/equations/algorithms.py @@ -3,8 +3,7 @@ from sympy import sympify from devito.symbolics import retrieve_indexed, uxreplace, retrieve_dimensions -from devito.tools import (PartialOrderTuple, as_tuple, flatten, - filter_sorted, filter_ordered) +from devito.tools import Ordering, as_tuple, flatten, filter_sorted, filter_ordered from devito.types import Dimension, IgnoreDimSort from devito.types.basic import AbstractFunction @@ -85,7 +84,7 @@ def handle_indexed(indexed): implicit_relations.update({tuple(filter_ordered(dims))}) - ordering = PartialOrderTuple(extra, relations=implicit_relations) + ordering = Ordering(extra, relations=implicit_relations, mode='partial') return ordering diff --git a/devito/ir/equations/equation.py b/devito/ir/equations/equation.py index ecef8b82fb..0fbfff9cf9 100644 --- a/devito/ir/equations/equation.py +++ b/devito/ir/equations/equation.py @@ -178,8 +178,8 @@ def __new__(cls, *args, **kwargs): iterators.setdefault(d, set()) # Construct the IterationSpace - intervals = IntervalGroup([Interval(d, 0, 0) for d in iterators], - relations=ordering.relations) + intervals = IntervalGroup([Interval(d) for d in iterators], + relations=ordering.relations, mode='partial') ispace = IterationSpace(intervals, iterators) # Construct the conditionals and replace the ConditionalDimensions in `expr` diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 7829ccb495..28389735c0 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -8,8 +8,8 @@ from devito.ir.support.utils import minimum, maximum from devito.ir.support.vector import Vector, vmin, vmax -from devito.tools import (PartialOrderTuple, Stamp, as_list, as_tuple, - filter_ordered, flatten, frozendict, is_integer, toposort) +from devito.tools import (Ordering, Stamp, as_list, as_tuple, filter_ordered, + flatten, frozendict, is_integer, toposort) from devito.types import Dimension, ModuloDimension __all__ = ['NullInterval', 'Interval', 'IntervalGroup', 'IterationSpace', @@ -291,7 +291,7 @@ def expand(self): ) -class IntervalGroup(PartialOrderTuple): +class IntervalGroup(Ordering): """ A sequence of Intervals equipped with set-like operations. @@ -308,6 +308,13 @@ def reorder(cls, items, relations): ordering = filter_ordered(toposort(relations) + [i.dim for i in items]) return sorted(items, key=lambda i: ordering.index(i.dim)) + @classmethod + def simplify_relations(cls, relations, items, mode): + if mode == 'total': + return [tuple(i.dim for i in items)] + else: + return super().simplify_relations(relations, items, mode) + def __eq__(self, o): return len(self) == len(o) and all(i == j for i, j in zip(self, o)) @@ -381,11 +388,19 @@ def generate(cls, op, *interval_groups, relations=None): relations = set(as_tuple(relations)) relations.update(set().union(*[ig.relations for ig in interval_groups])) - return IntervalGroup(intervals, relations=relations) + modes = set(ig.mode for ig in interval_groups) + assert len(modes) <= 1 + try: + mode = modes.pop() + except KeyError: + mode = 'total' + + return IntervalGroup(intervals, relations=relations, mode=mode) def is_compatible(self, o): """ - Two IntervalGroups are compatible iff they can be totally ordered. + Two IntervalGroups are compatible iff their elements are not subjected + to sets of relations that would prevent a reordering. """ if set(self) != set(o): return False @@ -396,7 +411,7 @@ def is_compatible(self, o): self.add(o) return True except ValueError: - # Cyclic dependence detected, there is no possible total ordering + # Cyclic dependence detected return False def _normalize(func): @@ -407,35 +422,38 @@ def _normalize(func): def wrapper(self, o): if not isinstance(o, IntervalGroup): o = IntervalGroup(as_tuple(o)) - return func(self, o) + if self.mode != o.mode: + raise ValueError("Incompatible ordering modes") + relations = self.relations | o.relations + return func(self, o, relations, self.mode) return wrapper @_normalize - def intersection(self, o): + def intersection(self, o, relations, mode): mapper = OrderedDict([(i.dim, i) for i in o]) intervals = [i.intersection(mapper.get(i.dim, i)) for i in self] - return IntervalGroup(intervals, relations=(self.relations | o.relations)) + return IntervalGroup(intervals, relations=relations, mode=mode) @_normalize - def add(self, o): + def add(self, o, relations, mode): mapper = OrderedDict([(i.dim, i) for i in o]) intervals = [i.add(mapper.get(i.dim, NullInterval(i.dim))) for i in self] - return IntervalGroup(intervals, relations=(self.relations | o.relations)) + return IntervalGroup(intervals, relations=relations, mode=mode) @_normalize - def subtract(self, o): + def subtract(self, o, relations, mode): mapper = OrderedDict([(i.dim, i) for i in o]) intervals = [i.subtract(mapper.get(i.dim, NullInterval(i.dim))) for i in self] - return IntervalGroup(intervals, relations=(self.relations | o.relations)) + return IntervalGroup(intervals, relations=relations, mode=mode) def relaxed(self, d=None): d = set(self.dimensions if d is None else as_tuple(d)) intervals = [i.relaxed if i.dim in d else i for i in self] - return IntervalGroup(intervals, relations=self.relations) + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) - def promote(self, cond, relations=None): - intervals = IntervalGroup([i.promote(cond) for i in self], - relations=relations) + def promote(self, cond): + intervals = [i.promote(cond) for i in self] + intervals = IntervalGroup(intervals, relations=None, mode='unordered') # There could be duplicate Dimensions at this point, so we sum up the # Intervals defined over the same Dimension to produce a well-defined @@ -456,23 +474,29 @@ def drop(self, d, strict=False): # Clean up relations relations = [tuple(i for i in r if i not in dims) for r in self.relations] - return IntervalGroup(intervals, relations=relations) + return IntervalGroup(intervals, relations=relations, mode=self.mode) def negate(self): - return IntervalGroup([i.negate() for i in self], relations=self.relations) + intervals = [i.negate() for i in self] + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def zero(self, d=None): d = self.dimensions if d is None else as_tuple(d) - return IntervalGroup([i.zero() if i.dim in d else i for i in self], - relations=self.relations) + intervals = [i.zero() if i.dim in d else i for i in self] + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def lift(self, d=None, v=None): d = set(self.dimensions if d is None else as_tuple(d)) - return IntervalGroup([i.lift(v) if i.dim._defines & d else i for i in self], - relations=self.relations) + intervals = [i.lift(v) if i.dim._defines & d else i for i in self] + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def reset(self): - return IntervalGroup([i.reset() for i in self], relations=self.relations) + intervals = [i.reset() for i in self] + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def switch(self, d0, d1): intervals = [i.switch(d1) if i.dim is d0 else i for i in self] @@ -480,17 +504,20 @@ def switch(self, d0, d1): # Update relations too relations = {tuple(d1 if i is d0 else i for i in r) for r in self.relations} - return IntervalGroup(intervals, relations=relations) + return IntervalGroup(intervals, relations=relations, mode=self.mode) def translate(self, d, v0=0, v1=None): - intervals = [i.translate(v0, v1) if i.dim in as_tuple(d) else i for i in self] - return IntervalGroup(intervals, relations=self.relations) + dims = as_tuple(d) + intervals = [i.translate(v0, v1) if i.dim in dims else i for i in self] + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def expand(self, d=None): if d is None: d = self.dimensions intervals = [i.expand() if i.dim in as_tuple(d) else i for i in self] - return IntervalGroup(intervals, relations=self.relations) + + return IntervalGroup(intervals, relations=self.relations, mode=self.mode) def index(self, key): if isinstance(key, Interval): @@ -504,7 +531,7 @@ def __getitem__(self, key): return super(IntervalGroup, self).__getitem__(key) elif isinstance(key, slice): retval = super(IntervalGroup, self).__getitem__(key) - return IntervalGroup(retval, relations=self.relations) + return IntervalGroup(retval, relations=self.relations, mode=self.mode) if not self.is_well_defined: raise ValueError("Cannot fetch Interval from ill defined Space") @@ -625,6 +652,14 @@ def dimensions(self): def size(self): return self.intervals.size + @property + def mode(self): + return self.intervals.mode + + @property + def relations(self): + return self.intervals.relations + @property def dimension_map(self): """ @@ -648,14 +683,15 @@ class DataSpace(Space): """ def __init__(self, intervals, parts=None): - super(DataSpace, self).__init__(intervals) + super().__init__(intervals) parts = {k: v.expand() for k, v in (parts or {}).items()} self._parts = frozendict(parts) def __eq__(self, other): - return isinstance(other, DataSpace) and\ - self.intervals == other.intervals and self.parts == other.parts + return (isinstance(other, DataSpace) and + self.intervals == other.intervals and + self.parts == other.parts) def __hash__(self): return hash((super(DataSpace, self).__hash__(), self.parts)) @@ -914,8 +950,10 @@ def prefix(self, key): return self[:self.index(i.dim) + 1] - def reorder(self, relations): - intervals = IntervalGroup(self.intervals, relations=relations) + def reorder(self, relations=None, mode=None): + relations = relations or self.relations + mode = mode or self.mode + intervals = IntervalGroup(self.intervals, relations=relations, mode=mode) return IterationSpace(intervals, self.sub_iterators, self.directions) @@ -931,10 +969,6 @@ def is_compatible(self, other): def itdims(self): return self.intervals.dimensions - @property - def relations(self): - return self.intervals.relations - @property def sub_iterators(self): return self._sub_iterators diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index dda2791eb0..803f399e3e 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -10,8 +10,7 @@ from devito.tools.algorithms import toposort __all__ = ['Bunch', 'EnrichedTuple', 'ReducerMap', 'DefaultOrderedDict', - 'OrderedSet', 'PartialOrderTuple', 'DAG', 'frozendict', - 'UnboundedMultiTuple'] + 'OrderedSet', 'Ordering', 'DAG', 'frozendict', 'UnboundedMultiTuple'] class Bunch(object): @@ -286,7 +285,7 @@ def __str__(self): union = property(lambda self: self.__or__) -class PartialOrderTuple(tuple): +class Ordering(tuple): """ A tuple whose elements are ordered according to a set of relations. @@ -296,18 +295,31 @@ class PartialOrderTuple(tuple): items : object or iterable of objects The elements of the tuple. relations : iterable of tuples, optional - One or more binary relations between elements in ``items``. If not - provided, then ``items`` is interpreted as a totally ordered sequence. + One or more n-ary relations between the elements in `items`. If not + provided, then `items` is interpreted as a totally ordered sequence. If provided, then a (partial) ordering is computed and all elements in - ``items`` for which a relation is not provided are appended. + `items` for which a relation is not provided are appended. + mode : str, optional + If 'total' (default), the resulting object is interpreted as a totally + ordered sequence; the object's relations are simplified away and no + subsequent operation involving the Ordering will ever be able to alter + the obtained sequence. If 'partial', the outcome is a partially ordered + sequence; the relations as provided by the user are preserved, which + leaves room for further reordering upon future operations. If 'unordered', + the `relations` are ignored and the resulting object degenerates to an + unordered collection. """ - def __new__(cls, items=None, relations=None): + def __new__(cls, items=None, relations=None, mode='total'): + assert mode in ('total', 'partial', 'unordered') + items = as_tuple(items) if relations: items = cls.reorder(items, relations) obj = super().__new__(cls, items) - obj._relations = frozenset(tuple(i) for i in as_tuple(relations)) + + obj._relations = frozenset(cls.simplify_relations(relations, items, mode)) + obj._mode = mode return obj @@ -315,19 +327,30 @@ def __new__(cls, items=None, relations=None): def reorder(cls, items, relations): return filter_ordered(toposort(relations) + list(items)) + @classmethod + def simplify_relations(cls, relations, items, mode): + if mode == 'total': + return [tuple(items)] + elif mode == 'partial': + return [tuple(i) for i in as_tuple(relations)] + else: + return [] + def __eq__(self, other): - return super(PartialOrderTuple, self).__eq__(other) and\ - self.relations == other.relations + return (super().__eq__(other) and + self.relations == other.relations and + self.mode == other.mode) def __hash__(self): - return hash(*([i for i in self] + list(self.relations))) + return hash(*([i for i in self] + list(self.relations) + [self.mode])) @property def relations(self): return self._relations - def generate_ordering(self): - raise NotImplementedError + @property + def mode(self): + return self._mode class DAG(object): diff --git a/tests/test_dimension.py b/tests/test_dimension.py index 704e25670c..e4446900b3 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -1496,12 +1496,7 @@ def test_sparse_time_function(self): for i in range(12, 20): assert np.all(p.data[i] == 0) - @pytest.mark.parametrize('init_value,expected', [ - ([2, 1, 3], [2, 2, 0]), # updates f1, f2 - ([3, 3, 3], [3, 3, 0]), # updates f2 - ([1, 2, 3], [1, 2, 3]) # no updates - ]) - def test_issue_1435(self, init_value, expected): + def test_issue_1435(self): names = 't1 t2 t3 t4 t5 t6 t7 t8 t9 t10' t1, t2, t3, t4, t5, t6, t7, t8, t9, t10 = \ tuple(SpaceDimension(i) for i in names.split()) @@ -1512,10 +1507,6 @@ def test_issue_1435(self, init_value, expected): dimensions=(t5, t6, t7, t8))) f2 = Function(name='f2', grid=f1.grid) - f0.data[:] = init_value[0] - f1.data[:] = init_value[1] - f2.data[:] = init_value[2] - cd = ConditionalDimension(name='cd', parent=t10, condition=Or(Gt(f0[t5, t6, t7 + t9, t8 + t10], @@ -1532,7 +1523,6 @@ def test_issue_1435(self, init_value, expected): # Check it compiles correctly! See issue report op.cfunction - op.apply(t9_M=5, t10_M=5) @pytest.mark.parametrize('factor', [ 4, From 7c47a287d6e89acbf8b28248918628b6680fc872 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 11 Sep 2023 10:03:03 +0000 Subject: [PATCH 60/79] compiler: Speedup filter_ordered --- devito/tools/utils.py | 73 ++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 25 deletions(-) diff --git a/devito/tools/utils.py b/devito/tools/utils.py index d99bf34b25..ff8b5608f2 100644 --- a/devito/tools/utils.py +++ b/devito/tools/utils.py @@ -3,6 +3,7 @@ from functools import reduce from itertools import chain, combinations, groupby, product, zip_longest from operator import attrgetter, mul +import sys import types import numpy as np @@ -15,6 +16,11 @@ 'humanbytes', 'contains_val'] +# Some utils run faster with Python>=3.7 +vi = sys.version_info +py_ge_37 = (vi.major, vi.minor) >= (3, 7) + + def prod(iterable, initial=1): return reduce(mul, iterable, initial) @@ -157,32 +163,49 @@ def single_or(l): return any(i) and not any(i) -def filter_ordered(elements, key=None): - """ - Filter elements in a list while preserving order. +if py_ge_37: + def filter_ordered(elements, key=None): + # This method exploits the fact that dictionary keys are unique and ordered + # (since Python 3.7). It's concise and often faster for larger lists - Parameters - ---------- - key : callable, optional - Conversion key used during equality comparison. - """ - if isinstance(elements, types.GeneratorType): - elements = list(elements) - seen = set() - if key is None: - try: - unordered, inds = np.unique(elements, return_index=True) - return unordered[np.argsort(inds)].tolist() - except: - return sorted(list(set(elements)), key=elements.index) - else: - ret = [] - for e in elements: - k = key(e) - if k not in seen: - ret.append(e) - seen.add(k) - return ret + if isinstance(elements, types.GeneratorType): + elements = list(elements) + + if key is None: + return list(dict.fromkeys(elements)) + else: + return list(dict(zip([key(i) for i in elements], elements)).values()) + +else: + def filter_ordered(elements, key=None): + if isinstance(elements, types.GeneratorType): + elements = list(elements) + + seen = set() + if key is None: + try: + unordered, inds = np.unique(elements, return_index=True) + return unordered[np.argsort(inds)].tolist() + except: + return sorted(list(set(elements)), key=elements.index) + else: + ret = [] + for e in elements: + k = key(e) + if k not in seen: + ret.append(e) + seen.add(k) + return ret + + +filter_ordered.__doc__ = """\ +Filter elements in a list while preserving order. + +Parameters +---------- +key : callable, optional + Conversion key used during equality comparison. +""" def filter_sorted(elements, key=None): From 2a81f6e9206e2ea95b652371e669bd035e864f9e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 11 Sep 2023 13:17:05 +0000 Subject: [PATCH 61/79] compiler: Speedup Dimension.__eq__ --- devito/types/basic.py | 6 +++--- devito/types/dimension.py | 25 +++++++++++++++---------- tests/test_pickle.py | 8 ++++---- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/devito/types/basic.py b/devito/types/basic.py index c9419a00cf..90ef573b91 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -359,10 +359,10 @@ def __init_finalize__(self, *args, **kwargs): self._is_const = kwargs.get('is_const', False) def __eq__(self, other): - return (isinstance(other, AbstractSymbol) and - super().__eq__(other) and + return (type(self) is type(other) and self.dtype is other.dtype and - self.is_const == other.is_const) + self.is_const == other.is_const and + super().__eq__(other)) __hash__ = sympy.Symbol.__hash__ diff --git a/devito/types/dimension.py b/devito/types/dimension.py index cbaf2c0b41..b73d557f27 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -145,12 +145,7 @@ def __str__(self): return self.name def _hashable_content(self): - # No need to make dtype and is_const part of the hash as they're - # always the same in all kinds of Dimension. This makes __eq__ and - # other comparison methods a bit more robust, since they rely on - # _hashable_content, and e.g. np.int32 < np.int32 would raise - # a stupid, avoidable exception - return sympy.Symbol._hashable_content(self) + return tuple(getattr(self, i) for i in self.__rargs__ + self.__rkwargs__) @property def spacing(self): @@ -378,6 +373,20 @@ def __new__(cls, *args, **kwargs): def __init_finalize__(self, name, spacing=None, **kwargs): self._spacing = spacing or Spacing(name='h_%s' % name, is_const=True) + def __eq__(self, other): + # Being of type Cached, Dimensions are by construction unique. But unlike + # Symbols, equality is much stricter -- we consider any two Dimensions + # equal iff they are the very same object. This has several advantages. + # First of all, it makes it much more difficult to trick the compiler + # to generate buggy code (e.g., using two different "x" Dimensions that + # actually represent the same iteration space). Secondly, comparison + # is much cheaper, since we avoid having to go through all of the + # __rargs__/__rkwargs__, and there can be quite a few depending on the + # specific Dimension type + return self is other + + __hash__ = Symbol.__hash__ + class DefaultDimension(Dimension, DataSymbol): @@ -1417,10 +1426,6 @@ def __init_finalize__(self, name, _min, _max, spacing=None, backward=False, if self._size < 1: raise ValueError("Expected size greater than 0 (got %s)" % self._size) - def _hashable_content(self): - return (super()._hashable_content() + - (self._min, self._max, self._spacing, self._backward)) - @cached_property def backward(self): return self._backward diff --git a/tests/test_pickle.py b/tests/test_pickle.py index 16f44bdaed..26b6ea7f63 100644 --- a/tests/test_pickle.py +++ b/tests/test_pickle.py @@ -236,7 +236,7 @@ def test_sub_dimension(self, pickle): assert di.name == new_di.name assert di.dtype == new_di.dtype - assert di.parent == new_di.parent + assert di.parent.name == new_di.parent.name assert di._thickness == new_di._thickness assert di._interval == new_di._interval @@ -249,7 +249,7 @@ def test_conditional_dimension(self, pickle): new_cd = pickle.loads(pkl_cd) assert cd.name == new_cd.name - assert cd.parent == new_cd.parent + assert cd.parent.name == new_cd.parent.name assert cd.factor == new_cd.factor assert cd.condition == new_cd.condition @@ -262,7 +262,7 @@ def test_incr_dimension(self, pickle): new_dd = pickle.loads(pkl_dd) assert dd.name == new_dd.name - assert dd.parent == new_dd.parent + assert dd.parent.name == new_dd.parent.name assert dd.symbolic_min == new_dd.symbolic_min assert dd.symbolic_max == new_dd.symbolic_max assert dd.step == new_dd.step @@ -384,7 +384,7 @@ def test_guard_factor(self, pickle): pkl_gf = pickle.dumps(gf) new_gf = pickle.loads(pkl_gf) - assert gf == new_gf + assert str(gf) == str(new_gf) def test_temp_function(self, pickle): grid = Grid(shape=(3, 3)) From 232bda7943bd59abb0d6622e195ceccf7f65e9f8 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 11 Sep 2023 13:50:49 +0000 Subject: [PATCH 62/79] compiler: Speedup DDA by introducing null_ispace --- devito/ir/clusters/cluster.py | 8 +++----- devito/ir/clusters/visitors.py | 6 ++---- devito/ir/support/basic.py | 6 +++--- devito/ir/support/space.py | 6 +++++- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 5576ab30c2..6d37629426 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -8,7 +8,7 @@ Forward, Interval, IntervalGroup, IterationSpace, DataSpace, Guards, Properties, Scope, detect_accesses, detect_io, normalize_properties, normalize_syncs, - minimum, maximum) + minimum, maximum, null_ispace) from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.symbolics import estimate_cost from devito.tools import as_tuple, flatten, frozendict, infer_dtype @@ -41,10 +41,8 @@ class Cluster(object): The halo exchanges required by the Cluster. """ - def __init__(self, exprs, ispace=None, guards=None, properties=None, syncs=None, - halo_scheme=None): - ispace = ispace or IterationSpace([]) - + def __init__(self, exprs, ispace=null_ispace, guards=None, properties=None, + syncs=None, halo_scheme=None): self._exprs = tuple(ClusterizedEq(e, ispace=ispace) for e in as_tuple(exprs)) self._ispace = ispace self._guards = Guards(guards or {}) diff --git a/devito/ir/clusters/visitors.py b/devito/ir/clusters/visitors.py index 979ae76249..95aecf1a86 100644 --- a/devito/ir/clusters/visitors.py +++ b/devito/ir/clusters/visitors.py @@ -3,7 +3,7 @@ from itertools import groupby -from devito.ir.support import IterationSpace, Scope +from devito.ir.support import IterationSpace, Scope, null_ispace from devito.tools import as_tuple, flatten, timed_pass __all__ = ['Queue', 'QueueStateful', 'cluster_pass'] @@ -73,12 +73,10 @@ def _make_key(self, cluster, level): def _make_key_hook(self, cluster, level): return () - def _process_fdta(self, clusters, level, prefix=None, **kwargs): + def _process_fdta(self, clusters, level, prefix=null_ispace, **kwargs): """ fdta -> First Divide Then Apply """ - prefix = prefix or IterationSpace([]) - # Divide part processed = [] for k, g in groupby(clusters, key=lambda i: self._make_key(i, level)): diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 5d06bb5c1a..6a52e429ff 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -3,7 +3,7 @@ from cached_property import cached_property from sympy import S -from devito.ir.support.space import Backward, IterationSpace +from devito.ir.support.space import Backward, IterationSpace, null_ispace from devito.ir.support.utils import AccessMode, extrema from devito.ir.support.vector import LabeledVector, Vector from devito.symbolics import (compare_ops, retrieve_indexed, retrieve_terminals, @@ -218,12 +218,12 @@ def __new__(cls, access, mode, timestamp, ispace=None): AccessMode.__init__(obj, mode=mode) return obj - def __init__(self, access, mode, timestamp, ispace=None): + def __init__(self, access, mode, timestamp, ispace=null_ispace): assert is_integer(timestamp) self.access = access self.timestamp = timestamp - self.ispace = ispace or IterationSpace([]) + self.ispace = ispace def __repr__(self): mode = '\033[1;37;31mW\033[0m' if self.is_write else '\033[1;37;32mR\033[0m' diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 28389735c0..7a291baa32 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -13,7 +13,8 @@ from devito.types import Dimension, ModuloDimension __all__ = ['NullInterval', 'Interval', 'IntervalGroup', 'IterationSpace', - 'IterationInterval', 'DataSpace', 'Forward', 'Backward', 'Any'] + 'IterationInterval', 'DataSpace', 'Forward', 'Backward', 'Any', + 'null_ispace'] # The default Stamp, used by all new Intervals @@ -989,3 +990,6 @@ def dimensions(self): @cached_property def nonderived_directions(self): return {k: v for k, v in self.directions.items() if not k.is_Derived} + + +null_ispace = IterationSpace([]) From f0692441511c39f6e32815297febed4c4b297f87 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 11 Sep 2023 14:28:30 +0000 Subject: [PATCH 63/79] compiler: Speedup IntervalGroup.reorder --- devito/ir/support/space.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 7a291baa32..9d1b731f9e 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -304,9 +304,13 @@ def reorder(cls, items, relations): raise ValueError("Cannot create IntervalGroup from objs of type [%s]" % ', '.join(str(type(i)) for i in items)) - # The relations are between dimensions, not intervals. So we take - # care of that here - ordering = filter_ordered(toposort(relations) + [i.dim for i in items]) + if len(relations) == 1: + # Special case: avoid expensive topological sorting if possible + relation, = relations + ordering = filter_ordered(list(relation) + [i.dim for i in items]) + else: + ordering = filter_ordered(toposort(relations) + [i.dim for i in items]) + return sorted(items, key=lambda i: ordering.index(i.dim)) @classmethod From e83fed335830b19671a149bf4a2a2ef0de31fadf Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 11 Sep 2023 15:06:37 +0000 Subject: [PATCH 64/79] compiler: Speedup DDA through lazy TimedAccess --- devito/ir/support/basic.py | 180 ++++++++++++++++++++++++------------- 1 file changed, 116 insertions(+), 64 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 6a52e429ff..54d2d0a52c 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -8,10 +8,10 @@ from devito.ir.support.vector import LabeledVector, Vector from devito.symbolics import (compare_ops, retrieve_indexed, retrieve_terminals, q_constant, q_affine, q_routine, search, uxreplace) -from devito.tools import (Tag, as_tuple, is_integer, filter_sorted, flatten, - memoized_meth, memoized_generator) +from devito.tools import (Tag, as_mapper, as_tuple, is_integer, filter_sorted, + flatten, memoized_meth, memoized_generator) from devito.types import (Barrier, ComponentAccess, Dimension, DimensionTuple, - Jump, Symbol) + Function, Jump, Symbol, Temp, TempArray, TBArray) __all__ = ['IterationInstance', 'TimedAccess', 'Scope', 'ExprGeometry'] @@ -820,26 +820,19 @@ def __init__(self, exprs, rules=None): A Scope enables data dependence analysis on a totally ordered sequence of expressions. """ - exprs = as_tuple(exprs) + self.exprs = as_tuple(exprs) - self.reads = {} - self.writes = {} - - self.initialized = set() - - for i, e in enumerate(exprs): - # Reads - terminals = retrieve_accesses(e.rhs, deep=True) - try: - terminals.update(retrieve_accesses(e.lhs.indices)) - except AttributeError: - pass - for j in terminals: - v = self.reads.setdefault(j.function, []) - mode = 'RR' if e.is_Reduction and j.function is e.lhs.function else 'R' - v.append(TimedAccess(j, mode, i, e.ispace)) + # A set of rules to drive the collection of dependencies + self.rules = as_tuple(rules) + assert all(callable(i) for i in self.rules) - # Write + @cached_property + def writes(self): + """ + Create a mapper from functions to write accesses. + """ + ret = {} + for i, e in enumerate(self.exprs): terminals = retrieve_accesses(e.lhs) if q_routine(e.rhs): try: @@ -848,47 +841,122 @@ def __init__(self, exprs, rules=None): # E.g., foreign routines, such as `cos` or `sin` pass for j in terminals: - v = self.writes.setdefault(j.function, []) - mode = 'WR' if e.is_Reduction else 'W' + v = ret.setdefault(j.function, []) + if e.is_Reduction: + mode = 'WR' + else: + mode = 'W' v.append(TimedAccess(j, mode, i, e.ispace)) + # Objects altering the control flow (e.g., synchronization barriers, + # break statements, ...) are converted into mock dependences + for i, e in enumerate(self.exprs): + if isinstance(e.rhs, (Barrier, Jump)): + ret.setdefault(mocksym, []).append( + TimedAccess(mocksym, 'W', i, e.ispace) + ) + + return ret + + @memoized_generator + def reads_explicit_gen(self): + """ + Generate all explicit reads. These are the read accesses to the + AbstractFunctions and Symbols appearing in the Scope's symbolic + expressions. + """ + for i, e in enumerate(self.exprs): + # Reads + terminals = retrieve_accesses(e.rhs, deep=True) + try: + terminals.update(retrieve_accesses(e.lhs.indices)) + except AttributeError: + pass + for j in terminals: + if j.function is e.lhs.function and e.is_Reduction: + mode = 'RR' + else: + mode = 'R' + yield TimedAccess(j, mode, i, e.ispace) + # If a reduction, we got one implicit read if e.is_Reduction: - v = self.reads.setdefault(e.lhs.function, []) - v.append(TimedAccess(e.lhs, 'RR', i, e.ispace)) - - # If writing to a scalar, we have an initialization - if not e.is_Reduction and e.is_scalar: - self.initialized.add(e.lhs.function) + yield TimedAccess(e.lhs, 'RR', i, e.ispace) # Look up ConditionalDimensions for v in e.conditionals.values(): for j in retrieve_accesses(v): - v = self.reads.setdefault(j.function, []) - v.append(TimedAccess(j, 'R', -1, e.ispace)) + yield TimedAccess(j, 'R', -1, e.ispace) - # The iteration symbols too - dimensions = set().union(*[e.dimensions for e in exprs]) + @memoized_generator + def reads_implicit_gen(self): + """ + Generate all implicit reads. These are for examples the reads accesses + to the iteration symbols bounded to the Dimensions used in the Scope's + symbolic expressions. + """ + # The iteration symbols + dimensions = set().union(*[e.dimensions for e in self.exprs]) for d in dimensions: for i in d.free_symbols | d.bound_symbols: - v = self.reads.setdefault(i.function, []) - v.append(TimedAccess(i, 'R', -1)) + yield TimedAccess(i, 'R', -1) # Objects altering the control flow (e.g., synchronization barriers, # break statements, ...) are converted into mock dependences - for i, e in enumerate(exprs): + for i, e in enumerate(self.exprs): if isinstance(e.rhs, (Barrier, Jump)): - self.writes.setdefault(mocksym, []).append( - TimedAccess(mocksym, 'W', i, e.ispace) - ) - self.reads.setdefault(mocksym, []).extend([ - TimedAccess(mocksym, 'R', max(i, 0), e.ispace), - TimedAccess(mocksym, 'R', i+1, e.ispace), - ]) + yield TimedAccess(mocksym, 'R', max(i, 0), e.ispace) + yield TimedAccess(mocksym, 'R', i+1, e.ispace) - # A set of rules to drive the collection of dependencies - self.rules = as_tuple(rules) - assert all(callable(i) for i in self.rules) + @memoized_generator + def reads_gen(self): + """ + Generate all read accesses. + """ + # NOTE: The reason to keep the explicit and implict reads separated + # is efficiency. Sometimes we wish to extract all reads to a given + # AbstractFunction, and we know that by construction these can't + # appear among the implicit reads + return chain(self.reads_explicit_gen(), self.reads_implicit_gen()) + + @memoized_generator + def reads_smart_gen(self, f): + """ + Generate all read access to a given function. + + StencilDimensions, if any, are replaced with their extrema. + + Notes + ----- + The implementation is smart, in the sense that, depending on the + given function type, it will not necessarily look everywhere inside + the scope to retrieve the corresponding read accesses. Instead, it + will only look in the places where the given type is expected to + be found. For example, a DiscreteFunction would never appear among + the iteration symbols. + """ + if isinstance(f, (Function, Temp, TempArray, TBArray)): + for i in self.reads_explicit_gen(): + if f is i.function: + for j in extrema(i.access): + yield TimedAccess(j, i.mode, i.timestamp, i.ispace) + + else: + for i in self.reads_gen(): + if f is i.function: + yield i + + @cached_property + def reads(self): + """ + Create a mapper from functions to read accesses. + """ + return as_mapper(self.reads_gen(), key=lambda i: i.function) + + @cached_property + def initialized(self): + return frozenset(e.lhs.function for e in self.exprs + if not e.is_Reduction and e.is_scalar) def getreads(self, function): return as_tuple(self.reads.get(function)) @@ -926,22 +994,6 @@ def __repr__(self): return "\n".join([out.format(i.name, w, '', r) for i, r, w in zip(tracked, reads, writes)]) - @memoized_meth - def getreads_extremaed(self, f): - """ - A view of the Scope's reads in which the StencilDimensions are replaced - with their extrema. - """ - ret = set() - - if f.is_Symbol: - return self.reads.get(f, ret) - else: - for i in self.reads.get(f, []): - for j in extrema(i.access): - ret.add(TimedAccess(j, i.mode, i.timestamp, i.ispace)) - return ret - @cached_property def accesses(self): groups = list(self.reads.values()) + list(self.writes.values()) @@ -967,7 +1019,7 @@ def d_flow_gen(self): """Generate the flow (or "read-after-write") dependences.""" for k, v in self.writes.items(): for w in v: - for r in self.getreads_extremaed(k): + for r in self.reads_smart_gen(k): dependence = Dependence(w, r) if dependence.is_imaginary: @@ -997,7 +1049,7 @@ def d_anti_gen(self): """Generate the anti (or "write-after-read") dependences.""" for k, v in self.writes.items(): for w in v: - for r in self.getreads_extremaed(k): + for r in self.reads_smart_gen(k): dependence = Dependence(r, w) if dependence.is_imaginary: From 4594ff6d59e8361e0003b5df713e2d7ffbc0345e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 12 Sep 2023 09:17:09 +0000 Subject: [PATCH 65/79] compiler: Speedup DDA by postponing distance calculation --- devito/ir/support/basic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 54d2d0a52c..a6157a6136 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -1022,10 +1022,10 @@ def d_flow_gen(self): for r in self.reads_smart_gen(k): dependence = Dependence(w, r) - if dependence.is_imaginary: + if any(not rule(dependence) for rule in self.rules): continue - if any(not rule(dependence) for rule in self.rules): + if dependence.is_imaginary: continue distance = dependence.distance @@ -1052,10 +1052,10 @@ def d_anti_gen(self): for r in self.reads_smart_gen(k): dependence = Dependence(r, w) - if dependence.is_imaginary: + if any(not rule(dependence) for rule in self.rules): continue - if any(not rule(dependence) for rule in self.rules): + if dependence.is_imaginary: continue distance = dependence.distance @@ -1082,10 +1082,10 @@ def d_output_gen(self): for w2 in self.writes.get(k, []): dependence = Dependence(w2, w1) - if dependence.is_imaginary: + if any(not rule(dependence) for rule in self.rules): continue - if any(not rule(dependence) for rule in self.rules): + if dependence.is_imaginary: continue distance = dependence.distance From c96b26613b2d717d2d57af8a992731c26989c378 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 12 Sep 2023 10:09:20 +0000 Subject: [PATCH 66/79] compiler: Speedup IterationSpace.project --- devito/ir/support/space.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 9d1b731f9e..7abc68ce08 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -897,8 +897,12 @@ def project(self, cond, strict=True): dims = [i.dim for i in self if not func(i.dim)] intervals = self.intervals.drop(dims, strict=strict) - sub_iterators = {k: v for k, v in self.sub_iterators.items() if func(k)} - directions = {k: v for k, v in self.directions.items() if func(k)} + sub_iterators = {} + directions = {} + for i in intervals: + d = i.dim + sub_iterators[d] = self.sub_iterators[d] + directions[d] = self.directions[d] return IterationSpace(intervals, sub_iterators, directions) From 7a3bd906032e5665d6fabd127e334a0f2912c46f Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 12 Sep 2023 12:15:57 +0000 Subject: [PATCH 67/79] compiler: Speedup DDA avoiding redundant TimedAccesses --- devito/ir/support/basic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index a6157a6136..f2328f3f2d 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -897,9 +897,11 @@ def reads_implicit_gen(self): """ # The iteration symbols dimensions = set().union(*[e.dimensions for e in self.exprs]) + symbols = set() for d in dimensions: - for i in d.free_symbols | d.bound_symbols: - yield TimedAccess(i, 'R', -1) + symbols.update(d.free_symbols | d.bound_symbols) + for i in symbols: + yield TimedAccess(i, 'R', -1) # Objects altering the control flow (e.g., synchronization barriers, # break statements, ...) are converted into mock dependences From b9e1056a702884046267bd61a5d72434a919de43 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 12 Sep 2023 12:40:20 +0000 Subject: [PATCH 68/79] compiler: Speedup Dependence creation --- devito/ir/support/basic.py | 18 +++++++++--------- devito/passes/clusters/misc.py | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index f2328f3f2d..8185078e1f 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -1022,11 +1022,11 @@ def d_flow_gen(self): for k, v in self.writes.items(): for w in v: for r in self.reads_smart_gen(k): - dependence = Dependence(w, r) - - if any(not rule(dependence) for rule in self.rules): + if any(not rule(w, r) for rule in self.rules): continue + dependence = Dependence(w, r) + if dependence.is_imaginary: continue @@ -1052,11 +1052,11 @@ def d_anti_gen(self): for k, v in self.writes.items(): for w in v: for r in self.reads_smart_gen(k): - dependence = Dependence(r, w) - - if any(not rule(dependence) for rule in self.rules): + if any(not rule(r, w) for rule in self.rules): continue + dependence = Dependence(r, w) + if dependence.is_imaginary: continue @@ -1082,11 +1082,11 @@ def d_output_gen(self): for k, v in self.writes.items(): for w1 in v: for w2 in self.writes.get(k, []): - dependence = Dependence(w2, w1) - - if any(not rule(dependence) for rule in self.rules): + if any(not rule(w2, w1) for rule in self.rules): continue + dependence = Dependence(w2, w1) + if dependence.is_imaginary: continue diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index c14c49958c..dd349a13d6 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -317,10 +317,10 @@ def _build_dag(self, cgroups, prefix): dag = DAG(nodes=cgroups) for n, cg0 in enumerate(cgroups): - def is_cross(dep): + def is_cross(source, sink): # True if a cross-ClusterGroup dependence, False otherwise - t0 = dep.source.timestamp - t1 = dep.sink.timestamp + t0 = source.timestamp + t1 = sink.timestamp v = len(cg0.exprs) return t0 < v <= t1 or t1 < v <= t0 From 9ef9200cd43730abc1ebb9e0b2408d1842acd655 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 12 Sep 2023 12:55:01 +0000 Subject: [PATCH 69/79] compiler: Speedup DDA by lazily generating writes --- devito/ir/support/basic.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 8185078e1f..a7d2854d23 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -826,12 +826,11 @@ def __init__(self, exprs, rules=None): self.rules = as_tuple(rules) assert all(callable(i) for i in self.rules) - @cached_property - def writes(self): + @memoized_generator + def writes_gen(self): """ - Create a mapper from functions to write accesses. + Generate all write accesses. """ - ret = {} for i, e in enumerate(self.exprs): terminals = retrieve_accesses(e.lhs) if q_routine(e.rhs): @@ -841,22 +840,24 @@ def writes(self): # E.g., foreign routines, such as `cos` or `sin` pass for j in terminals: - v = ret.setdefault(j.function, []) if e.is_Reduction: mode = 'WR' else: mode = 'W' - v.append(TimedAccess(j, mode, i, e.ispace)) + yield TimedAccess(j, mode, i, e.ispace) # Objects altering the control flow (e.g., synchronization barriers, # break statements, ...) are converted into mock dependences for i, e in enumerate(self.exprs): if isinstance(e.rhs, (Barrier, Jump)): - ret.setdefault(mocksym, []).append( - TimedAccess(mocksym, 'W', i, e.ispace) - ) + yield TimedAccess(mocksym, 'W', i, e.ispace) - return ret + @cached_property + def writes(self): + """ + Create a mapper from functions to write accesses. + """ + return as_mapper(self.writes_gen(), key=lambda i: i.function) @memoized_generator def reads_explicit_gen(self): From 8b85d759c94fa61363dff1d4a9cbeb7feac0ec2f Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 13 Sep 2023 10:54:41 +0000 Subject: [PATCH 70/79] compiler: Refactor factorizer to enable external customization --- devito/passes/clusters/factorization.py | 205 +++++++++++++----------- 1 file changed, 110 insertions(+), 95 deletions(-) diff --git a/devito/passes/clusters/factorization.py b/devito/passes/clusters/factorization.py index 1353bf89b3..41c1ed2ec0 100644 --- a/devito/passes/clusters/factorization.py +++ b/devito/passes/clusters/factorization.py @@ -43,6 +43,71 @@ def factorize(cluster, *args): return cluster.rebuild(processed) +def collect_special(expr): + """ + Factorize elemental functions, pows, and other special symbolic objects, + prioritizing the most expensive entities. + """ + args, candidates = zip(*[_collect_nested(arg) for arg in expr.args]) + candidates = ReducerMap.fromdicts(*candidates) + + funcs = candidates.getall('funcs', []) + pows = candidates.getall('pows', []) + coeffs = candidates.getall('coeffs', []) + + # Functions/Pows are collected first, coefficients afterwards + terms = [] + w_funcs = [] + w_pows = [] + w_coeffs = [] + for i in args: + _args = i.args + if any(j in funcs for j in _args): + w_funcs.append(i) + elif any(j in pows for j in _args): + w_pows.append(i) + elif any(j in coeffs for j in _args): + w_coeffs.append(i) + else: + terms.append(i) + + # Collect common funcs + if len(w_funcs) > 1: + w_funcs = Add(*w_funcs, evaluate=False) + w_funcs = collect(w_funcs, funcs, evaluate=False) + try: + terms.extend([Mul(k, collect_const(v), evaluate=False) + for k, v in w_funcs.items()]) + except AttributeError: + assert w_funcs == 0 + else: + terms.extend(w_funcs) + + # Collect common pows + w_pows = Add(*w_pows, evaluate=False) + w_pows = collect(w_pows, pows, evaluate=False) + try: + terms.extend([Mul(k, collect_const(v), evaluate=False) + for k, v in w_pows.items()]) + except AttributeError: + assert w_pows == 0 + + # Collect common temporaries (r0, r1, ...) + w_coeffs = Add(*w_coeffs, evaluate=False) + symbols = retrieve_symbols(w_coeffs) + if symbols: + w_coeffs = collect(w_coeffs, symbols, evaluate=False) + try: + terms.extend([Mul(k, collect_const(v), evaluate=False) + for k, v in w_coeffs.items()]) + except AttributeError: + assert w_coeffs == 0 + else: + terms.append(w_coeffs) + + return Add(*terms) + + def collect_const(expr): """ *Much* faster alternative to sympy.collect_const. *Potentially* slightly less @@ -67,9 +132,10 @@ def collect_const(expr): if len(v) == 1 and not v[0].is_Add: # Special case: avoid e.g. (-2)*a mul = Mul(k, *v) - elif all(i.is_Mul and len(i.args) == 2 and i.args[0] == -1 for i in v): + elif all(i.is_Mul and i.args[0] == -1 for i in v): # Other special case: [-a, -b, -c ...] - add = Add(*[i.args[1] for i in v], evaluate=False) + operands = [Mul(*i.args[1:], evaluate=False) for i in v] + add = Add(*operands, evaluate=False) mul = Mul(-k, add, evaluate=False) elif k == 1: # 1 * (a + c) @@ -89,105 +155,54 @@ def collect_const(expr): return Add(*terms) -def collect_nested(expr): +def strategy0(expr): + rebuilt = collect_special(expr) + rebuilt = collect_const(rebuilt) + + return rebuilt + + +strategies = { + 'default': strategy0 +} + + +def _collect_nested(expr): + """ + Recursion helper for `collect_nested`. """ - Collect numeric coefficients, trascendental functions, and symbolic powers, - across all levels of the expression tree. + # Return semantic (rebuilt expression, factorization candidates) + + if expr.is_Number: + return expr, {'coeffs': expr} + elif expr.is_Function: + return expr, {'funcs': expr} + elif expr.is_Pow: + return expr, {'pows': expr} + elif expr.is_Symbol or expr.is_Indexed or isinstance(expr, BasicWrapperMixin): + return expr, {} + elif expr.is_Add: + return strategies['default'](expr), {} + elif expr.is_Mul: + args, candidates = zip(*[_collect_nested(arg) for arg in expr.args]) + return Mul(*args), ReducerMap.fromdicts(*candidates) + elif expr.is_Equality: + args, candidates = zip(*[_collect_nested(expr.lhs), + _collect_nested(expr.rhs)]) + return expr.func(*args, evaluate=False), ReducerMap.fromdicts(*candidates) + else: + args, candidates = zip(*[_collect_nested(arg) for arg in expr.args]) + return expr.func(*args), ReducerMap.fromdicts(*candidates) - The collection gives precedence to (in order of importance): - 1) Trascendental functions, - 2) Symbolic powers, - 3) Numeric coefficients. +def collect_nested(expr): + """ + Collect numeric coefficients, trascendental functions, pows, and other + symbolic objects across all levels of the expression tree. Parameters ---------- expr : expr-like The expression to be factorized. """ - - def run(expr): - # Return semantic (rebuilt expression, factorization candidates) - - if expr.is_Number: - return expr, {'coeffs': expr} - elif expr.is_Function: - return expr, {'funcs': expr} - elif expr.is_Pow: - return expr, {'pows': expr} - elif expr.is_Symbol or expr.is_Indexed or isinstance(expr, BasicWrapperMixin): - return expr, {} - elif expr.is_Add: - args, candidates = zip(*[run(arg) for arg in expr.args]) - candidates = ReducerMap.fromdicts(*candidates) - - funcs = candidates.getall('funcs', []) - pows = candidates.getall('pows', []) - coeffs = candidates.getall('coeffs', []) - - # Functions/Pows are collected first, coefficients afterwards - terms = [] - w_funcs = [] - w_pows = [] - w_coeffs = [] - for i in args: - _args = i.args - if any(j in funcs for j in _args): - w_funcs.append(i) - elif any(j in pows for j in _args): - w_pows.append(i) - elif any(j in coeffs for j in _args): - w_coeffs.append(i) - else: - terms.append(i) - - # Collect common funcs - if len(w_funcs) > 1: - w_funcs = Add(*w_funcs, evaluate=False) - w_funcs = collect(w_funcs, funcs, evaluate=False) - try: - terms.extend([Mul(k, collect_const(v), evaluate=False) - for k, v in w_funcs.items()]) - except AttributeError: - assert w_funcs == 0 - else: - terms.extend(w_funcs) - - # Collect common pows - w_pows = Add(*w_pows, evaluate=False) - w_pows = collect(w_pows, pows, evaluate=False) - try: - terms.extend([Mul(k, collect_const(v), evaluate=False) - for k, v in w_pows.items()]) - except AttributeError: - assert w_pows == 0 - - # Collect common temporaries (r0, r1, ...) - w_coeffs = Add(*w_coeffs, evaluate=False) - symbols = retrieve_symbols(w_coeffs) - if symbols: - w_coeffs = collect(w_coeffs, symbols, evaluate=False) - try: - terms.extend([Mul(k, collect_const(v), evaluate=False) - for k, v in w_coeffs.items()]) - except AttributeError: - assert w_coeffs == 0 - else: - terms.append(w_coeffs) - - # Collect common coefficients - rebuilt = Add(*terms) - rebuilt = collect_const(rebuilt) - - return rebuilt, {} - elif expr.is_Mul: - args, candidates = zip(*[run(arg) for arg in expr.args]) - return Mul(*args), ReducerMap.fromdicts(*candidates) - elif expr.is_Equality: - args, candidates = zip(*[run(expr.lhs), run(expr.rhs)]) - return expr.func(*args, evaluate=False), ReducerMap.fromdicts(*candidates) - else: - args, candidates = zip(*[run(arg) for arg in expr.args]) - return expr.func(*args), ReducerMap.fromdicts(*candidates) - - return run(expr)[0] + return _collect_nested(expr)[0] From 0c520a841be3559ebe62ec62b94d3a7f20192cbb Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 13 Sep 2023 15:58:21 +0000 Subject: [PATCH 71/79] compiler: Revamp and simplify CIRE's cost model --- devito/ir/support/basic.py | 2 +- devito/passes/clusters/__init__.py | 2 +- devito/passes/clusters/aliases.py | 126 ++++++++++++++--------------- devito/symbolics/inspection.py | 2 + tests/test_dle.py | 8 +- tests/test_dse.py | 45 ++++++----- tests/test_mpi.py | 6 +- tests/test_unexpansion.py | 10 +-- 8 files changed, 100 insertions(+), 101 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index a7d2854d23..21b3527c06 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -3,7 +3,7 @@ from cached_property import cached_property from sympy import S -from devito.ir.support.space import Backward, IterationSpace, null_ispace +from devito.ir.support.space import Backward, null_ispace from devito.ir.support.utils import AccessMode, extrema from devito.ir.support.vector import LabeledVector, Vector from devito.symbolics import (compare_ops, retrieve_indexed, retrieve_terminals, diff --git a/devito/passes/clusters/__init__.py b/devito/passes/clusters/__init__.py index d4bc2e182f..c41a628e06 100644 --- a/devito/passes/clusters/__init__.py +++ b/devito/passes/clusters/__init__.py @@ -1,7 +1,7 @@ from .utils import * # noqa from .buffering import * # noqa -from .aliases import * # noqa from .cse import * # noqa +from .aliases import * # noqa from .factorization import * # noqa from .blocking import * # noqa from .asynchrony import * # noqa diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index eb2a6dbd35..5db4a787c2 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -12,6 +12,7 @@ IntervalGroup, LabeledVector, Vector, normalize_properties, relax_properties, unbounded, minimum, maximum, extrema, vmax, vmin) +from devito.passes.clusters.cse import _cse from devito.symbolics import (Uxmapper, estimate_cost, search, reuse_if_untouched, uxreplace, sympy_dtype) from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, @@ -194,14 +195,6 @@ def _select(self, variants): """ raise NotImplementedError - def _eval_variants_delta(self, delta_flops, delta_ws): - """ - Given the deltas in flop reduction and working set size increase of two - Variants, return True if the second variant is estimated to deliever - better performance, False otherwise. - """ - raise NotImplementedError - class CireTransformerLegacy(CireTransformer): @@ -210,7 +203,7 @@ def _do_generate(self, exprs, exclude, cbk_search, cbk_compose=None): Carry out the bulk of the work of ``_generate``. """ counter = generator() - make = lambda: Symbol(name='dummy%d' % counter()) + make = lambda: Symbol(name='dummy%d' % counter(), dtype=np.float32) if cbk_compose is None: cbk_compose = lambda *args: None @@ -285,12 +278,7 @@ def _lookup_key(self, c, d): return AliasKey(ispace, intervals, c.dtype, c.guards, properties) def _select(self, variants): - return pick_best(variants, self._eval_variants_delta) - - def _eval_variants_delta(self, delta_flops, delta_ws): - # Always prefer the Variant with fewer temporaries - return ((delta_ws == 0 and delta_flops < 0) or - (delta_ws > 0)) + return pick_best(variants) class CireInvariantsElementary(CireInvariants): @@ -390,19 +378,7 @@ def _select(self, variants): "generated %d schedules in total" % (self.opt_schedule_strategy, len(variants))) - return pick_best(variants, self._eval_variants_delta) - - def _eval_variants_delta(self, delta_flops, delta_ws): - # If there's a greater flop reduction using fewer temporaries, no doubts - # what's gonna be the best variant. But if the better flop reduction - # comes at the price of using more temporaries, then we have to apply - # heuristics, in particular we estimate how many flops a temporary would - # allow to save - return ( - (delta_flops >= 0 and delta_ws > 0 and delta_flops / delta_ws < 15) or - (delta_flops <= 0 and delta_ws < 0 and delta_flops / delta_ws > 15) or - (delta_flops <= 0 and delta_ws >= 0) - ) + return pick_best(variants) class CireEvalDerivatives(CireDerivatives): @@ -757,8 +733,9 @@ def lower_aliases(aliases, meta, maxpar): meta.ispace.directions) ispace = ispace.augment(sub_iterators) - processed.append(ScheduledAlias(a.pivot, writeto, ispace, a.aliaseds, - indicess, a.score)) + processed.append( + ScheduledAlias(a.pivot, writeto, ispace, a.aliaseds, indicess) + ) # The [ScheduledAliases] must be ordered so as to reuse as many of the # `ispace`'s IterationIntervals as possible in order to honor the @@ -841,8 +818,9 @@ def optimize_schedule_rotations(schedule, sregistry): aispace = aispace.augment({d: mds + [ii]}) ispace = IterationSpace.union(rispace, aispace, relations={(d, cd, d1)}) - processed.append(ScheduledAlias(pivot, writeto, ispace, i.aliaseds, - indicess, i.score)) + processed.append(ScheduledAlias( + pivot, writeto, ispace, i.aliaseds, indicess, + )) # Update the rotations mapper rmapper[d].extend(list(mapper.values())) @@ -865,8 +843,9 @@ def optimize_schedule_padding(schedule, meta, platform): ispace = i.ispace.add(Interval(it.dim, 0, it.size % vl)) else: ispace = i.ispace - processed.append(ScheduledAlias(i.pivot, i.writeto, ispace, i.aliaseds, - i.indicess, i.score)) + processed.append(ScheduledAlias( + i.pivot, i.writeto, ispace, i.aliaseds, i.indicess, + )) except (TypeError, KeyError, IndexError): processed.append(i) @@ -885,12 +864,12 @@ def lower_schedule(schedule, meta, sregistry, ftemps): clusters = [] subs = {} - for pivot, writeto, ispace, aliaseds, indicess, _ in schedule: + for pivot, writeto, ispace, aliaseds, indicess in schedule: name = sregistry.make_name() # Infer the dtype for the pivot # This prevents cases such as `floor(a*b)` with `a` and `b` floats - # that would creat a temporary `int r = b` leading to erronous numerical results - # Such cases happen with the positions for sparse functions for example. + # that would creat a temporary `int r = b` leading to erronous + # numerical results dtype = sympy_dtype(pivot, meta.dtype) if writeto: @@ -986,36 +965,42 @@ def optimize_clusters_msds(clusters): return processed -def pick_best(variants, eval_variants_delta): +def pick_best(variants): """ - Return the variant with the best trade-off between operation count - reduction and working set increase. Heuristics may be applied. + Return the variant with the best theoretical performance. """ best = None - best_flops_score = None - best_ws_score = None - for i in variants: - i_flops_score = i.schedule.score - - # The working set score depends on the number and dimensionality of - # temporaries required by the Schedule - i_ws_count = Counter([len(sa.writeto) for sa in i.schedule]) - i_ws_score = [i_ws_count[sa + 1] - for sa in reversed(range(max(i_ws_count, default=0)))] - # TODO: For now, we assume the performance impact of an N-dimensional - # temporary is always the same regardless of the value N, but this might - # change in the future - i_ws_score = sum(i_ws_score) + # Flops in the two sweeps + flops0 = i.schedule.cost + flops1 = estimate_cost(i.exprs, True) + + flops = flops0 + flops1 + + # Data movement in the two sweeps + indexeds0 = search([sa.pivot for sa in i.schedule], Indexed) + indexeds1 = search(i.exprs, Indexed) + + ntemps = len(i.schedule) + nfunctions0 = len({i.function for i in indexeds0}) + nfunctions1 = len({i.function for i in indexeds1}) + + ws = ntemps*2 + nfunctions0 + nfunctions1 if best is None: - best, best_flops_score, best_ws_score = i, i_flops_score, i_ws_score + best, best_flops, best_ws = i, flops, ws continue - delta_flops = best_flops_score - i_flops_score - delta_ws = best_ws_score - i_ws_score - if eval_variants_delta(delta_flops, delta_ws): - best, best_flops_score, best_ws_score = i, i_flops_score, i_ws_score + delta_flops = flops - best_flops + delta_ws = ws - best_ws + + # Magic sauce + # The coefficients were obtained empirically studying the behaviour + # of different variants in several kernels + ws_curve = lambda x: (-1 / 70)*x + + if delta_ws <= ws_curve(delta_flops): + best, best_flops, best_ws = i, flops, ws return best @@ -1230,7 +1215,7 @@ def _pivot_legal_shifts(self): class Alias(object): - def __init__(self, pivot, aliaseds, intervals, distances, score=0): + def __init__(self, pivot, aliaseds, intervals, distances, score): self.pivot = pivot self.aliaseds = aliaseds self.intervals = intervals @@ -1290,7 +1275,7 @@ def __iter__(self): for i in self._list: yield i - def add(self, pivot, aliaseds, intervals, distances, score=0): + def add(self, pivot, aliaseds, intervals, distances, score): assert len(aliaseds) == len(distances) self._list.append(Alias(pivot, aliaseds, intervals, distances, score)) @@ -1314,7 +1299,8 @@ def is_frame(self): return all(i.is_frame for i in self._list) -ScheduledAlias = namedtuple('SchedAlias', 'pivot writeto ispace aliaseds indicess score') +ScheduledAlias = namedtuple('SchedAlias', + 'pivot writeto ispace aliaseds indicess') class Schedule(tuple): @@ -1334,9 +1320,19 @@ def rebuild(self, *items, dmapper=None, rmapper=None, is_frame=False): is_frame=is_frame or self.is_frame ) - @property - def score(self): - return sum(i.score for i in self) + @cached_property + def cost(self): + # Not just the sum for the individual items' cost! There might be + # redundancies, which we factor out here... + counter = generator() + make = lambda: Symbol(name='dummy%d' % counter(), dtype=np.float32) + + tot = 0 + for v in as_mapper(self, lambda i: i.ispace).values(): + exprs = [i.pivot for i in v] + tot += estimate_cost(_cse(exprs, make), True) + + return tot def make_rotations_table(d, v): diff --git a/devito/symbolics/inspection.py b/devito/symbolics/inspection.py index 94279db4ab..25ed8f84eb 100644 --- a/devito/symbolics/inspection.py +++ b/devito/symbolics/inspection.py @@ -111,6 +111,8 @@ def estimate_cost(exprs, estimate=False): 'pow': 50, 'div': 5, 'Abs': 5, + 'floor': 1, + 'ceil': 1 } diff --git a/tests/test_dle.py b/tests/test_dle.py index 4d851be03c..bfcb732bb4 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -390,10 +390,10 @@ def test_custom_rule1(self): 'blockinner': True, 'blockrelax': True})) - # Check generated code. By having specified "1" as rule, we expect the - # given par-tile to be applied to the kernel with id 1 - bns, _ = assert_blocking(op, {'z0_blk0', 'x1_blk0', 'x2_blk0', 'x3_blk0'}) - for i in ['x1_blk0', 'x2_blk0', 'x3_blk0']: + # Check generated code. By having specified "time" as rule, we expect the + # given par-tile to be applied to the kernel within the time loop + bns, _ = assert_blocking(op, {'x0_blk0', 'x1_blk0', 'x2_blk0'}) + for i in ['x0_blk0', 'x1_blk0', 'x2_blk0']: root = bns[i] iters = FindNodes(Iteration).visit(root) iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1] diff --git a/tests/test_dse.py b/tests/test_dse.py index 174cd4967e..009482ca43 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -1574,14 +1574,14 @@ def test_lazy_solve_produces_larger_temps(self): """ grid = Grid(shape=(10, 10)) - u = TimeFunction(name="u", grid=grid, space_order=4, time_order=2) + u = TimeFunction(name="u", grid=grid, space_order=8, time_order=2) pde = u.dt2 - (u.dx.dx + u.dy.dy) + u.dx.dy eq = Eq(u.forward, solve(pde, u.forward)) op = Operator(eq) assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 2 - assert op._profiler._sections['section0'].sops == 39 + assert op._profiler._sections['section0'].sops == 67 def test_hoisting_iso_ot4_akin(self): """ @@ -2063,8 +2063,11 @@ def test_tti_adjoint_akin_v2(self): eqn = Eq(p.backward, H0) - op0 = Operator(eqn, subs=grid.spacing_map, opt=('noop', {'openmp': True})) - op1 = Operator(eqn, subs=grid.spacing_map, opt=('advanced', {'openmp': True})) + op0 = Operator(eqn, subs=grid.spacing_map, + opt=('noop', {'openmp': True})) + op1 = Operator(eqn, subs=grid.spacing_map, + opt=('advanced', {'openmp': True, + 'cire-schedule': 0})) # Check code generation bns, pbs = assert_blocking(op1, {'x0_blk0'}) @@ -2155,9 +2158,9 @@ def test_nested_first_derivatives_unbalanced(self): ('v.dx.dx + p.dx.dx', (2, 2, (0, 2)), (61, 61, 25)), ('(v.dx + v.dy).dx - (v.dx + v.dy).dy + 2*f.dx.dx + f*f.dy.dy + f.dx.dx(x0=1)', - (3, 3, (0, 3)), (218, 202, 74)), + (3, 3, (0, 3)), (218, 202, 75)), ('(g*(1 + f)*v.dx).dx + (2*g*f*v.dx).dx', - (1, 2, (0, 1)), (52, 70, 20)), + (1, 1, (0, 1)), (52, 70, 20)), ('g*(f.dx.dx + g.dx.dx)', (1, 2, (0, 1)), (47, 62, 17)), ]) @@ -2731,11 +2734,10 @@ def test_fullopt(self): assert summary0[('section1', None)].ops == 44 assert np.isclose(summary0[('section0', None)].oi, 2.851, atol=0.001) - assert summary1[('section0', None)].ops == 9 - assert summary1[('section1', None)].ops == 31 - assert summary1[('section2', None)].ops == 88 - assert summary1[('section3', None)].ops == 22 - assert np.isclose(summary1[('section1', None)].oi, 1.767, atol=0.001) + assert summary1[('section0', None)].ops == 31 + assert summary1[('section1', None)].ops == 88 + assert summary1[('section2', None)].ops == 25 + assert np.isclose(summary1[('section0', None)].oi, 1.767, atol=0.001) assert np.allclose(u0.data, u1.data, atol=10e-5) assert np.allclose(rec0.data, rec1.data, atol=10e-5) @@ -2795,8 +2797,8 @@ def test_fullopt(self): assert np.allclose(self.tti_noopt[1].data, rec.data, atol=10e-1) # Check expected opcount/oi - assert summary[('section2', None)].ops == 92 - assert np.isclose(summary[('section2', None)].oi, 2.074, atol=0.001) + assert summary[('section1', None)].ops == 92 + assert np.isclose(summary[('section1', None)].oi, 2.074, atol=0.001) # With optimizations enabled, there should be exactly four BlockDimensions op = wavesolver.op_fwd() @@ -2814,7 +2816,7 @@ def test_fullopt(self): # 3 Arrays are defined globally for the sparse positions temporaries # and two additional bock-sized Arrays are defined locally arrays = [i for i in FindSymbols().visit(op) if i.is_Array] - extra_arrays = 2+3 + extra_arrays = 2 assert len(arrays) == 4 + extra_arrays assert all(i._mem_heap and not i._mem_external for i in arrays) bns, pbs = assert_blocking(op, {'x0_blk0'}) @@ -2850,18 +2852,19 @@ def test_fullopt_w_mpi(self): def test_opcounts(self, space_order, expected): op = self.tti_operator(opt='advanced', space_order=space_order) sections = list(op.op_fwd()._profiler._sections.values()) - assert sections[2].sops == expected + assert sections[1].sops == expected @switchconfig(profiling='advanced') - @pytest.mark.parametrize('space_order,expected', [ - (4, 121), + @pytest.mark.parametrize('space_order,exp_ops,exp_arrays', [ + (4, 122, 6), (8, 221, 7) ]) - def test_opcounts_adjoint(self, space_order, expected): - wavesolver = self.tti_operator(opt=('advanced', {'openmp': False})) + def test_opcounts_adjoint(self, space_order, exp_ops, exp_arrays): + wavesolver = self.tti_operator(space_order=space_order, + opt=('advanced', {'openmp': False})) op = wavesolver.op_adj() - assert op._profiler._sections['section2'].sops == expected - assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3 + assert op._profiler._sections['section1'].sops == exp_ops + assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == exp_arrays class TestTTIv2(object): diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 9951b5411e..9d827a3001 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -2501,10 +2501,8 @@ def test_adjoint_codegen(self, shape, kernel, space_order, save): op_adj = solver.op_adj() adj_calls = FindNodes(Call).visit(op_adj) - # one halo, ndim memalign and free (pos temp rec/src) - sf_calls = 2 * len(shape) - assert len(fwd_calls) == 1 + sf_calls - assert len(adj_calls) == 1 + sf_calls + assert len(fwd_calls) == 1 + assert len(adj_calls) == 1 def run_adjoint_F(self, nd): """ diff --git a/tests/test_unexpansion.py b/tests/test_unexpansion.py index 7d0391a85a..fa076096a3 100644 --- a/tests/test_unexpansion.py +++ b/tests/test_unexpansion.py @@ -232,10 +232,10 @@ class Test2Pass(object): def test_v0(self): grid = Grid(shape=(10, 10, 10)) - u = TimeFunction(name='u', grid=grid, space_order=4) - v = TimeFunction(name='v', grid=grid, space_order=4) - u1 = TimeFunction(name='u', grid=grid, space_order=4) - v1 = TimeFunction(name='v', grid=grid, space_order=4) + u = TimeFunction(name='u', grid=grid, space_order=8) + v = TimeFunction(name='v', grid=grid, space_order=8) + u1 = TimeFunction(name='u', grid=grid, space_order=8) + v1 = TimeFunction(name='v', grid=grid, space_order=8) eqns = [Eq(u.forward, (u.dx.dy + v*u + 1.)), Eq(v.forward, (v + u.dx.dy + 1.))] @@ -245,7 +245,7 @@ def test_v0(self): 'openmp': True})) # Check generated code - op1._profiler._sections['section0'].sops == 65 + assert op1._profiler._sections['section0'].sops == 41 assert_structure(op1, ['t', 't,x0_blk0,y0_blk0,x,y,z', 't,x0_blk0,y0_blk0,x,y,z,i0', From 503f2ec6a7324aeb0d7c6c7885a84c43f1f179e2 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 18 Sep 2023 10:01:37 +0000 Subject: [PATCH 72/79] compiler: Fix unexpansion of tensor objects --- devito/types/tensor.py | 8 +++++++- tests/test_derivatives.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/devito/types/tensor.py b/devito/types/tensor.py index 6c94c5fc3a..62a555ef78 100644 --- a/devito/types/tensor.py +++ b/devito/types/tensor.py @@ -173,7 +173,13 @@ def is_diagonal(self): for i in range(self.rows) if i != j]) def _evaluate(self, **kwargs): - return self.applyfunc(lambda x: getattr(x, 'evaluate', x)) + def _do_evaluate(x): + try: + expand = kwargs.get('expand', True) + return x._evaluate(expand=expand) + except AttributeError: + return x + return self.applyfunc(_do_evaluate) def values(self): if self.is_diagonal: diff --git a/tests/test_derivatives.py b/tests/test_derivatives.py index 8031932df5..5fa11a6fa0 100644 --- a/tests/test_derivatives.py +++ b/tests/test_derivatives.py @@ -766,6 +766,16 @@ def test_transpose(self): i0, = term.dimensions assert term.base == f.subs(x, x + i0*h_x) + def test_tensor_algebra(self): + grid = Grid(shape=(4, 4)) + + f = Function(name='f', grid=grid, space_order=4) + + v = grad(f)._evaluate(expand=False) + + assert all(isinstance(i, IndexDerivative) for i in v) + assert all(zip([Add(*i.args) for i in grad(f).evaluate], v.evaluate)) + def bypass_uneval(expr): unevals = expr.find(EvalDerivative) From 5d843fb5a3d1a2baa9e7f2a668143ec9394e7fa4 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 18 Sep 2023 13:32:17 +0000 Subject: [PATCH 73/79] compiler: Speedup instantiation of DiscreteFunctions --- devito/types/basic.py | 1 - devito/types/dense.py | 10 +++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/devito/types/basic.py b/devito/types/basic.py index 90ef573b91..102bfe3a75 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -855,7 +855,6 @@ def __new__(cls, *args, **kwargs): # through the `function` field newobj.function = function or newobj - # Initialization newobj.__init_finalize__(*args, **kwargs) return newobj diff --git a/devito/types/dense.py b/devito/types/dense.py index 2a13fa91af..08146eaf83 100644 --- a/devito/types/dense.py +++ b/devito/types/dense.py @@ -998,7 +998,15 @@ def __init_finalize__(self, *args, **kwargs): else: raise TypeError("`space_order` must be int or 3-tuple of ints") - self._fd = self.__fd_setup__() + # Acquire derivative shortcuts + if self is self.function: + self._fd = self.__fd_setup__() + else: + # E.g., `self is f(x + i0, y)` and `self.function is f(x, y)` + # Dynamically genereating derivative shortcuts is expensive; we + # can clearly avoid that here though! + self._fd = self.function._fd + # Flag whether it is a parameter or a variable. # Used at operator evaluation to evaluate the Function at the # variable location (i.e. if the variable is staggered in x the From e72f8e9bc86fc40dde5bcb7f8fa803aa486dd795 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 18 Sep 2023 15:02:58 +0000 Subject: [PATCH 74/79] compiler: Speedup ordering of IndexDerivatives --- devito/finite_differences/differentiable.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py index c780e08bfc..aa29d9c1d7 100644 --- a/devito/finite_differences/differentiable.py +++ b/devito/finite_differences/differentiable.py @@ -7,6 +7,7 @@ import sympy from sympy.core.add import _addsort from sympy.core.mul import _keep_coeff, _mulsort +from sympy.core.core import ordering_of_classes from sympy.core.decorators import call_highest_priority from sympy.core.evalf import evalf_table @@ -668,6 +669,16 @@ def __new__(cls, expr, mapper, **kwargs): def _hashable_content(self): return super()._hashable_content() + (self.mapper,) + def compare(self, other): + if self is other: + return 0 + n1 = self.__class__ + n2 = other.__class__ + if n1.__name__ == n2.__name__: + return self.base.compare(other.base) + else: + return super().compare(other) + @cached_property def base(self): return self.expr.func(*[a for a in self.expr.args if a is not self.weights]) @@ -700,6 +711,10 @@ def _evaluate(self, **kwargs): return expr +ordering_of_classes.insert(ordering_of_classes.index('Derivative') + 1, + 'IndexDerivative') + + class EvalDerivative(DifferentiableOp, sympy.Add): is_commutative = True From 37028733202383e411543bfd73828a75cd134146 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 19 Sep 2023 08:08:52 +0000 Subject: [PATCH 75/79] compiler: Improve StencilDimension interface --- .../finite_differences/finite_difference.py | 5 ++++- devito/finite_differences/tools.py | 5 +---- devito/ir/clusters/algorithms.py | 2 +- devito/ir/clusters/cluster.py | 6 +++--- devito/passes/clusters/aliases.py | 3 ++- devito/types/dimension.py | 20 +++++++++++++------ 6 files changed, 25 insertions(+), 16 deletions(-) diff --git a/devito/finite_differences/finite_difference.py b/devito/finite_differences/finite_difference.py index cd42864173..44859b0903 100644 --- a/devito/finite_differences/finite_difference.py +++ b/devito/finite_differences/finite_difference.py @@ -239,7 +239,10 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic # For homogeneity, always generate e.g. `x + i0` rather than `x - i0` # for transpose and `x + i0` for direct indices = indices.transpose() - weights = weights._subs(indices.free_dim, -indices.free_dim) + + # Do the same for the Weights, though this is more than just a + # transposition, we also must switch to the transposed StencilDimension + weights = weights._subs(weights.dimension, -indices.free_dim) # Inject the StencilDimension # E.g. `x + i*h_x` into `f(x)` s.t. `f(x + i*h_x)` diff --git a/devito/finite_differences/tools.py b/devito/finite_differences/tools.py index 5ec3629403..e1f209c4d6 100644 --- a/devito/finite_differences/tools.py +++ b/devito/finite_differences/tools.py @@ -203,10 +203,7 @@ def transpose(self): """ indices = tuple(reversed(self)) - free_dim = StencilDimension(self.free_dim.name, - -self.free_dim._max, - -self.free_dim._min, - backward=True) + free_dim = self.free_dim.transpose() try: expr = self.expr._subs(self.free_dim, -free_dim) diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index a1b068fc05..91a145d966 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -64,7 +64,7 @@ def impose_total_ordering(clusters): try: relations = {tuple(sorted(c.ispace.itdims, key=key))} except ValueError: - # See issue #X + # See issue #2204 relations = c.ispace.relations ispace = c.ispace.reorder(relations=relations, mode='total') diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 6d37629426..bd5a820f30 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -488,12 +488,12 @@ def tailor_properties(properties, ispace): Create a new Properties object off `properties` that retains all and only the iteration dimensions in `ispace`. """ - for d in ispace.itdims: - properties = properties.add(d) - for i in properties: for d in as_tuple(i): if d not in ispace.itdims: properties = properties.drop(d) + for d in ispace.itdims: + properties = properties.add(d) + return properties diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 5db4a787c2..bfa1014748 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -996,7 +996,8 @@ def pick_best(variants): # Magic sauce # The coefficients were obtained empirically studying the behaviour - # of different variants in several kernels + # of different variants in several kernels and platforms + # Intuitively, it's like trading 70 operations for 1 temporary ws_curve = lambda x: (-1 / 70)*x if delta_ws <= ws_curve(delta_flops): diff --git a/devito/types/dimension.py b/devito/types/dimension.py index b73d557f27..61d82277da 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -1403,9 +1403,9 @@ class StencilDimension(BasicDimension): is_Stencil = True __rargs__ = BasicDimension.__rargs__ + ('_min', '_max') - __rkwargs__ = BasicDimension.__rkwargs__ + ('backward',) + __rkwargs__ = BasicDimension.__rkwargs__ + ('step',) - def __init_finalize__(self, name, _min, _max, spacing=None, backward=False, + def __init_finalize__(self, name, _min, _max, spacing=None, step=1, **kwargs): self._spacing = sympy.sympify(spacing) or sympy.S.One @@ -1415,20 +1415,25 @@ def __init_finalize__(self, name, _min, _max, spacing=None, backward=False, raise ValueError("Expected integer `max` (got %s)" % _max) if not is_integer(self._spacing): raise ValueError("Expected integer `spacing` (got %s)" % self._spacing) + if not is_integer(step): + raise ValueError("Expected integer `step` (got %s)" % step) self._min = _min self._max = _max + self._step = step self._size = _max - _min + 1 - self._backward = backward - if self._size < 1: raise ValueError("Expected size greater than 0 (got %s)" % self._size) - @cached_property + @property + def step(self): + return self._step + + @property def backward(self): - return self._backward + return self.step < 0 @cached_property def symbolic_size(self): @@ -1446,6 +1451,9 @@ def symbolic_max(self): def range(self): return range(self._min, self._max + 1) + def transpose(self): + return StencilDimension(self.name, -self._max, -self._min, step=-1) + @property def _arg_names(self): return () From bfa73759916f4041f44b9b2f73705f795245bfe8 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 6 Oct 2023 13:25:25 +0000 Subject: [PATCH 76/79] compiler: Remove post-rebase redundant meth --- devito/ir/support/properties.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py index a7113a4833..e9317ba1be 100644 --- a/devito/ir/support/properties.py +++ b/devito/ir/support/properties.py @@ -209,9 +209,6 @@ def is_parallel(self, dims): return any(len(self[d] & {PARALLEL, PARALLEL_INDEP}) > 0 for d in as_tuple(dims)) - def is_parallel_atomic(self, dims): - return any(len(self[d] & {PARALLEL_IF_ATOMIC}) > 0 for d in as_tuple(dims)) - def is_parallel_relaxed(self, dims): return any(len(self[d] & PARALLELS) > 0 for d in as_tuple(dims)) From 95807463420390803e32411b604658432260af86 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 9 Oct 2023 07:27:45 +0000 Subject: [PATCH 77/79] compiler: Fix IndexDerivative lowering --- devito/passes/clusters/derivatives.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devito/passes/clusters/derivatives.py b/devito/passes/clusters/derivatives.py index 222b665af4..58990ca9e9 100644 --- a/devito/passes/clusters/derivatives.py +++ b/devito/passes/clusters/derivatives.py @@ -74,7 +74,7 @@ def _core(expr, c, weights, mapper, sregistry): try: w = weights[k] except KeyError: - w = weights[k] = w0._rebuild(name=name) + w = weights[k] = w0._rebuild(name=name, dtype=expr.dtype) expr = uxreplace(expr, {w0.indexed: w.indexed}) dims = retrieve_dimensions(expr, deep=True) @@ -94,7 +94,7 @@ def _core(expr, c, weights, mapper, sregistry): ispace = IterationSpace.union(c.ispace, ispace0, relations=extra) name = sregistry.make_name(prefix='r') - s = Symbol(name=name, dtype=c.dtype) + s = Symbol(name=name, dtype=w.dtype) expr0 = Eq(s, 0.) ispace1 = ispace.project(lambda d: d is not dims[-1]) processed.insert(0, c.rebuild(exprs=expr0, ispace=ispace1)) From b4264b1f46ac9b540b06853a8d684913345e6687 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 9 Oct 2023 08:40:39 +0000 Subject: [PATCH 78/79] compiler: Restrict fusion of FetchUpdates --- devito/passes/clusters/misc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index dd349a13d6..1153342733 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -191,13 +191,13 @@ def _key(self, c): mapper = defaultdict(set) for k, v in i.items(): for s in v: - if isinstance(s, (FetchUpdate, PrefetchUpdate)) or \ + if isinstance(s, PrefetchUpdate) or \ (not self.fusetasks and isinstance(s, WaitLock)): # NOTE: A mix of Clusters w/ and w/o WaitLocks can safely # be fused, as in the worst case scenario the WaitLocks # get "hoisted" above the first Cluster in the sequence continue - elif (isinstance(s, (WaitLock, ReleaseLock)) or + elif (isinstance(s, (FetchUpdate, WaitLock, ReleaseLock)) or (self.fusetasks and isinstance(s, WithLock))): mapper[k].add(type(s)) else: From c1ebe2f0f3d40a6db1b77bff3da084b126f96460 Mon Sep 17 00:00:00 2001 From: Mathias Louboutin Date: Wed, 11 Oct 2023 13:59:13 -0400 Subject: [PATCH 79/79] api: cleanup fd transpose implementation --- .../finite_differences/finite_difference.py | 14 +++-------- devito/finite_differences/tools.py | 25 ++++++------------- devito/passes/clusters/derivatives.py | 2 +- 3 files changed, 12 insertions(+), 29 deletions(-) diff --git a/devito/finite_differences/finite_difference.py b/devito/finite_differences/finite_difference.py index 44859b0903..5749a82ae4 100644 --- a/devito/finite_differences/finite_difference.py +++ b/devito/finite_differences/finite_difference.py @@ -220,10 +220,11 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic weights = numeric_weights(deriv_order, indices, x0) # Enforce fixed precision FD coefficients to avoid variations in results - weights = [sympify(w).evalf(_PRECISION) for w in weights] + weights = [sympify(w).evalf(_PRECISION) for w in weights][::matvec.val] # Transpose the FD, if necessary - indices = indices.scale(matvec.val) + if matvec == transpose: + indices = indices.transpose() # Shift index due to staggering, if any indices = indices.shift(-(expr.indices_ref[dim] - dim)) @@ -235,15 +236,6 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic if not expand and indices.expr is not None: weights = Weights(name='w', dimensions=indices.free_dim, initvalue=weights) - if matvec == transpose: - # For homogeneity, always generate e.g. `x + i0` rather than `x - i0` - # for transpose and `x + i0` for direct - indices = indices.transpose() - - # Do the same for the Weights, though this is more than just a - # transposition, we also must switch to the transposed StencilDimension - weights = weights._subs(weights.dimension, -indices.free_dim) - # Inject the StencilDimension # E.g. `x + i*h_x` into `f(x)` s.t. `f(x + i*h_x)` expr = expr._subs(dim, indices.expr) diff --git a/devito/finite_differences/tools.py b/devito/finite_differences/tools.py index e1f209c4d6..7ab19e7664 100644 --- a/devito/finite_differences/tools.py +++ b/devito/finite_differences/tools.py @@ -175,14 +175,14 @@ def __repr__(self): def spacing(self): return self.dim.spacing - def scale(self, v): + def transpose(self): """ - Construct a new IndexSet with all indices scaled by `v`. + Transpose the IndexSet. """ - mapper = {self.spacing: v*self.spacing} + mapper = {self.spacing: -self.spacing} indices = [] - for i in self: + for i in reversed(self): try: iloc = i.xreplace(mapper) except AttributeError: @@ -191,22 +191,13 @@ def scale(self, v): indices.append(iloc) try: - expr = self.expr.xreplace(mapper) + free_dim = self.free_dim.transpose() + mapper.update({self.free_dim: -free_dim}) except AttributeError: - expr = None - - return IndexSet(self.dim, indices, expr=expr, fd=self.free_dim) - - def transpose(self): - """ - Transpose the IndexSet. - """ - indices = tuple(reversed(self)) - - free_dim = self.free_dim.transpose() + free_dim = self.free_dim try: - expr = self.expr._subs(self.free_dim, -free_dim) + expr = self.expr.xreplace(mapper) except AttributeError: expr = None diff --git a/devito/passes/clusters/derivatives.py b/devito/passes/clusters/derivatives.py index 58990ca9e9..01e5afea6a 100644 --- a/devito/passes/clusters/derivatives.py +++ b/devito/passes/clusters/derivatives.py @@ -102,7 +102,7 @@ def _core(expr, c, weights, mapper, sregistry): # Transform e.g. `w[i0] -> w[i0 + 2]` for alignment with the # StencilDimensions starting points subs = {expr.weights: - expr.weights.subs(d, d - (d._max if d.backward else d._min)) + expr.weights.subs(d, d - d._min) for d in dims} expr1 = Inc(s, uxreplace(expr.expr, subs)) processed.append(c.rebuild(exprs=expr1, ispace=ispace))