From f3e5b401670c3d43a19a8f99c2aed1b632a1b546 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 19 Oct 2023 14:02:44 +0000 Subject: [PATCH 01/12] compiler: Introduce Fence mixin --- devito/ir/support/basic.py | 8 ++++---- devito/types/misc.py | 36 ++++++++++++++++++++++++++++++++++-- devito/types/parallel.py | 6 +++--- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 21b3527c06..139e72f77b 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -10,8 +10,8 @@ q_constant, q_affine, q_routine, search, uxreplace) from devito.tools import (Tag, as_mapper, as_tuple, is_integer, filter_sorted, flatten, memoized_meth, memoized_generator) -from devito.types import (Barrier, ComponentAccess, Dimension, DimensionTuple, - Function, Jump, Symbol, Temp, TempArray, TBArray) +from devito.types import (ComponentAccess, Dimension, DimensionTuple, Fence, + Function, Symbol, Temp, TempArray, TBArray) __all__ = ['IterationInstance', 'TimedAccess', 'Scope', 'ExprGeometry'] @@ -849,7 +849,7 @@ def writes_gen(self): # Objects altering the control flow (e.g., synchronization barriers, # break statements, ...) are converted into mock dependences for i, e in enumerate(self.exprs): - if isinstance(e.rhs, (Barrier, Jump)): + if isinstance(e.rhs, Fence): yield TimedAccess(mocksym, 'W', i, e.ispace) @cached_property @@ -907,7 +907,7 @@ def reads_implicit_gen(self): # Objects altering the control flow (e.g., synchronization barriers, # break statements, ...) are converted into mock dependences for i, e in enumerate(self.exprs): - if isinstance(e.rhs, (Barrier, Jump)): + if isinstance(e.rhs, Fence): yield TimedAccess(mocksym, 'R', max(i, 0), e.ispace) yield TimedAccess(mocksym, 'R', i+1, e.ispace) diff --git a/devito/types/misc.py b/devito/types/misc.py index ccbd9b7fdc..6755b3ef16 100644 --- a/devito/types/misc.py +++ b/devito/types/misc.py @@ -7,7 +7,7 @@ from devito.types.basic import IndexedData from devito.tools import Pickable, as_tuple -__all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard', +__all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard', 'Fence', 'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump'] @@ -194,7 +194,39 @@ class TempArray(Array): pass -class Jump(object): +class Fence(object): + + """ + Mixin class for generic "fence" objects. + + A Fence is an object that enforces an ordering constraint on the + surrounding operations: the operations issued before the Fence are + guaranteed to be scheduled before operations issued after the Fence. + + The meaning of "operation" and its relationship with the concept of + termination depends on the actual Fence subclass. + + For example, operations could be Eq's. A Fence will definitely impair + topological sorting such that, e.g. + + Eq(A) + Fence + Eq(B) + + *cannot* get transformed into + + Eq(A) + Eq(B) + Fence + + However, a simple Fence won't dictate whether or not Eq(A) should also + terminate before Eq(B). + """ + + pass + + +class Jump(Fence): """ Mixin class for symbolic objects representing jumps in the control flow, diff --git a/devito/types/parallel.py b/devito/types/parallel.py index 491839779d..472ee9e781 100644 --- a/devito/types/parallel.py +++ b/devito/types/parallel.py @@ -18,7 +18,7 @@ from devito.types.array import Array, ArrayObject from devito.types.basic import Scalar, Symbol from devito.types.dimension import CustomDimension -from devito.types.misc import VolatileInt +from devito.types.misc import Fence, VolatileInt __all__ = ['NThreads', 'NThreadsNested', 'NThreadsNonaffine', 'NThreadsBase', 'DeviceID', 'ThreadID', 'Lock', 'PThreadArray', 'SharedData', @@ -287,10 +287,10 @@ def __new__(cls, *args, **kwargs): return super().__new__(cls, *args, **kwargs) -class Barrier(object): +class Barrier(Fence): """ - Mixin class for symbolic objects representing synchronization barriers. + A generic synchronization barrier for threads or processes. """ pass From eeb28518b66c1bdfbb2e83136985f98e3440011b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 19 Oct 2023 14:08:13 +0000 Subject: [PATCH 02/12] compiler: Introduce Nop for no-op eqns --- devito/types/misc.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/devito/types/misc.py b/devito/types/misc.py index 6755b3ef16..3a35145fe8 100644 --- a/devito/types/misc.py +++ b/devito/types/misc.py @@ -1,6 +1,7 @@ from ctypes import c_double, c_void_p import numpy as np +import sympy from sympy.core.core import ordering_of_classes from devito.types import Array, CompositeObject, Indexed, Symbol @@ -8,7 +9,8 @@ from devito.tools import Pickable, as_tuple __all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard', 'Fence', - 'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump'] + 'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump', + 'Nop'] class Timer(CompositeObject): @@ -234,3 +236,10 @@ class Jump(Fence): """ pass + + +Nop = sympy.Function('NOP') +""" +A wildcard for use in the RHS of Eqs that encode some kind of semantics +(e.g., a synchronization operation) but no computation. +""" From 28d45a034bb9a01b4b7f83224dba9d2c11954048 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 19 Oct 2023 14:14:13 +0000 Subject: [PATCH 03/12] compiler: Add WeakFence --- devito/types/misc.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/devito/types/misc.py b/devito/types/misc.py index 3a35145fe8..4c7ec687a2 100644 --- a/devito/types/misc.py +++ b/devito/types/misc.py @@ -10,7 +10,7 @@ __all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard', 'Fence', 'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump', - 'Nop'] + 'Nop', 'WeakFence'] class Timer(CompositeObject): @@ -238,6 +238,17 @@ class Jump(Fence): pass +class WeakFence(sympy.Function, Fence): + + """ + A Fence impairing topological sorting while not imposing constraints + on the termination of the potentially asynchronous operations initiated + before or after the fence. + """ + + pass + + Nop = sympy.Function('NOP') """ A wildcard for use in the RHS of Eqs that encode some kind of semantics From feba247c76b563026eab0a0e334fc840a19ef478 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 19 Oct 2023 14:44:49 +0000 Subject: [PATCH 04/12] compiler: Tweak Nop and weak_fence --- devito/types/misc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devito/types/misc.py b/devito/types/misc.py index 4c7ec687a2..3a6ca01efb 100644 --- a/devito/types/misc.py +++ b/devito/types/misc.py @@ -10,7 +10,7 @@ __all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard', 'Fence', 'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump', - 'Nop', 'WeakFence'] + 'nop', 'WeakFence'] class Timer(CompositeObject): @@ -249,7 +249,7 @@ class WeakFence(sympy.Function, Fence): pass -Nop = sympy.Function('NOP') +nop = sympy.Function('NOP') """ A wildcard for use in the RHS of Eqs that encode some kind of semantics (e.g., a synchronization operation) but no computation. From 685689cab9e88aa25f28d594f5fdf87480d2592a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 19 Oct 2023 15:06:55 +0000 Subject: [PATCH 05/12] compiler: Patch DDA over Fences --- devito/ir/support/basic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 139e72f77b..2151d9c96d 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -908,8 +908,8 @@ def reads_implicit_gen(self): # break statements, ...) are converted into mock dependences for i, e in enumerate(self.exprs): if isinstance(e.rhs, Fence): - yield TimedAccess(mocksym, 'R', max(i, 0), e.ispace) - yield TimedAccess(mocksym, 'R', i+1, e.ispace) + for j in range(len(self.exprs)): + yield TimedAccess(mocksym, 'R', j, e.ispace) @memoized_generator def reads_gen(self): From 21b418e816000a92bd9612efefe1a42c55af8ac8 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 19 Oct 2023 15:38:12 +0000 Subject: [PATCH 06/12] compiler: Refine lowering of Wild Clusters --- devito/ir/clusters/cluster.py | 23 +++++++++++++++++++---- devito/ir/clusters/visitors.py | 4 ++-- devito/ir/stree/algorithms.py | 11 +++++++++-- devito/passes/clusters/misc.py | 2 +- 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 83ee5a45d4..0ef9cff01c 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -12,6 +12,7 @@ from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.symbolics import estimate_cost from devito.tools import as_tuple, flatten, frozendict, infer_dtype +from devito.types import Fence __all__ = ["Cluster", "ClusterGroup"] @@ -191,12 +192,14 @@ def grid(self): @cached_property def is_dense(self): """ - A Cluster is dense if at least one of the following conditions is True: + True if at least one of the following conditions are True: * It is defined over a unique Grid and all of the Grid Dimensions are PARALLEL. * Only DiscreteFunctions are written and only affine index functions are used (e.g., `a[x+1, y-2]` is OK, while `a[b[x], y-2]` is not) + + False in all other cases. """ # Hopefully it's got a unique Grid and all Dimensions are PARALLEL (or # at most PARALLEL_IF_PVT). This is a quick and easy check so we try it first @@ -212,22 +215,34 @@ def is_dense(self): # Fallback to legacy is_dense checks return (not any(e.conditionals for e in self.exprs) and not any(f.is_SparseFunction for f in self.functions) and - not self.is_halo_touch and + not self.is_wild and all(a.is_regular for a in self.scope.accesses)) @cached_property def is_sparse(self): """ - A Cluster is sparse if it represents a sparse operation, i.e iff - There's at least one irregular access. + True if it represents a sparse operation, i.e iff there's at least + one irregular access, False otherwise. """ return any(a.is_irregular for a in self.scope.accesses) + @property + def is_wild(self): + """ + True if encoding a non-mathematical operation, False otherwise. + """ + return self.is_halo_touch or self.is_fence + @property def is_halo_touch(self): return (len(self.exprs) > 0 and all(isinstance(e.rhs, HaloTouch) for e in self.exprs)) + @property + def is_fence(self): + return (len(self.exprs) > 0 and + all(isinstance(e.rhs, Fence) for e in self.exprs)) + @cached_property def dtype(self): """ diff --git a/devito/ir/clusters/visitors.py b/devito/ir/clusters/visitors.py index 95aecf1a86..c080dabab6 100644 --- a/devito/ir/clusters/visitors.py +++ b/devito/ir/clusters/visitors.py @@ -198,9 +198,9 @@ def __init__(self, func, mode='dense'): self.func = func if mode == 'dense': - self.cond = lambda c: c.is_dense or not c.is_sparse + self.cond = lambda c: (c.is_dense or not c.is_sparse) and not c.is_wild elif mode == 'sparse': - self.cond = lambda c: c.is_sparse + self.cond = lambda c: c.is_sparse and not c.is_wild else: self.cond = lambda c: True diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 9383501ea3..24bba112c1 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -132,8 +132,13 @@ def stree_build(clusters, profiler=None, **kwargs): def preprocess(clusters, options=None, **kwargs): """ - Remove the HaloTouch's from `clusters` and create a mapping associating - each removed HaloTouch to the first Cluster necessitating it. + Lower the so-called "wild" Clusters, that is objects not representing a set + of mathematical operations. This boils down to: + + * Moving the HaloTouch's from `clusters` into a mapper `M: {HT -> C}`. + `c = M(ht)` is the first Cluster of the sequence requiring the halo + exchange `ht` to have terminated before the execution can proceed. + * Remove the WeakFences, as they have served their purpose at this point. """ queue = [] processed = [] @@ -141,6 +146,8 @@ def preprocess(clusters, options=None, **kwargs): if c.is_halo_touch: hs = HaloScheme.union(e.rhs.halo_scheme for e in c.exprs) queue.append(c.rebuild(halo_scheme=hs)) + elif c.is_wild: + continue else: dims = set(c.ispace.promote(lambda d: d.is_Block).itdims) diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index 1153342733..1d431830c4 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -262,7 +262,7 @@ def dump(): groups, processed = processed, [] for group in groups: - for flag, minigroup in groupby(group, key=lambda c: c.is_halo_touch): + for flag, minigroup in groupby(group, key=lambda c: c.is_wild): if flag: processed.extend([(c,) for c in minigroup]) else: From d3fc517419f0df9afdea38357e1aa1b58c9e98ae Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 20 Oct 2023 12:24:48 +0000 Subject: [PATCH 07/12] compiler: Add Cluster.is_async --- devito/ir/clusters/cluster.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 0ef9cff01c..05bde51ea7 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -6,9 +6,10 @@ from devito.ir.equations import ClusterizedEq from devito.ir.support import (PARALLEL, PARALLEL_IF_PVT, BaseGuardBoundNext, Forward, Interval, IntervalGroup, IterationSpace, - DataSpace, Guards, Properties, Scope, detect_accesses, - detect_io, normalize_properties, normalize_syncs, - minimum, maximum, null_ispace) + DataSpace, Guards, Properties, Scope, WithLock, + PrefetchUpdate, detect_accesses, detect_io, + normalize_properties, normalize_syncs, minimum, + maximum, null_ispace) from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.symbolics import estimate_cost from devito.tools import as_tuple, flatten, frozendict, infer_dtype @@ -177,10 +178,6 @@ def functions(self): def has_increments(self): return any(e.is_Increment for e in self.exprs) - @cached_property - def is_scalar(self): - return not any(f.is_Function for f in self.scope.writes) - @cached_property def grid(self): grids = set(f.grid for f in self.functions if f.is_DiscreteFunction) - {None} @@ -189,6 +186,10 @@ def grid(self): else: raise ValueError("Cluster has no unique Grid") + @cached_property + def is_scalar(self): + return not any(f.is_Function for f in self.scope.writes) + @cached_property def is_dense(self): """ @@ -243,6 +244,14 @@ def is_fence(self): return (len(self.exprs) > 0 and all(isinstance(e.rhs, Fence) for e in self.exprs)) + @property + def is_async(self): + """ + True if an asynchronous Cluster, False otherwise. + """ + return any(isinstance(s, (WithLock, PrefetchUpdate)) + for s in flatten(self.syncs.values())) + @cached_property def dtype(self): """ From a042071681283785594948c61155b244333dfb03 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 20 Oct 2023 12:24:59 +0000 Subject: [PATCH 08/12] compiler: Introduce CriticalRegion --- devito/ir/clusters/cluster.py | 16 +++++--- devito/ir/stree/algorithms.py | 7 ++++ devito/ir/support/basic.py | 55 ++++++++++++++++++++++++---- devito/passes/clusters/asynchrony.py | 8 +++- devito/passes/clusters/misc.py | 6 ++- devito/passes/clusters/utils.py | 51 +++++++++++++++++++++++++- devito/types/misc.py | 51 ++++++++++++++++++++++++-- tests/test_ir.py | 4 +- 8 files changed, 175 insertions(+), 23 deletions(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 05bde51ea7..87044cbb88 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -13,7 +13,7 @@ from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.symbolics import estimate_cost from devito.tools import as_tuple, flatten, frozendict, infer_dtype -from devito.types import Fence +from devito.types import Fence, WeakFence, CriticalRegion __all__ = ["Cluster", "ClusterGroup"] @@ -236,13 +236,19 @@ def is_wild(self): @property def is_halo_touch(self): - return (len(self.exprs) > 0 and - all(isinstance(e.rhs, HaloTouch) for e in self.exprs)) + return self.exprs and all(isinstance(e.rhs, HaloTouch) for e in self.exprs) @property def is_fence(self): - return (len(self.exprs) > 0 and - all(isinstance(e.rhs, Fence) for e in self.exprs)) + return self.is_weak_fence or self.is_critical_region + + @property + def is_weak_fence(self): + return self.exprs and all(isinstance(e.rhs, WeakFence) for e in self.exprs) + + @property + def is_critical_region(self): + return self.exprs and all(isinstance(e.rhs, CriticalRegion) for e in self.exprs) @property def is_async(self): diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 24bba112c1..6b4149cced 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -138,6 +138,11 @@ def preprocess(clusters, options=None, **kwargs): * Moving the HaloTouch's from `clusters` into a mapper `M: {HT -> C}`. `c = M(ht)` is the first Cluster of the sequence requiring the halo exchange `ht` to have terminated before the execution can proceed. + * Lower the CriticalRegions: + * If they encode an asynchronous operation (e.g., a WaitLock), attach + it to a Nop Cluster for future lowering; + * Otherwise, simply remove them, as they have served their purpose + at this point. * Remove the WeakFences, as they have served their purpose at this point. """ queue = [] @@ -146,6 +151,8 @@ def preprocess(clusters, options=None, **kwargs): if c.is_halo_touch: hs = HaloScheme.union(e.rhs.halo_scheme for e in c.exprs) queue.append(c.rebuild(halo_scheme=hs)) + elif c.is_critical_region and c.syncs: + processed.append(c.rebuild(exprs=None, syncs=c.syncs)) elif c.is_wild: continue else: diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 2151d9c96d..16e6f66064 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -11,7 +11,8 @@ from devito.tools import (Tag, as_mapper, as_tuple, is_integer, filter_sorted, flatten, memoized_meth, memoized_generator) from devito.types import (ComponentAccess, Dimension, DimensionTuple, Fence, - Function, Symbol, Temp, TempArray, TBArray) + CriticalRegion, Function, Symbol, Temp, TempArray, + TBArray) __all__ = ['IterationInstance', 'TimedAccess', 'Scope', 'ExprGeometry'] @@ -23,10 +24,9 @@ class IndexMode(Tag): REGULAR = IndexMode('regular') IRREGULAR = IndexMode('irregular') -mocksym = Symbol(name='⋈') -""" -A Symbol to create mock data depdendencies. -""" +# Symbols to create mock data depdendencies +mocksym0 = Symbol(name='__⋈_0__') +mocksym1 = Symbol(name='__⋈_1__') class IterationInstance(LabeledVector): @@ -848,9 +848,21 @@ def writes_gen(self): # Objects altering the control flow (e.g., synchronization barriers, # break statements, ...) are converted into mock dependences + + # Fences (any sort) cannot float around upon topological sorting for i, e in enumerate(self.exprs): if isinstance(e.rhs, Fence): - yield TimedAccess(mocksym, 'W', i, e.ispace) + yield TimedAccess(mocksym0, 'W', i, e.ispace) + + # CriticalRegions are stronger than plain Fences. + # We must also ensure that none of the Eqs within an opening-closing + # CriticalRegion pair floats outside upon topological sorting + for i, e in enumerate(self.exprs): + if isinstance(e.rhs, CriticalRegion) and e.rhs.opening: + for j, e1 in enumerate(self.exprs[i+1:], 1): + if isinstance(e1.rhs, CriticalRegion) and e.rhs.closing: + break + yield TimedAccess(mocksym1, 'W', i+j, e1.ispace) @cached_property def writes(self): @@ -906,10 +918,39 @@ def reads_implicit_gen(self): # Objects altering the control flow (e.g., synchronization barriers, # break statements, ...) are converted into mock dependences + + # Fences (any sort) cannot float around upon topological sorting for i, e in enumerate(self.exprs): if isinstance(e.rhs, Fence): for j in range(len(self.exprs)): - yield TimedAccess(mocksym, 'R', j, e.ispace) + yield TimedAccess(mocksym0, 'R', j, e.ispace) + break + + # CriticalRegions are stronger than plain Fences. + # We must also ensure that none of the Eqs within an opening-closing + # CriticalRegion pair floats outside upon topological sorting + for i, e in enumerate(self.exprs): + # Prevent floating before the opening + if isinstance(e.rhs, CriticalRegion) and e.rhs.opening: + for j, e1 in enumerate(reversed(self.exprs[:i]), 1): + if isinstance(e1.rhs, CriticalRegion) and e.rhs.closing: + break + yield TimedAccess(mocksym1, 'R', i-j, e1.ispace) + + # Ensure that "weak uses" of Scope (e.g., when the caller looks + # just at the written/read objects rather than the actual data + # dependencies) are shielded too + for e1 in self.exprs[i+1:]: + if isinstance(e1.rhs, CriticalRegion) and e.rhs.closing: + break + yield TimedAccess(e1.lhs, 'R', i, e.ispace) + + # Prevent floating after the closing + if isinstance(e.rhs, CriticalRegion) and e.rhs.closing: + for j, e1 in enumerate(self.exprs[i+1:], 1): + if isinstance(e1.rhs, CriticalRegion) and e.rhs.opening: + break + yield TimedAccess(mocksym1, 'R', i+j, e1.ispace) @memoized_generator def reads_gen(self): diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index f839363f4d..8508588090 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -4,7 +4,7 @@ from devito.ir import (Forward, GuardBoundNext, Queue, Vector, WaitLock, WithLock, FetchUpdate, PrefetchUpdate, ReleaseLock, normalize_syncs) -from devito.passes.clusters.utils import is_memcpy +from devito.passes.clusters.utils import bind_critical_regions, is_memcpy from devito.symbolics import IntDiv, uxreplace from devito.tools import OrderedSet, is_integer, timed_pass from devito.types import CustomDimension, Lock @@ -139,6 +139,12 @@ def callback(self, clusters, prefix): tasks[c0].append(ReleaseLock(lock[i], target)) tasks[c0].append(WithLock(lock[i], target, i, function, findex, d)) + # CriticalRegions preempt WaitLocks, by definition + mapper = bind_critical_regions(clusters) + for c in clusters: + for c1 in mapper.get(c, []): + waits[c].update(waits.pop(c1, [])) + processed = [] for c in clusters: if waits[c] or tasks[c]: diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index 1d431830c4..d8017af3c6 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -5,6 +5,7 @@ from devito.ir.clusters import Cluster, ClusterGroup, Queue, cluster_pass from devito.ir.support import (SEQUENTIAL, SEPARABLE, Scope, ReleaseLock, WaitLock, WithLock, FetchUpdate, PrefetchUpdate) +from devito.passes.clusters.utils import in_critical_region from devito.symbolics import pow_to_mul from devito.tools import DAG, Stamp, as_tuple, flatten, frozendict, timed_pass from devito.types import Hyperplane @@ -44,8 +45,9 @@ def callback(self, clusters, prefix): processed.append(c) continue - # Synchronization operations prevent lifting - if c.syncs.get(dim): + # Synchronization prevents lifting + if c.syncs.get(dim) or \ + in_critical_region(c, clusters): processed.append(c) continue diff --git a/devito/passes/clusters/utils.py b/devito/passes/clusters/utils.py index bb27859471..e3244976e9 100644 --- a/devito/passes/clusters/utils.py +++ b/devito/passes/clusters/utils.py @@ -1,7 +1,13 @@ +from collections import defaultdict +from itertools import groupby + +from devito.ir import Cluster from devito.symbolics import uxreplace -from devito.types import Symbol, Wildcard +from devito.tools import as_tuple, flatten +from devito.types import CriticalRegion, Eq, Symbol, Wildcard -__all__ = ['makeit_ssa', 'is_memcpy'] +__all__ = ['makeit_ssa', 'is_memcpy', 'make_critical_sequence', + 'bind_critical_regions', 'in_critical_region'] def makeit_ssa(exprs): @@ -48,3 +54,44 @@ def is_memcpy(expr): return False return a.function.is_Array or b.function.is_Array + + +def make_critical_sequence(ispace, sequence): + sequence = as_tuple(sequence) + assert len(sequence) >= 1 + + processed = [] + + # Opening + expr = Eq(Symbol(name='⋈'), CriticalRegion(True)) + processed.append(Cluster(exprs=expr, ispace=ispace)) + + processed.extend(sequence) + + # Closing + expr = Eq(Symbol(name='⋈'), CriticalRegion(False)) + processed.append(Cluster(exprs=expr, ispace=ispace)) + + return processed + + +def bind_critical_regions(clusters): + """ + A mapper from CriticalRegions to the critical sequences they open. + """ + critical_region = False + mapper = defaultdict(list) + for c in clusters: + if c.is_critical_region: + critical_region = not critical_region and c + elif critical_region: + mapper[critical_region].append(c) + return mapper + + +def in_critical_region(cluster, clusters): + """ + True if `cluster` is part of a critical sequence, False otherwise. + """ + mapper = bind_critical_regions(clusters) + return cluster in flatten(mapper.values()) diff --git a/devito/types/misc.py b/devito/types/misc.py index 3a6ca01efb..88d5dcae13 100644 --- a/devito/types/misc.py +++ b/devito/types/misc.py @@ -10,7 +10,7 @@ __all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard', 'Fence', 'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump', - 'nop', 'WeakFence'] + 'nop', 'WeakFence', 'CriticalRegion'] class Timer(CompositeObject): @@ -241,14 +241,57 @@ class Jump(Fence): class WeakFence(sympy.Function, Fence): """ - A Fence impairing topological sorting while not imposing constraints - on the termination of the potentially asynchronous operations initiated - before or after the fence. + The weakest of all possible fences. + + Equations cannot be moved across a WeakFence. + However an operation initiated before a WeakFence can terminate at any + point in time. """ pass +class CriticalRegion(sympy.Function, Fence): + + """ + A fence that either opens or closes a "critical sequence of Equations". + + There always are two CriticalRegions for each critical sequence of Equations: + + * `CriticalRegion(init)`: opens the critical sequence + * `CriticalRegion(end)`: closes the critical sequence + + `CriticalRegion(end)` must follow `CriticalRegion(init)`. + + A CriticalRegion implements a strong form of fencing: + + * Equations within a critical sequence cannot be moved outside of + the opening and closing CriticalRegions. + * However, internal rearrangements are possible + * An asynchronous operation initiated within the critial sequence must + terminate before re-entering the opening CriticalRegion. + """ + + def __init__(self, opening, **kwargs): + opening = bool(opening) + + sympy.Function.__init__(opening) + self.opening = opening + + def __repr__(self): + return "%s(%s)" % (self.__class__.__name__, + 'OPEN' if self.opening else 'CLOSE') + + __str__ = __repr__ + + def _sympystr(self, printer): + return str(self) + + @property + def closing(self): + return not self.opening + + nop = sympy.Function('NOP') """ A wildcard for use in the RHS of Eqs that encode some kind of semantics diff --git a/tests/test_ir.py b/tests/test_ir.py index ac2977d15f..6445cd53d8 100644 --- a/tests/test_ir.py +++ b/tests/test_ir.py @@ -9,7 +9,7 @@ from devito.ir.equations.algorithms import dimension_sort from devito.ir.iet import Iteration, FindNodes from devito.ir.support.basic import (IterationInstance, TimedAccess, Scope, - Vector, AFFINE, REGULAR, IRREGULAR, mocksym) + Vector, AFFINE, REGULAR, IRREGULAR, mocksym0) from devito.ir.support.space import (NullInterval, Interval, Forward, Backward, IterationSpace) from devito.ir.support.guards import GuardOverflow @@ -746,7 +746,7 @@ class Foo(DefFunction, Jump): assert len(scope.d_flow) == 3 assert len(scope.d_anti) == 0 assert any(v.function is f for v in scope.d_flow) - assert any(v.function is mocksym for v in scope.d_flow) + assert any(v.function is mocksym0 for v in scope.d_flow) def test_indirect_access(self): grid = Grid(shape=(4, 4)) From 4d39624659f7bb20b3c2770a54c1d11e66df007f Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 23 Oct 2023 14:52:35 +0000 Subject: [PATCH 09/12] compiler: Simplify DDA across CriticalRegions --- devito/ir/support/basic.py | 47 ++++++++++++---------------- tests/test_ir.py | 64 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 82 insertions(+), 29 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 16e6f66064..e218482f4e 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -860,7 +860,7 @@ def writes_gen(self): for i, e in enumerate(self.exprs): if isinstance(e.rhs, CriticalRegion) and e.rhs.opening: for j, e1 in enumerate(self.exprs[i+1:], 1): - if isinstance(e1.rhs, CriticalRegion) and e.rhs.closing: + if isinstance(e1.rhs, CriticalRegion) and e1.rhs.closing: break yield TimedAccess(mocksym1, 'W', i+j, e1.ispace) @@ -916,41 +916,32 @@ def reads_implicit_gen(self): for i in symbols: yield TimedAccess(i, 'R', -1) + @memoized_generator + def reads_synchro_gen(self): + """ + Generate all reads due to syncronization operations. These may be explicit + or implicit. + """ # Objects altering the control flow (e.g., synchronization barriers, # break statements, ...) are converted into mock dependences # Fences (any sort) cannot float around upon topological sorting for i, e in enumerate(self.exprs): if isinstance(e.rhs, Fence): - for j in range(len(self.exprs)): - yield TimedAccess(mocksym0, 'R', j, e.ispace) - break + if i > 0: + yield TimedAccess(mocksym0, 'R', i-1, e.ispace) + if i < len(self.exprs)-1: + yield TimedAccess(mocksym0, 'R', i+1, e.ispace) # CriticalRegions are stronger than plain Fences. # We must also ensure that none of the Eqs within an opening-closing # CriticalRegion pair floats outside upon topological sorting for i, e in enumerate(self.exprs): - # Prevent floating before the opening - if isinstance(e.rhs, CriticalRegion) and e.rhs.opening: - for j, e1 in enumerate(reversed(self.exprs[:i]), 1): - if isinstance(e1.rhs, CriticalRegion) and e.rhs.closing: - break - yield TimedAccess(mocksym1, 'R', i-j, e1.ispace) - - # Ensure that "weak uses" of Scope (e.g., when the caller looks - # just at the written/read objects rather than the actual data - # dependencies) are shielded too - for e1 in self.exprs[i+1:]: - if isinstance(e1.rhs, CriticalRegion) and e.rhs.closing: - break - yield TimedAccess(e1.lhs, 'R', i, e.ispace) - - # Prevent floating after the closing - if isinstance(e.rhs, CriticalRegion) and e.rhs.closing: - for j, e1 in enumerate(self.exprs[i+1:], 1): - if isinstance(e1.rhs, CriticalRegion) and e.rhs.opening: - break - yield TimedAccess(mocksym1, 'R', i+j, e1.ispace) + if isinstance(e.rhs, CriticalRegion): + if e.rhs.opening and i > 0: + yield TimedAccess(mocksym1, 'R', i-1, self.exprs[i-1].ispace) + elif e.rhs.closing and i < len(self.exprs)-1: + yield TimedAccess(mocksym1, 'R', i+1, self.exprs[i+1].ispace) @memoized_generator def reads_gen(self): @@ -961,7 +952,9 @@ def reads_gen(self): # is efficiency. Sometimes we wish to extract all reads to a given # AbstractFunction, and we know that by construction these can't # appear among the implicit reads - return chain(self.reads_explicit_gen(), self.reads_implicit_gen()) + return chain(self.reads_explicit_gen(), + self.reads_synchro_gen(), + self.reads_implicit_gen()) @memoized_generator def reads_smart_gen(self, f): @@ -980,7 +973,7 @@ def reads_smart_gen(self, f): the iteration symbols. """ if isinstance(f, (Function, Temp, TempArray, TBArray)): - for i in self.reads_explicit_gen(): + for i in chain(self.reads_explicit_gen(), self.reads_synchro_gen()): if f is i.function: for j in extrema(i.access): yield TimedAccess(j, i.mode, i.timestamp, i.ispace) diff --git a/tests/test_ir.py b/tests/test_ir.py index 6445cd53d8..f64ac08ae8 100644 --- a/tests/test_ir.py +++ b/tests/test_ir.py @@ -9,13 +9,14 @@ from devito.ir.equations.algorithms import dimension_sort from devito.ir.iet import Iteration, FindNodes from devito.ir.support.basic import (IterationInstance, TimedAccess, Scope, - Vector, AFFINE, REGULAR, IRREGULAR, mocksym0) + Vector, AFFINE, REGULAR, IRREGULAR, mocksym0, + mocksym1) from devito.ir.support.space import (NullInterval, Interval, Forward, Backward, IterationSpace) from devito.ir.support.guards import GuardOverflow from devito.symbolics import DefFunction, FieldFromPointer, ccode from devito.tools import prod -from devito.types import Array, Jump, Scalar, Symbol +from devito.types import Array, CriticalRegion, Jump, Scalar, Symbol class TestVectorHierarchy(object): @@ -826,6 +827,65 @@ def test_dep_nasty(self, eqns): scope = Scope(eqns) assert len(scope.d_all) == 1 + def test_critical_region_v0(self): + grid = Grid(shape=(4, 4)) + + f = Function(name='f', grid=grid) + + s0 = Symbol(name='s0') + s1 = Symbol(name='s1') + + exprs = [Eq(s0, CriticalRegion(True)), + Eq(f.indexify(), 1), + Eq(s1, CriticalRegion(False))] + exprs = [LoweredEq(i) for i in exprs] + + scope = Scope(exprs) + + # Mock depedencies so that the fences (CriticalRegions) don't float around + assert len(scope.writes[mocksym0]) == 2 + assert len(scope.reads[mocksym0]) == 2 + assert len(scope.d_all) == 3 + + # No other mock depedencies because there's no other place the Eq + # within the critical sequence can float to + assert len(scope.writes[mocksym1]) == 1 + assert mocksym1 not in scope.reads + + def test_critical_region_v1(self): + grid = Grid(shape=(4, 4)) + + f = Function(name='f', grid=grid) + g = Function(name='g', grid=grid) + h = Function(name='h', grid=grid) + u = Function(name='u', grid=grid) + + s0 = Symbol(name='s0') + s1 = Symbol(name='s1') + + exprs = [Eq(g.indexify(), 2), + Eq(h.indexify(), 2), + Eq(s0, CriticalRegion(True)), + Eq(f.indexify(), 1), + Eq(s1, CriticalRegion(False)), + Eq(u.indexify(), 3)] + exprs = [LoweredEq(i) for i in exprs] + + scope = Scope(exprs) + + # Mock depedencies so that the fences (CriticalRegions) don't float around + assert len(scope.writes[mocksym0]) == 2 + assert len(scope.reads[mocksym0]) == 4 + assert len([i for i in scope.d_all + if i.source.access is mocksym0 + or i.sink.access is mocksym0]) == 7 + + # More mock depedencies because Eq must not float outside of the critical + # sequence + assert len(scope.writes[mocksym1]) == 1 + assert len(scope.reads[mocksym1]) == 2 + assert len(scope.d_all) == 9 + class TestParallelismAnalysis(object): From 7cb01cf6333a3819a5e4804982b13556cf861e8e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 23 Oct 2023 16:37:10 +0000 Subject: [PATCH 10/12] compiler: Enhance make_critical_sequence --- devito/ir/stree/algorithms.py | 2 +- devito/passes/clusters/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 6b4149cced..f40288946f 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -152,7 +152,7 @@ def preprocess(clusters, options=None, **kwargs): hs = HaloScheme.union(e.rhs.halo_scheme for e in c.exprs) queue.append(c.rebuild(halo_scheme=hs)) elif c.is_critical_region and c.syncs: - processed.append(c.rebuild(exprs=None, syncs=c.syncs)) + processed.append(c.rebuild(exprs=None, guards=c.guards, syncs=c.syncs)) elif c.is_wild: continue else: diff --git a/devito/passes/clusters/utils.py b/devito/passes/clusters/utils.py index e3244976e9..ef5a91b559 100644 --- a/devito/passes/clusters/utils.py +++ b/devito/passes/clusters/utils.py @@ -56,7 +56,7 @@ def is_memcpy(expr): return a.function.is_Array or b.function.is_Array -def make_critical_sequence(ispace, sequence): +def make_critical_sequence(ispace, sequence, **kwargs): sequence = as_tuple(sequence) assert len(sequence) >= 1 @@ -64,13 +64,13 @@ def make_critical_sequence(ispace, sequence): # Opening expr = Eq(Symbol(name='⋈'), CriticalRegion(True)) - processed.append(Cluster(exprs=expr, ispace=ispace)) + processed.append(Cluster(exprs=expr, ispace=ispace, **kwargs)) processed.extend(sequence) # Closing expr = Eq(Symbol(name='⋈'), CriticalRegion(False)) - processed.append(Cluster(exprs=expr, ispace=ispace)) + processed.append(Cluster(exprs=expr, ispace=ispace, **kwargs)) return processed From a0084f0e8fa1f99cafa05792ece510a2f0ee0ef3 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 24 Oct 2023 14:41:52 +0000 Subject: [PATCH 11/12] compiler: pep8 happiness --- devito/ir/clusters/cluster.py | 2 +- devito/ir/support/basic.py | 2 +- devito/passes/clusters/utils.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 87044cbb88..887793e8e7 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -13,7 +13,7 @@ from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.symbolics import estimate_cost from devito.tools import as_tuple, flatten, frozendict, infer_dtype -from devito.types import Fence, WeakFence, CriticalRegion +from devito.types import WeakFence, CriticalRegion __all__ = ["Cluster", "ClusterGroup"] diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index e218482f4e..6c13a3f006 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -24,7 +24,7 @@ class IndexMode(Tag): REGULAR = IndexMode('regular') IRREGULAR = IndexMode('irregular') -# Symbols to create mock data depdendencies +# Symbols to create mock data depdendencies mocksym0 = Symbol(name='__⋈_0__') mocksym1 = Symbol(name='__⋈_1__') diff --git a/devito/passes/clusters/utils.py b/devito/passes/clusters/utils.py index ef5a91b559..abb4cec264 100644 --- a/devito/passes/clusters/utils.py +++ b/devito/passes/clusters/utils.py @@ -1,5 +1,4 @@ from collections import defaultdict -from itertools import groupby from devito.ir import Cluster from devito.symbolics import uxreplace From a000c49d21c601183c6b15235ac13b1b2e79140b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 24 Oct 2023 16:09:49 +0000 Subject: [PATCH 12/12] tests: Fix after changing DDA for Fences --- tests/test_ir.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_ir.py b/tests/test_ir.py index f64ac08ae8..5bef5fb8bd 100644 --- a/tests/test_ir.py +++ b/tests/test_ir.py @@ -744,8 +744,8 @@ class Foo(DefFunction, Jump): scope = Scope(exprs) assert len(scope.d_all) == 3 - assert len(scope.d_flow) == 3 - assert len(scope.d_anti) == 0 + assert len(scope.d_flow) == 2 + assert len(scope.d_anti) == 1 assert any(v.function is f for v in scope.d_flow) assert any(v.function is mocksym0 for v in scope.d_flow)