From 3f63463aaa6e691babe786102988ee916f89f171 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 20 Jun 2024 10:58:59 +0000 Subject: [PATCH 01/18] compiler: Ensure Function.alias is just a bool --- devito/passes/__init__.py | 10 +--------- devito/passes/iet/engine.py | 2 +- devito/passes/iet/languages/utils.py | 5 +++++ 3 files changed, 7 insertions(+), 10 deletions(-) create mode 100644 devito/passes/iet/languages/utils.py diff --git a/devito/passes/__init__.py b/devito/passes/__init__.py index 8db9072812..f4ac2783c3 100644 --- a/devito/passes/__init__.py +++ b/devito/passes/__init__.py @@ -82,15 +82,7 @@ def is_gpu_create(obj, gpu_create): except AttributeError: functions = as_tuple(obj) - for i in functions: - try: - f = i.alias or i - except AttributeError: - f = i - if f not in gpu_create: - return False - - return True + return all(f in gpu_create for f in functions) # Import all compiler passes diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index 2e9aebc055..6add6674f7 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -433,7 +433,7 @@ def abstract_object(i, mapper, sregistry): def _(i, mapper, sregistry): name = sregistry.make_name(prefix='f') - v = i._rebuild(name=name, initializer=None, alias=i) + v = i._rebuild(name=name, initializer=None, alias=True) mapper.update({ i: v, diff --git a/devito/passes/iet/languages/utils.py b/devito/passes/iet/languages/utils.py new file mode 100644 index 0000000000..e4399b451c --- /dev/null +++ b/devito/passes/iet/languages/utils.py @@ -0,0 +1,5 @@ +__all__ = ['joins'] + + +def joins(symbols): + return ",".join(sorted([i.name for i in symbols])) From 875af5152a41ef7348cde690d24ebe847e4f8c2a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 20 Jun 2024 10:59:37 +0000 Subject: [PATCH 02/18] compiler: Fix Bundle abstraction --- devito/passes/iet/engine.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index 6add6674f7..4520c9f22a 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -398,18 +398,21 @@ def abstract_efunc(efunc): return efunc -def abstract_objects(objects, sregistry=None): +def abstract_objects(objects0, sregistry=None): """ Proxy for `abstract_object`. """ + # Expose hidden objects for complete reconstruction + objects = [] + for i in objects0: + if i.is_Bundle: + objects.extend(i.components) + objects.append(i) + # Precedence rules make it possible to reconstruct objects that depend on # higher priority objects - priority = { - Array: 1, - DiscreteFunction: 2, - AbstractIncrDimension: 3, - BlockDimension: 4, - } + keys = [Bundle, Array, DiscreteFunction, AbstractIncrDimension, BlockDimension] + priority = {k: i for i, k in enumerate(keys, start=1)} objects = sorted_priority(objects, priority) # Build abstraction mappings @@ -444,7 +447,6 @@ def _(i, mapper, sregistry): @abstract_object.register(Array) -@abstract_object.register(Bundle) def _(i, mapper, sregistry): if isinstance(i, Lock): name = sregistry.make_name(prefix='lock') @@ -462,6 +464,22 @@ def _(i, mapper, sregistry): mapper[i.dmap] = v.dmap +@abstract_object.register(Bundle) +def _(i, mapper, sregistry): + name = sregistry.make_name(prefix='a') + components = [mapper[f] for f in i.components] + + v = i._rebuild(name=name, components=components, alias=True) + + mapper.update({ + i: v, + i.indexed: v.indexed, + i._C_symbol: v._C_symbol, + }) + if i.dmap is not None: + mapper[i.dmap] = v.dmap + + @abstract_object.register(CompositeObject) def _(i, mapper, sregistry): name = sregistry.make_name(prefix='o') From 887bbf0188dc4b3fd6e2fb4c5e498e1977aae733 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 20 Jun 2024 11:00:26 +0000 Subject: [PATCH 03/18] compiler: Improve iet engine's update_args --- devito/passes/iet/engine.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index 4520c9f22a..7e95a99220 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -618,6 +618,11 @@ def update_args(root, efuncs, dag): drop_params.extend(a for a in root.parameters if (a.is_Symbol or a.is_LocalObject) and a not in symbols) + # 4) removed a function that was previously necessary + functions = FindSymbols('symbolics').visit(root.body) + drop_params.extend(a for a in root.parameters + if a.is_AbstractFunction and a not in functions) + # Must record the index, not the param itself, since a param may be # bound to whatever arg, possibly a generic SymPy expr drop_params = [root.parameters.index(a) for a in drop_params] From 4ab1f1cf715a2450239d55f21dde8586514417cb Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 20 Jun 2024 11:01:06 +0000 Subject: [PATCH 04/18] compiler: Systematically use Pragma, not cgen.Pragma This is necessary to ensure reconstruction works as expected --- devito/ir/iet/nodes.py | 16 ++-- devito/ir/iet/utils.py | 3 +- devito/ir/iet/visitors.py | 15 ++-- devito/passes/iet/languages/openacc.py | 85 +++++++++++--------- devito/passes/iet/languages/openmp.py | 66 +++++++++------ devito/passes/iet/languages/utils.py | 2 +- devito/passes/iet/parpragma.py | 43 +++++----- tests/test_dle.py | 107 +++++++++++++------------ tests/test_gpu_openacc.py | 28 +++---- tests/test_gpu_openmp.py | 44 +++++----- tests/test_mpi.py | 4 +- 11 files changed, 223 insertions(+), 190 deletions(-) diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index 5b778ad7b9..9bcc3460f6 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -1217,21 +1217,25 @@ def __repr__(self): class Pragma(Node): """ - One or more pragmas floating in the IET constructed through a callback. + One or more pragmas floating in the IET. """ - def __init__(self, callback, arguments=None): + def __init__(self, pragma, arguments=None): super().__init__() - self.callback = callback + if not isinstance(pragma, str): + raise TypeError("Pragma name must be a string, not %s" % type(pragma)) + + self.pragma = pragma self.arguments = as_tuple(arguments) def __repr__(self): - return '' + return '' @cached_property - def pragmas(self): - return as_tuple(self.callback(*self.arguments)) + def _generate(self): + # Subclasses may override this property to customize the pragma generation + return self.pragma % self.arguments class Transfer: diff --git a/devito/ir/iet/utils.py b/devito/ir/iet/utils.py index e2a0cde052..2bac05733c 100644 --- a/devito/ir/iet/utils.py +++ b/devito/ir/iet/utils.py @@ -106,7 +106,8 @@ def derive_parameters(iet, drop_locals=False, ordering='default'): basics = FindSymbols('basics').visit(iet) candidates.extend(i.function for i in basics) - # Filter off duplicates (e.g., `x_size` is extracted by both calls to FindSymbols) + # Filter off duplicates (e.g., `x_size` is extracted by both calls to + # FindSymbols) candidates = filter_ordered(candidates) # Filter off symbols which are defined somewhere within `iet` diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index e0bf98017f..54e9188e1a 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -480,7 +480,7 @@ def visit_Expression(self, o): code = c.Assign(lhs, rhs) if o.pragmas: - code = c.Module(list(o.pragmas) + [code]) + code = c.Module(self._visit(o.pragmas) + (code,)) return code @@ -489,7 +489,7 @@ def visit_AugmentedExpression(self, o): c_rhs = ccode(o.expr.rhs, dtype=o.dtype, compiler=self._compiler) code = c.Statement("%s %s= %s" % (c_lhs, o.op, c_rhs)) if o.pragmas: - code = c.Module(list(o.pragmas) + [code]) + code = c.Module(self._visit(o.pragmas) + (code,)) return code def visit_Call(self, o, nested_call=False): @@ -555,15 +555,13 @@ def visit_Iteration(self, o): # Attach pragmas, if any if o.pragmas: - handle = c.Module(o.pragmas + (handle,)) + pragmas = tuple(self._visit(i) for i in o.pragmas) + handle = c.Module(pragmas + (handle,)) return handle def visit_Pragma(self, o): - if len(o.pragmas) == 1: - return o.pragmas[0] - else: - return c.Collection(o.pragmas) + return c.Pragma(o._generate) def visit_While(self, o): condition = ccode(o.condition) @@ -1230,9 +1228,10 @@ def visit_Iteration(self, o): nodes = self._visit(o.nodes) dimension = uxreplace(o.dim, self.mapper) limits = [uxreplace(i, self.mapper) for i in o.limits] + pragmas = self._visit(o.pragmas) uindices = [uxreplace(i, self.mapper) for i in o.uindices] return o._rebuild(nodes=nodes, dimension=dimension, limits=limits, - uindices=uindices) + pragmas=pragmas, uindices=uindices) def visit_Definition(self, o): try: diff --git a/devito/passes/iet/languages/openacc.py b/devito/passes/iet/languages/openacc.py index 186a106211..bcd2c8d006 100644 --- a/devito/passes/iet/languages/openacc.py +++ b/devito/passes/iet/languages/openacc.py @@ -1,4 +1,3 @@ -import cgen as c import numpy as np from devito.arch import AMDGPUX, NVIDIAX @@ -74,38 +73,43 @@ class AccBB(PragmaLangBB): 'set-device': lambda args: Call('acc_set_device_num', args), # Pragmas - 'atomic': c.Pragma('acc atomic update'), - 'map-enter-to': lambda i, j: - c.Pragma('acc enter data copyin(%s%s)' % (i, j)), - 'map-enter-to-wait': lambda i, j, k: - (c.Pragma('acc enter data copyin(%s%s) async(%s)' % (i, j, k)), - c.Pragma('acc wait(%s)' % k)), - 'map-enter-alloc': lambda i, j: - c.Pragma('acc enter data create(%s%s)' % (i, j)), - 'map-present': lambda i, j: - c.Pragma('acc data present(%s%s)' % (i, j)), + 'atomic': + Pragma('acc atomic update'), + 'map-enter-to': lambda f, imask: + PragmaTransfer('acc enter data copyin(%s%s)', f, imask=imask), + 'map-enter-to-async': lambda f, imask, a: + PragmaTransfer('acc enter data copyin(%s%s) async(%s)', + f, imask=imask, arguments=a), + 'map-enter-alloc': lambda f, imask: + PragmaTransfer('acc enter data create(%s%s)', f, imask=imask), + 'map-present': lambda f, imask: + PragmaTransfer('acc data present(%s%s)', f, imask=imask), 'map-serial-present': lambda i, j: - c.Pragma('acc serial present(%s) copyout(%s)' % (i, j)), + Pragma('acc serial present(%s) copyout(%s)', arguments=(i, j)), 'map-wait': lambda i: - c.Pragma('acc wait(%s)' % i), - 'map-update': lambda i, j: - c.Pragma('acc exit data copyout(%s%s)' % (i, j)), - 'map-update-host': lambda i, j: - c.Pragma('acc update self(%s%s)' % (i, j)), - 'map-update-host-async': lambda i, j, k: - c.Pragma('acc update self(%s%s) async(%s)' % (i, j, k)), - 'map-update-device': lambda i, j: - c.Pragma('acc update device(%s%s)' % (i, j)), - 'map-update-device-async': lambda i, j, k: - c.Pragma('acc update device(%s%s) async(%s)' % (i, j, k)), - 'map-release': lambda i, j: - c.Pragma('acc exit data delete(%s%s)' % (i, j)), - 'map-release-if': lambda i, j, k: - c.Pragma('acc exit data delete(%s%s) if(%s)' % (i, j, k)), - 'map-exit-delete': lambda i, j: - c.Pragma('acc exit data delete(%s%s)' % (i, j)), - 'map-exit-delete-if': lambda i, j, k: - c.Pragma('acc exit data delete(%s%s) if(%s)' % (i, j, k)), + Pragma('acc wait(%s)', arguments=i), + 'map-update': lambda f, imask: + PragmaTransfer('acc exit data copyout(%s%s)', f, imask=imask), + 'map-update-host': lambda f, imask: + PragmaTransfer('acc update self(%s%s)', f, imask=imask), + 'map-update-host-async': lambda f, imask, a: + PragmaTransfer('acc update self(%s%s) async(%s)', + f, imask=imask, arguments=a), + 'map-update-device': lambda f, imask: + PragmaTransfer('acc update device(%s%s)', f, imask=imask), + 'map-update-device-async': lambda f, imask, a: + PragmaTransfer('acc update device(%s%s) async(%s)', + f, imask=imask, arguments=a), + 'map-release': lambda f, imask: + PragmaTransfer('acc exit data delete(%s%s)', f, imask=imask), + 'map-release-if': lambda f, imask, a: + PragmaTransfer('acc exit data delete(%s%s) if(%s)', + f, imask=imask, arguments=a), + 'map-exit-delete': lambda f, imask: + PragmaTransfer('acc exit data delete(%s%s)', f, imask=imask), + 'map-exit-delete-if': lambda f, imask, a: + PragmaTransfer('acc exit data delete(%s%s) if(%s)', + f, imask=imask, arguments=a), 'memcpy-to-device': lambda i, j, k: Call('acc_memcpy_to_device', [i, j, k]), 'memcpy-to-device-wait': lambda i, j, k, l: @@ -126,30 +130,33 @@ class AccBB(PragmaLangBB): @classmethod def _map_to_wait(cls, f, imask=None, qid=None): - return PragmaTransfer(cls.mapper['map-enter-to-wait'], f, imask, qid) + return List(body=[ + cls.mapper['map-enter-to-async'](f, imask, qid), + cls.mapper['map-wait'](qid) + ]) @classmethod def _map_present(cls, f, imask=None): - return PragmaTransfer(cls.mapper['map-present'], f, imask) + return cls.mapper['map-present'](f, imask) @classmethod def _map_wait(cls, qid=None): - return Pragma(cls.mapper['map-wait'], qid) + return cls.mapper['map-wait'](qid) @classmethod def _map_delete(cls, f, imask=None, devicerm=None): if devicerm: - return PragmaTransfer(cls.mapper['map-exit-delete-if'], f, imask, devicerm) + return cls.mapper['map-exit-delete-if'](f, imask, devicerm) else: - return PragmaTransfer(cls.mapper['map-exit-delete'], f, imask) + return cls.mapper['map-exit-delete'](f, imask) @classmethod def _map_update_host_async(cls, f, imask=None, qid=None): - return PragmaTransfer(cls.mapper['map-update-host-async'], f, imask, qid) + return cls.mapper['map-update-host-async'](f, imask, qid) @classmethod def _map_update_device_async(cls, f, imask=None, qid=None): - return PragmaTransfer(cls.mapper['map-update-device-async'], f, imask, qid) + return cls.mapper['map-update-device-async'](f, imask, qid) class DeviceAccizer(PragmaDeviceAwareTransformer): @@ -227,7 +234,7 @@ def place_devptr(self, iet, **kwargs): init = DummyExpr(tdp, 0, init=True) dpf = List(body=[ - Pragma(self.lang.mapper['map-serial-present'], (hp, tdp)), + self.lang.mapper['map-serial-present'](hp, tdp), Block(body=DummyExpr(tdp, cast_mapper[tdp.dtype](hp))) ]) diff --git a/devito/passes/iet/languages/openmp.py b/devito/passes/iet/languages/openmp.py index 60b0eb0c41..ad12879b25 100644 --- a/devito/passes/iet/languages/openmp.py +++ b/devito/passes/iet/languages/openmp.py @@ -1,3 +1,4 @@ +from functools import cached_property from packaging.version import Version import cgen as c @@ -5,7 +6,7 @@ from devito.arch import AMDGPUX, NVIDIAX, INTELGPUX, PVC from devito.arch.compiler import GNUCompiler -from devito.ir import (Call, Conditional, DeviceCall, List, Prodder, +from devito.ir import (Call, Conditional, DeviceCall, List, Pragma, Prodder, ParallelBlock, PointerCast, While, FindSymbols) from devito.passes.iet.definitions import DataManager, DeviceAwareDataManager from devito.passes.iet.langbase import LangBB @@ -13,6 +14,7 @@ from devito.passes.iet.parpragma import (PragmaSimdTransformer, PragmaShmTransformer, PragmaDeviceAwareTransformer, PragmaLangBB, PragmaIteration, PragmaTransfer) +from devito.passes.iet.languages.utils import joins from devito.passes.iet.languages.C import CBB from devito.symbolics import CondEq, DefFunction from devito.tools import filter_ordered @@ -99,6 +101,16 @@ def __init__(self, prodder, arguments=None): Prodder.__init__(self, prodder.name, arguments, periodic=prodder.periodic) +class SimdForAligned(Pragma): + + @cached_property + def _generate(self): + assert len(self.arguments) > 1 + n = self.arguments[0] + items = self.arguments[1:] + return self.pragma % (joins(*items), n) + + class OmpBB(LangBB): mapper = { @@ -115,9 +127,12 @@ class OmpBB(LangBB): 'thread-num': lambda retobj=None: Call('omp_get_thread_num', retobj=retobj), # Pragmas - 'simd-for': c.Pragma('omp simd'), - 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)), - 'atomic': c.Pragma('omp atomic update') + 'simd-for': + Pragma('omp simd'), + 'simd-for-aligned': lambda n, *a: + SimdForAligned('omp simd aligned(%s:%d)', arguments=(n, *a)), + 'atomic': + Pragma('omp atomic update') } mapper.update(CBB.mapper) @@ -139,24 +154,29 @@ class DeviceOmpBB(OmpBB, PragmaLangBB): 'set-device': lambda args: Call('omp_set_default_device', args), # Pragmas - 'map-enter-to': lambda i, j: - c.Pragma('omp target enter data map(to: %s%s)' % (i, j)), - 'map-enter-alloc': lambda i, j: - c.Pragma('omp target enter data map(alloc: %s%s)' % (i, j)), - 'map-update': lambda i, j: - c.Pragma('omp target update from(%s%s)' % (i, j)), - 'map-update-host': lambda i, j: - c.Pragma('omp target update from(%s%s)' % (i, j)), - 'map-update-device': lambda i, j: - c.Pragma('omp target update to(%s%s)' % (i, j)), - 'map-release': lambda i, j: - c.Pragma('omp target exit data map(release: %s%s)' % (i, j)), - 'map-release-if': lambda i, j, k: - c.Pragma('omp target exit data map(release: %s%s) if(%s)' % (i, j, k)), - 'map-exit-delete': lambda i, j: - c.Pragma('omp target exit data map(delete: %s%s)' % (i, j)), - 'map-exit-delete-if': lambda i, j, k: - c.Pragma('omp target exit data map(delete: %s%s) if(%s)' % (i, j, k)), + 'map-enter-to': lambda f, imask: + PragmaTransfer('omp target enter data map(to: %s%s)', f, imask=imask), + 'map-enter-alloc': lambda f, imask: + PragmaTransfer('omp target enter data map(alloc: %s%s)', + f, imask=imask), + 'map-update': lambda f, imask: + PragmaTransfer('omp target update from(%s%s)', f, imask=imask), + 'map-update-host': lambda f, imask: + PragmaTransfer('omp target update from(%s%s)', f, imask=imask), + 'map-update-device': lambda f, imask: + PragmaTransfer('omp target update to(%s%s)', f, imask=imask), + 'map-release': lambda f, imask: + PragmaTransfer('omp target exit data map(release: %s%s)', + f, imask=imask), + 'map-release-if': lambda f, imask, a: + PragmaTransfer('omp target exit data map(release: %s%s) if(%s)', + f, imask=imask, arguments=a), + 'map-exit-delete': lambda f, imask: + PragmaTransfer('omp target exit data map(delete: %s%s)', + f, imask=imask), + 'map-exit-delete-if': lambda f, imask, a: + PragmaTransfer('omp target exit data map(delete: %s%s) if(%s)', + f, imask=imask, arguments=a), 'memcpy-to-device': lambda i, j, k: Call('omp_target_memcpy', [i, j, k, 0, 0, DefFunction('omp_get_device_num'), @@ -186,7 +206,7 @@ def _map_delete(cls, f, imask=None, devicerm=None): if devicerm is not None: items.append(devicerm) argument = And(*items) - return PragmaTransfer(cls.mapper['map-exit-delete-if'], f, imask, argument) + return cls.mapper['map-exit-delete-if'](f, imask, argument) class SimdOmpizer(PragmaSimdTransformer): diff --git a/devito/passes/iet/languages/utils.py b/devito/passes/iet/languages/utils.py index e4399b451c..dbf466a8dd 100644 --- a/devito/passes/iet/languages/utils.py +++ b/devito/passes/iet/languages/utils.py @@ -1,5 +1,5 @@ __all__ = ['joins'] -def joins(symbols): +def joins(*symbols): return ",".join(sorted([i.name for i in symbols])) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 35f1ec15c8..c3ed016a94 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -48,10 +48,10 @@ def simd_reg_nbytes(self): def _make_simd_pragma(self, iet): indexeds = FindSymbols('indexeds').visit(iet) - aligned = {i.name for i in indexeds if i.function.is_DiscreteFunction} + aligned = {i.base for i in indexeds if i.function.is_DiscreteFunction} if aligned: simd = self.lang['simd-for-aligned'] - simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_nbytes)) + simd = as_tuple(simd(self.simd_reg_nbytes, *aligned)) else: simd = as_tuple(self.lang['simd-for']) @@ -146,8 +146,7 @@ def __init__(self, *args, parallel=None, schedule=None, chunk_size=None, reduction=reduction, schedule=schedule, tile=tile, gpu_fit=gpu_fit, **kwargs ) - pragma = c.Pragma(' '.join([construct] + clauses)) - kwargs['pragmas'] = pragma + kwargs['pragmas'] = Pragma(' '.join([construct] + clauses)) super().__init__(*args, **kwargs) @@ -427,11 +426,12 @@ def make_parallel(self, graph): class PragmaTransfer(Pragma, Transfer): """ - A data transfer between host and device expressed by means of one or more pragmas. + A data transfer between host and device expressed by means of one or + more pragmas. """ - def __init__(self, callback, function, imask=None, arguments=None): - super().__init__(callback, arguments) + def __init__(self, pragma, function, imask=None, arguments=None): + super().__init__(pragma, arguments) self._function = function self._imask = imask @@ -450,13 +450,6 @@ def imask(self): def sections(self): return make_sections_from_imask(self.function, self.imask) - @cached_property - def pragmas(self): - # Stringify sections - sections = ''.join(['[%s:%s]' % (ccode(i), ccode(j)) for i, j in self.sections]) - arguments = [ccode(i) for i in self.arguments] - return as_tuple(self.callback(self.function.name, sections, *arguments)) - @property def functions(self): return (self.function,) @@ -471,6 +464,14 @@ def expr_symbols(self): pass return tuple(retval) + @cached_property + def _generate(self): + # Stringify sections + sections = ''.join(['[%s:%s]' % (ccode(i), ccode(j)) + for i, j in self.sections]) + arguments = [ccode(i) for i in self.arguments] + return self.pragma % (self.function.name, sections, *arguments) + class PragmaDeviceAwareTransformer(DeviceAwareMixin, PragmaShmTransformer): @@ -588,13 +589,13 @@ def _get_num_devices(cls, platform): @classmethod def _map_to(cls, f, imask=None, qid=None): - return PragmaTransfer(cls.mapper['map-enter-to'], f, imask) + return cls.mapper['map-enter-to'](f, imask) _map_to_wait = _map_to @classmethod def _map_alloc(cls, f, imask=None): - return PragmaTransfer(cls.mapper['map-enter-alloc'], f, imask) + return cls.mapper['map-enter-alloc'](f, imask) @classmethod def _map_present(cls, f, imask=None): @@ -605,26 +606,26 @@ def _map_present(cls, f, imask=None): @classmethod def _map_update(cls, f, imask=None): - return PragmaTransfer(cls.mapper['map-update'], f, imask) + return cls.mapper['map-update'](f, imask) @classmethod def _map_update_host(cls, f, imask=None, qid=None): - return PragmaTransfer(cls.mapper['map-update-host'], f, imask) + return cls.mapper['map-update-host'](f, imask) _map_update_host_async = _map_update_host @classmethod def _map_update_device(cls, f, imask=None, qid=None): - return PragmaTransfer(cls.mapper['map-update-device'], f, imask) + return cls.mapper['map-update-device'](f, imask) _map_update_device_async = _map_update_device @classmethod def _map_release(cls, f, imask=None, devicerm=None): if devicerm: - return PragmaTransfer(cls.mapper['map-release-if'], f, imask, devicerm) + return cls.mapper['map-release-if'](f, imask, devicerm) else: - return PragmaTransfer(cls.mapper['map-release'], f, imask) + return cls.mapper['map-release'](f, imask) # Utils diff --git a/tests/test_dle.py b/tests/test_dle.py index 13f6841c09..3db2a41c9b 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -102,7 +102,7 @@ def test_cache_blocking_structure(blockinner, openmp, expected): if openmp: trees = retrieve_iteration_tree(op) assert len(trees[0][1].pragmas) == 1 - assert 'omp for' in trees[0][1].pragmas[0].value + assert 'omp for' in trees[0][1].pragmas[0].ccode.value def test_cache_blocking_structure_subdims(): @@ -709,10 +709,10 @@ def test_iterations_ompized(self, exprs, expected): if j is True: assert len(pragmas) == 1 pragma = pragmas[0] - assert 'omp for' in pragma.value + assert 'omp for' in pragma.ccode.value else: for k in pragmas: - assert 'omp for' not in k.value + assert 'omp for' not in k.ccode.value def test_dynamic_nthreads(self): grid = Grid(shape=(16, 16, 16)) @@ -774,10 +774,10 @@ def test_collapsing(self, eqns, expected, blocking): if j > 0: assert len(i.pragmas) == 1 pragma = i.pragmas[0] - assert 'omp for collapse(%d)' % j in pragma.value + assert 'omp for collapse(%d)' % j in pragma.ccode.value else: for k in i.pragmas: - assert 'omp for collapse' not in k.value + assert 'omp for collapse' not in k.ccode.value def test_collapsing_v2(self): """ @@ -830,9 +830,9 @@ def test_scheduling(self): assert len(iterations) == 6 assert iterations[1].is_Affine - assert 'schedule(dynamic,1)' in iterations[1].pragmas[0].value + assert 'schedule(dynamic,1)' in iterations[1].pragmas[0].ccode.value assert not iterations[3].is_Affine - assert 'schedule(dynamic,chunk_size)' in iterations[3].pragmas[0].value + assert 'schedule(dynamic,chunk_size)' in iterations[3].pragmas[0].ccode.value @skipif('cpu64-icc') @pytest.mark.parametrize('so', [0, 1, 2]) @@ -856,10 +856,11 @@ def test_array_sum_reduction(self, so, dim): # With the `f[z] += u[t0][x + 1][y + 1][z + 1] + 1` expr, the innermost # `z` Iteration gets parallelized, nothing is collapsed, hence no # reduction is required - assert "reduction" not in parallelized.pragmas[0].value + assert "reduction" not in parallelized.pragmas[0].ccode.value elif Ompizer._support_array_reduction(configuration['compiler']): - if "collapse" in parallelized.pragmas[0].value: - assert "reduction(+:f[0:f_vec->size[0]])" in parallelized.pragmas[0].value + if "collapse" in parallelized.pragmas[0].ccode.value: + assert ("reduction(+:f[0:f_vec->size[0]])" + in parallelized.pragmas[0].ccode.value) else: # E.g. old GCC's assert "atomic update" in str(iterations[-1]) @@ -891,7 +892,7 @@ def test_reduction_local(self): if configuration['language'] == 'C': pass elif Ompizer._support_array_reduction(configuration['compiler']): - assert "reduction(+:n[0])" in iterations[0].pragmas[0].value + assert "reduction(+:n[0])" in iterations[0].pragmas[0].ccode.value else: # E.g. old GCC's assert "atomic update" in str(iterations[-1]) @@ -942,7 +943,7 @@ def test_array_max_reduction(self): op = Operator(eqn, opt=('advanced', {'openmp': True})) iterations = FindNodes(Iteration).visit(op) - assert "reduction(max:n[0])" in iterations[0].pragmas[0].value + assert "reduction(max:n[0])" in iterations[0].pragmas[0].ccode.value op() assert n.data[0] == 26 @@ -979,7 +980,7 @@ def test_array_minmax_reduction(self): if configuration['language'] == 'openmp': iterations = FindNodes(Iteration).visit(op) expected = "reduction(max:r0) reduction(min:r1)" - assert expected in iterations[0].pragmas[0].value + assert expected in iterations[0].pragmas[0].ccode.value op() assert n.data[0] == 26 @@ -1012,7 +1013,8 @@ def test_incs_no_atomic(self): # Now only `x` is parallelized op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)], - opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1})) + opt=('advanced', {'openmp': True, + 'par-collapse-ncores': 1})) assert 'omp for' in str(op1) assert 'collapse' not in str(op1) @@ -1055,7 +1057,8 @@ def test_incr_perfect_sparse_outer(self): assert len(iters) == 5 assert iters[0].is_Sequential assert all(i.is_ParallelAtomic for i in iters[1:]) - assert iters[1].pragmas[0].value == 'omp for schedule(dynamic,chunk_size)' + assert iters[1].pragmas[0].ccode.value ==\ + 'omp for schedule(dynamic,chunk_size)' assert all(not i.pragmas for i in iters[2:]) @pytest.mark.parametrize('exprs,simd_level,expected', [ @@ -1124,16 +1127,16 @@ def test_edge_cases(self, exprs, simd_level, expected): iterations = FindNodes(Iteration).visit(op) parallel = [i for i in iterations if i.is_Parallel] try: - assert 'omp for' in iterations[0].pragmas[0].value + assert 'omp for' in iterations[0].pragmas[0].ccode.value if len(parallel) > 1 and simd_level is not None and simd_level > 1: - assert 'collapse' in iterations[0].pragmas[0].value + assert 'collapse' in iterations[0].pragmas[0].ccode.value if simd_level: - assert 'omp simd' in iterations[simd_level].pragmas[0].value + assert 'omp simd' in iterations[simd_level].pragmas[0].ccode.value except: # E.g. gcc-5 doesn't support array reductions, so the compiler will # generate different legal code assert not Ompizer._support_array_reduction(configuration['compiler']) - assert any('omp for' in i.pragmas[0].value + assert any('omp for' in i.pragmas[0].ccode.value for i in iterations if i.pragmas) op.apply() @@ -1153,9 +1156,9 @@ def test_simd_space_invariant(self): op = Operator(eq, opt=('advanced', {'openmp': True})) iterations = FindNodes(Iteration).visit(op) - assert 'omp for schedule(static,1)' in iterations[0].pragmas[0].value - assert 'omp simd' in iterations[1].pragmas[0].value - assert 'omp simd' in iterations[3].pragmas[0].value + assert 'omp for schedule(static,1)' in iterations[0].pragmas[0].ccode.value + assert 'omp simd' in iterations[1].pragmas[0].ccode.value + assert 'omp simd' in iterations[3].pragmas[0].ccode.value op.apply() assert np.isclose(np.linalg.norm(f.data), 37.1458, rtol=1e-5) @@ -1182,8 +1185,8 @@ def test_parallel_prec_inject(self): iterations = FindNodes(Iteration).visit(op0) assert not iterations[0].pragmas - assert 'omp for' in iterations[1].pragmas[0].value - assert 'collapse' not in iterations[1].pragmas[0].value + assert 'omp for' in iterations[1].pragmas[0].ccode.value + assert 'collapse' not in iterations[1].pragmas[0].ccode.value op0 = Operator(eqns, opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1, @@ -1191,7 +1194,7 @@ def test_parallel_prec_inject(self): iterations = FindNodes(Iteration).visit(op0) assert not iterations[0].pragmas - assert 'omp for collapse' in iterations[1].pragmas[0].value + assert 'omp for collapse' in iterations[1].pragmas[0].ccode.value class TestNestedParallelism: @@ -1223,10 +1226,9 @@ def test_basic(self): bns, _ = assert_blocking(op, {'x0_blk0'}) iterations = FindNodes(Iteration).visit(bns['x0_blk0']) - assert iterations[0].pragmas[0].value == 'omp for schedule(dynamic,1)' - assert iterations[2].pragmas[0].value == ('omp parallel for ' - 'schedule(dynamic,1) ' - 'num_threads(nthreads_nested)') + assert iterations[0].pragmas[0].ccode.value == 'omp for schedule(dynamic,1)' + assert iterations[2].pragmas[0].ccode.value ==\ + 'omp parallel for schedule(dynamic,1) num_threads(nthreads_nested)' def test_collapsing(self): grid = Grid(shape=(3, 3, 3)) @@ -1251,10 +1253,11 @@ def test_collapsing(self): bns, _ = assert_blocking(op, {'x0_blk0'}) iterations = FindNodes(Iteration).visit(bns['x0_blk0']) - assert iterations[0].pragmas[0].value == 'omp for collapse(2) schedule(dynamic,1)' - assert iterations[2].pragmas[0].value == ('omp parallel for collapse(2) ' - 'schedule(dynamic,1) ' - 'num_threads(nthreads_nested)') + assert iterations[0].pragmas[0].ccode.value ==\ + 'omp for collapse(2) schedule(dynamic,1)' + assert iterations[2].pragmas[0].ccode.value ==\ + ('omp parallel for collapse(2) schedule(dynamic,1) ' + 'num_threads(nthreads_nested)') def test_multiple_subnests_v0(self): grid = Grid(shape=(3, 3, 3)) @@ -1279,14 +1282,14 @@ def test_multiple_subnests_v0(self): assert len(trees) == 2 assert trees[0][0] is trees[1][0] - assert trees[0][0].pragmas[0].value ==\ + assert trees[0][0].pragmas[0].ccode.value ==\ 'omp for collapse(2) schedule(dynamic,1)' - assert trees[0][2].pragmas[0].value == ('omp parallel for collapse(2) ' - 'schedule(dynamic,1) ' - 'num_threads(nthreads_nested)') - assert trees[1][2].pragmas[0].value == ('omp parallel for collapse(2) ' - 'schedule(dynamic,1) ' - 'num_threads(nthreads_nested)') + assert trees[0][2].pragmas[0].ccode.value ==\ + ('omp parallel for collapse(2) schedule(dynamic,1) ' + 'num_threads(nthreads_nested)') + assert trees[1][2].pragmas[0].ccode.value ==\ + ('omp parallel for collapse(2) schedule(dynamic,1) ' + 'num_threads(nthreads_nested)') def test_multiple_subnests_v1(self): """ @@ -1316,17 +1319,15 @@ def test_multiple_subnests_v1(self): assert len(trees) == 2 assert trees[0][0] is trees[1][0] - assert trees[0][0].pragmas[0].value ==\ + assert trees[0][0].pragmas[0].ccode.value ==\ 'omp for collapse(2) schedule(dynamic,1)' assert not trees[0][2].pragmas assert not trees[0][3].pragmas - assert trees[0][4].pragmas[0].value == ('omp parallel for ' - 'schedule(dynamic,1) ' - 'num_threads(nthreads_nested)') + assert trees[0][4].pragmas[0].ccode.value ==\ + 'omp parallel for schedule(dynamic,1) num_threads(nthreads_nested)' assert not trees[1][2].pragmas - assert trees[1][3].pragmas[0].value == ('omp parallel for ' - 'schedule(dynamic,1) ' - 'num_threads(nthreads_nested)') + assert trees[1][3].pragmas[0].ccode.value ==\ + 'omp parallel for schedule(dynamic,1) num_threads(nthreads_nested)' @pytest.mark.parametrize('blocklevels', [1, 2]) def test_nested_cache_blocking_structure_subdims(self, blocklevels): @@ -1381,11 +1382,11 @@ def test_nested_cache_blocking_structure_subdims(self, blocklevels): tree[6].dim.symbolic_min is zi.symbolic_min and\ tree[6].dim.symbolic_max is zi.symbolic_max and tree[6].dim.parent is z - assert trees[0][0].pragmas[0].value ==\ + assert trees[0][0].pragmas[0].ccode.value ==\ 'omp for collapse(2) schedule(dynamic,1)' - assert trees[0][2].pragmas[0].value == ('omp parallel for collapse(2) ' - 'schedule(dynamic,1) ' - 'num_threads(nthreads_nested)') + assert trees[0][2].pragmas[0].ccode.value ==\ + ('omp parallel for collapse(2) schedule(dynamic,1) ' + 'num_threads(nthreads_nested)') @pytest.mark.parametrize('exprs,collapsed,scheduling', [ (['Eq(u.forward, u.dx)'], '2', 'static'), @@ -1416,5 +1417,5 @@ def test_collapsing_w_wo_halo(self, exprs, collapsed, scheduling): ompfor_string = "".join(['omp for collapse(', collapsed, ')']) scheduling_string = "".join([' schedule(', scheduling, ',1)']) - assert iterations[1].pragmas[0].value == "".join([ompfor_string, - scheduling_string]) + assert iterations[1].pragmas[0].ccode.value ==\ + "".join([ompfor_string, scheduling_string]) diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index a6e04e8dbc..bdf732a12d 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -24,15 +24,15 @@ def test_basic(self): trees = retrieve_iteration_tree(op) assert len(trees) == 1 - assert trees[0][1].pragmas[0].value ==\ + assert trees[0][1].pragmas[0].ccode.value ==\ 'acc parallel loop collapse(3) present(u)' - assert op.body.maps[0].pragmas[0].value ==\ + assert op.body.maps[0].ccode.value ==\ ('acc enter data copyin(u[0:u_vec->size[0]]' '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])') - assert op.body.unmaps[0].pragmas[0].value ==\ + assert op.body.unmaps[0].ccode.value ==\ ('acc exit data copyout(u[0:u_vec->size[0]]' '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])') - assert op.body.unmaps[1].pragmas[0].value ==\ + assert op.body.unmaps[1].ccode.value ==\ ('acc exit data delete(u[0:u_vec->size[0]]' '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]]) if(devicerm)') @@ -52,7 +52,7 @@ def test_basic_customop(self): trees = retrieve_iteration_tree(op) assert len(trees) == 1 - assert trees[0][1].pragmas[0].value ==\ + assert trees[0][1].pragmas[0].ccode.value ==\ 'acc parallel loop collapse(3) present(u)' try: @@ -82,7 +82,7 @@ def test_blocking(self, opt): assert op.parameters[7] is tree[2].step assert op.parameters[10] is tree[3].step - assert tree[1].pragmas[0].value ==\ + assert tree[1].pragmas[0].ccode.value ==\ 'acc parallel loop collapse(3) present(u)' @pytest.mark.parametrize('par_tile', [True, (32, 4), (32, 4, 4), (32, 4, 4, 8)]) @@ -105,12 +105,12 @@ def test_tile_insteadof_collapse(self, par_tile): stile = (32, 4, 4, 4) if par_tile != (32, 4, 4, 8) else (32, 4, 4, 8) assert len(trees) == 4 - assert trees[0][1].pragmas[0].value ==\ + assert trees[0][1].pragmas[0].ccode.value ==\ 'acc parallel loop tile(32,4,4) present(u)' - assert trees[1][1].pragmas[0].value ==\ + assert trees[1][1].pragmas[0].ccode.value ==\ 'acc parallel loop tile(32,4) present(u)' strtile = ','.join([str(i) for i in stile]) - assert trees[3][1].pragmas[0].value ==\ + assert trees[3][1].pragmas[0].ccode.value ==\ 'acc parallel loop tile(%s) present(src,src_coords,u)' % strtile @pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)), @@ -134,12 +134,12 @@ def test_multiple_tile_sizes(self, par_tile): trees = retrieve_iteration_tree(op) assert len(trees) == 4 - assert trees[0][1].pragmas[0].value ==\ + assert trees[0][1].pragmas[0].ccode.value ==\ 'acc parallel loop tile(32,4,4) present(u)' - assert trees[1][1].pragmas[0].value ==\ + assert trees[1][1].pragmas[0].ccode.value ==\ 'acc parallel loop tile(8,8) present(u)' sclause = 'collapse(4)' if par_tile[-1] is None else 'tile(8,8,8,8)' - assert trees[3][1].pragmas[0].value ==\ + assert trees[3][1].pragmas[0].ccode.value ==\ 'acc parallel loop %s present(src,src_coords,u)' % sclause def test_multi_tile_blocking_structure(self): @@ -161,9 +161,9 @@ def test_multi_tile_blocking_structure(self): bns, _ = assert_blocking(op, {'x0_blk0', 'x1_blk0'}) assert len(bns) == len(expected) - assert bns['x0_blk0'].pragmas[0].value ==\ + assert bns['x0_blk0'].pragmas[0].ccode.value ==\ 'acc parallel loop tile(32,4,4) present(u)' - assert bns['x1_blk0'].pragmas[0].value ==\ + assert bns['x1_blk0'].pragmas[0].ccode.value ==\ 'acc parallel loop tile(16,4,4) present(u,v)' for root, v in zip(bns.values(), expected): iters = FindNodes(Iteration).visit(root) diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index 7c0877c279..7150d66eb2 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -50,15 +50,15 @@ def test_basic(self): trees = retrieve_iteration_tree(op) assert len(trees) == 1 - assert trees[0][1].pragmas[0].value ==\ + assert trees[0][1].pragmas[0].ccode.value ==\ 'omp target teams distribute parallel for collapse(3)' - assert op.body.maps[0].pragmas[0].value ==\ + assert op.body.maps[0].ccode.value ==\ ('omp target enter data map(to: u[0:u_vec->size[0]*' 'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])') - assert op.body.unmaps[0].pragmas[0].value ==\ + assert op.body.unmaps[0].ccode.value ==\ ('omp target update from(u[0:u_vec->size[0]*' 'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])') - assert op.body.unmaps[1].pragmas[0].value ==\ + assert op.body.unmaps[1].ccode.value ==\ ('omp target exit data map(release: u[0:u_vec->size[0]*' 'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]]) if(devicerm)') @@ -76,7 +76,7 @@ def test_basic_customop(self): trees = retrieve_iteration_tree(op) assert len(trees) == 1 - assert trees[0][1].pragmas[0].value ==\ + assert trees[0][1].pragmas[0].ccode.value ==\ 'omp target teams distribute parallel for collapse(3)' try: @@ -105,7 +105,7 @@ def test_blocking(self, opt): assert op.parameters[7] is tree[2].step assert op.parameters[10] is tree[3].step - assert tree[1].pragmas[0].value ==\ + assert tree[1].pragmas[0].ccode.value ==\ 'omp target teams distribute parallel for collapse(3)' def test_multiple_eqns(self): @@ -120,18 +120,18 @@ def test_multiple_eqns(self): trees = retrieve_iteration_tree(op) assert len(trees) == 1 - assert trees[0][1].pragmas[0].value ==\ + assert trees[0][1].pragmas[0].ccode.value ==\ 'omp target teams distribute parallel for collapse(3)' for i, f in enumerate([u, v]): - assert op.body.maps[i].pragmas[0].value ==\ + assert op.body.maps[i].ccode.value ==\ ('omp target enter data map(to: %(n)s[0:%(n)s_vec->size[0]*' '%(n)s_vec->size[1]*%(n)s_vec->size[2]*%(n)s_vec->size[3]])' % {'n': f.name}) - assert op.body.unmaps[2*i + 0].pragmas[0].value ==\ + assert op.body.unmaps[2*i + 0].ccode.value ==\ ('omp target update from(%(n)s[0:%(n)s_vec->size[0]*' '%(n)s_vec->size[1]*%(n)s_vec->size[2]*%(n)s_vec->size[3]])' % {'n': f.name}) - assert op.body.unmaps[2*i + 1].pragmas[0].value ==\ + assert op.body.unmaps[2*i + 1].ccode.value ==\ ('omp target exit data map(release: %(n)s[0:%(n)s_vec->size[0]*' '%(n)s_vec->size[1]*%(n)s_vec->size[2]*%(n)s_vec->size[3]]) ' 'if(devicerm)' % {'n': f.name}) @@ -154,45 +154,45 @@ def test_multiple_loops(self): assert len(trees) == 3 # All loop nests must have been parallelized - assert trees[0][0].pragmas[0].value ==\ + assert trees[0][0].pragmas[0].ccode.value ==\ 'omp target teams distribute parallel for collapse(3)' - assert trees[1][1].pragmas[0].value ==\ + assert trees[1][1].pragmas[0].ccode.value ==\ 'omp target teams distribute parallel for collapse(3)' - assert trees[2][1].pragmas[0].value ==\ + assert trees[2][1].pragmas[0].ccode.value ==\ 'omp target teams distribute parallel for collapse(3)' # Check `u` and `v` for i, f in enumerate([u, v], 1): - assert op.body.maps[i].pragmas[0].value ==\ + assert op.body.maps[i].ccode.value ==\ ('omp target enter data map(to: %(n)s[0:%(n)s_vec->size[0]]' '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' % {'n': f.name}) - assert op.body.unmaps[2*i + 0].pragmas[0].value ==\ + assert op.body.unmaps[2*i + 0].ccode.value ==\ ('omp target update from(%(n)s[0:%(n)s_vec->size[0]]' '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' % {'n': f.name}) - assert op.body.unmaps[2*i + 1].pragmas[0].value ==\ + assert op.body.unmaps[2*i + 1].ccode.value ==\ ('omp target exit data map(release: %(n)s[0:%(n)s_vec->size[0]]' '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]]) ' 'if(devicerm)' % {'n': f.name}) # Check `f` - assert op.body.maps[0].pragmas[0].value ==\ + assert op.body.maps[0].ccode.value ==\ ('omp target enter data map(to: f[0:f_vec->size[0]]' '[0:f_vec->size[1]][0:f_vec->size[2]])') - assert op.body.unmaps[0].pragmas[0].value ==\ + assert op.body.unmaps[0].ccode.value ==\ ('omp target update from(f[0:f_vec->size[0]]' '[0:f_vec->size[1]][0:f_vec->size[2]])') - assert op.body.unmaps[1].pragmas[0].value ==\ + assert op.body.unmaps[1].ccode.value ==\ ('omp target exit data map(release: f[0:f_vec->size[0]]' '[0:f_vec->size[1]][0:f_vec->size[2]]) if(devicerm)') # Check `g` -- note that unlike `f`, this one should be `delete` upon # exit, not `from` - assert op.body.maps[3].pragmas[0].value ==\ + assert op.body.maps[3].ccode.value ==\ ('omp target enter data map(to: g[0:g_vec->size[0]]' '[0:g_vec->size[1]][0:g_vec->size[2]])') - assert op.body.unmaps[6].pragmas[0].value ==\ + assert op.body.unmaps[6].ccode.value ==\ ('omp target exit data map(delete: g[0:g_vec->size[0]]' '[0:g_vec->size[1]][0:g_vec->size[2]])' ' if(devicerm && g_vec->size[0] != 0 && g_vec->size[1] != 0' @@ -240,7 +240,7 @@ def test_timeparallel_reduction(self): # The time loop is not in OpenMP canonical form, so it won't be parallelized assert not tree.root.pragmas assert len(tree[1].pragmas) == 1 - assert tree[1].pragmas[0].value ==\ + assert tree[1].pragmas[0].ccode.value ==\ ('omp target teams distribute parallel for collapse(3)' ' reduction(+:f[0])') diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 04a7bd1fb6..c0836fb0c2 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -1481,7 +1481,7 @@ def test_poke_progress(self, mode): # W/ OpenMP, we prod until all comms have completed assert call.then_body[0].body[0].is_While # W/ OpenMP, we expect dynamic thread scheduling - assert 'dynamic,1' in tree.root.pragmas[0].value + assert 'dynamic,1' in tree.root.pragmas[0].ccode.value else: # W/o OpenMP, it's a different story assert call._single_thread @@ -1505,7 +1505,7 @@ def test_poke_progress(self, mode): # W/ OpenMP, we prod until all comms have completed assert call.then_body[0].body[0].is_While # W/ OpenMP, we expect dynamic thread scheduling - assert 'dynamic,1' in tree.root.pragmas[0].value + assert 'dynamic,1' in tree.root.pragmas[0].ccode.value else: # W/o OpenMP, it's a different story assert call._single_thread From b65810392bef9abf8f32440c493a8e86c1ee51ad Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 21 Jun 2024 09:50:22 +0000 Subject: [PATCH 05/18] compiler: Cleanup blocking --- devito/passes/clusters/blocking.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index b7ec5a2408..105d3eeaf6 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -2,8 +2,8 @@ from devito.finite_differences.differentiable import IndexSum from devito.ir.clusters import Queue -from devito.ir.support import (AFFINE, PARALLEL, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT, - SEQUENTIAL, SKEWABLE, TILABLES, Interval, +from devito.ir.support import (AFFINE, PARALLEL, PARALLEL_IF_ATOMIC, + PARALLEL_IF_PVT, SKEWABLE, TILABLES, Interval, IntervalGroup, IterationSpace, Scope) from devito.passes import is_on_device from devito.symbolics import search, uxreplace, xreplace_indices @@ -247,7 +247,7 @@ def callback(self, clusters, prefix): return clusters # Heuristic: TILABLE not worth it if not within a SEQUENTIAL Dimension - if not any(SEQUENTIAL in c.properties[i.dim] for i in prefix[:-1]): + if not any(c.properties.is_sequential(i.dim) for i in prefix[:-1]): return clusters processed.append(c.rebuild(properties=c.properties.block(d))) @@ -569,7 +569,8 @@ def next(self, prefix, d, clusters): class SynthesizeSkewing(Queue): """ - Construct a new sequence of clusters with skewed expressions and iteration spaces. + Construct a new sequence of clusters with skewed expressions and + iteration spaces. Notes ----- @@ -615,12 +616,14 @@ def callback(self, clusters, prefix): if SKEWABLE not in c.properties[d]: return clusters - skew_dims = {i.dim for i in c.ispace if SEQUENTIAL in c.properties[i.dim]} + skew_dims = {i.dim for i in c.ispace + if c.properties.is_sequential(i.dim)} if len(skew_dims) > 1: return clusters skew_dim = skew_dims.pop() - # Since we are here, prefix is skewable and nested under a SEQUENTIAL loop + # Since we are here, prefix is skewable and nested under a + # SEQUENTIAL loop intervals = [] for i in c.ispace: if i.dim is d and (not d.is_Block or d._depth == 1): From dad8dc85e6e552fcfe450f2360ed3973d4b2a594 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 21 Jun 2024 13:27:34 +0000 Subject: [PATCH 06/18] compiler: Fix rcompile --- devito/core/gpu.py | 6 +++--- devito/operator/operator.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/devito/core/gpu.py b/devito/core/gpu.py index c0eae9732f..05e4e6451d 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -120,17 +120,17 @@ def _rcompile_wrapper(cls, **kwargs0): options0 = kwargs0.pop('options') def wrapper(expressions, mode='default', options=None, **kwargs1): - options = {**options0, **(options or {})} kwargs = {**kwargs0, **kwargs1} if mode == 'host': - par_disabled = options['par-disabled'] + options = options or {} target = { 'platform': 'cpu64', - 'language': 'C' if par_disabled else 'openmp', + 'language': 'C' if options0['par-disabled'] else 'openmp', 'compiler': 'custom' } else: + options = {**options0, **(options or {})} target = None return rcompile(expressions, kwargs, options, target=target) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 297ac544ee..363c2507e3 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -1098,13 +1098,12 @@ def rcompile(expressions, kwargs, options, target=None): if target is None: cls = operator_selector(**kwargs) + kwargs['options'] = options else: kwargs = parse_kwargs(**target) cls = operator_selector(**kwargs) kwargs = cls._normalize_kwargs(**kwargs) - - # Use the customized opt options - kwargs['options'] = options + kwargs['options'].update(options) # Recursive profiling not supported -- would be a complete mess kwargs.pop('profiler', None) From cba2ea986a805201b0506cfd4d24acc285c8bfac Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 21 Jun 2024 14:23:13 +0000 Subject: [PATCH 07/18] compiler: Patch pow-to-mul --- devito/symbolics/manipulation.py | 8 ++++---- tests/test_dse.py | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/devito/symbolics/manipulation.py b/devito/symbolics/manipulation.py index 220e4d27b8..7f44befb74 100644 --- a/devito/symbolics/manipulation.py +++ b/devito/symbolics/manipulation.py @@ -286,12 +286,12 @@ def pow_to_mul(expr): except TypeError: # E.g., a Symbol, or possibly a generic expression return expr - if exp > 10 or exp < -10 or int(exp) != exp or exp == 0: + if exp > 10 or exp < -10 or exp == 0: # Large and non-integer powers remain untouched return expr - elif exp == -1: - # Reciprocals also remain untouched, but we traverse the base - # looking for other Pows + elif exp == -1 or int(exp) != exp: + # Reciprocals and fractional powers also remain untouched, + # but at least we traverse the base looking for other Pows return expr.func(pow_to_mul(base), exp, evaluate=False) elif exp > 0: return Mul(*[base]*int(exp), evaluate=False) diff --git a/tests/test_dse.py b/tests/test_dse.py index 1d7071cabc..f2e57dceea 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -210,6 +210,9 @@ def test_cse_w_conditionals(): ('Mul(SizeOf("char"), ' '-IndexedPointer(FieldFromPointer("size", fa._C_symbol), x), evaluate=False)', 'sizeof(char)*(-fa_vec->size[x])'), + ('sqrt(fa[x]**4)', 'sqrt(fa[x]*fa[x]*fa[x]*fa[x])'), + ('sqrt(fa[x])**2', 'fa[x]'), + ('fa[x]**-2', '1/(fa[x]*fa[x])'), ]) def test_pow_to_mul(expr, expected): grid = Grid((4, 5)) From 60f5d43824107de3035e36505ce3db4d8c6bc930 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 26 Jun 2024 12:32:46 +0000 Subject: [PATCH 08/18] compiler: Switch from local to global autopadding --- devito/passes/iet/linearization.py | 8 ++---- devito/types/array.py | 12 ++++---- devito/types/basic.py | 45 +++++++++++++++++++----------- tests/test_linearize.py | 2 +- 4 files changed, 38 insertions(+), 29 deletions(-) diff --git a/devito/passes/iet/linearization.py b/devito/passes/iet/linearization.py index 109d19ef29..11da313be2 100644 --- a/devito/passes/iet/linearization.py +++ b/devito/passes/iet/linearization.py @@ -71,12 +71,8 @@ def key1(f, d): """ if f.is_regular: # For paddable objects the following holds: - # `same dim + same halo + same dtype => same (auto-)padding` - # Bundle need the actual function dtype - if f.is_Bundle: - return (d, f._size_halo[d], f.is_autopaddable, f.c0.dtype) - else: - return (d, f._size_halo[d], f.is_autopaddable, f.dtype) + # `same dim + same halo + same padding_dtype => same (auto-)padding` + return (d, f._size_halo[d], f.__padding_dtype__) else: return False diff --git a/devito/types/array.py b/devito/types/array.py index 73eccefdc5..e45c3d78c9 100644 --- a/devito/types/array.py +++ b/devito/types/array.py @@ -367,6 +367,8 @@ def __args_setup__(cls, *args, **kwargs): raise ValueError("Components must be of same type") if not issubclass(klss.pop(), AbstractFunction): raise ValueError("Component type must be subclass of AbstractFunction") + if len({i.__padding_dtype__ for i in components}) != 1: + raise ValueError("Components must have the same padding dtype") return args, kwargs @@ -438,14 +440,14 @@ def ncomp(self): def initvalue(self): return None - # CodeSymbol overrides defaulting to self.c0's behaviour + # Overrides defaulting to self.c0's behaviour for i in ['_mem_internal_eager', '_mem_internal_lazy', '_mem_local', '_mem_mapped', '_mem_host', '_mem_stack', '_mem_constant', - '_mem_shared', '_size_domain', '_size_halo', '_size_owned', - '_size_padding', '_size_nopad', '_size_nodomain', '_offset_domain', - '_offset_halo', '_offset_owned', '_dist_dimensions', '_C_get_field', - 'grid', 'symbolic_shape']: + '_mem_shared', '__padding_dtype__', '_size_domain', '_size_halo', + '_size_owned', '_size_padding', '_size_nopad', '_size_nodomain', + '_offset_domain', '_offset_halo', '_offset_owned', '_dist_dimensions', + '_C_get_field', 'grid', 'symbolic_shape']: locals()[i] = property(lambda self, v=i: getattr(self.c0, v)) @property diff --git a/devito/types/basic.py b/devito/types/basic.py index e0fc5c084b..6ce688f209 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -999,30 +999,41 @@ def __padding_setup__(self, **kwargs): padding = tuple(kwargs.get('padding', ((0, 0),)*self.ndim)) return DimensionTuple(*padding, getters=self.dimensions) + @cached_property + def __padding_dtype__(self): + v = configuration['autopadding'] + if not self.is_autopaddable or not v: + return None + try: + if issubclass(v, np.number): + return v + except TypeError: + return np.float32 + def __padding_setup_smart__(self, **kwargs): nopadding = ((0, 0),)*self.ndim - if kwargs.get('autopadding', configuration['autopadding']): - # The padded Dimension - candidates = self.space_dimensions - if not candidates: - return nopadding - d = candidates[-1] - - mmts = configuration['platform'].max_mem_trans_size(self.dtype) - remainder = self._size_nopad[d] % mmts - if remainder == 0: - # Already a multiple of `mmts`, no need to pad - return nopadding + if not self.__padding_dtype__: + return nopadding - dpadding = (0, (mmts - remainder)) - padding = [(0, 0)]*self.ndim - padding[self.dimensions.index(d)] = dpadding + # The padded Dimension + candidates = self.space_dimensions + if not candidates: + return nopadding + d = candidates[-1] - return tuple(padding) - else: + mmts = configuration['platform'].max_mem_trans_size(self.__padding_dtype__) + remainder = self._size_nopad[d] % mmts + if remainder == 0: + # Already a multiple of `mmts`, no need to pad return nopadding + dpadding = (0, (mmts - remainder)) + padding = [(0, 0)]*self.ndim + padding[self.dimensions.index(d)] = dpadding + + return tuple(padding) + def __ghost_setup__(self, **kwargs): return (0, 0) diff --git a/tests/test_linearize.py b/tests/test_linearize.py index a92ddff484..7d1abad73c 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -611,4 +611,4 @@ def test_different_dtype(): # Check generated code has different strides for different dtypes assert "bL0(x,y) b[(x)*y_stride0 + (y)]" in str(op1) - assert "L0(x,y) f[(x)*y_stride1 + (y)]" in str(op1) + assert "L0(x,y) f[(x)*y_stride0 + (y)]" in str(op1) From 1566fec38688638bc0bc4f3a6fb338c6000db84b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 26 Jun 2024 14:40:01 +0000 Subject: [PATCH 09/18] compiler: Avoid ModuloDimension when size==1 --- devito/ir/clusters/algorithms.py | 21 ++++++++++++++------- devito/ir/clusters/cluster.py | 4 ---- devito/passes/clusters/misc.py | 5 +++-- tests/test_dimension.py | 15 +++++++++++++++ 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index 1d2770a657..fb144e5770 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -343,6 +343,11 @@ def rule(size, e): # Reconstruct the Clusters processed = [] for c in clusters: + exprs = c.exprs + + sub_iterators = dict(c.ispace.sub_iterators) + sub_iterators[d] = [i for i in sub_iterators[d] if i not in subiters] + # Apply substitutions to expressions # Note: In an expression, there could be `u[t+1, ...]` and `v[t+1, # ...]`, where `u` and `v` are TimeFunction with circular time @@ -350,17 +355,19 @@ def rule(size, e): # indices above are therefore conceptually different, so they will # be replaced with the proper ModuloDimension through two different # calls to `xreplace_indices` - exprs = c.exprs groups = as_mapper(mds, lambda d: d.modulo) for size, v in groups.items(): - subs = {md.origin: md for md in v} - func = partial(xreplace_indices, mapper=subs, key=partial(rule, size)) + key = partial(rule, size) + if size == 1: + # Optimization -- avoid useless "% 1" ModuloDimensions + subs = {md.origin: 0 for md in v} + else: + subs = {md.origin: md for md in v} + sub_iterators[d].extend(v) + + func = partial(xreplace_indices, mapper=subs, key=key) exprs = [e.apply(func) for e in exprs] - # Augment IterationSpace - sub_iterators = dict(c.ispace.sub_iterators) - sub_iterators[d] = tuple(i for i in sub_iterators[d] + tuple(mds) - if i not in subiters) ispace = IterationSpace(c.ispace.intervals, sub_iterators, c.ispace.directions) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 7a13fa5222..a4a26dadbb 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -187,10 +187,6 @@ def scope(self): def functions(self): return self.scope.functions - @cached_property - def has_increments(self): - return any(e.is_Increment for e in self.exprs) - @cached_property def grid(self): grids = set(f.grid for f in self.functions if f.is_AbstractFunction) diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index 7ce2bd3422..a8ccb500c1 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -40,8 +40,9 @@ def callback(self, clusters, prefix): lifted = [] processed = [] for n, c in enumerate(clusters): - # Increments prevent lifting - if c.has_increments: + # Storage-related dependences, such as those induced by reduction + # increments, prevent lifting + if any(dep.is_storage_related(dim) for dep in c.scope.d_all_gen()): processed.append(c) continue diff --git a/tests/test_dimension.py b/tests/test_dimension.py index 8fc01837ce..ed5f0a777a 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -210,6 +210,21 @@ def test_modulo_dims_generation_v2(self): assert np.all(f.data[3] == 2) assert np.all(f.data[4] == 4) + def test_degenerate_to_zero(self): + grid = Grid(shape=(10, 10)) + + u = TimeFunction(name='u', grid=grid, save=Buffer(1)) + + eq = Eq(u.forward, u + 1) + + op = Operator(eq) + + assert len([i for i in FindSymbols('dimensions').visit(op) if i.is_Modulo]) == 0 + + op.apply(time_M=9) + + assert np.all(u.data == 10) + class TestSubDimension: From ad63a445c0758e5cdb3f0bb0e17829238259516c Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 27 Jun 2024 12:50:49 +0000 Subject: [PATCH 10/18] compiler: Simplify par-tile selection --- devito/passes/clusters/blocking.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index 105d3eeaf6..b48f3af2b0 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -487,13 +487,7 @@ def __init__(self, par_tile): # Special case 1: a smaller par-tile to avoid under-utilizing # computational resources when the iteration spaces are too small - if (len(par_tile) == 1 and - (len(par_tile[0]) < len(par_tile.default) or - prod(par_tile[0]) < prod(par_tile.default))): - # Ignore if, e.g., user supplies a lower dimensional block shape - self.umt_small = self.umt - else: - self.umt_small = UnboundedMultiTuple(par_tile.default) + self.umt_small = UnboundedMultiTuple(par_tile.default) # Special case 2: par-tiles for iteration spaces that must be fully # blocked for correctness From a5955a1aac236aca69c0b3dc5f710537f7676138 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 28 Jun 2024 13:32:47 +0000 Subject: [PATCH 11/18] compiler: Refactor TimedAccess.distance --- devito/ir/support/basic.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index c435800872..4b86bf4985 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -345,21 +345,19 @@ def distance(self, other): if not (sit == oit and sai.root is oai.root): # E.g., `self=R` and `other=W` # E.g., `self=R`, `other=W`, - # `self.itintervals=(x<0>,)` and `other.itintervals=(x<1>,)` - ret.append(S.Infinity) - break + # `self.itintervals=(x<0>,)`, `other.itintervals=(x<1>,)` + return vinf(ret) except AttributeError: # E.g., `self=R` and `self.itintervals=(y,)` => `sai=None` pass if self.function._mem_shared: # Special case: the distance between two regular, thread-shared - # objects fallbacks to zero, as any other value would be nonsensical. + # objects fallbacks to zero, as any other value would be nonsensical ret.append(S.Zero) elif sai and oai and sai._defines & sit.dim._defines: - # E.g., `self=R`, `self.itintervals=(time, x)` - # and `ai=t` + # E.g., `self=R`, `self.itintervals=(time, x)`, `ai=t` if sit.direction is Backward: ret.append(other[n] - self[n]) else: @@ -373,8 +371,8 @@ def distance(self, other): break elif sai in self.ispace and oai in other.ispace: - # E.g., `self=R`, `sai=time`, self.itintervals=(time, x, y) - # with `n=0` + # E.g., `self=R`, `sai=time`, + # `self.itintervals=(time, x, y)`, `n=0` continue elif any(d and d._defines & sit.dim._defines for d in (sai, oai)): @@ -402,16 +400,11 @@ def distance(self, other): return Vector(S.ImaginaryUnit) # Fallback - ret.append(S.Infinity) - break - - elif self.findices[n] in sit.dim._defines: - # E.g., `self=R` and `fi=p_src` (`n=1`) - ret.append(S.Infinity) - break + return vinf(ret) - if S.Infinity in ret: - return Vector(*ret) + else: + # E.g., `self=R`, `fi=p_src`, `n=1` + return vinf(ret) n = len(ret) @@ -1330,6 +1323,10 @@ def is_regular(self): # *** Utils +def vinf(entries): + return Vector(*(entries + [S.Infinity])) + + def retrieve_accesses(exprs, **kwargs): """ Like retrieve_terminals, but ensure that if a ComponentAccess is found, From 660a438a2dee6acf447d07b905b7b59240026c1e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 28 Jun 2024 15:44:12 +0000 Subject: [PATCH 12/18] compiler: Add rudimentary disjoint_test for DDA --- devito/ir/support/basic.py | 108 +++++++++++++++++++++++++++---------- devito/types/dimension.py | 6 --- tests/test_dimension.py | 28 ++++++++++ 3 files changed, 109 insertions(+), 33 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 4b86bf4985..3d6bb3ba60 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -2,6 +2,7 @@ from functools import cached_property from sympy import S +import sympy from devito.ir.support.space import Backward, null_ispace from devito.ir.support.utils import AccessMode, extrema @@ -351,6 +352,35 @@ def distance(self, other): # E.g., `self=R` and `self.itintervals=(y,)` => `sai=None` pass + # In some cases, the distance degenerates because `self` and + # `other` never intersect, which essentially means there's no + # dependence between them. In this case, we set the distance to a + # dummy value (the imaginary unit). Hence, we call these "imaginary + # dependences". This occurs in just a small set of special cases, + # which we attempt to handle here + if any(d and d._defines & sit.dim._defines for d in (sai, oai)): + # Case 1: `sit` is an IterationInterval with statically known + # trip count. E.g. it ranges from 0 to 3; `other` performs a + # constant access at 4 + for v in (self[n], other[n]): + try: + if bool(v < sit.symbolic_min or v > sit.symbolic_max): + return Vector(S.ImaginaryUnit) + except TypeError: + pass + + # Case 2: `sit` is an IterationInterval over a local SubDimension + # and `other` performs a constant access + for d0, d1 in ((sai, oai), (oai, sai)): + if d0 is None and d1.is_Sub and d1.local: + return Vector(S.ImaginaryUnit) + + # Case 3: `self` and `other` have some special form such that + # it's provable that they never intersect + if sai and sit == oit: + if disjoint_test(self[n], other[n], sai, sit): + return Vector(S.ImaginaryUnit) + if self.function._mem_shared: # Special case: the distance between two regular, thread-shared # objects fallbacks to zero, as any other value would be nonsensical @@ -375,33 +405,6 @@ def distance(self, other): # `self.itintervals=(time, x, y)`, `n=0` continue - elif any(d and d._defines & sit.dim._defines for d in (sai, oai)): - # In some cases, the distance degenerates because `self` and - # `other` never intersect, which essentially means there's no - # dependence between them. In this case, we set the distance to - # a dummy value (the imaginary unit). Hence, we call these - # "imaginary dependences". This occurs in just a small set of - # special cases, which we handle here - - # Case 1: `sit` is an IterationInterval with statically known - # trip count. E.g. it ranges from 0 to 3; `other` performs a - # constant access at 4 - for v in (self[n], other[n]): - try: - if bool(v < sit.symbolic_min or v > sit.symbolic_max): - return Vector(S.ImaginaryUnit) - except TypeError: - pass - - # Case 2: `sit` is an IterationInterval over a local SubDimension - # and `other` performs a constant access - for d0, d1 in ((sai, oai), (oai, sai)): - if d0 is None and d1.is_Sub and d1.local: - return Vector(S.ImaginaryUnit) - - # Fallback - return vinf(ret) - else: # E.g., `self=R`, `fi=p_src`, `n=1` return vinf(ret) @@ -1342,3 +1345,54 @@ def retrieve_accesses(exprs, **kwargs): exprs1 = uxreplace(exprs, subs) return compaccs | retrieve_terminals(exprs1, **kwargs) - set(subs.values()) + + +def disjoint_test(e0, e1, d, it): + """ + A rudimentary test to check if two accesses `e0` and `e1` along `d` within + the IterationInterval `it` are independent. + + This is inspired by the Banerjee test, but it's way more simplistic. + + The test is conservative, meaning that if it returns False, then the accesses + might be independent, but it's not guaranteed. If it returns True, then the + accesses are definitely independent. + + Our implementation focuses on tiny yet relevant cases, such as when the + iteration space's bounds are numeric constants, while the index accesses + functions reduce to numbers once the iteration variable is substituted with + one of the possible values in the iteration space. + + Examples + -------- + * e0 = 12 - zl, e1 = zl + 4, d = zl, it = zl[0,0] + where zl is a left SubDimension with thickness, say, 4 + The test will return True, as the two index access functions never + overlap. + """ + if e0 == e1: + return False + + if d.is_Custom: + subs = {} + elif d.is_Sub and d.is_left: + subs = {d.root.symbolic_min: 0, **dict([d.thickness.left])} + else: + return False + + m = it.symbolic_min.subs(subs) + M = it.symbolic_max.subs(subs) + + p00 = e0._subs(d, m) + p01 = e0._subs(d, M) + + p10 = e1._subs(d, m) + p11 = e1._subs(d, M) + + if any(not i.is_Number for i in [p00, p01, p10, p11]): + return False + + i0 = sympy.Interval(min(p00, p01), max(p00, p01)) + i1 = sympy.Interval(min(p10, p11), max(p10, p11)) + + return not bool(i0.intersect(i1)) diff --git a/devito/types/dimension.py b/devito/types/dimension.py index b3d51284fe..179089866c 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -732,12 +732,6 @@ def _offset_right(self): symbolic_thickness ) - def overlap(self, other): - return (isinstance(other, SubDimension) and - self.root is other.root and - self._offset_left.extreme is other._offset_left.extreme and - self._offset_right.extreme is other._offset_right.extreme) - @property def _arg_names(self): return tuple(k.name for k, _ in self.thickness) + self.parent._arg_names diff --git a/tests/test_dimension.py b/tests/test_dimension.py index ed5f0a777a..f4a87a20f9 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -555,6 +555,34 @@ def test_subdimleft_parallel(self): assert np.all(u.data[1, 0:thickness, thickness:-thickness] == 1) assert np.all(u.data[1, thickness+1:, :] == 0) + @pytest.mark.parametrize('thickness,flag', [ + (4, True), + (8, False) + ]) + def test_subdim_local_parallel(self, thickness, flag): + """ + A variation of `test_subdimleft_parallel` where the thickness, whose + value is statically known, explicitly appears in the equations. + """ + grid = Grid(shape=(30, 30, 30)) + x, y, z = grid.dimensions + t = grid.stepping_dim + + u = TimeFunction(name='u', grid=grid, space_order=4) + v = TimeFunction(name='v', grid=grid, space_order=4) + + zl = SubDimension.left(name='zl', parent=z, thickness=thickness) + + eqns = [Eq(u[t, x, y, zl], u[t, x, y, 8 - zl]), + Eq(v[t, x, y, zl], v[t, x, y, 8 - zl])] + + op = Operator(eqns) + + if flag: + assert_structure(op, ['t,x,y,z'], 't,x,y,z') + else: + assert_structure(op, ['t,x,y,z', 't,x,y,z'], 't,x,y,z,z') + def test_subdimmiddle_notparallel(self): """ Tests application of an Operator consisting of a subdimension From bd7c628fdac2b724c04ae0c44beb5a950fc68208 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 1 Jul 2024 15:37:56 +0000 Subject: [PATCH 13/18] compiler: Improve par-tile scheduling --- devito/passes/clusters/blocking.py | 29 ++++++++++++++++++++--------- devito/types/dimension.py | 12 ++++++++---- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index b48f3af2b0..c91da4fe08 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -8,7 +8,7 @@ from devito.passes import is_on_device from devito.symbolics import search, uxreplace, xreplace_indices from devito.tools import (UnboundedMultiTuple, UnboundTuple, as_mapper, as_tuple, - filter_ordered, flatten, is_integer, prod) + filter_ordered, flatten, is_integer) from devito.types import BlockDimension __all__ = ['blocking'] @@ -160,11 +160,15 @@ def __init__(self, options): def _make_key_hook(self, cluster, level): return (is_on_device(cluster.functions, self.gpu_fit),) - def _has_other_blockable_dim(self, cluster, d): - return any(cluster.properties.is_parallel_relaxed(i) and - not self._has_short_trip_count(i) + def _has_atomic_blockable_dim(self, cluster, d): + return any(cluster.properties.is_parallel_atomic(i) for i in set(cluster.ispace.itdims) - {d}) + def _has_enough_large_blockable_dims(self, cluster, d): + return len([i for i in set(cluster.ispace.itdims) - {d} + if (cluster.properties.is_parallel_relaxed(i) and + not self._has_short_trip_count(i))]) >= 3 + def callback(self, clusters, prefix): if not prefix: return clusters @@ -178,16 +182,25 @@ def callback(self, clusters, prefix): if is_on_device(c.functions, self.gpu_fit): if self._has_short_trip_count(d): - if self._has_other_blockable_dim(c, d): + if self._has_atomic_blockable_dim(c, d): + # Optimization: minimize number of parallel reductions + # if we think there's already enough parallelism around + return clusters + elif self._has_enough_large_blockable_dims(c, d): + # Optimization: pointless, from a performance standpoint, + # to have more than three large blockable Dimensions return clusters - else: - properties = c.properties.block(d, 'small') + + if any(self._has_short_trip_count(i) for i in c.ispace.itdims): + properties = c.properties.block(d, 'small') elif self._has_data_reuse(c): properties = c.properties.block(d) else: properties = c.properties.block(d, 'small') + elif self._has_data_reuse(c): properties = c.properties.block(d) + else: return clusters @@ -317,8 +330,6 @@ def process(self, clusters): def _derive_block_dims(self, clusters, prefix, d, blk_size_gen): if blk_size_gen is not None: - # By passing a suitable key to `next` we ensure that we pull the - # next par-tile entry iff we're now blocking an unseen TILABLE nest step = sympify(blk_size_gen.next(prefix, d, clusters)) else: # This will result in a parametric step, e.g. `x0_blk0_size` diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 179089866c..ec476e558b 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -1348,7 +1348,7 @@ def _defines(self): ret |= self.parent._defines return ret - @property + @cached_property def symbolic_min(self): try: return sympy.Number(self._symbolic_min) @@ -1359,7 +1359,7 @@ def symbolic_min(self): else: return self._symbolic_min - @property + @cached_property def symbolic_max(self): try: return sympy.Number(self._symbolic_max) @@ -1370,14 +1370,18 @@ def symbolic_max(self): else: return self._symbolic_max - @property + @cached_property def symbolic_size(self): try: return sympy.Number(self._symbolic_size) except (TypeError, ValueError): pass if self._symbolic_size is None: - return super().symbolic_size + v = self.symbolic_max - self.symbolic_min + 1 + if v.is_Number: + return v + else: + return super().symbolic_size else: return self._symbolic_size From c988fa9a8177ff49f22b6aace25d3e68d6427155 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 2 Jul 2024 09:28:43 +0000 Subject: [PATCH 14/18] compiler: Add AbstractFunction.c0 --- devito/types/basic.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/devito/types/basic.py b/devito/types/basic.py index 6ce688f209..962c19bfad 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -1084,6 +1084,13 @@ def space_dimensions(self): def base(self): return self.indexed + @property + def c0(self): + """ + `self`'s first component if `self` is a tensor, otherwise just `self`. + """ + return self + @property def _eval_deriv(self): return self From 0851d5c0eaf6196bf639a7ca9a86be0b1cd2d8a2 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 3 Jul 2024 08:20:32 +0000 Subject: [PATCH 15/18] tests: Strengthen test_different_dtype --- devito/__init__.py | 2 +- tests/test_linearize.py | 32 +++++++++++++++++++------------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/devito/__init__.py b/devito/__init__.py index da9044ed4f..121aa657e1 100644 --- a/devito/__init__.py +++ b/devito/__init__.py @@ -102,7 +102,7 @@ def reinit_compiler(val): configuration.add('safe-math', 0, [0, 1], preprocessor=bool, callback=reinit_compiler) # Enable/disable automatic padding for allocated data -configuration.add('autopadding', False, [False, True]) +configuration.add('autopadding', False, [False, True, np.float32, np.float64]) # Select target device configuration.add('deviceid', -1, preprocessor=int, impacts_jit=False) diff --git a/tests/test_linearize.py b/tests/test_linearize.py index 7d1abad73c..a98de5c635 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -3,7 +3,7 @@ import scipy.sparse from devito import (Grid, Function, TimeFunction, SparseTimeFunction, Operator, Eq, - Inc, MatrixSparseTimeFunction, sin) + Inc, MatrixSparseTimeFunction, sin, switchconfig) from devito.ir import Call, Callable, DummyExpr, Expression, FindNodes, SymbolRegistry from devito.passes import Graph, linearize, generate_macros from devito.types import Array, Bundle, DefaultDimension @@ -594,21 +594,27 @@ def test_inc_w_default_dims(): assert np.all(g.data[4:] == 0) -def test_different_dtype(): - space_order = 4 +@pytest.mark.parametrize('autopadding', [False, True, np.float64]) +def test_different_dtype(autopadding): - grid = Grid(shape=(4, 4)) + @switchconfig(autopadding=autopadding) + def _test_different_dtype(): + space_order = 4 - f = Function(name='f', grid=grid, space_order=space_order) - b = Function(name='b', grid=grid, space_order=space_order, dtype=np.float64) + grid = Grid(shape=(4, 4)) - f.data[:] = 2.1 - b.data[:] = 1.3 + f = Function(name='f', grid=grid, space_order=space_order) + b = Function(name='b', grid=grid, space_order=space_order, dtype=np.float64) - eq = Eq(f, b.dx + f.dy) + f.data[:] = 2.1 + b.data[:] = 1.3 - op1 = Operator(eq, opt=('advanced', {'linearize': True})) + eq = Eq(f, b.dx + f.dy) + + op1 = Operator(eq, opt=('advanced', {'linearize': True})) + + # Check generated code has different strides for different dtypes + assert "bL0(x,y) b[(x)*y_stride0 + (y)]" in str(op1) + assert "L0(x,y) f[(x)*y_stride0 + (y)]" in str(op1) - # Check generated code has different strides for different dtypes - assert "bL0(x,y) b[(x)*y_stride0 + (y)]" in str(op1) - assert "L0(x,y) f[(x)*y_stride0 + (y)]" in str(op1) + _test_different_dtype() From afe0719d1ef53615dc3468088365db843b5b4e46 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 3 Jul 2024 08:57:27 +0000 Subject: [PATCH 16/18] compiler: Enforce int64 on linearized SparseTimeFunctions --- devito/passes/iet/linearization.py | 15 +++++++++++++-- tests/test_linearize.py | 21 +++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/devito/passes/iet/linearization.py b/devito/passes/iet/linearization.py index 11da313be2..ddb0122794 100644 --- a/devito/passes/iet/linearization.py +++ b/devito/passes/iet/linearization.py @@ -125,14 +125,25 @@ def __init__(self, mode, dtype, sregistry): self.strides_dynamic = {} # Strides varying regularly across iterations self.dists = {} + def _select_indexing_dtype(self, f): + # Some objects may contain an extremely large number of elements, so we + # conservatively use int64 to avoid potential overflows regardless of + # what the user requested via `index-mode` + if f.is_SparseTimeFunction: + return np.int64 + else: + return self.dtype + def add(self, f): + dtype = self._select_indexing_dtype(f) + # Update unique sizes table for d in f.dimensions[1:]: k = key1(f, d) if not k or k in self.sizes: continue name = self.sregistry.make_name(prefix='%s_fsz' % d.name) - self.sizes[k] = Size(name=name, dtype=self.dtype, is_const=True) + self.sizes[k] = Size(name=name, dtype=dtype, is_const=True) # Update unique strides table for n, d in enumerate(f.dimensions[1:], 1): @@ -143,7 +154,7 @@ def add(self, f): if k in self.strides: continue name = self.sregistry.make_name(prefix='%s_stride' % d.name) - self.strides[k] = Stride(name=name, dtype=self.dtype, is_const=True) + self.strides[k] = Stride(name=name, dtype=dtype, is_const=True) def update(self, functions): for f in functions: diff --git a/tests/test_linearize.py b/tests/test_linearize.py index a98de5c635..d4e23ecc03 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -126,6 +126,27 @@ def test_interpolation(): assert np.all(u.data == u1.data) +def test_interpolation_enforcing_int64_indexing(): + grid = Grid(shape=(4, 4)) + + src = SparseTimeFunction(name='src', grid=grid, npoint=1, nt=10) + rec = SparseTimeFunction(name='rec', grid=grid, npoint=1, nt=10) + u = TimeFunction(name="u", grid=grid, time_order=2) + + eqns = ([Eq(u.forward, u + 1)] + + src.inject(field=u.forward, expr=src) + + rec.interpolate(expr=u.forward)) + + op = Operator(eqns, opt=('advanced', {'linearize': True, + 'index-mode': 'int32'})) + + # Check generated code + assert 'uL0' in str(op) + assert 'int x_stride0' in str(op) # for `u` + assert 'long p_rec_stride0' in str(op) # for `rec` + assert 'long p_src_stride0' in str(op) # for `src` + + def test_interpolation_msf(): grid = Grid(shape=(4, 4)) From efa27df271a2486930101ae3dea75440b63b2f03 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 3 Jul 2024 12:39:55 +0000 Subject: [PATCH 17/18] compiler: Polish code --- devito/symbolics/manipulation.py | 2 +- devito/types/basic.py | 5 ++--- tests/test_dimension.py | 4 ++++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/devito/symbolics/manipulation.py b/devito/symbolics/manipulation.py index 7f44befb74..776415a3f6 100644 --- a/devito/symbolics/manipulation.py +++ b/devito/symbolics/manipulation.py @@ -287,7 +287,7 @@ def pow_to_mul(expr): # E.g., a Symbol, or possibly a generic expression return expr if exp > 10 or exp < -10 or exp == 0: - # Large and non-integer powers remain untouched + # Large powers remain untouched return expr elif exp == -1 or int(exp) != exp: # Reciprocals and fractional powers also remain untouched, diff --git a/devito/types/basic.py b/devito/types/basic.py index 962c19bfad..e21bae6453 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -1017,10 +1017,9 @@ def __padding_setup_smart__(self, **kwargs): return nopadding # The padded Dimension - candidates = self.space_dimensions - if not candidates: + if not self.space_dimensions: return nopadding - d = candidates[-1] + d = self.space_dimensions[-1] mmts = configuration['platform'].max_mem_trans_size(self.__padding_dtype__) remainder = self._size_nopad[d] % mmts diff --git a/tests/test_dimension.py b/tests/test_dimension.py index f4a87a20f9..18482ca65d 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -211,6 +211,10 @@ def test_modulo_dims_generation_v2(self): assert np.all(f.data[4] == 4) def test_degenerate_to_zero(self): + """ + Check that if `save=Buffer(1)` is used, then the TimeFunction doesn't + need any ModuloDimension for indexing. + """ grid = Grid(shape=(10, 10)) u = TimeFunction(name='u', grid=grid, save=Buffer(1)) From 2d0f60aff643d01a57a1e10bc2d7b0f1f566a6a8 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 3 Jul 2024 13:17:15 +0000 Subject: [PATCH 18/18] api: Enhance autopadding setup --- devito/__init__.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/devito/__init__.py b/devito/__init__.py index 121aa657e1..b0a981dcfa 100644 --- a/devito/__init__.py +++ b/devito/__init__.py @@ -101,8 +101,21 @@ def reinit_compiler(val): # optimisations. configuration.add('safe-math', 0, [0, 1], preprocessor=bool, callback=reinit_compiler) + # Enable/disable automatic padding for allocated data -configuration.add('autopadding', False, [False, True, np.float32, np.float64]) +def _preprocess_autopadding(v): + return { + '0': False, + '1': np.float32, + True: np.float32, + 'fp16': np.float16, + 'fp32': np.float32, + 'fp64': np.float64 + }.get(v, v) + +configuration.add('autopadding', False, # noqa: E305 + [False, True, 0, 1, np.float16, np.float32, np.float64], + preprocessor=_preprocess_autopadding) # Select target device configuration.add('deviceid', -1, preprocessor=int, impacts_jit=False)