devitocodes · FabioLuporini · Apr 18, 2023 · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023
diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
@@ -16,12 +16,17 @@
 
 __all__ = ['platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_nvidia_cc',
            'get_cuda_path', 'get_hip_path', 'check_cuda_runtime', 'get_m1_llvm_path',
-           'Platform', 'Cpu64', 'Intel64', 'Amd', 'Arm', 'Power', 'Device',
-           'NvidiaDevice', 'AmdDevice', 'IntelDevice',
-           'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'SKX', 'KNL', 'KNL7210',  # Intel
-           'AMD', 'ARM', 'M1', 'GRAVITON',  # ARM
-           'POWER8', 'POWER9',  # Other loosely supported CPU architectures
-           'AMDGPUX', 'NVIDIAX', 'INTELGPUX']  # GPUs
+           'Platform', 'Cpu64', 'Intel64', 'IntelSkylake', 'Amd', 'Arm', 'Power',
+           'Device', 'NvidiaDevice', 'AmdDevice', 'IntelDevice',
+           # Intel
+           'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210',
+           'SKX', 'KLX', 'CLX', 'CLK',
+           # ARM
+           'AMD', 'ARM', 'M1', 'GRAVITON',
+           # Other loosely supported CPU architectures
+           'POWER8', 'POWER9',
+           # GPUs
+           'AMDGPUX', 'NVIDIAX', 'INTELGPUX']
 
 
 @memoized_func
@@ -494,7 +499,7 @@ def get_platform():
             if 'phi' in brand:
                 # Intel Xeon Phi?
                 return platform_registry['knl']
-            # Unknown Xeon ? May happen on some virtualizes systems...
+            # Unknown Xeon ? May happen on some virtualized systems...
             return platform_registry['intel64']
         elif 'intel' in brand:
             # Most likely a desktop i3/i5/i7
@@ -607,6 +612,14 @@ class Intel64(Cpu64):
     known_isas = ('cpp', 'sse', 'avx', 'avx2', 'avx512')
 
 
+class IntelSkylake(Intel64):
+    pass
+
+
+class IntelGoldenCode(Intel64):
+    pass
+
+
 class Arm(Cpu64):
 
     known_isas = ('fp', 'asimd', 'asimdrdm')
@@ -721,11 +734,12 @@ def march(cls):
 IVB = Intel64('ivb')
 HSW = Intel64('hsw')
 BDW = Intel64('bdw', isa='avx2')
-SKX = Intel64('skx')
-KLX = Intel64('klx')
-CLX = Intel64('clx')
 KNL = Intel64('knl')
 KNL7210 = Intel64('knl', cores_logical=256, cores_physical=64, isa='avx512')
+SKX = IntelSkylake('skx')
+KLX = IntelSkylake('klx')
+CLX = IntelSkylake('clx')
+CLK = IntelSkylake('clk')
 
 ARM = Arm('arm')
 GRAVITON = Arm('graviton')
@@ -752,6 +766,7 @@ def march(cls):
     'skx': SKX,  # Skylake
     'klx': KLX,  # Kaby Lake
     'clx': CLX,  # Coffee Lake
+    'clk': CLK,  # Cascade Lake
     'knl': KNL,
     'knl7210': KNL7210,
     'arm': ARM,  # Generic ARM CPU

diff --git a/devito/arch/compiler.py b/devito/arch/compiler.py
@@ -12,8 +12,9 @@
 from codepy.jit import compile_from_string
 from codepy.toolchain import GCCToolchain
 
-from devito.arch import (AMDGPUX, Cpu64, M1, NVIDIAX, SKX, POWER8, POWER9, GRAVITON,
-                         get_nvidia_cc, check_cuda_runtime, get_m1_llvm_path)
+from devito.arch import (AMDGPUX, Cpu64, M1, NVIDIAX, POWER8, POWER9, GRAVITON,
+                         IntelSkylake, get_nvidia_cc, check_cuda_runtime,
+                         get_m1_llvm_path)
 from devito.exceptions import CompilationError
 from devito.logger import debug, warning, error
 from devito.parameters import configuration
@@ -375,13 +376,22 @@ class GNUCompiler(Compiler):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        self.cflags += ['-march=native', '-Wno-unused-result', '-Wno-unused-variable',
-                        '-Wno-unused-but-set-variable']
+        platform = kwargs.pop('platform', configuration['platform'])
+
+        self.cflags += ['-march=native', '-Wno-unused-result',
+                        '-Wno-unused-variable', '-Wno-unused-but-set-variable']
+
         if configuration['safe-math']:
             self.cflags.append('-fno-unsafe-math-optimizations')
         else:
             self.cflags.append('-ffast-math')
 
+        if isinstance(platform, IntelSkylake):
+            # The default is `=256` because avx512 slows down the CPU frequency;
+            # however, we empirically found that stencils generally benefit
+            # from `=512`
+            self.cflags.append('-mprefer-vector-width=512')
+
         language = kwargs.pop('language', configuration['language'])
         try:
             if self.version >= Version("4.9.0"):
@@ -683,7 +693,7 @@ def __init__(self, *args, **kwargs):
         else:
             self.cflags.append('-fast')
 
-        if platform is SKX:
+        if isinstance(platform, IntelSkylake):
             # Systematically use 512-bit vectors on skylake
             self.cflags.append("-qopt-zmm-usage=high")
 

diff --git a/devito/core/cpu.py b/devito/core/cpu.py
@@ -60,6 +60,10 @@ def _normalize_kwargs(cls, **kwargs):
         o['par-dynamic-work'] = oo.pop('par-dynamic-work', cls.PAR_DYNAMIC_WORK)
         o['par-nested'] = oo.pop('par-nested', cls.PAR_NESTED)
 
+        # Fission
+        o['fiss-press-ratio'] = oo.pop('fiss-press-ratio', cls.FISS_PRESS_RATIO)
+        o['fiss-press-size'] = oo.pop('fiss-press-size', cls.FISS_PRESS_SIZE)
+
         # Misc
         o['expand'] = oo.pop('expand', cls.EXPAND)
         o['optcomms'] = oo.pop('optcomms', True)
@@ -234,7 +238,7 @@ def callback(f):
             'buffering': lambda i: buffering(i, callback, sregistry, options),
             'blocking': lambda i: blocking(i, sregistry, options),
             'factorize': factorize,
-            'fission': fission,
+            'fission': lambda i: fission(i, 'pressure', **kwargs),
             'fuse': lambda i: fuse(i, options=options),
             'lift': lambda i: Lift().process(cire(i, 'invariants', sregistry,
                                                   options, platform)),

diff --git a/devito/core/gpu.py b/devito/core/gpu.py
@@ -81,6 +81,10 @@ def _normalize_kwargs(cls, **kwargs):
         o['mapify-reduce'] = oo.pop('mapify-reduce', cls.MAPIFY_REDUCE)
         o['index-mode'] = oo.pop('index-mode', cls.INDEX_MODE)
 
+        # Recognised but unused by the GPU backend
+        oo.pop('fiss-press-ratio', None)
+        oo.pop('fiss-press-size', None)
+
         if oo:
             raise InvalidOperator("Unsupported optimization options: [%s]"
                                   % ", ".join(list(oo)))
@@ -158,7 +162,7 @@ def _specialize_clusters(cls, clusters, **kwargs):
         clusters = fuse(clusters, toposort=True, options=options)
 
         # Fission to increase parallelism
-        clusters = fission(clusters)
+        clusters = fission(clusters, kind='parallelism', **kwargs)
 
         # Hoist and optimize Dimension-invariant sub-expressions
         clusters = cire(clusters, 'invariants', sregistry, options, platform)
@@ -251,7 +255,7 @@ def _make_clusters_passes_mapper(cls, **kwargs):
             'tasking': Tasker(runs_on_host, sregistry).process,
             'streaming': Streaming(reads_if_on_host, sregistry).process,
             'factorize': factorize,
-            'fission': fission,
+            'fission': lambda i: fission(i, kind='parallelism', **kwargs),
             'fuse': lambda i: fuse(i, options=options),
             'lift': lambda i: Lift().process(cire(i, 'invariants', sregistry,
                                                   options, platform)),

diff --git a/devito/core/operator.py b/devito/core/operator.py
@@ -83,6 +83,19 @@ class BasicOperator(Operator):
     than this threshold.
     """
 
+    FISS_PRESS_RATIO = 2
+    """
+    A threshold that must be crossed to trigger loop fission. The ratio refers
+    to the number of unique to shared arrays between any two expressions.
+    """
+
+    FISS_PRESS_SIZE = 80
+    """
+    A threshold that must be crossed to trigger loop fission. The size represents
+    the total number of symbols in a group of expressions that is candidate to
+    be fissioned.
+    """
+
     MAPIFY_REDUCE = False
     """
     Vector-expand all scalar reductions to turn them into explicit map-reductions,

diff --git a/devito/passes/clusters/__init__.py b/devito/passes/clusters/__init__.py
@@ -7,4 +7,5 @@
 from .asynchrony import *  # noqa
 from .implicit import *  # noqa
 from .misc import *  # noqa
+from .fission import *  # noqa
 from .derivatives import *  # noqa
diff --git a/devito/passes/clusters/fission.py b/devito/passes/clusters/fission.py
@@ -0,0 +1,165 @@
+from itertools import groupby
+
+from devito.ir import Queue, Scope
+from devito.symbolics import retrieve_terminals
+from devito.tools import Stamp, flatten, frozendict, timed_pass
+
+__all__ = ['fission']
+
+
+class FissionForParallelism(Queue):
+
+    def callback(self, clusters, prefix):
+        if not prefix or len(clusters) == 1:
+            return clusters
+
+        d = prefix[-1].dim
+
+        # Do not waste time if definitely illegal
+        if any(c.properties.is_sequential(d) for c in clusters):
+            return clusters
+
+        # Do not waste time if definitely nothing to do
+        if all(len(prefix) == len(c.ispace) for c in clusters):
+            return clusters
+
+        # Analyze and abort if fissioning would break a dependence
+        scope = Scope(flatten(c.exprs for c in clusters))
+        if any(d._defines & dep.cause or dep.is_reduce(d) for dep in scope.d_all_gen()):
+            return clusters
+
+        processed = []
+        for (it, guards), g in groupby(clusters, key=lambda c: self._key(c, prefix)):
+            group = list(g)
+
+            try:
+                test0 = any(c.properties.is_sequential(it.dim) for c in group)
+            except AttributeError:
+                # `it` is None because `c`'s IterationSpace has no `d` Dimension,
+                # hence `key = (it, guards) = (None, guards)`
+                test0 = True
+
+            if test0 or guards:
+                # Heuristic: no gain from fissioning if unable to ultimately
+                # increase the number of collapsable iteration spaces, hence give up
+                processed.extend(group)
+            else:
+                stamp = Stamp()
+                for c in group:
+                    ispace = c.ispace.lift(d, stamp)
+                    processed.append(c.rebuild(ispace=ispace))
+
+        return processed
+
+    def _key(self, c, prefix):
+        try:
+            index = len(prefix)
+            dims = tuple(i.dim for i in prefix)
+
+            it = c.ispace[index]
+            guards = frozendict({d: v for d, v in c.guards.items() if d in dims})
+
+            return (it, guards)
+        except IndexError:
+            return (None, c.guards)
+
+
+def fission_for_pressure(clusters, options):
+    fiss_press_ratio = options['fiss-press-ratio']
+    fiss_press_size = options['fiss-press-size']
+
+    processed = []
+    for c in clusters:
+        if not c.ispace:
+            processed.append(c)
+            continue
+
+        # Fission, if anything, occurs along the innermost Dimension
+        d = c.ispace[-1].dim
+
+        # Let `ts` ("timestamp") be our candidate split point
+        for timestamp in range(1, len(c.exprs)):
+            # Checking whether it's legal or not might be expensive, so let's
+            # first find out whether it'd be worth it
+            g0 = c.exprs[:timestamp]
+            g1 = c.exprs[timestamp:]
+
+            terminals0 = retrieve_terminals(g0, mode='unique')
+            if len(terminals0) < fiss_press_size:
+                continue
+            terminals1 = retrieve_terminals(g1, mode='unique')
+            if len(terminals1) < fiss_press_size:
+                continue
+
+            functions0 = {i.function for i in terminals0 if i.is_Indexed}
+            functions1 = {i.function for i in terminals1 if i.is_Indexed}
+            functions_shared = functions0.intersection(functions1)
+
+            n0 = len(functions0)
+            n1 = len(functions1)
+            ns = len(functions_shared)
+
+            if not ns:
+                ns = .001
+
+            if not (n0 / ns >= fiss_press_ratio and n1 / ns >= fiss_press_ratio):
+                continue
+
+            # At this point we know we want to fission. But can we?
+            for dep in c.scope.d_flow.independent():
+                if dep.source.timestamp < timestamp <= dep.sink.timestamp:
+                    # Nope, we would unfortunately violate a data dependence
+                    break
+            else:
+                # Yes -- all good
+                processed.append(c.rebuild(exprs=g0))
+
+                ispace = c.ispace.lift(d)
+                processed.append(c.rebuild(exprs=g1, ispace=ispace))
+
+                break
+        else:
+            processed.append(c)
+
+    return processed
+
+
+@timed_pass()
+def fission(clusters, kind='parallelism', options=None, **kwargs):
+    """
+    Clusters fission.
+
+    Currently performed in the following cases:
+
+        * Trade off data locality for parallelism, e.g.
+
+          .. code-block::
+
+            for x              for x
+              for y1             for y1
+                ..                 ..
+              for y2     -->   for x
+                ..               for y2
+                                   ..
+
+        * Trade off data locality for register pressure, e.g.
+
+          .. code-block::
+
+            for x                         for x
+              for y                         for y1
+                a = f(x) + g(x)                 a = f(x) + g(x)
+                b = h(x) + w(x)     -->     for y2
+                                                b = h(x) + w(x)
+
+          NOTE: this only applies to innermost Dimensions.
+    """
+    assert kind in ('parallelism', 'pressure', 'all')
+
+    if kind in ('parallelism', 'all'):
+        clusters = FissionForParallelism().process(clusters)
+
+    if kind in ('pressure', 'all'):
+        clusters = fission_for_pressure(clusters, options)
+
+    return clusters