Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compiler: Add fission-for-pressure subpass #2114

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 25 additions & 10 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,17 @@

__all__ = ['platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_nvidia_cc',
'get_cuda_path', 'get_hip_path', 'check_cuda_runtime', 'get_m1_llvm_path',
'Platform', 'Cpu64', 'Intel64', 'Amd', 'Arm', 'Power', 'Device',
'NvidiaDevice', 'AmdDevice', 'IntelDevice',
'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'SKX', 'KNL', 'KNL7210', # Intel
'AMD', 'ARM', 'M1', 'GRAVITON', # ARM
'POWER8', 'POWER9', # Other loosely supported CPU architectures
'AMDGPUX', 'NVIDIAX', 'INTELGPUX'] # GPUs
'Platform', 'Cpu64', 'Intel64', 'IntelSkylake', 'Amd', 'Arm', 'Power',
'Device', 'NvidiaDevice', 'AmdDevice', 'IntelDevice',
# Intel
'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210',
'SKX', 'KLX', 'CLX', 'CLK',
# ARM
'AMD', 'ARM', 'M1', 'GRAVITON',
# Other loosely supported CPU architectures
'POWER8', 'POWER9',
# GPUs
'AMDGPUX', 'NVIDIAX', 'INTELGPUX']


@memoized_func
Expand Down Expand Up @@ -494,7 +499,7 @@ def get_platform():
if 'phi' in brand:
# Intel Xeon Phi?
return platform_registry['knl']
# Unknown Xeon ? May happen on some virtualizes systems...
# Unknown Xeon ? May happen on some virtualized systems...
return platform_registry['intel64']
elif 'intel' in brand:
# Most likely a desktop i3/i5/i7
Expand Down Expand Up @@ -607,6 +612,14 @@ class Intel64(Cpu64):
known_isas = ('cpp', 'sse', 'avx', 'avx2', 'avx512')


class IntelSkylake(Intel64):
pass


class IntelGoldenCode(Intel64):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GoldenCove?

pass


class Arm(Cpu64):

known_isas = ('fp', 'asimd', 'asimdrdm')
Expand Down Expand Up @@ -721,11 +734,12 @@ def march(cls):
IVB = Intel64('ivb')
HSW = Intel64('hsw')
BDW = Intel64('bdw', isa='avx2')
SKX = Intel64('skx')
KLX = Intel64('klx')
CLX = Intel64('clx')
KNL = Intel64('knl')
KNL7210 = Intel64('knl', cores_logical=256, cores_physical=64, isa='avx512')
SKX = IntelSkylake('skx')
KLX = IntelSkylake('klx')
CLX = IntelSkylake('clx')
CLK = IntelSkylake('clk')

ARM = Arm('arm')
GRAVITON = Arm('graviton')
Expand All @@ -752,6 +766,7 @@ def march(cls):
'skx': SKX, # Skylake
'klx': KLX, # Kaby Lake
'clx': CLX, # Coffee Lake
'clk': CLK, # Cascade Lake
'knl': KNL,
'knl7210': KNL7210,
'arm': ARM, # Generic ARM CPU
Expand Down
20 changes: 15 additions & 5 deletions devito/arch/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
from codepy.jit import compile_from_string
from codepy.toolchain import GCCToolchain

from devito.arch import (AMDGPUX, Cpu64, M1, NVIDIAX, SKX, POWER8, POWER9, GRAVITON,
get_nvidia_cc, check_cuda_runtime, get_m1_llvm_path)
from devito.arch import (AMDGPUX, Cpu64, M1, NVIDIAX, POWER8, POWER9, GRAVITON,
IntelSkylake, get_nvidia_cc, check_cuda_runtime,
get_m1_llvm_path)
from devito.exceptions import CompilationError
from devito.logger import debug, warning, error
from devito.parameters import configuration
Expand Down Expand Up @@ -375,13 +376,22 @@ class GNUCompiler(Compiler):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.cflags += ['-march=native', '-Wno-unused-result', '-Wno-unused-variable',
'-Wno-unused-but-set-variable']
platform = kwargs.pop('platform', configuration['platform'])

self.cflags += ['-march=native', '-Wno-unused-result',
'-Wno-unused-variable', '-Wno-unused-but-set-variable']

if configuration['safe-math']:
self.cflags.append('-fno-unsafe-math-optimizations')
else:
self.cflags.append('-ffast-math')

if isinstance(platform, IntelSkylake):
# The default is `=256` because avx512 slows down the CPU frequency;
# however, we empirically found that stencils generally benefit
# from `=512`
self.cflags.append('-mprefer-vector-width=512')

language = kwargs.pop('language', configuration['language'])
try:
if self.version >= Version("4.9.0"):
Expand Down Expand Up @@ -683,7 +693,7 @@ def __init__(self, *args, **kwargs):
else:
self.cflags.append('-fast')

if platform is SKX:
if isinstance(platform, IntelSkylake):
# Systematically use 512-bit vectors on skylake
self.cflags.append("-qopt-zmm-usage=high")

Expand Down
6 changes: 5 additions & 1 deletion devito/core/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ def _normalize_kwargs(cls, **kwargs):
o['par-dynamic-work'] = oo.pop('par-dynamic-work', cls.PAR_DYNAMIC_WORK)
o['par-nested'] = oo.pop('par-nested', cls.PAR_NESTED)

# Fission
o['fiss-press-ratio'] = oo.pop('fiss-press-ratio', cls.FISS_PRESS_RATIO)
o['fiss-press-size'] = oo.pop('fiss-press-size', cls.FISS_PRESS_SIZE)

# Misc
o['expand'] = oo.pop('expand', cls.EXPAND)
o['optcomms'] = oo.pop('optcomms', True)
Expand Down Expand Up @@ -234,7 +238,7 @@ def callback(f):
'buffering': lambda i: buffering(i, callback, sregistry, options),
'blocking': lambda i: blocking(i, sregistry, options),
'factorize': factorize,
'fission': fission,
'fission': lambda i: fission(i, 'pressure', **kwargs),
'fuse': lambda i: fuse(i, options=options),
'lift': lambda i: Lift().process(cire(i, 'invariants', sregistry,
options, platform)),
Expand Down
8 changes: 6 additions & 2 deletions devito/core/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ def _normalize_kwargs(cls, **kwargs):
o['mapify-reduce'] = oo.pop('mapify-reduce', cls.MAPIFY_REDUCE)
o['index-mode'] = oo.pop('index-mode', cls.INDEX_MODE)

# Recognised but unused by the GPU backend
oo.pop('fiss-press-ratio', None)
oo.pop('fiss-press-size', None)

if oo:
raise InvalidOperator("Unsupported optimization options: [%s]"
% ", ".join(list(oo)))
Expand Down Expand Up @@ -158,7 +162,7 @@ def _specialize_clusters(cls, clusters, **kwargs):
clusters = fuse(clusters, toposort=True, options=options)

# Fission to increase parallelism
clusters = fission(clusters)
clusters = fission(clusters, kind='parallelism', **kwargs)

# Hoist and optimize Dimension-invariant sub-expressions
clusters = cire(clusters, 'invariants', sregistry, options, platform)
Expand Down Expand Up @@ -251,7 +255,7 @@ def _make_clusters_passes_mapper(cls, **kwargs):
'tasking': Tasker(runs_on_host, sregistry).process,
'streaming': Streaming(reads_if_on_host, sregistry).process,
'factorize': factorize,
'fission': fission,
'fission': lambda i: fission(i, kind='parallelism', **kwargs),
'fuse': lambda i: fuse(i, options=options),
'lift': lambda i: Lift().process(cire(i, 'invariants', sregistry,
options, platform)),
Expand Down
13 changes: 13 additions & 0 deletions devito/core/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,19 @@ class BasicOperator(Operator):
than this threshold.
"""

FISS_PRESS_RATIO = 2
"""
A threshold that must be crossed to trigger loop fission. The ratio refers
to the number of unique to shared arrays between any two expressions.
"""

FISS_PRESS_SIZE = 80
"""
A threshold that must be crossed to trigger loop fission. The size represents
the total number of symbols in a group of expressions that is candidate to
be fissioned.
"""

MAPIFY_REDUCE = False
"""
Vector-expand all scalar reductions to turn them into explicit map-reductions,
Expand Down
1 change: 1 addition & 0 deletions devito/passes/clusters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@
from .asynchrony import * # noqa
from .implicit import * # noqa
from .misc import * # noqa
from .fission import * # noqa
from .derivatives import * # noqa
165 changes: 165 additions & 0 deletions devito/passes/clusters/fission.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
from itertools import groupby

from devito.ir import Queue, Scope
from devito.symbolics import retrieve_terminals
from devito.tools import Stamp, flatten, frozendict, timed_pass

__all__ = ['fission']


class FissionForParallelism(Queue):

def callback(self, clusters, prefix):
if not prefix or len(clusters) == 1:
return clusters

d = prefix[-1].dim

# Do not waste time if definitely illegal
if any(c.properties.is_sequential(d) for c in clusters):
return clusters

# Do not waste time if definitely nothing to do
if all(len(prefix) == len(c.ispace) for c in clusters):
return clusters

# Analyze and abort if fissioning would break a dependence
scope = Scope(flatten(c.exprs for c in clusters))
if any(d._defines & dep.cause or dep.is_reduce(d) for dep in scope.d_all_gen()):
return clusters

processed = []
for (it, guards), g in groupby(clusters, key=lambda c: self._key(c, prefix)):
group = list(g)

try:
test0 = any(c.properties.is_sequential(it.dim) for c in group)
except AttributeError:
# `it` is None because `c`'s IterationSpace has no `d` Dimension,
# hence `key = (it, guards) = (None, guards)`
test0 = True

if test0 or guards:
# Heuristic: no gain from fissioning if unable to ultimately
# increase the number of collapsable iteration spaces, hence give up
processed.extend(group)
else:
stamp = Stamp()
for c in group:
ispace = c.ispace.lift(d, stamp)
processed.append(c.rebuild(ispace=ispace))

return processed

def _key(self, c, prefix):
try:
index = len(prefix)
dims = tuple(i.dim for i in prefix)

it = c.ispace[index]
guards = frozendict({d: v for d, v in c.guards.items() if d in dims})

return (it, guards)
except IndexError:
return (None, c.guards)


def fission_for_pressure(clusters, options):
fiss_press_ratio = options['fiss-press-ratio']
fiss_press_size = options['fiss-press-size']

processed = []
for c in clusters:
if not c.ispace:
processed.append(c)
continue

# Fission, if anything, occurs along the innermost Dimension
d = c.ispace[-1].dim

# Let `ts` ("timestamp") be our candidate split point
for timestamp in range(1, len(c.exprs)):
# Checking whether it's legal or not might be expensive, so let's
# first find out whether it'd be worth it
g0 = c.exprs[:timestamp]
g1 = c.exprs[timestamp:]

terminals0 = retrieve_terminals(g0, mode='unique')
if len(terminals0) < fiss_press_size:
continue
terminals1 = retrieve_terminals(g1, mode='unique')
if len(terminals1) < fiss_press_size:
continue

functions0 = {i.function for i in terminals0 if i.is_Indexed}
functions1 = {i.function for i in terminals1 if i.is_Indexed}
functions_shared = functions0.intersection(functions1)

n0 = len(functions0)
n1 = len(functions1)
ns = len(functions_shared)

if not ns:
ns = .001
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess safe to assume we won't have over 1k function

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add some comment for this, even though not really necessary? Just to justify this option


if not (n0 / ns >= fiss_press_ratio and n1 / ns >= fiss_press_ratio):
continue

# At this point we know we want to fission. But can we?
for dep in c.scope.d_flow.independent():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not check this first and avoid all the retrieve/sets/...

if dep.source.timestamp < timestamp <= dep.sink.timestamp:
# Nope, we would unfortunately violate a data dependence
break
else:
# Yes -- all good
processed.append(c.rebuild(exprs=g0))

ispace = c.ispace.lift(d)
processed.append(c.rebuild(exprs=g1, ispace=ispace))

break
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So at most one fission? Could check the remainder g can be split too not sure of what would want

processed.append(c.rebuild(exprs=g0))
processed.append(fission_for_pressure(c.rebuild(exprs=g1, ispace=ispace)))

else:
processed.append(c)

return processed


@timed_pass()
def fission(clusters, kind='parallelism', options=None, **kwargs):
"""
Clusters fission.

Currently performed in the following cases:

* Trade off data locality for parallelism, e.g.

.. code-block::

for x for x
for y1 for y1
.. ..
for y2 --> for x
.. for y2
..

* Trade off data locality for register pressure, e.g.

.. code-block::

for x for x
for y for y1
a = f(x) + g(x) a = f(x) + g(x)
b = h(x) + w(x) --> for y2
b = h(x) + w(x)

NOTE: this only applies to innermost Dimensions.
"""
assert kind in ('parallelism', 'pressure', 'all')

if kind in ('parallelism', 'all'):
clusters = FissionForParallelism().process(clusters)

if kind in ('pressure', 'all'):
clusters = fission_for_pressure(clusters, options)

return clusters
Loading