Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pm-gpu #835

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion compass/job/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir,
job_name = 'compass'
else:
job_name = f'compass_{suite}'

if config.has_option('parallel', 'gpus_per_node'):
gpus_per_node = config.get('parallel', 'gpus_per_node')
else:
gpus_per_node = ''

wall_time = config.get('job', 'wall_time')

template = Template(resources.read_text(
Expand All @@ -96,7 +102,8 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir,
text = template.render(job_name=job_name, account=account,
nodes=f'{nodes}', wall_time=wall_time, qos=qos,
partition=partition, constraint=constraint,
suite=suite, pre_run_commands=pre_run_commands,
gpus_per_node=gpus_per_node, suite=suite,
pre_run_commands=pre_run_commands,
post_run_commands=post_run_commands)
text = _clean_up_whitespace(text)
if suite == '':
Expand Down
3 changes: 3 additions & 0 deletions compass/job/job_script.template
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
{% if constraint != '' -%}
#SBATCH --constraint={{ constraint }}
{%- endif %}
{% if gpus_per_node != '' -%}
#SBATCH --gpus-per-node={{ gpus_per_node }}
{%- endif %}

source load_compass_env.sh
{{ pre_run_commands }}
Expand Down
44 changes: 44 additions & 0 deletions compass/machines/pm-gpu.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@

# The paths section describes paths that are used within the ocean core test
# cases.
[paths]

# A shared root directory where MPAS standalone data can be found
database_root = /global/cfs/cdirs/e3sm/mpas_standalonedata

# the path to the base conda environment where compass environments have
# been created
compass_envs = /global/common/software/e3sm/compass/pm-gpu/base


# Options related to deploying a compass conda environment on supported
# machines
[deploy]

# the compiler set to use for system libraries and MPAS builds
compiler = gnugpu

# the system MPI library to use for gnugpu compiler
mpi_gnugpu = mpich

# the system MPI library to use for nvidiagpu compiler
mpi_nvidiagpu = mpich

# the base path for spack environments used by compass
spack = /global/cfs/cdirs/e3sm/software/compass/pm-gpu/spack

# whether to use the same modules for hdf5, netcdf-c, netcdf-fortran and
# pnetcdf as E3SM (spack modules are used otherwise)
use_e3sm_hdf5_netcdf = True

# The parallel section describes options related to running jobs in parallel.
# Most options in this section come from mache so here we just add or override
# some defaults
[parallel]

# cores per node on the machine
cores_per_node = 64

# threads per core (set to 1 because trying to hyperthread seems to be causing
# hanging on perlmutter)
threads_per_core = 1
5 changes: 5 additions & 0 deletions compass/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ def get_available_parallel_resources(config):
cores_per_node=cores_per_node,
mpi_allowed=mpi_allowed
)

if config.has_option('parallel', 'gpus_per_node'):
available_resources['gpus_per_node'] = \
config.getint('parallel', 'gpus_per_node')

return available_resources


Expand Down
2 changes: 1 addition & 1 deletion compass/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.4.0-alpha.7'
__version__ = '1.5.0-alpha.1'
1 change: 1 addition & 0 deletions conda/albany_supported.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
chicoma-cpu, gnu, mpich
chrysalis, gnu, openmpi
pm-cpu, gnu, mpich
pm-gpu, gnugpu, mpich
morpheus, gnu, openmpi
18 changes: 14 additions & 4 deletions conda/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,9 @@ def build_spack_env(config, update_spack, machine, compiler, mpi, # noqa: C901
scorpio = config.get('deploy', 'scorpio')
parallelio = config.get('deploy', 'parallelio')

# for now, we'll assume Cuda is needed anytime GPUs are present
with_cuda = config.has_option('parallel', 'gpus_per_node')

if config.has_option('deploy', 'spack_mirror'):
spack_mirror = config.get('deploy', 'spack_mirror')
else:
Expand Down Expand Up @@ -536,8 +539,14 @@ def build_spack_env(config, update_spack, machine, compiler, mpi, # noqa: C901
f'@{parallelio}+pnetcdf~timing"')

if albany != 'None':
specs.append(f'"trilinos-for-albany@{albany}"')
specs.append(f'"albany@{albany}+mpas~py+unit_tests"')
if with_cuda:
albany_cuda = '+cuda+uvm+sfad sfadsize=12'
trilinos_cuda = '+cuda+uvm'
else:
albany_cuda = ''
trilinos_cuda = ''
specs.append(f'"trilinos-for-albany@{albany}{trilinos_cuda}"')
specs.append(f'"albany@{albany}+mpas~py+unit_tests{albany_cuda}"')

yaml_template = f'{spack_template_path}/{machine}_{compiler}_{mpi}.yaml'
if not os.path.exists(yaml_template):
Expand Down Expand Up @@ -1082,8 +1091,9 @@ def main(): # noqa: C901
print('Install local mache\n')
commands = f'source {conda_base}/etc/profile.d/conda.sh && ' \
f'conda activate {conda_env_name} && ' \
'cd ../build_mache/mache && ' \
'python -m pip install --no-deps .'
f'cd ../build_mache/mache && ' \
f'conda install -y --file spec-file.txt && ' \
f'python -m pip install --no-deps .'
check_call(commands, logger=logger)

previous_conda_env = conda_env_name
Expand Down
2 changes: 1 addition & 1 deletion conda/compass_env/spec-file.template
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ ipython
jupyter
lxml
{% if include_mache %}
mache=1.23.0
mache=1.24.0
{% endif %}
matplotlib-base >=3.9.1
metis
Expand Down
3 changes: 2 additions & 1 deletion conda/configure_compass_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def main():
if local_mache:
mache = ''
else:
mache = '"mache=1.23.0"'
mache = '"mache=1.24.0"'

setup_install_env(env_name, activate_base, args.use_local, logger,
args.recreate, conda_base, mache)
Expand All @@ -114,6 +114,7 @@ def main():
f'git clone -b {args.mache_branch} ' \
f'[email protected]:{args.mache_fork}.git mache && ' \
f'cd mache && ' \
f'conda install -y --file spec-file.txt && ' \
f'python -m pip install --no-deps .'

check_call(commands, logger=logger)
Expand Down
3 changes: 2 additions & 1 deletion conda/unsupported.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ compy, pgi, mvapich2
pm-cpu, nvidia, mpich
pm-cpu, aocc, mpich
pm-cpu, amdclang, mpich

pm-gpu, gnu, mpich
pm-gpu, nvidia, mpich

# compiles but tests unreliable (errors or hanging),
# see https://github.com/MPAS-Dev/compass/issues/336
Expand Down
Loading