From e19999f5923b4a2cae0f12ac3e1eb72b6eefe0ab Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Thu, 25 Jul 2024 06:41:20 -0700 Subject: [PATCH 1/7] Update Compass to v1.5.0-alpha.1 --- compass/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compass/version.py b/compass/version.py index 91e460c18..1b2e6af6b 100644 --- a/compass/version.py +++ b/compass/version.py @@ -1 +1 @@ -__version__ = '1.4.0-alpha.7' +__version__ = '1.5.0-alpha.1' From a6582e5f16cdd6fbdd0151dca846e10616bfc308 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Thu, 25 Jul 2024 06:42:56 -0700 Subject: [PATCH 2/7] Update mache to v1.24.0 --- conda/compass_env/spec-file.template | 2 +- conda/configure_compass_env.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/compass_env/spec-file.template b/conda/compass_env/spec-file.template index cac8fe415..d508c6461 100644 --- a/conda/compass_env/spec-file.template +++ b/conda/compass_env/spec-file.template @@ -16,7 +16,7 @@ ipython jupyter lxml {% if include_mache %} -mache=1.23.0 +mache=1.24.0 {% endif %} matplotlib-base >=3.9.1 metis diff --git a/conda/configure_compass_env.py b/conda/configure_compass_env.py index 6ad866196..54866a87c 100755 --- a/conda/configure_compass_env.py +++ b/conda/configure_compass_env.py @@ -100,7 +100,7 @@ def main(): if local_mache: mache = '' else: - mache = '"mache=1.23.0"' + mache = '"mache=1.24.0"' setup_install_env(env_name, activate_base, args.use_local, logger, args.recreate, conda_base, mache) From 1ba10913e77bbec88ba771cbd0a36ecb79aed840 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 1 Jul 2024 04:21:45 -0700 Subject: [PATCH 3/7] Add pm-gpu --- compass/machines/pm-gpu.cfg | 44 +++++++++++++++++++++++++++++++++++++ conda/albany_supported.txt | 1 + conda/unsupported.txt | 3 ++- 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 compass/machines/pm-gpu.cfg diff --git a/compass/machines/pm-gpu.cfg b/compass/machines/pm-gpu.cfg new file mode 100644 index 000000000..b82faf518 --- /dev/null +++ b/compass/machines/pm-gpu.cfg @@ -0,0 +1,44 @@ + +# The paths section describes paths that are used within the ocean core test +# cases. +[paths] + +# A shared root directory where MPAS standalone data can be found +database_root = /global/cfs/cdirs/e3sm/mpas_standalonedata + +# the path to the base conda environment where compass environments have +# been created +compass_envs = /global/common/software/e3sm/compass/pm-gpu/base + + +# Options related to deploying a compass conda environment on supported +# machines +[deploy] + +# the compiler set to use for system libraries and MPAS builds +compiler = gnugpu + +# the system MPI library to use for gnugpu compiler +mpi_gnugpu = mpich + +# the system MPI library to use for nvidiagpu compiler +mpi_nvidiagpu = mpich + +# the base path for spack environments used by compass +spack = /global/cfs/cdirs/e3sm/software/compass/pm-gpu/spack + +# whether to use the same modules for hdf5, netcdf-c, netcdf-fortran and +# pnetcdf as E3SM (spack modules are used otherwise) +use_e3sm_hdf5_netcdf = True + +# The parallel section describes options related to running jobs in parallel. +# Most options in this section come from mache so here we just add or override +# some defaults +[parallel] + +# cores per node on the machine +cores_per_node = 64 + +# threads per core (set to 1 because trying to hyperthread seems to be causing +# hanging on perlmutter) +threads_per_core = 1 diff --git a/conda/albany_supported.txt b/conda/albany_supported.txt index 12ce2817a..d969e7a20 100644 --- a/conda/albany_supported.txt +++ b/conda/albany_supported.txt @@ -3,4 +3,5 @@ chicoma-cpu, gnu, mpich chrysalis, gnu, openmpi pm-cpu, gnu, mpich +pm-gpu, gnugpu, mpich morpheus, gnu, openmpi diff --git a/conda/unsupported.txt b/conda/unsupported.txt index 5014d526e..cfce62076 100644 --- a/conda/unsupported.txt +++ b/conda/unsupported.txt @@ -15,7 +15,8 @@ compy, pgi, mvapich2 pm-cpu, nvidia, mpich pm-cpu, aocc, mpich pm-cpu, amdclang, mpich - +pm-gpu, gnu, mpich +pm-gpu, nvidia, mpich # compiles but tests unreliable (errors or hanging), # see https://github.com/MPAS-Dev/compass/issues/336 From 560b620902166bc97199fe46e2bb5725d060b2c6 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Thu, 25 Jul 2024 06:52:24 -0700 Subject: [PATCH 4/7] Add `gpus-per-node` to job scripts and resources --- compass/job/__init__.py | 9 ++++++++- compass/job/job_script.template | 3 +++ compass/parallel.py | 5 +++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/compass/job/__init__.py b/compass/job/__init__.py index 54a042123..410d9571b 100644 --- a/compass/job/__init__.py +++ b/compass/job/__init__.py @@ -88,6 +88,12 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir, job_name = 'compass' else: job_name = f'compass_{suite}' + + if config.has_option('parallel', 'gpus_per_node'): + gpus_per_node = config.get('parallel', 'gpus_per_node') + else: + gpus_per_node = '' + wall_time = config.get('job', 'wall_time') template = Template(resources.read_text( @@ -96,7 +102,8 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir, text = template.render(job_name=job_name, account=account, nodes=f'{nodes}', wall_time=wall_time, qos=qos, partition=partition, constraint=constraint, - suite=suite, pre_run_commands=pre_run_commands, + gpus_per_node=gpus_per_node, suite=suite, + pre_run_commands=pre_run_commands, post_run_commands=post_run_commands) text = _clean_up_whitespace(text) if suite == '': diff --git a/compass/job/job_script.template b/compass/job/job_script.template index 09030e2a6..bb9065b01 100644 --- a/compass/job/job_script.template +++ b/compass/job/job_script.template @@ -16,6 +16,9 @@ {% if constraint != '' -%} #SBATCH --constraint={{ constraint }} {%- endif %} +{% if gpus_per_node != '' -%} +#SBATCH --gpus-per-node={{ gpus_per_node }} +{%- endif %} source load_compass_env.sh {{ pre_run_commands }} diff --git a/compass/parallel.py b/compass/parallel.py index cf49c930a..08d0e7d80 100644 --- a/compass/parallel.py +++ b/compass/parallel.py @@ -66,6 +66,11 @@ def get_available_parallel_resources(config): cores_per_node=cores_per_node, mpi_allowed=mpi_allowed ) + + if config.has_option('parallel', 'gpus_per_node'): + available_resources['gpus_per_node'] = \ + config.getint('parallel', 'gpus_per_node') + return available_resources From 08f33dbf56b3eb8e0730aa193a620ff7dd11da3c Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 5 Aug 2024 06:24:54 -0700 Subject: [PATCH 5/7] Fix local mache install --- conda/bootstrap.py | 5 +++-- conda/configure_compass_env.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/conda/bootstrap.py b/conda/bootstrap.py index c113fb0e5..33228645d 100755 --- a/conda/bootstrap.py +++ b/conda/bootstrap.py @@ -1082,8 +1082,9 @@ def main(): # noqa: C901 print('Install local mache\n') commands = f'source {conda_base}/etc/profile.d/conda.sh && ' \ f'conda activate {conda_env_name} && ' \ - 'cd ../build_mache/mache && ' \ - 'python -m pip install --no-deps .' + f'cd ../build_mache/mache && ' \ + f'conda install -y --file spec-file.txt && ' \ + f'python -m pip install --no-deps .' check_call(commands, logger=logger) previous_conda_env = conda_env_name diff --git a/conda/configure_compass_env.py b/conda/configure_compass_env.py index 54866a87c..e4f6fe4ee 100755 --- a/conda/configure_compass_env.py +++ b/conda/configure_compass_env.py @@ -114,6 +114,7 @@ def main(): f'git clone -b {args.mache_branch} ' \ f'git@github.com:{args.mache_fork}.git mache && ' \ f'cd mache && ' \ + f'conda install -y --file spec-file.txt && ' \ f'python -m pip install --no-deps .' check_call(commands, logger=logger) From 15a87f4ac9d7933a44b5f0c7fb323b846b5aaebb Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 5 Aug 2024 06:25:23 -0700 Subject: [PATCH 6/7] Add cuda to Albany and Trilinos builds for GPU machines --- conda/bootstrap.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/conda/bootstrap.py b/conda/bootstrap.py index 33228645d..f746194ac 100755 --- a/conda/bootstrap.py +++ b/conda/bootstrap.py @@ -476,6 +476,9 @@ def build_spack_env(config, update_spack, machine, compiler, mpi, # noqa: C901 scorpio = config.get('deploy', 'scorpio') parallelio = config.get('deploy', 'parallelio') + # for now, we'll assume Cuda is needed anytime GPUs are present + with_cuda = config.has_option('parallel', 'gpus_per_node') + if config.has_option('deploy', 'spack_mirror'): spack_mirror = config.get('deploy', 'spack_mirror') else: @@ -536,8 +539,12 @@ def build_spack_env(config, update_spack, machine, compiler, mpi, # noqa: C901 f'@{parallelio}+pnetcdf~timing"') if albany != 'None': - specs.append(f'"trilinos-for-albany@{albany}"') - specs.append(f'"albany@{albany}+mpas~py+unit_tests"') + if with_cuda: + cuda = '+cuda+uvm' + else: + cuda = '' + specs.append(f'"trilinos-for-albany@{albany}{cuda}"') + specs.append(f'"albany@{albany}+mpas~py+unit_tests{cuda}"') yaml_template = f'{spack_template_path}/{machine}_{compiler}_{mpi}.yaml' if not os.path.exists(yaml_template): From 6dfb627f5785c932849665c0a2fa33a0088bb4d9 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Tue, 13 Aug 2024 08:27:16 -0700 Subject: [PATCH 7/7] Add +sfad variant to albany spack build with cuda --- conda/bootstrap.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/conda/bootstrap.py b/conda/bootstrap.py index f746194ac..ff2f74209 100755 --- a/conda/bootstrap.py +++ b/conda/bootstrap.py @@ -540,11 +540,13 @@ def build_spack_env(config, update_spack, machine, compiler, mpi, # noqa: C901 if albany != 'None': if with_cuda: - cuda = '+cuda+uvm' + albany_cuda = '+cuda+uvm+sfad sfadsize=12' + trilinos_cuda = '+cuda+uvm' else: - cuda = '' - specs.append(f'"trilinos-for-albany@{albany}{cuda}"') - specs.append(f'"albany@{albany}+mpas~py+unit_tests{cuda}"') + albany_cuda = '' + trilinos_cuda = '' + specs.append(f'"trilinos-for-albany@{albany}{trilinos_cuda}"') + specs.append(f'"albany@{albany}+mpas~py+unit_tests{albany_cuda}"') yaml_template = f'{spack_template_path}/{machine}_{compiler}_{mpi}.yaml' if not os.path.exists(yaml_template):