From 3ec3a7b4115583a9a3cf2f666e41513649f52709 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Thu, 25 Jul 2024 06:41:20 -0700 Subject: [PATCH 1/8] Update Compass to v1.5.0-alpha.1 --- compass/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compass/version.py b/compass/version.py index 91e460c18..1b2e6af6b 100644 --- a/compass/version.py +++ b/compass/version.py @@ -1 +1 @@ -__version__ = '1.4.0-alpha.7' +__version__ = '1.5.0-alpha.1' From ec3e55d6e3a2a7693c1504fede0ad5ffc220bb71 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Thu, 25 Jul 2024 06:42:56 -0700 Subject: [PATCH 2/8] Update mache to v1.25.0 --- conda/compass_env/spec-file.template | 2 +- conda/configure_compass_env.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/compass_env/spec-file.template b/conda/compass_env/spec-file.template index e7ae69b96..05142ac1f 100644 --- a/conda/compass_env/spec-file.template +++ b/conda/compass_env/spec-file.template @@ -16,7 +16,7 @@ ipython jupyter lxml {% if include_mache %} -mache=1.23.0 +mache=1.25.0 {% endif %} matplotlib-base >=3.9.1 metis diff --git a/conda/configure_compass_env.py b/conda/configure_compass_env.py index 6ad866196..9db9c2c51 100755 --- a/conda/configure_compass_env.py +++ b/conda/configure_compass_env.py @@ -100,7 +100,7 @@ def main(): if local_mache: mache = '' else: - mache = '"mache=1.23.0"' + mache = '"mache=1.25.0"' setup_install_env(env_name, activate_base, args.use_local, logger, args.recreate, conda_base, mache) From 25d74f0595ffdb97a5c41a5b9651213ab28fe35d Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 1 Jul 2024 04:21:45 -0700 Subject: [PATCH 3/8] Add pm-gpu --- compass/machines/pm-gpu.cfg | 44 +++++++++++++++++++++++++++++++++++++ conda/albany_supported.txt | 1 + conda/unsupported.txt | 3 ++- 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 compass/machines/pm-gpu.cfg diff --git a/compass/machines/pm-gpu.cfg b/compass/machines/pm-gpu.cfg new file mode 100644 index 000000000..b82faf518 --- /dev/null +++ b/compass/machines/pm-gpu.cfg @@ -0,0 +1,44 @@ + +# The paths section describes paths that are used within the ocean core test +# cases. +[paths] + +# A shared root directory where MPAS standalone data can be found +database_root = /global/cfs/cdirs/e3sm/mpas_standalonedata + +# the path to the base conda environment where compass environments have +# been created +compass_envs = /global/common/software/e3sm/compass/pm-gpu/base + + +# Options related to deploying a compass conda environment on supported +# machines +[deploy] + +# the compiler set to use for system libraries and MPAS builds +compiler = gnugpu + +# the system MPI library to use for gnugpu compiler +mpi_gnugpu = mpich + +# the system MPI library to use for nvidiagpu compiler +mpi_nvidiagpu = mpich + +# the base path for spack environments used by compass +spack = /global/cfs/cdirs/e3sm/software/compass/pm-gpu/spack + +# whether to use the same modules for hdf5, netcdf-c, netcdf-fortran and +# pnetcdf as E3SM (spack modules are used otherwise) +use_e3sm_hdf5_netcdf = True + +# The parallel section describes options related to running jobs in parallel. +# Most options in this section come from mache so here we just add or override +# some defaults +[parallel] + +# cores per node on the machine +cores_per_node = 64 + +# threads per core (set to 1 because trying to hyperthread seems to be causing +# hanging on perlmutter) +threads_per_core = 1 diff --git a/conda/albany_supported.txt b/conda/albany_supported.txt index 12ce2817a..d969e7a20 100644 --- a/conda/albany_supported.txt +++ b/conda/albany_supported.txt @@ -3,4 +3,5 @@ chicoma-cpu, gnu, mpich chrysalis, gnu, openmpi pm-cpu, gnu, mpich +pm-gpu, gnugpu, mpich morpheus, gnu, openmpi diff --git a/conda/unsupported.txt b/conda/unsupported.txt index 5014d526e..cfce62076 100644 --- a/conda/unsupported.txt +++ b/conda/unsupported.txt @@ -15,7 +15,8 @@ compy, pgi, mvapich2 pm-cpu, nvidia, mpich pm-cpu, aocc, mpich pm-cpu, amdclang, mpich - +pm-gpu, gnu, mpich +pm-gpu, nvidia, mpich # compiles but tests unreliable (errors or hanging), # see https://github.com/MPAS-Dev/compass/issues/336 From ebd5953c13a094c8bc6853807c1a128616a6ad38 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Thu, 25 Jul 2024 06:52:24 -0700 Subject: [PATCH 4/8] Add `gpus-per-node` to job scripts and resources --- compass/job/__init__.py | 9 ++++++++- compass/job/job_script.template | 3 +++ compass/parallel.py | 5 +++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/compass/job/__init__.py b/compass/job/__init__.py index 21ec57f58..151d951cb 100644 --- a/compass/job/__init__.py +++ b/compass/job/__init__.py @@ -93,6 +93,12 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir, job_name = 'compass' else: job_name = f'compass_{suite}' + + if config.has_option('parallel', 'gpus_per_node'): + gpus_per_node = config.get('parallel', 'gpus_per_node') + else: + gpus_per_node = '' + wall_time = config.get('job', 'wall_time') template = Template(resources.read_text( @@ -101,7 +107,8 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir, text = template.render(job_name=job_name, account=account, nodes=f'{nodes}', wall_time=wall_time, qos=qos, partition=partition, constraint=constraint, - reservation=reservation, suite=suite, + reservation=reservation, + gpus_per_node=gpus_per_node, suite=suite, pre_run_commands=pre_run_commands, post_run_commands=post_run_commands) text = _clean_up_whitespace(text) diff --git a/compass/job/job_script.template b/compass/job/job_script.template index 37a384a28..fec4d599d 100644 --- a/compass/job/job_script.template +++ b/compass/job/job_script.template @@ -19,6 +19,9 @@ {% if constraint != '' -%} #SBATCH --constraint={{ constraint }} {%- endif %} +{% if gpus_per_node != '' -%} +#SBATCH --gpus-per-node={{ gpus_per_node }} +{%- endif %} source load_compass_env.sh {{ pre_run_commands }} diff --git a/compass/parallel.py b/compass/parallel.py index cf49c930a..08d0e7d80 100644 --- a/compass/parallel.py +++ b/compass/parallel.py @@ -66,6 +66,11 @@ def get_available_parallel_resources(config): cores_per_node=cores_per_node, mpi_allowed=mpi_allowed ) + + if config.has_option('parallel', 'gpus_per_node'): + available_resources['gpus_per_node'] = \ + config.getint('parallel', 'gpus_per_node') + return available_resources From b04ee856d37a8b263da468d44cb778ef390fc438 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 5 Aug 2024 06:24:54 -0700 Subject: [PATCH 5/8] Fix local mache install --- conda/bootstrap.py | 5 +++-- conda/configure_compass_env.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/conda/bootstrap.py b/conda/bootstrap.py index c113fb0e5..33228645d 100755 --- a/conda/bootstrap.py +++ b/conda/bootstrap.py @@ -1082,8 +1082,9 @@ def main(): # noqa: C901 print('Install local mache\n') commands = f'source {conda_base}/etc/profile.d/conda.sh && ' \ f'conda activate {conda_env_name} && ' \ - 'cd ../build_mache/mache && ' \ - 'python -m pip install --no-deps .' + f'cd ../build_mache/mache && ' \ + f'conda install -y --file spec-file.txt && ' \ + f'python -m pip install --no-deps .' check_call(commands, logger=logger) previous_conda_env = conda_env_name diff --git a/conda/configure_compass_env.py b/conda/configure_compass_env.py index 9db9c2c51..bb2eb51f8 100755 --- a/conda/configure_compass_env.py +++ b/conda/configure_compass_env.py @@ -114,6 +114,7 @@ def main(): f'git clone -b {args.mache_branch} ' \ f'git@github.com:{args.mache_fork}.git mache && ' \ f'cd mache && ' \ + f'conda install -y --file spec-file.txt && ' \ f'python -m pip install --no-deps .' check_call(commands, logger=logger) From 077b42bcf670113f5c399fced3bdb772e4c4a54f Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Mon, 5 Aug 2024 06:25:23 -0700 Subject: [PATCH 6/8] Add cuda to Albany and Trilinos builds for GPU machines --- conda/bootstrap.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/conda/bootstrap.py b/conda/bootstrap.py index 33228645d..f746194ac 100755 --- a/conda/bootstrap.py +++ b/conda/bootstrap.py @@ -476,6 +476,9 @@ def build_spack_env(config, update_spack, machine, compiler, mpi, # noqa: C901 scorpio = config.get('deploy', 'scorpio') parallelio = config.get('deploy', 'parallelio') + # for now, we'll assume Cuda is needed anytime GPUs are present + with_cuda = config.has_option('parallel', 'gpus_per_node') + if config.has_option('deploy', 'spack_mirror'): spack_mirror = config.get('deploy', 'spack_mirror') else: @@ -536,8 +539,12 @@ def build_spack_env(config, update_spack, machine, compiler, mpi, # noqa: C901 f'@{parallelio}+pnetcdf~timing"') if albany != 'None': - specs.append(f'"trilinos-for-albany@{albany}"') - specs.append(f'"albany@{albany}+mpas~py+unit_tests"') + if with_cuda: + cuda = '+cuda+uvm' + else: + cuda = '' + specs.append(f'"trilinos-for-albany@{albany}{cuda}"') + specs.append(f'"albany@{albany}+mpas~py+unit_tests{cuda}"') yaml_template = f'{spack_template_path}/{machine}_{compiler}_{mpi}.yaml' if not os.path.exists(yaml_template): From 27b9975075677458486d52da79bb7ffb3e4fe98c Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Tue, 13 Aug 2024 08:27:16 -0700 Subject: [PATCH 7/8] Add +sfad variant to albany spack build with cuda --- conda/bootstrap.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/conda/bootstrap.py b/conda/bootstrap.py index f746194ac..ff2f74209 100755 --- a/conda/bootstrap.py +++ b/conda/bootstrap.py @@ -540,11 +540,13 @@ def build_spack_env(config, update_spack, machine, compiler, mpi, # noqa: C901 if albany != 'None': if with_cuda: - cuda = '+cuda+uvm' + albany_cuda = '+cuda+uvm+sfad sfadsize=12' + trilinos_cuda = '+cuda+uvm' else: - cuda = '' - specs.append(f'"trilinos-for-albany@{albany}{cuda}"') - specs.append(f'"albany@{albany}+mpas~py+unit_tests{cuda}"') + albany_cuda = '' + trilinos_cuda = '' + specs.append(f'"trilinos-for-albany@{albany}{trilinos_cuda}"') + specs.append(f'"albany@{albany}+mpas~py+unit_tests{albany_cuda}"') yaml_template = f'{spack_template_path}/{machine}_{compiler}_{mpi}.yaml' if not os.path.exists(yaml_template): From 34aa147047eb483e6ce68c4216830b2b1b3ede3f Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Tue, 10 Sep 2024 02:08:26 -0700 Subject: [PATCH 8/8] Update conda and spack deps --- conda/compass_env/spec-file.template | 4 ++-- conda/default.cfg | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conda/compass_env/spec-file.template b/conda/compass_env/spec-file.template index 05142ac1f..ee6e0e7b1 100644 --- a/conda/compass_env/spec-file.template +++ b/conda/compass_env/spec-file.template @@ -49,8 +49,8 @@ cmake cxx-compiler fortran-compiler libnetcdf=4.9.2={{ mpi_prefix }}_* -libpnetcdf=1.12.3={{ mpi_prefix }}_* -parallelio=2.6.2={{ mpi_prefix }}_* +libpnetcdf=1.13.0={{ mpi_prefix }}_* +parallelio=2.6.3={{ mpi_prefix }}_* m4 make {{ mpi }} diff --git a/conda/default.cfg b/conda/default.cfg index 83e13ee11..6cae25589 100644 --- a/conda/default.cfg +++ b/conda/default.cfg @@ -29,9 +29,9 @@ lapack = 3.9.1 metis = 5.1.0 moab = 5.5.1 netcdf_c = 4.9.2 -netcdf_fortran = 4.6.0 +netcdf_fortran = 4.6.1 petsc = 3.19.1 -pnetcdf = 1.12.3 -scorpio = 1.6.3 -# parallelio = 2.6.2 +pnetcdf = 1.13.0 +scorpio = 1.6.5 +# parallelio = 2.6.3 parallelio = None