diff --git a/docs/developers_guide/api.md b/docs/developers_guide/api.md index ea8412971..cc6722d6a 100644 --- a/docs/developers_guide/api.md +++ b/docs/developers_guide/api.md @@ -231,6 +231,7 @@ seaice/api write_job_script get_slurm_options + clean_up_whitespace ``` ### logging diff --git a/polaris/job/__init__.py b/polaris/job/__init__.py index b1ae9710d..33eab5ec9 100644 --- a/polaris/job/__init__.py +++ b/polaris/job/__init__.py @@ -42,7 +42,7 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir, cores = np.sqrt(target_cores * min_cores) nodes = int(np.ceil(cores / cores_per_node)) - partition, qos, constraint, wall_time = get_slurm_options( + partition, qos, constraint, gpus_per_node, wall_time = get_slurm_options( config, machine, nodes) job_name = config.get('job', 'job_name') @@ -58,8 +58,8 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir, text = template.render(job_name=job_name, account=account, nodes=f'{nodes}', wall_time=wall_time, qos=qos, partition=partition, constraint=constraint, - suite=suite) - text = _clean_up_whitespace(text) + gpus_per_node=gpus_per_node, suite=suite) + text = clean_up_whitespace(text) if suite == '': script_filename = 'job_script.sh' else: @@ -95,6 +95,9 @@ def get_slurm_options(config, machine, nodes): constraint : str Slurm constraint + gpus_per_node : str + The numer of GPUs per node (if any) + wall_time : str Slurm wall time """ @@ -131,12 +134,30 @@ def get_slurm_options(config, machine, nodes): else: constraint = '' + if config.has_option('parallel', 'gpus_per_node'): + gpus_per_node = config.get('parallel', 'gpus_per_node') + else: + gpus_per_node = '' + wall_time = config.get('job', 'wall_time') - return partition, qos, constraint, wall_time + return partition, qos, constraint, gpus_per_node, wall_time -def _clean_up_whitespace(text): +def clean_up_whitespace(text): + """ + Clean up whitespace after jinja templating + + Parameters + ---------- + text : str + Text to clean up + + Returns + ------- + text : str + Text with extra blank lines removed + """ prev_line = None lines = text.split('\n') trimmed = list() diff --git a/polaris/job/job_script.template b/polaris/job/job_script.template index 06884170a..fd554622d 100644 --- a/polaris/job/job_script.template +++ b/polaris/job/job_script.template @@ -16,6 +16,9 @@ {% if constraint != '' -%} #SBATCH --constraint={{ constraint }} {%- endif %} +{% if gpus_per_node != '' -%} +#SBATCH --gpus-per-node={{ gpus_per_node }} +{%- endif %} source load_polaris_env.sh polaris serial {{suite}} diff --git a/polaris/machines/frontier.cfg b/polaris/machines/frontier.cfg index 0bf3d54a0..46992c22b 100644 --- a/polaris/machines/frontier.cfg +++ b/polaris/machines/frontier.cfg @@ -42,7 +42,7 @@ use_e3sm_hdf5_netcdf = True # some defaults [parallel] -# cores per node on the machine +# allocatable cores per node on the machine cores_per_node = 56 # threads per core (set to 1 because hyperthreading requires extra sbatch diff --git a/polaris/machines/pm-gpu.cfg b/polaris/machines/pm-gpu.cfg index 6cb1a2c91..c228aee52 100644 --- a/polaris/machines/pm-gpu.cfg +++ b/polaris/machines/pm-gpu.cfg @@ -42,8 +42,8 @@ use_e3sm_hdf5_netcdf = True # some defaults [parallel] -# cores per node on the machine -cores_per_node = 128 +# cores per node on the machine (without hyperthreading) +cores_per_node = 64 # threads per core (set to 1 because trying to hyperthread seems to be causing # hanging on perlmutter) diff --git a/polaris/parallel.py b/polaris/parallel.py index 29f46c928..0f8861e39 100644 --- a/polaris/parallel.py +++ b/polaris/parallel.py @@ -77,6 +77,11 @@ def get_available_parallel_resources(config): cores_per_node=cores_per_node, mpi_allowed=mpi_allowed ) + + if config.has_option('parallel', 'gpus_per_node'): + available_resources['gpus_per_node'] = \ + config.getint('parallel', 'gpus_per_node') + return available_resources diff --git a/utils/omega/ctest/job_script.template b/utils/omega/ctest/job_script.template index 785b1d57f..e25d57ec4 100644 --- a/utils/omega/ctest/job_script.template +++ b/utils/omega/ctest/job_script.template @@ -16,6 +16,9 @@ {% if constraint != '' -%} #SBATCH --constraint={{ constraint }} {%- endif %} +{% if gpus_per_node != '' -%} +#SBATCH --gpus-per-node={{ gpus_per_node }} +{%- endif %} cd {{ build_dir }} ./omega_ctest.sh diff --git a/utils/omega/ctest/omega_ctest.py b/utils/omega/ctest/omega_ctest.py index ac2011f37..f94a6b0b8 100755 --- a/utils/omega/ctest/omega_ctest.py +++ b/utils/omega/ctest/omega_ctest.py @@ -8,7 +8,7 @@ from polaris.config import PolarisConfigParser from polaris.io import download, update_permissions -from polaris.job import _clean_up_whitespace, get_slurm_options +from polaris.job import clean_up_whitespace, get_slurm_options def make_build_script(machine, compiler, branch, build_only, mesh_filename, @@ -61,7 +61,7 @@ def make_build_script(machine, compiler, branch, build_only, mesh_filename, clean=clean, cmake_flags=cmake_flags) - script = _clean_up_whitespace(script) + script = clean_up_whitespace(script) build_omega_dir = os.path.abspath('build_omega') os.makedirs(build_omega_dir, exist_ok=True) @@ -120,7 +120,7 @@ def write_job_script(config, machine, compiler, submit): nodes = 1 - partition, qos, constraint, _ = get_slurm_options( + partition, qos, constraint, gpus_per_node, _ = get_slurm_options( config, machine, nodes) wall_time = '0:15:00' @@ -156,8 +156,8 @@ def write_job_script(config, machine, compiler, submit): script = template.render(job_name=job_name, account=account, nodes=f'{nodes}', wall_time=wall_time, qos=qos, partition=partition, constraint=constraint, - build_dir=build_dir) - script = _clean_up_whitespace(script) + gpus_per_node=gpus_per_node, build_dir=build_dir) + script = clean_up_whitespace(script) build_omega_dir = os.path.abspath('build_omega') script_filename = f'job_build_and_ctest_omega_{machine}_{compiler}.sh'