Skip to content

Commit

Permalink
Merge pull request #207 from xylar/add-gpus-flag-to-job-scripts
Browse files Browse the repository at this point in the history
Add gpus-per-node to job scripts and resources
  • Loading branch information
altheaden authored Jul 22, 2024
2 parents e327086 + 5fb5ac1 commit 937c6eb
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 13 deletions.
1 change: 1 addition & 0 deletions docs/developers_guide/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ seaice/api
write_job_script
get_slurm_options
clean_up_whitespace
```

### logging
Expand Down
31 changes: 26 additions & 5 deletions polaris/job/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir,
cores = np.sqrt(target_cores * min_cores)
nodes = int(np.ceil(cores / cores_per_node))

partition, qos, constraint, wall_time = get_slurm_options(
partition, qos, constraint, gpus_per_node, wall_time = get_slurm_options(
config, machine, nodes)

job_name = config.get('job', 'job_name')
Expand All @@ -58,8 +58,8 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir,
text = template.render(job_name=job_name, account=account,
nodes=f'{nodes}', wall_time=wall_time, qos=qos,
partition=partition, constraint=constraint,
suite=suite)
text = _clean_up_whitespace(text)
gpus_per_node=gpus_per_node, suite=suite)
text = clean_up_whitespace(text)
if suite == '':
script_filename = 'job_script.sh'
else:
Expand Down Expand Up @@ -95,6 +95,9 @@ def get_slurm_options(config, machine, nodes):
constraint : str
Slurm constraint
gpus_per_node : str
The numer of GPUs per node (if any)
wall_time : str
Slurm wall time
"""
Expand Down Expand Up @@ -131,12 +134,30 @@ def get_slurm_options(config, machine, nodes):
else:
constraint = ''

if config.has_option('parallel', 'gpus_per_node'):
gpus_per_node = config.get('parallel', 'gpus_per_node')
else:
gpus_per_node = ''

wall_time = config.get('job', 'wall_time')

return partition, qos, constraint, wall_time
return partition, qos, constraint, gpus_per_node, wall_time


def _clean_up_whitespace(text):
def clean_up_whitespace(text):
"""
Clean up whitespace after jinja templating
Parameters
----------
text : str
Text to clean up
Returns
-------
text : str
Text with extra blank lines removed
"""
prev_line = None
lines = text.split('\n')
trimmed = list()
Expand Down
3 changes: 3 additions & 0 deletions polaris/job/job_script.template
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
{% if constraint != '' -%}
#SBATCH --constraint={{ constraint }}
{%- endif %}
{% if gpus_per_node != '' -%}
#SBATCH --gpus-per-node={{ gpus_per_node }}
{%- endif %}

source load_polaris_env.sh
polaris serial {{suite}}
2 changes: 1 addition & 1 deletion polaris/machines/frontier.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ use_e3sm_hdf5_netcdf = True
# some defaults
[parallel]

# cores per node on the machine
# allocatable cores per node on the machine
cores_per_node = 56

# threads per core (set to 1 because hyperthreading requires extra sbatch
Expand Down
4 changes: 2 additions & 2 deletions polaris/machines/pm-gpu.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ use_e3sm_hdf5_netcdf = True
# some defaults
[parallel]

# cores per node on the machine
cores_per_node = 128
# cores per node on the machine (without hyperthreading)
cores_per_node = 64

# threads per core (set to 1 because trying to hyperthread seems to be causing
# hanging on perlmutter)
Expand Down
5 changes: 5 additions & 0 deletions polaris/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ def get_available_parallel_resources(config):
cores_per_node=cores_per_node,
mpi_allowed=mpi_allowed
)

if config.has_option('parallel', 'gpus_per_node'):
available_resources['gpus_per_node'] = \
config.getint('parallel', 'gpus_per_node')

return available_resources


Expand Down
3 changes: 3 additions & 0 deletions utils/omega/ctest/job_script.template
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
{% if constraint != '' -%}
#SBATCH --constraint={{ constraint }}
{%- endif %}
{% if gpus_per_node != '' -%}
#SBATCH --gpus-per-node={{ gpus_per_node }}
{%- endif %}

cd {{ build_dir }}
./omega_ctest.sh
10 changes: 5 additions & 5 deletions utils/omega/ctest/omega_ctest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from polaris.config import PolarisConfigParser
from polaris.io import download, update_permissions
from polaris.job import _clean_up_whitespace, get_slurm_options
from polaris.job import clean_up_whitespace, get_slurm_options


def make_build_script(machine, compiler, branch, build_only, mesh_filename,
Expand Down Expand Up @@ -61,7 +61,7 @@ def make_build_script(machine, compiler, branch, build_only, mesh_filename,
clean=clean,
cmake_flags=cmake_flags)

script = _clean_up_whitespace(script)
script = clean_up_whitespace(script)

build_omega_dir = os.path.abspath('build_omega')
os.makedirs(build_omega_dir, exist_ok=True)
Expand Down Expand Up @@ -120,7 +120,7 @@ def write_job_script(config, machine, compiler, submit):

nodes = 1

partition, qos, constraint, _ = get_slurm_options(
partition, qos, constraint, gpus_per_node, _ = get_slurm_options(
config, machine, nodes)

wall_time = '0:15:00'
Expand Down Expand Up @@ -156,8 +156,8 @@ def write_job_script(config, machine, compiler, submit):
script = template.render(job_name=job_name, account=account,
nodes=f'{nodes}', wall_time=wall_time, qos=qos,
partition=partition, constraint=constraint,
build_dir=build_dir)
script = _clean_up_whitespace(script)
gpus_per_node=gpus_per_node, build_dir=build_dir)
script = clean_up_whitespace(script)

build_omega_dir = os.path.abspath('build_omega')
script_filename = f'job_build_and_ctest_omega_{machine}_{compiler}.sh'
Expand Down

0 comments on commit 937c6eb

Please sign in to comment.