diff --git a/cmake/configure_ursa.intel.cmake b/cmake/configure_ursa.intel.cmake new file mode 100644 index 0000000000..92b8ecb75e --- /dev/null +++ b/cmake/configure_ursa.intel.cmake @@ -0,0 +1 @@ +set(PARALLEL_NETCDF ON CACHE BOOL "Enable parallel NetCDF" FORCE) diff --git a/modulefiles/ufs_common.lua b/modulefiles/ufs_common.lua index 062fa38449..38eea18a93 100644 --- a/modulefiles/ufs_common.lua +++ b/modulefiles/ufs_common.lua @@ -4,23 +4,23 @@ help([[Load UFS Model common libraries]]) local ufs_modules = { {["jasper"] = "2.0.32"}, - {["zlib"] = "1.2.13"}, + {["zlib-ng"] = "2.1.6"}, {["libpng"] = "1.6.37"}, - {["hdf5"] = "1.14.0"}, + {["hdf5"] = "1.14.3"}, {["netcdf-c"] = "4.9.2"}, {["netcdf-fortran"] = "4.6.1"}, - {["parallelio"] = "2.5.10"}, - {["esmf"] = "8.6.0"}, - {["fms"] = "2024.01"}, + {["parallelio"] = "2.6.2"}, + {["esmf"] = "8.6.1"}, + {["fms"] = "2024.02"}, {["bacio"] = "2.4.1"}, - {["crtm"] = "2.4.0"}, + {["crtm"] = "2.4.0.1"}, {["g2"] = "3.5.1"}, {["g2tmpl"] = "1.13.0"}, - {["ip"] = "4.3.0"}, + {["ip"] = "5.0.0"}, {["sp"] = "2.5.0"}, {["w3emc"] = "2.10.0"}, - {["gftl-shared"] = "1.6.1"}, - {["mapl"] = "2.40.3-esmf-8.6.0"}, + {["gftl-shared"] = "1.9.0"}, + {["mapl"] = "2.46.3-esmf-8.6.1"}, {["scotch"] = "7.0.4"}, } diff --git a/modulefiles/ufs_ursa.intel.lua b/modulefiles/ufs_ursa.intel.lua new file mode 100644 index 0000000000..5d8cd7a076 --- /dev/null +++ b/modulefiles/ufs_ursa.intel.lua @@ -0,0 +1,31 @@ +help([[ +loads UFS Model prerequisites for Ursa/Intel +]]) + +prepend_path("MODULEPATH", "/contrib/spack-stack/envs/1.8.0/ue-oneapi-ifort-2024.2.1/install/modulefiles/Core") +prepend_path("MODULEPATH", "/contrib/spack-stack/envs/1.8.0/ue-oneapi-ifort-2024.2.1/install/modulefiles/intel-oneapi-mpi/2021.13-eaajhcw/oneapi/2024.2.1") + +stack_intel_ver=os.getenv("stack_intel_ver") or "2024.2.1" +load(pathJoin("stack-oneapi", stack_intel_ver)) + +stack_impi_ver=os.getenv("stack_impi_ver") or "2021.13" +load(pathJoin("stack-intel-oneapi-mpi", stack_impi_ver)) + +cmake_ver=os.getenv("cmake_ver") or "3.27.9" +load(pathJoin("cmake", cmake_ver)) + +load("ufs_common") + +nccmp_ver=os.getenv("nccmp_ver") or "1.9.1.0" +load(pathJoin("nccmp", nccmp_ver)) + +setenv("CC", "mpiicc") +setenv("CXX", "mpiicpc") +setenv("FC", "mpiifort") +setenv("I_MPI_CC", "icx") +setenv("I_MPI_CXX", "icpx") +setenv("I_MPI_F90", "ifort") + +setenv("CMAKE_Platform", "ursa.intel") + +whatis("Description: UFS build environment") diff --git a/tests/default_vars.sh b/tests/default_vars.sh index fded44ea40..d5fcacc3ba 100644 --- a/tests/default_vars.sh +++ b/tests/default_vars.sh @@ -205,6 +205,28 @@ elif [[ ${MACHINE_ID} = hera ]]; then export WPG_cpl_atmw_gdas=24 export WAV_tasks_atmw_gdas=248 +elif [[ ${MACHINE_ID} = ursa ]]; then + + export TPN=192 + + export INPES_dflt=3 + export JNPES_dflt=8 + export INPES_thrd=3 + export JNPES_thrd=4 + export INPES_c384=6 + export JNPES_c384=8 + export THRD_c384=2 + export INPES_c768=8 + export JNPES_c768=16 + export THRD_c768=4 + + export THRD_cpl_atmw_gdas=2 + export INPES_cpl_atmw_gdas=6 + export JNPES_cpl_atmw_gdas=8 + export WPG_cpl_atmw_gdas=24 + export WAV_tasks_atmw_gdas=248 + + elif [[ ${MACHINE_ID} = linux ]]; then export TPN=40 diff --git a/tests/detect_machine.sh b/tests/detect_machine.sh index 99419f3555..b0ce8cecbc 100755 --- a/tests/detect_machine.sh +++ b/tests/detect_machine.sh @@ -29,6 +29,8 @@ case $(hostname -f) in hfe0[1-9]) MACHINE_ID=hera ;; ### hera01-09 hfe1[0-2]) MACHINE_ID=hera ;; ### hera10-12 hecflow01) MACHINE_ID=hera ;; ### heraecflow01 + + nfe91) MACHINE_ID=ursa ;; ### ursa s4-submit.ssec.wisc.edu) MACHINE_ID=s4 ;; ### s4 @@ -87,6 +89,9 @@ elif [[ -d /mnt/lfs1 ]]; then elif [[ -d /scratch1 ]]; then # We are on NOAA Hera MACHINE_ID=hera +elif [[ -d /collab1 ]]; then + # We are on NOAA Ursa + MACHINE_ID=ursa elif [[ -d /work ]]; then # We are on MSU Orion or Hercules mount=$(findmnt -n -o SOURCE /home) diff --git a/tests/fv3_conf/compile_slurm.IN_ursa b/tests/fv3_conf/compile_slurm.IN_ursa new file mode 100644 index 0000000000..733f694e83 --- /dev/null +++ b/tests/fv3_conf/compile_slurm.IN_ursa @@ -0,0 +1,23 @@ +#!/bin/bash +#SBATCH -e err +#SBATCH -o out +#SBATCH --account=@[ACCNR] +#SBATCH --qos=@[QUEUE] +#SBATCH --partition=to39-compute +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=30 +#SBATCH --job-name="@[JBNME]" + +set -eux +date_s_start=$(date +%s) +date_start=$(date) +echo -n "${date_s_start}," > job_timestamp.txt +echo "Compile started: ${date_start}" + +"@[PATHRT]/compile.sh" "@[MACHINE_ID]" "@[MAKE_OPT]" "@[COMPILE_ID]" "@[RT_COMPILER]" + +date_end=$(date) +echo "Compile ended: ${date_end}" +date_s_end=$(date +%s) +echo -n "${date_s_end}," >> job_timestamp.txt diff --git a/tests/fv3_conf/fv3_slurm.IN_ursa b/tests/fv3_conf/fv3_slurm.IN_ursa new file mode 100644 index 0000000000..dd19c3868c --- /dev/null +++ b/tests/fv3_conf/fv3_slurm.IN_ursa @@ -0,0 +1,54 @@ +#!/bin/bash +#SBATCH -e err +#SBATCH -o out +#SBATCH --account=@[ACCNR] +#SBATCH --qos=@[QUEUE] +### #SBATCH --ntasks=@[TASKS] +#SBATCH --nodes=@[NODES] +#SBATCH --partition=to39-compute +#SBATCH --ntasks-per-node=@[TPN] +#SBATCH --time=@[WLCLK] +#SBATCH --job-name="@[JBNME]" +### #SBATCH --exclusive + +set -eux +date_s_start=$(date +%s) +echo -n "${date_s_start}," > job_timestamp.txt + +set +x +export MACHINE_ID=ursa +source ./module-setup.sh +module use "${PWD}/modulefiles" +module load modules.fv3 +module list +set -x + +date_start=$(date) +echo "Model started: ${date_start}" + +export MPI_TYPE_DEPTH=20 +export OMP_STACKSIZE=512M +# shellcheck disable=SC2125 +export OMP_NUM_THREADS=@[THRD] +export ESMF_RUNTIME_COMPLIANCECHECK=OFF:depth=4 +export ESMF_RUNTIME_PROFILE=ON +export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY" +export PSM_RANKS_PER_CONTEXT=4 +export PSM_SHAREDCONTEXTS=1 + +# Avoid job errors because of filesystem synchronization delays +sync && sleep 1 + +# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run. +if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + +srun --label -n @[TASKS] ./fv3.exe + +date_end=$(date) +echo "Model ended: ${date_end}" +date_s_end=$(date +%s) +echo -n "${date_s_end}," >> job_timestamp.txt diff --git a/tests/module-setup.sh b/tests/module-setup.sh index d39bbeb95c..e4568d799a 100755 --- a/tests/module-setup.sh +++ b/tests/module-setup.sh @@ -15,6 +15,13 @@ elif [[ ${MACHINE_ID} = hera ]] ; then fi module purge +elif [[ ${MACHINE_ID} = ursa ]] ; then + # We are on NOAA Ursa + if ( ! eval module help > /dev/null 2>&1 ) ; then + source /apps/lmod/lmod/init/bash + fi + module purge + elif [[ ${MACHINE_ID} = orion ]] ; then # We are on Orion if ( ! eval module help > /dev/null 2>&1 ) ; then diff --git a/tests/rt.sh b/tests/rt.sh index 65c97132b9..a53e52dd47 100755 --- a/tests/rt.sh +++ b/tests/rt.sh @@ -792,6 +792,30 @@ case ${MACHINE_ID} in PTMP="${dprefix}/stmp2" SCHEDULER=slurm + ;; + ursa) + echo "rt.sh: Setting up ursa..." + if [[ "${ROCOTO:-false}" == true ]] ; then + module load rocoto + ROCOTO_SCHEDULER=slurm + fi + + # ecflow not yet available on ursa + #if [[ "${ECFLOW:-false}" == true ]] ; then + # module load ecflow/5.11.4 + #fi + + QUEUE="batch" + COMPILE_QUEUE="batch" + + PARTITION= + dprefix="/collab1/data/${USER}" + DISKNM="/collab1/data/Ratko.Vasic/UFS-WM_RT" + STMP="${STMP:-${dprefix}/RT_BASELINE}" + PTMP="${PTMP:-${dprefix}/RT_RUNDIRS}" + + SCHEDULER=slurm + ;; orion) echo "rt.sh: Setting up orion..." diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index fa3dcfeebe..40118a6c40 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -419,6 +419,9 @@ rocoto_create_compile_task() { if [[ ${MACHINE_ID} == hera ]]; then BUILD_WALLTIME="01:00:00" fi + if [[ ${MACHINE_ID} == ursa ]]; then + BUILD_WALLTIME="01:00:00" + fi if [[ ${MACHINE_ID} == orion ]]; then BUILD_WALLTIME="01:00:00" fi diff --git a/tests/run_test.sh b/tests/run_test.sh index cd46804607..7325be8e0d 100755 --- a/tests/run_test.sh +++ b/tests/run_test.sh @@ -483,7 +483,7 @@ if [[ ${skip_check_results} == false ]]; then else if [[ ${i##*.} == nc* ]] ; then - if [[ " orion hercules hera wcoss2 acorn derecho gaeac5 gaeac6 jet s4 noaacloud " =~ ${MACHINE_ID} ]]; then + if [[ " orion hercules hera ursa wcoss2 acorn derecho gaeac5 gaeac6 jet s4 noaacloud " =~ ${MACHINE_ID} ]]; then printf "USING NCCMP.." >> "${RT_LOG}" printf "USING NCCMP.." if [[ ${CMP_DATAONLY} == false ]]; then