Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add aurora machine to e3sm #6117

Merged
merged 7 commits into from
Dec 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cime_config/machines/cmake_macros/oneapi-ifx.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ string(APPEND CMAKE_CXX_FLAGS_RELEASE " -O2")
string(APPEND CMAKE_Fortran_FLAGS_DEBUG " -O0 -g -check uninit -check bounds -check pointers -fpe0 -check noarg_temp_created")
string(APPEND CMAKE_C_FLAGS_DEBUG " -O0 -g")
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -O0 -g")
string(APPEND CMAKE_C_FLAGS " -traceback -fp-model precise -std=gnu99")
string(APPEND CMAKE_CXX_FLAGS " -traceback -fp-model precise")
string(APPEND CMAKE_C_FLAGS " -fp-model precise -std=gnu99")
string(APPEND CMAKE_CXX_FLAGS " -fp-model precise")
string(APPEND CMAKE_Fortran_FLAGS " -traceback -convert big_endian -assume byterecl -assume realloc_lhs -fp-model precise")
string(APPEND CPPDEFS " -DFORTRANUNDERSCORE -DNO_R16 -DCPRINTEL -DHAVE_SLASHPROC -DHIDE_MPI")
string(APPEND CMAKE_Fortran_FORMAT_FIXED_FLAG " -fixed -132")
Expand All @@ -23,3 +23,4 @@ set(MPICXX "mpicxx")
set(SCC "icx")
set(SCXX "icpx")
set(SFC "ifx")
set(E3SM_LINK_WITH_FORTRAN "TRUE")
5 changes: 5 additions & 0 deletions cime_config/machines/cmake_macros/oneapi-ifx_aurora.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

string(APPEND CMAKE_EXE_LINKER_FLAGS " -lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
if (compile_threaded)
string(APPEND CMAKE_EXE_LINKER_FLAGS " -fiopenmp -fopenmp-targets=spir64")
endif()
6 changes: 2 additions & 4 deletions cime_config/machines/cmake_macros/oneapi-ifxgpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ string(APPEND CMAKE_CXX_FLAGS_RELEASE " -O2")
string(APPEND CMAKE_Fortran_FLAGS_DEBUG " -O0 -g -check uninit -check bounds -check pointers -fpe0 -check noarg_temp_created")
string(APPEND CMAKE_C_FLAGS_DEBUG " -O0 -g")
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -O0 -g")
string(APPEND CMAKE_C_FLAGS " -traceback -fp-model precise -std=gnu99")
string(APPEND CMAKE_CXX_FLAGS " -traceback -fp-model precise")
string(APPEND CMAKE_C_FLAGS " -fp-model precise -std=gnu99")
string(APPEND CMAKE_CXX_FLAGS " -fp-model precise")
string(APPEND CMAKE_Fortran_FLAGS " -traceback -convert big_endian -assume byterecl -assume realloc_lhs -fp-model precise")
string(APPEND CPPDEFS " -DFORTRANUNDERSCORE -DNO_R16 -DCPRINTEL -DHAVE_SLASHPROC -DHIDE_MPI")
string(APPEND CMAKE_Fortran_FORMAT_FIXED_FLAG " -fixed -132")
Expand All @@ -23,6 +23,4 @@ set(MPICXX "mpicxx")
set(SCC "icx")
set(SCXX "icpx")
set(SFC "ifx")
string(APPEND CMAKE_EXE_LINKER_FLAGS " -fiopenmp -fopenmp-targets=spir64")
set(USE_SYCL "TRUE")
string(APPEND SYCL_FLAGS " -\-intel -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend \"-device 12.60.7\"")
7 changes: 7 additions & 0 deletions cime_config/machines/cmake_macros/oneapi-ifxgpu_aurora.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

string(APPEND CMAKE_EXE_LINKER_FLAGS " -lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
if (compile_threaded)
string(APPEND CMAKE_EXE_LINKER_FLAGS " -fiopenmp -fopenmp-targets=spir64")
endif()
string(APPEND KOKKOS_OPTIONS " -DCMAKE_CXX_STANDARD=17 -DKokkos_ENABLE_SERIAL=On -DKokkos_ARCH_INTEL_PVC=On -DKokkos_ENABLE_SYCL=On -DKokkos_ENABLE_EXPLICIT_INSTANTIATION=Off")
string(APPEND SYCL_FLAGS " -\-intel -fsycl -fsycl-targets=spir64_gen -mlong-double-64 -Xsycl-target-backend \"-device 12.60.7\"")
9 changes: 9 additions & 0 deletions cime_config/machines/config_batch.xml
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,15 @@
</queues>
</batch_system>

<batch_system MACH="aurora" type="pbspro">
<batch_submit>/lus/gecko/projects/CSC249ADSE15_CNDA/tools/qsub/throttle</batch_submit>
<queues>
<queue walltimemax="00:59:00" jobmin="1" jobmax="2560" default="true">EarlyAppAccess</queue>
<queue walltimemax="00:59:00" jobmin="1" jobmax="10624">workq-route</queue>
<queue walltimemax="00:59:00" jobmin="1" jobmax="10624">workq</queue>
</queues>
</batch_system>

<batch_system MACH="cascade" type="slurm">
<directives>
<directive>--output=slurm.out</directive>
Expand Down
109 changes: 109 additions & 0 deletions cime_config/machines/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3073,6 +3073,115 @@
</resource_limits>
</machine>

<machine MACH="aurora">
<DESC>ALCF Aurora, 10624 nodes, 2x52c SPR, 6x2s PVC, 2x512GB DDR5, 2x64GB CPU-HBM, 6x128GB GPU-HBM, Slingshot 11, PBSPro</DESC>
<NODENAME_REGEX>aurora-uan-.*</NODENAME_REGEX>
<OS>LINUX</OS>
<COMPILERS>oneapi-ifx,oneapi-ifxgpu,gnu</COMPILERS>
<MPILIBS>mpich</MPILIBS>
<CHARGE_ACCOUNT>CSC249ADSE15_CNDA</CHARGE_ACCOUNT>
<SAVE_TIMING_DIR>/lus/gecko/projects/CSC249ADSE15_CNDA/performance_archive</SAVE_TIMING_DIR>
<SAVE_TIMING_DIR_PROJECTS>.*</SAVE_TIMING_DIR_PROJECTS>
<CIME_OUTPUT_ROOT>/lus/gecko/projects/CSC249ADSE15_CNDA/$USER/scratch</CIME_OUTPUT_ROOT>
<DIN_LOC_ROOT>/lus/gecko/projects/CSC249ADSE15_CNDA/inputdata</DIN_LOC_ROOT>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What project is "CSC249ADSE15_CNDA"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for ECP Aurora early access project, it is valid through Apr. 2024.

<DIN_LOC_ROOT_CLMFORC>/lus/gecko/projects/CSC249ADSE15_CNDA/inputdata/atm/datm7</DIN_LOC_ROOT_CLMFORC>
<DOUT_S_ROOT>$CIME_OUTPUT_ROOT/archive/$CASE</DOUT_S_ROOT>
<BASELINE_ROOT>/lus/gecko/projects/CSC249ADSE15_CNDA/baselines/$COMPILER</BASELINE_ROOT>
<CCSM_CPRNC>/lus/gecko/projects/CSC249ADSE15_CNDA/tools/cprnc/cprnc</CCSM_CPRNC>
<GMAKE_J>16</GMAKE_J>
<TESTS>e3sm_developer</TESTS>
<NTEST_PARALLEL_JOBS>4</NTEST_PARALLEL_JOBS>
<BATCH_SYSTEM>pbspro</BATCH_SYSTEM>
<SUPPORTED_BY>e3sm</SUPPORTED_BY>
<MAX_TASKS_PER_NODE>208</MAX_TASKS_PER_NODE>
<MAX_TASKS_PER_NODE compiler="oneapi-ifxgpu">104</MAX_TASKS_PER_NODE>
<MAX_MPITASKS_PER_NODE>104</MAX_MPITASKS_PER_NODE>
<MAX_MPITASKS_PER_NODE compiler="oneapi-ifxgpu">12</MAX_MPITASKS_PER_NODE>
<PROJECT_REQUIRED>FALSE</PROJECT_REQUIRED>
<mpirun mpilib="default">
<executable>mpiexec</executable>
<!--executable>numactl -m 2-3 mpiexec</executable--><!--for HBM runs-->
<arguments>
<arg name="total_num_tasks">-np {{ total_tasks }} --label</arg>
<arg name="ranks_per_node">-ppn {{ tasks_per_node }}</arg>
<arg name="ranks_bind">--cpu-bind $ENV{RANKS_BIND} -envall</arg>
<arg name="threads_per_rank">-d $ENV{OMP_NUM_THREADS}</arg>
<arg name="gpu_maps">$ENV{GPU_TILE_COMPACT}</arg>
</arguments>
</mpirun>
<module_system type="module" allow_error="true">
<init_path lang="sh">/lus/gecko/projects/CSC249ADSE15_CNDA/modules/lmod.sh</init_path>
<init_path lang="csh">/soft/sunspot_migrate/soft/packaging/lmod/lmod/init/csh</init_path>
<init_path lang="python">/soft/sunspot_migrate/soft/packaging/lmod/lmod/init/env_modules_python.py</init_path>
<cmd_path lang="sh">module</cmd_path>
<cmd_path lang="csh">module</cmd_path>
<cmd_path lang="python">/soft/sunspot_migrate/soft/packaging/lmod/lmod/libexec/lmod python</cmd_path>
<modules>
<command name="purge"></command>
<command name="use">/soft/modulefiles</command>
<command name="use">/soft/restricted/CNDA/updates/modulefiles</command>
<command name="load">spack-pe-gcc cmake</command>
</modules>
<modules compiler="!gnu">
<command name="load">oneapi/eng-compiler/2023.05.15.007</command>
</modules>
<modules compiler="gnu">
<command name="unload">spack-pe-gcc cmake</command>
<command name="load">gcc/10.3.0</command>
</modules>
<modules>
<command name="load">cray-pals</command>
<command name="load">libfabric/1.15.2.0</command>
<command name="load">cray-libpals/1.3.2</command>
</modules>
</module_system>
<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR>
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
<environment_variables>
<env name="NETCDF_C_PATH">/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-c/4.9.2/oneapi.eng.2023.05.15.007</env>
<env name="NETCDF_FORTRAN_PATH">/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-fortran/4.6.1/oneapi.eng.2023.05.15.007</env>
<env name="PNETCDF_PATH">/lus/gecko/projects/CSC249ADSE15_CNDA/software/pnetcdf/1.12.3/oneapi.eng.2023.05.15.007</env>
<env name="LD_LIBRARY_PATH">/lus/gecko/projects/CSC249ADSE15_CNDA/software/pnetcdf/1.12.3/oneapi.eng.2023.05.15.007/lib:/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-fortran/4.6.1/oneapi.eng.2023.05.15.007/lib:/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-c/4.9.2/oneapi.eng.2023.05.15.007/lib:$ENV{LD_LIBRARY_PATH}</env>
<env name="PATH">/lus/gecko/projects/CSC249ADSE15_CNDA/software/pnetcdf/1.12.3/oneapi.eng.2023.05.15.007/bin:/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-fortran/4.6.1/oneapi.eng.2023.05.15.007/bin:/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-c/4.9.2/oneapi.eng.2023.05.15.007/bin:$ENV{PATH}</env>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these blocks, "modules", "env variables"..., get executed in order they appear in the file? if so, does it make sense to append env variables before modules are loaded?

It may be a user error, but i am in the situation when a module is loaded and it presumably modifies PATH, but then, I think, the command from above for PATH "erases" that module's path because, maybe, $ENV{PATH} value in use is from before the module was loaded.

<env name="RANKS_BIND">list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203</env>
</environment_variables>
<environment_variables DEBUG="TRUE">
<env name="HYDRA_TOPO_DEBUG">1</env>
</environment_variables>
<environment_variables compiler="oneapi-ifxgpu">
<env name="ONEAPI_DEVICE_SELECTOR">level_zero:gpu</env>
<env name="ONEAPI_MPICH_GPU">NO_GPU</env>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is disabling GPU to GPU MPI?

<env name="MPIR_CVAR_ENABLE_GPU">0</env>
<env name="romio_cb_read">disable</env>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these flags (romio_cb_*) still causing issues on Aurora?

<env name="romio_cb_write">disable</env>
<env name="SYCL_CACHE_PERSISTENT">1</env>
<env name="GATOR_INITIAL_MB">4000MB</env>
<env name="GATOR_DISABLE">0</env>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't GATOR_DISABLE 0 by default? Perhaps this was set as 1 while debugging?

<env name="GPU_TILE_COMPACT">/soft/tools/mpi_wrapper_utils/gpu_tile_compact.sh</env>
<env name="FI_CXI_DEFAULT_CQ_SIZE">131072</env>
<env name="FI_CXI_CQ_FILL_PERCENT">20</env>
Comment on lines +3161 to +3162
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's good to add context for these flags and why we had to steer away from defaults for these Slingshot networks variables.

</environment_variables>
<environment_variables compiler="oneapi-ifx">
<env name="LIBOMPTARGET_DEBUG">0</env><!--default 0, max 5 -->
<env name="OMP_TARGET_OFFLOAD">DISABLED</env><!--default OMP_TARGET_OFFLOAD=MANDATORY-->
<env name="FI_CXI_DEFAULT_CQ_SIZE">131072</env>
<env name="FI_CXI_CQ_FILL_PERCENT">20</env>
<env name="MPIR_CVAR_ENABLE_GPU">0</env>
<env name="GPU_TILE_COMPACT"> </env>
</environment_variables>
<environment_variables SMP_PRESENT="TRUE" compiler="!gnu">
<env name="KMP_AFFINITY">verbose,granularity=thread,balanced</env>
<env name="OMP_STACKSIZE">128M</env>
</environment_variables>
<environment_variables SMP_PRESENT="TRUE" compiler="gnu">
<env name="OMP_PLACES">threads</env>
<env name="OMP_STACKSIZE">128M</env>
</environment_variables>
<resource_limits>
<resource name="RLIMIT_STACK">-1</resource>
</resource_limits>
</machine>

<machine MACH="sooty">
<DESC>PNL cluster, OS is Linux, batch system is SLURM</DESC>
<NODENAME_REGEX>sooty</NODENAME_REGEX>
Expand Down
7 changes: 6 additions & 1 deletion components/cmake/build_model.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -256,12 +256,17 @@ macro(build_model COMP_CLASS COMP_NAME)
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE Fortran)

# A bit hacky, some platforms need help with the fortran linker
if (COMPILER STREQUAL "intel")
if (COMPILER STREQUAL "intel" OR COMPILER STREQUAL "oneapi-ifx")
string(APPEND CMAKE_EXE_LINKER_FLAGS " -cxxlib")
endif()

else()
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)

if (COMPILER STREQUAL "oneapi-ifxgpu")
string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,-\-defsym,main=MAIN_\_ -lifcore -\-intel -fsycl -lsycl -Xsycl-target-backend \"-device 12.60.7\" ")
endif()

endif()

else()
Expand Down
17 changes: 14 additions & 3 deletions components/eam/src/physics/crm/pam/pam_debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@

#include "pam_coupler.h"

#if defined(__SYCL_DEVICE_ONLY__)
#define PRINTF(format, ...) \
do { \
const __attribute__((opencl_constant)) char fmt[] = (format); \
sycl::ext::oneapi::experimental::printf(fmt, ##__VA_ARGS__); \
} while (0)
#else
#define PRINTF(format, ...) \
printf(format, ##__VA_ARGS__);
#endif

// These routines were helpful for debugging the coupling
// between PAM and E3SM, so we kept them here for future use

Expand Down Expand Up @@ -76,7 +87,7 @@ void pam_debug_check_state( pam::PamCoupler &coupler, int id, int nstep ) {
const auto is_nan_q_atm = isnan( rhov(k,j,i,iens) );
if ( is_nan_t_atm || is_nan_r_atm || is_nan_q_atm ) {
auto phis = input_phis(iens)/grav;
printf("PAM-DEBUG nan-found - st:%3.3d id:%2.2d k:%3.3d i:%3.3d n:%3.3d y:%5.1f x:%5.1f ph:%6.1f -- t:%8.2g rd:%8.2g rv:%8.2g rc:%8.2g ri:%8.2g -- t:%8.2g rd:%8.2g rv:%8.2g rc:%8.2g ri:%8.2g \n",
PRINTF("PAM-DEBUG nan-found - st:%3.3d id:%2.2d k:%3.3d i:%3.3d n:%3.3d y:%5.1f x:%5.1f ph:%6.1f -- t:%8.2g rd:%8.2g rv:%8.2g rc:%8.2g ri:%8.2g -- t:%8.2g rd:%8.2g rv:%8.2g rc:%8.2g ri:%8.2g \n",
nstep,id,k,i,iens,lat(iens),lon(iens),phis,
temp(k,j,i,iens),
rhod(k,j,i,iens),
Expand All @@ -96,7 +107,7 @@ void pam_debug_check_state( pam::PamCoupler &coupler, int id, int nstep ) {
const auto is_neg_q_atm = rhov(k,j,i,iens)<0;
if ( is_neg_t_atm || is_neg_r_atm || is_neg_q_atm ) {
auto phis = input_phis(iens)/grav;
printf("PAM-DEBUG neg-found - st:%3.3d id:%2.2d k:%3.3d i:%3.3d n:%3.3d y:%5.1f x:%5.1f ph:%6.1f -- t:%8.2g rd:%8.2g rv:%8.2g rc:%8.2g ri:%8.2g -- t:%8.2g rd:%8.2g rv:%8.2g rc:%8.2g ri:%8.2g \n",
PRINTF("PAM-DEBUG neg-found - st:%3.3d id:%2.2d k:%3.3d i:%3.3d n:%3.3d y:%5.1f x:%5.1f ph:%6.1f -- t:%8.2g rd:%8.2g rv:%8.2g rc:%8.2g ri:%8.2g -- t:%8.2g rd:%8.2g rv:%8.2g rc:%8.2g ri:%8.2g \n",
nstep,id,k,i,iens,lat(iens),lon(iens),phis,
temp(k,j,i,iens),
rhod(k,j,i,iens),
Expand Down Expand Up @@ -200,7 +211,7 @@ void pam_debug_print_state( pam::PamCoupler &coupler, int id ) {
parallel_for("pam_debug_print_state", SimpleBounds<2>(nz,nx), YAKL_LAMBDA (int k, int i) {
int j = 0;
int n = 0;
printf("PAM-DEBUG %d - k:%d i:%d temp : %g rv: %g rc: %g ri: %g \n",
PRINTF("PAM-DEBUG %d - k:%d i:%d temp : %g rv: %g rc: %g ri: %g \n",
id,k,i,
temp(k,j,i,n),
rho_v(k,j,i,n),
Expand Down