-
Notifications
You must be signed in to change notification settings - Fork 374
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add aurora machine to e3sm #6117
Changes from all commits
2bb175a
8e8f093
7fdab8f
8f6720f
2f9c8fb
9774a7e
cdb6325
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
|
||
string(APPEND CMAKE_EXE_LINKER_FLAGS " -lmkl_intel_lp64 -lmkl_sequential -lmkl_core") | ||
if (compile_threaded) | ||
string(APPEND CMAKE_EXE_LINKER_FLAGS " -fiopenmp -fopenmp-targets=spir64") | ||
endif() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
|
||
string(APPEND CMAKE_EXE_LINKER_FLAGS " -lmkl_intel_lp64 -lmkl_sequential -lmkl_core") | ||
if (compile_threaded) | ||
string(APPEND CMAKE_EXE_LINKER_FLAGS " -fiopenmp -fopenmp-targets=spir64") | ||
endif() | ||
string(APPEND KOKKOS_OPTIONS " -DCMAKE_CXX_STANDARD=17 -DKokkos_ENABLE_SERIAL=On -DKokkos_ARCH_INTEL_PVC=On -DKokkos_ENABLE_SYCL=On -DKokkos_ENABLE_EXPLICIT_INSTANTIATION=Off") | ||
string(APPEND SYCL_FLAGS " -\-intel -fsycl -fsycl-targets=spir64_gen -mlong-double-64 -Xsycl-target-backend \"-device 12.60.7\"") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3073,6 +3073,115 @@ | |
</resource_limits> | ||
</machine> | ||
|
||
<machine MACH="aurora"> | ||
<DESC>ALCF Aurora, 10624 nodes, 2x52c SPR, 6x2s PVC, 2x512GB DDR5, 2x64GB CPU-HBM, 6x128GB GPU-HBM, Slingshot 11, PBSPro</DESC> | ||
<NODENAME_REGEX>aurora-uan-.*</NODENAME_REGEX> | ||
<OS>LINUX</OS> | ||
<COMPILERS>oneapi-ifx,oneapi-ifxgpu,gnu</COMPILERS> | ||
<MPILIBS>mpich</MPILIBS> | ||
<CHARGE_ACCOUNT>CSC249ADSE15_CNDA</CHARGE_ACCOUNT> | ||
<SAVE_TIMING_DIR>/lus/gecko/projects/CSC249ADSE15_CNDA/performance_archive</SAVE_TIMING_DIR> | ||
<SAVE_TIMING_DIR_PROJECTS>.*</SAVE_TIMING_DIR_PROJECTS> | ||
<CIME_OUTPUT_ROOT>/lus/gecko/projects/CSC249ADSE15_CNDA/$USER/scratch</CIME_OUTPUT_ROOT> | ||
<DIN_LOC_ROOT>/lus/gecko/projects/CSC249ADSE15_CNDA/inputdata</DIN_LOC_ROOT> | ||
<DIN_LOC_ROOT_CLMFORC>/lus/gecko/projects/CSC249ADSE15_CNDA/inputdata/atm/datm7</DIN_LOC_ROOT_CLMFORC> | ||
<DOUT_S_ROOT>$CIME_OUTPUT_ROOT/archive/$CASE</DOUT_S_ROOT> | ||
<BASELINE_ROOT>/lus/gecko/projects/CSC249ADSE15_CNDA/baselines/$COMPILER</BASELINE_ROOT> | ||
<CCSM_CPRNC>/lus/gecko/projects/CSC249ADSE15_CNDA/tools/cprnc/cprnc</CCSM_CPRNC> | ||
<GMAKE_J>16</GMAKE_J> | ||
<TESTS>e3sm_developer</TESTS> | ||
<NTEST_PARALLEL_JOBS>4</NTEST_PARALLEL_JOBS> | ||
<BATCH_SYSTEM>pbspro</BATCH_SYSTEM> | ||
<SUPPORTED_BY>e3sm</SUPPORTED_BY> | ||
<MAX_TASKS_PER_NODE>208</MAX_TASKS_PER_NODE> | ||
<MAX_TASKS_PER_NODE compiler="oneapi-ifxgpu">104</MAX_TASKS_PER_NODE> | ||
<MAX_MPITASKS_PER_NODE>104</MAX_MPITASKS_PER_NODE> | ||
<MAX_MPITASKS_PER_NODE compiler="oneapi-ifxgpu">12</MAX_MPITASKS_PER_NODE> | ||
<PROJECT_REQUIRED>FALSE</PROJECT_REQUIRED> | ||
<mpirun mpilib="default"> | ||
<executable>mpiexec</executable> | ||
<!--executable>numactl -m 2-3 mpiexec</executable--><!--for HBM runs--> | ||
<arguments> | ||
<arg name="total_num_tasks">-np {{ total_tasks }} --label</arg> | ||
<arg name="ranks_per_node">-ppn {{ tasks_per_node }}</arg> | ||
<arg name="ranks_bind">--cpu-bind $ENV{RANKS_BIND} -envall</arg> | ||
<arg name="threads_per_rank">-d $ENV{OMP_NUM_THREADS}</arg> | ||
<arg name="gpu_maps">$ENV{GPU_TILE_COMPACT}</arg> | ||
</arguments> | ||
</mpirun> | ||
<module_system type="module" allow_error="true"> | ||
<init_path lang="sh">/lus/gecko/projects/CSC249ADSE15_CNDA/modules/lmod.sh</init_path> | ||
<init_path lang="csh">/soft/sunspot_migrate/soft/packaging/lmod/lmod/init/csh</init_path> | ||
<init_path lang="python">/soft/sunspot_migrate/soft/packaging/lmod/lmod/init/env_modules_python.py</init_path> | ||
<cmd_path lang="sh">module</cmd_path> | ||
<cmd_path lang="csh">module</cmd_path> | ||
<cmd_path lang="python">/soft/sunspot_migrate/soft/packaging/lmod/lmod/libexec/lmod python</cmd_path> | ||
<modules> | ||
<command name="purge"></command> | ||
<command name="use">/soft/modulefiles</command> | ||
<command name="use">/soft/restricted/CNDA/updates/modulefiles</command> | ||
<command name="load">spack-pe-gcc cmake</command> | ||
</modules> | ||
<modules compiler="!gnu"> | ||
<command name="load">oneapi/eng-compiler/2023.05.15.007</command> | ||
</modules> | ||
<modules compiler="gnu"> | ||
<command name="unload">spack-pe-gcc cmake</command> | ||
<command name="load">gcc/10.3.0</command> | ||
</modules> | ||
<modules> | ||
<command name="load">cray-pals</command> | ||
<command name="load">libfabric/1.15.2.0</command> | ||
<command name="load">cray-libpals/1.3.2</command> | ||
</modules> | ||
</module_system> | ||
<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR> | ||
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT> | ||
<environment_variables> | ||
<env name="NETCDF_C_PATH">/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-c/4.9.2/oneapi.eng.2023.05.15.007</env> | ||
<env name="NETCDF_FORTRAN_PATH">/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-fortran/4.6.1/oneapi.eng.2023.05.15.007</env> | ||
<env name="PNETCDF_PATH">/lus/gecko/projects/CSC249ADSE15_CNDA/software/pnetcdf/1.12.3/oneapi.eng.2023.05.15.007</env> | ||
<env name="LD_LIBRARY_PATH">/lus/gecko/projects/CSC249ADSE15_CNDA/software/pnetcdf/1.12.3/oneapi.eng.2023.05.15.007/lib:/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-fortran/4.6.1/oneapi.eng.2023.05.15.007/lib:/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-c/4.9.2/oneapi.eng.2023.05.15.007/lib:$ENV{LD_LIBRARY_PATH}</env> | ||
<env name="PATH">/lus/gecko/projects/CSC249ADSE15_CNDA/software/pnetcdf/1.12.3/oneapi.eng.2023.05.15.007/bin:/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-fortran/4.6.1/oneapi.eng.2023.05.15.007/bin:/lus/gecko/projects/CSC249ADSE15_CNDA/software/netcdf-c/4.9.2/oneapi.eng.2023.05.15.007/bin:$ENV{PATH}</env> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are these blocks, "modules", "env variables"..., get executed in order they appear in the file? if so, does it make sense to append env variables before modules are loaded? It may be a user error, but i am in the situation when a module is loaded and it presumably modifies PATH, but then, I think, the command from above for PATH "erases" that module's path because, maybe, $ENV{PATH} value in use is from before the module was loaded. |
||
<env name="RANKS_BIND">list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203</env> | ||
</environment_variables> | ||
<environment_variables DEBUG="TRUE"> | ||
<env name="HYDRA_TOPO_DEBUG">1</env> | ||
</environment_variables> | ||
<environment_variables compiler="oneapi-ifxgpu"> | ||
<env name="ONEAPI_DEVICE_SELECTOR">level_zero:gpu</env> | ||
<env name="ONEAPI_MPICH_GPU">NO_GPU</env> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is disabling GPU to GPU MPI? |
||
<env name="MPIR_CVAR_ENABLE_GPU">0</env> | ||
<env name="romio_cb_read">disable</env> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are these flags (romio_cb_*) still causing issues on Aurora? |
||
<env name="romio_cb_write">disable</env> | ||
<env name="SYCL_CACHE_PERSISTENT">1</env> | ||
<env name="GATOR_INITIAL_MB">4000MB</env> | ||
<env name="GATOR_DISABLE">0</env> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't GATOR_DISABLE 0 by default? Perhaps this was set as 1 while debugging? |
||
<env name="GPU_TILE_COMPACT">/soft/tools/mpi_wrapper_utils/gpu_tile_compact.sh</env> | ||
<env name="FI_CXI_DEFAULT_CQ_SIZE">131072</env> | ||
<env name="FI_CXI_CQ_FILL_PERCENT">20</env> | ||
Comment on lines
+3161
to
+3162
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's good to add context for these flags and why we had to steer away from defaults for these Slingshot networks variables. |
||
</environment_variables> | ||
<environment_variables compiler="oneapi-ifx"> | ||
<env name="LIBOMPTARGET_DEBUG">0</env><!--default 0, max 5 --> | ||
<env name="OMP_TARGET_OFFLOAD">DISABLED</env><!--default OMP_TARGET_OFFLOAD=MANDATORY--> | ||
<env name="FI_CXI_DEFAULT_CQ_SIZE">131072</env> | ||
<env name="FI_CXI_CQ_FILL_PERCENT">20</env> | ||
<env name="MPIR_CVAR_ENABLE_GPU">0</env> | ||
<env name="GPU_TILE_COMPACT"> </env> | ||
</environment_variables> | ||
<environment_variables SMP_PRESENT="TRUE" compiler="!gnu"> | ||
<env name="KMP_AFFINITY">verbose,granularity=thread,balanced</env> | ||
<env name="OMP_STACKSIZE">128M</env> | ||
</environment_variables> | ||
<environment_variables SMP_PRESENT="TRUE" compiler="gnu"> | ||
<env name="OMP_PLACES">threads</env> | ||
<env name="OMP_STACKSIZE">128M</env> | ||
</environment_variables> | ||
<resource_limits> | ||
<resource name="RLIMIT_STACK">-1</resource> | ||
</resource_limits> | ||
</machine> | ||
|
||
<machine MACH="sooty"> | ||
<DESC>PNL cluster, OS is Linux, batch system is SLURM</DESC> | ||
<NODENAME_REGEX>sooty</NODENAME_REGEX> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What project is "CSC249ADSE15_CNDA"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is for ECP Aurora early access project, it is valid through Apr. 2024.