E3SM-Project · ndkeen · Feb 6, 2024 · Jan 12, 2024 · Jan 18, 2024
diff --git a/cime_config/allactive/config_pesall.xml b/cime_config/allactive/config_pesall.xml
@@ -107,7 +107,7 @@
         </nthrds>
       </pes>
     </mach>
-    <mach name="theta|pm-cpu|alvarez|pm-gpu|muller|jlse">
+    <mach name="theta|pm-cpu|muller-cpu|alvarez|pm-gpu|muller-gpu|jlse">
       <pes compset="any" pesize="any">
         <comment>allactive: default, 1 node x MAX_MPITASKS_PER_NODE mpi x 1 omp @ root 0</comment>
         <ntasks>
@@ -263,7 +263,7 @@
   <!-- 2000_DATM%JRA_ELM%SPBC_MPASSI_MPASO%DATMFORCED_MOSART_SGLC_SWAV_SIAC_SESP -->
   <!-- ATM_GRID is ne30np4.pg2  ICE_GRID is EC30to60E2r2 -->
   <grid name="a%ne30np4">
-    <mach name="pm-cpu|alvarez">
+    <mach name="pm-cpu|muller-cpu|alvarez">
       <pes compset="JRA_ELM.+MPASSI.+MPASO.+MOSART.+SGLC.+SWAV" pesize="any">
         <comment>"pm-cpu 4 nodes, 256 partition, 128x1"</comment>
         <ntasks>
@@ -547,7 +547,7 @@
     </mach>
   </grid>
   <grid name="a%ne120np4">
-    <mach name="pm-cpu|alvarez">
+    <mach name="pm-cpu|muller-cpu|alvarez">
       <pes compset=".*EAM.+ELM.+MPASSI.+MPASO.+MOSART.+SWAV.*" pesize="any">
         <comment>ne120-wcycl on 42 nodes 128x1 ~0.7 sypd</comment>
         <MAX_MPITASKS_PER_NODE>128</MAX_MPITASKS_PER_NODE>
@@ -1244,7 +1244,7 @@
         </rootpe>
       </pes>
     </mach>
-    <mach name="pm-cpu|alvarez">
+    <mach name="pm-cpu|muller-cpu|alvarez">
       <pes compset=".*EAM.+ELM.+MPASSI.+MPASO.+MOSART.+" pesize="any">
         <comment> -compset A_WCYCL* -res ne30pg2_oECv3 with MPASO on 7 nodes, 128x1 </comment>
         <MAX_MPITASKS_PER_NODE>128</MAX_MPITASKS_PER_NODE>
@@ -1792,7 +1792,7 @@
     </mach>
   </grid>
   <grid name="a%ne30np4">
-    <mach name="pm-gpu|muller">
+    <mach name="pm-gpu|muller-gpu">
       <pes compset="any" pesize="any">
         <comment>"pm-gpu ne30np4 and ne30np4.pg2  2 nodes, 4x16"</comment>
         <ntasks>
@@ -1815,7 +1815,7 @@
         </nthrds>
       </pes>
     </mach>
-    <mach name="pm-cpu|alvarez">
+    <mach name="pm-cpu|muller-cpu|alvarez">
       <pes compset="any" pesize="any">
         <comment>"pm-cpu ne30np4 and ne30np4.pg2 2 nodes 1 thread, 128x1"</comment>
         <ntasks>
@@ -2127,7 +2127,7 @@
         </rootpe>
       </pes>
     </mach>
-    <mach name="pm-cpu|alvarez">
+    <mach name="pm-cpu|muller-cpu|alvarez">
       <pes compset=".*EAM.+ELM.+MPASSI.+MPASO.+MOSART.+" pesize="any">
         <comment> 8 nodes, 128x1</comment>
         <ntasks>
@@ -2336,19 +2336,18 @@
         </nthrds>
       </pes>
     </mach>
-    <mach name="pm-gpu|muller">
+    <mach name="pm-gpu|muller-gpu">
       <pes compset="any" pesize="any">
+        <comment>pm-gpu conus 2 nodes, 4x1 except 16 threads in LND</comment>
         <MAX_MPITASKS_PER_NODE>4</MAX_MPITASKS_PER_NODE>
         <MAX_TASKS_PER_NODE>16</MAX_TASKS_PER_NODE>
         <ntasks>
-          <ntasks_atm>-4</ntasks_atm>
-          <ntasks_lnd>-4</ntasks_lnd>
-          <ntasks_rof>-4</ntasks_rof>
-          <ntasks_ice>-4</ntasks_ice>
-          <ntasks_ocn>-4</ntasks_ocn>
-          <ntasks_glc>-4</ntasks_glc>
-          <ntasks_wav>-4</ntasks_wav>
-          <ntasks_cpl>-4</ntasks_cpl>
+          <ntasks_atm>-2</ntasks_atm>
+          <ntasks_lnd>-2</ntasks_lnd>
+          <ntasks_rof>-2</ntasks_rof>
+          <ntasks_ice>-2</ntasks_ice>
+          <ntasks_ocn>-2</ntasks_ocn>
+          <ntasks_cpl>-2</ntasks_cpl>
         </ntasks>
         <nthrds>
           <nthrds_atm>1</nthrds_atm>

diff --git a/cime_config/customize/provenance.py b/cime_config/customize/provenance.py
@@ -786,7 +786,7 @@ def _get_batch_job_id_for_syslog(case):
     """
     mach = case.get_value("MACH")
     try:
-        if mach in ["anvil", "chrysalis", "compy", "cori-haswell", "cori-knl", "pm-cpu", "pm-gpu", "alvarez","frontier","crusher"]:
+        if mach in ["anvil", "chrysalis", "compy", "pm-cpu", "pm-gpu", "muller-cpu", "muller-gpu", "alvarez","frontier","frontier-scream-gpu","crusher"]:
             # Note: Besides, SLURM_JOB_ID, equivalent SLURM_JOBID is also present on some systems (Frontier).
             return os.environ["SLURM_JOB_ID"]
         elif mach in ["theta"]:

diff --git a/...fig/machines/Depends.pm-cpu.alvarez.cmake → ...config/machines/Depends.alvarez.gnu.cmake b/...fig/machines/Depends.pm-cpu.alvarez.cmake → ...config/machines/Depends.alvarez.gnu.cmake
@@ -4,6 +4,10 @@ set(NOOPT
 
 if (NOT DEBUG)
   foreach(ITEM IN LISTS NOOPT)
-    e3sm_deoptimize_file(${ITEM})
+    e3sm_deoptimize_file("${ITEM}")
   endforeach()
 endif()
+
+
+
+
diff --git a/cime_config/machines/Depends.alvarez.intel.cmake b/cime_config/machines/Depends.alvarez.intel.cmake
@@ -0,0 +1,14 @@
+# For this file, we see internal compiler error with ifx (via intel-oneapi module) on pm-cpu with -O2
+# Commenting for now as we are using intel module which is not seeing build issue
+#set(NOOPT
+#  eam/src/physics/cam/debug_info.F90)
+
+#if (NOT DEBUG)
+#  foreach(ITEM IN LISTS NOOPT)
+#    e3sm_add_flags("${ITEM}" "-O0")
+#  endforeach()
+#endif()
+
+
+
+
diff --git a/cime_config/machines/Depends.alvarez.nvidia.cmake b/cime_config/machines/Depends.alvarez.nvidia.cmake
@@ -0,0 +1,31 @@
+list(APPEND REDUCE_OPT_LIST
+  homme/src/share/derivative_mod_base.F90
+)
+
+# Can use this flag to avoid internal compiler error for this file (with nvidia/21.11)
+# Still needed with nvidia/22.5
+if (NOT DEBUG)
+  foreach(ITEM IN LISTS REDUCE_OPT_LIST)
+    e3sm_add_flags("${ITEM}" " -Mnovect")
+  endforeach()
+endif()
+
+# Use -O2 for a few files already found to benefit from increased optimization in Intel Depends file
+set(PERFOBJS
+  homme/src/share/prim_advection_base.F90
+  homme/src/share/vertremap_base.F90
+  homme/src/share/edge_mod_base.F90
+  homme/src/share/bndry_mod_base.F90
+  homme/src/theta-l/share/prim_advance_mod.F90
+  homme/src/preqx/share/prim_advance_mod.F90
+  homme/src/preqx/share/viscosity_preqx_base.F90
+  homme/src/share/viscosity_base.F90
+  homme/src/theta-l/share/viscosity_theta.F90
+  homme/src/theta-l/share/eos.F90
+  eam/src/physics/cam/uwshcu.F90)
+
+if (NOT DEBUG)
+  foreach(ITEM IN LISTS PERFOBJS)
+    e3sm_add_flags("${ITEM}" "-O2")
+  endforeach()
+endif()
diff --git a/cime_config/machines/Depends.muller-cpu.gnu.cmake b/cime_config/machines/Depends.muller-cpu.gnu.cmake
@@ -0,0 +1,13 @@
+# For this file, fixes non-BFB behavior of stealth feature on pm-cpu with -O2
+set(NOOPT
+  eam/src/physics/cam/zm_conv.F90)
+
+if (NOT DEBUG)
+  foreach(ITEM IN LISTS NOOPT)
+    e3sm_deoptimize_file("${ITEM}")
+  endforeach()
+endif()
+
+
+
+
diff --git a/cime_config/machines/Depends.muller-cpu.intel.cmake b/cime_config/machines/Depends.muller-cpu.intel.cmake
@@ -0,0 +1,14 @@
+# For this file, we see internal compiler error with ifx (via intel-oneapi module) on pm-cpu with -O2
+# Commenting for now as we are using intel module which is not seeing build issue
+#set(NOOPT
+#  eam/src/physics/cam/debug_info.F90)
+
+#if (NOT DEBUG)
+#  foreach(ITEM IN LISTS NOOPT)
+#    e3sm_add_flags("${ITEM}" "-O0")
+#  endforeach()
+#endif()
+
+
+
+
diff --git a/cime_config/machines/Depends.muller-cpu.nvidia.cmake b/cime_config/machines/Depends.muller-cpu.nvidia.cmake
@@ -0,0 +1,31 @@
+list(APPEND REDUCE_OPT_LIST
+  homme/src/share/derivative_mod_base.F90
+)
+
+# Can use this flag to avoid internal compiler error for this file (with nvidia/21.11)
+# Still needed with nvidia/22.5
+if (NOT DEBUG)
+  foreach(ITEM IN LISTS REDUCE_OPT_LIST)
+    e3sm_add_flags("${ITEM}" " -Mnovect")
+  endforeach()
+endif()
+
+# Use -O2 for a few files already found to benefit from increased optimization in Intel Depends file
+set(PERFOBJS
+  homme/src/share/prim_advection_base.F90
+  homme/src/share/vertremap_base.F90
+  homme/src/share/edge_mod_base.F90
+  homme/src/share/bndry_mod_base.F90
+  homme/src/theta-l/share/prim_advance_mod.F90
+  homme/src/preqx/share/prim_advance_mod.F90
+  homme/src/preqx/share/viscosity_preqx_base.F90
+  homme/src/share/viscosity_base.F90
+  homme/src/theta-l/share/viscosity_theta.F90
+  homme/src/theta-l/share/eos.F90
+  eam/src/physics/cam/uwshcu.F90)
+
+if (NOT DEBUG)
+  foreach(ITEM IN LISTS PERFOBJS)
+    e3sm_add_flags("${ITEM}" "-O2")
+  endforeach()
+endif()
diff --git a/...nfig/machines/Depends.muller.nvidia.cmake → .../machines/Depends.muller-gpu.nvidia.cmake b/...nfig/machines/Depends.muller.nvidia.cmake → .../machines/Depends.muller-gpu.nvidia.cmake
diff --git a/...g/machines/Depends.muller.nvidiagpu.cmake → ...chines/Depends.muller-gpu.nvidiagpu.cmake b/...g/machines/Depends.muller.nvidiagpu.cmake → ...chines/Depends.muller-gpu.nvidiagpu.cmake
diff --git a/cime_config/machines/cmake_macros/amdclang_muller-cpu.cmake b/cime_config/machines/cmake_macros/amdclang_muller-cpu.cmake
@@ -0,0 +1,18 @@
+if (COMP_NAME STREQUAL gptl)
+  string(APPEND CPPDEFS " -DHAVE_NANOTIME -DBIT64 -DHAVE_SLASHPROC -DHAVE_GETTIMEOFDAY")
+endif()
+string(APPEND CMAKE_C_FLAGS_RELEASE " -O2 -g")
+string(APPEND CMAKE_CXX_FLAGS_RELEASE " -O2 -g")
+string(APPEND CMAKE_Fortran_FLAGS_RELEASE " -O2 -g")
+#string(APPEND FFLAGS " -march=znver3")
+set(SCC "clang")
+set(SCXX "clang++")
+set(SFC "flang")
+
+string(APPEND CMAKE_Fortran_FLAGS " -Mflushz ")
+string(APPEND CMAKE_Fortran_FORMAT_FIXED_FLAG " -Mfixed")
+string(APPEND CMAKE_Fortran_FORMAT_FREE_FLAG " -Mfreeform")
+if (compile_threaded)
+  string(APPEND CMAKE_Fortran_FLAGS " -mp")
+  string(APPEND CMAKE_EXE_LINKER_FLAGS " -mp")
+endif()
diff --git a/...ig/machines/cmake_macros/gnu_muller.cmake → ...achines/cmake_macros/gnu_muller-cpu.cmake b/...ig/machines/cmake_macros/gnu_muller.cmake → ...achines/cmake_macros/gnu_muller-cpu.cmake
@@ -2,7 +2,6 @@ string(APPEND CONFIG_ARGS " --host=cray")
 if (COMP_NAME STREQUAL gptl)
   string(APPEND CPPDEFS " -DHAVE_NANOTIME -DBIT64 -DHAVE_SLASHPROC -DHAVE_GETTIMEOFDAY")
 endif()
-set(E3SM_LINK_WITH_FORTRAN "ON")
 string(APPEND CMAKE_C_FLAGS_RELEASE " -O2 -g")
 string(APPEND CMAKE_Fortran_FLAGS_RELEASE " -O2 -g")
 set(MPICC "cc")

diff --git a/cime_config/machines/cmake_macros/gnu_muller-gpu.cmake b/cime_config/machines/cmake_macros/gnu_muller-gpu.cmake
@@ -0,0 +1,12 @@
+string(APPEND CONFIG_ARGS " --host=cray")
+if (COMP_NAME STREQUAL gptl)
+  string(APPEND CPPDEFS " -DHAVE_NANOTIME -DBIT64 -DHAVE_SLASHPROC -DHAVE_GETTIMEOFDAY")
+endif()
+string(APPEND CMAKE_C_FLAGS_RELEASE " -O2 -g")
+string(APPEND CMAKE_Fortran_FLAGS_RELEASE " -O2 -g")
+set(MPICC "cc")
+set(MPICXX "CC")
+set(MPIFC "ftn")
+set(SCC "gcc")
+set(SCXX "g++")
+set(SFC "gfortran")
diff --git a/...machines/cmake_macros/gnugpu_muller.cmake → ...ines/cmake_macros/gnugpu_muller-gpu.cmake b/...machines/cmake_macros/gnugpu_muller.cmake → ...ines/cmake_macros/gnugpu_muller-gpu.cmake
diff --git a/cime_config/machines/cmake_macros/intel_muller-cpu.cmake b/cime_config/machines/cmake_macros/intel_muller-cpu.cmake
@@ -0,0 +1,32 @@
+string(APPEND CONFIG_ARGS " --host=cray")
+if (COMP_NAME STREQUAL gptl)
+  string(APPEND CPPDEFS " -DHAVE_NANOTIME -DBIT64 -DHAVE_SLASHPROC -DHAVE_GETTIMEOFDAY")
+endif()
+
+set(MPICC "cc")
+set(MPICXX "CC")
+set(MPIFC "ftn")
+set(SCC "icx")
+set(SCXX "icpx")
+set(SFC "ifx")
+
+# Bit of a hack here. For whatever reason, the intel version on pm-cpu (both intel and intel-oneapi, and both icpc/icpx)
+# does not seem to have the -fp-model=source flag (docs still show it).  And I was unable to find a reliable way of testing
+# on the compiler ID or version, so for now, simply manually adjust the CXXFLAG setting for pm-cpu/intel
+# Try to manually remove -fp-model=source (and replace with -fp-model=precise) from CXXFLAGS
+#message(STATUS "ndk CXXFLAGS=${CXXFLAGS}")
+set(CMAKE_CXX_FLAGS " ") # hardcode it here to blank, then try to do same things as in intel.cmake
+if (compile_threaded)
+  string(APPEND CMAKE_CXX_FLAGS " -qopenmp")
+endif()
+string(APPEND CMAKE_CXX_FLAGS_DEBUG " -O0 -g")
+string(APPEND CMAKE_CXX_FLAGS_RELEASE " -O2")
+string(APPEND CMAKE_CXX_FLAGS " -fp-model=precise") # and manually add precise
+#message(STATUS "ndk CXXFLAGS=${CXXFLAGS}")
+
+string(APPEND CMAKE_Fortran_FLAGS " -fp-model=consistent -fimf-use-svml")
+   #  string(APPEND FFLAGS " -qno-opt-dynamic-align")
+ string(APPEND CMAKE_Fortran_FLAGS_RELEASE " -g -traceback")
+ string(APPEND CMAKE_CXX_FLAGS_RELEASE " -g -traceback")
+string(APPEND CMAKE_Fortran_FLAGS " -DHAVE_ERF_INTRINSICS")
+string(APPEND CMAKE_CXX_FLAGS " -fp-model=consistent")
diff --git a/cime_config/machines/cmake_macros/nvidia_muller-cpu.cmake b/cime_config/machines/cmake_macros/nvidia_muller-cpu.cmake
@@ -0,0 +1,16 @@
+string(APPEND CONFIG_ARGS " --host=cray")
+if (COMP_NAME STREQUAL gptl)
+  string(APPEND CPPDEFS " -DHAVE_NANOTIME -DBIT64 -DHAVE_SLASHPROC -DHAVE_GETTIMEOFDAY")
+endif()
+string(APPEND CMAKE_C_FLAGS_RELEASE " -O2")
+string(APPEND CMAKE_CXX_FLAGS_RELEASE " -O2")
+string(APPEND CMAKE_Fortran_FLAGS_RELEASE " -g")
+if (compile_threaded)
+  string(APPEND KOKKOS_OPTIONS " -DKokkos_ENABLE_OPENMP=Off") # work-around for nvidia as kokkos is not passing "-mp" for threaded build
+endif()
+set(MPICC "cc")
+set(MPICXX "CC")
+set(MPIFC "ftn")
+set(SCC "cc")
+set(SCXX "CC")
+set(SFC "ftn")
diff --git a/...machines/cmake_macros/nvidia_muller.cmake → ...ines/cmake_macros/nvidia_muller-gpu.cmake b/...machines/cmake_macros/nvidia_muller.cmake → ...ines/cmake_macros/nvidia_muller-gpu.cmake
diff --git a/...hines/cmake_macros/nvidiagpu_muller.cmake → ...s/cmake_macros/nvidiagpu_muller-gpu.cmake b/...hines/cmake_macros/nvidiagpu_muller.cmake → ...s/cmake_macros/nvidiagpu_muller-gpu.cmake
@@ -1,11 +1,13 @@
 string(APPEND CONFIG_ARGS " --host=cray")
 set(USE_CUDA "TRUE")
-string(APPEND CPPDEFS " -DGPU")
+string(APPEND CPPDEFS " -DGPU -DMPAS_OPENACC")
 if (COMP_NAME STREQUAL gptl)
   string(APPEND CPPDEFS " -DHAVE_NANOTIME -DBIT64 -DHAVE_SLASHPROC -DHAVE_GETTIMEOFDAY")
 endif()
 string(APPEND CPPDEFS " -DTHRUST_IGNORE_CUB_VERSION_CHECK")
 string(APPEND CMAKE_CUDA_FLAGS " -ccbin CC -O2 -arch sm_80 --use_fast_math")
+string(APPEND CMAKE_EXE_LINKER_FLAGS " -acc -gpu=cc70,cc60 -Minfo=accel")
 set(SCC "cc")
 set(SCXX "CC")
 set(SFC "ftn")
+string(APPEND CMAKE_Fortran_FLAGS " -acc -gpu=cc70,cc60 -Minfo=accel")
diff --git a/cime_config/machines/config_batch.xml b/cime_config/machines/config_batch.xml
@@ -458,19 +458,19 @@
     </queues>
   </batch_system>
 
-  <batch_system MACH="pm-cpu" type="nersc_slurm">
+  <batch_system MACH="muller-cpu" type="nersc_slurm">
     <directives>
       <directive> --constraint=cpu</directive>
     </directives>
     <queues>
       <!-- Note: walltime is not the max walltime, but the default - see NERSC docs for Q limits, https://docs.nersc.gov/jobs/policy/ -->
-      <queue walltimemax="00:30:00" nodemax="4096" default="true">regular</queue>
-      <queue walltimemax="00:30:00" nodemax="4096" strict="true">preempt</queue>
+      <queue walltimemax="00:30:00" nodemax="16" default="true">regular</queue>
+      <queue walltimemax="00:30:00" nodemax="16" strict="true">preempt</queue>
       <queue walltimemax="00:30:00" nodemax="8" strict="true">debug</queue>
     </queues>
   </batch_system>
 
-  <batch_system MACH="muller" type="nersc_slurm">
+  <batch_system MACH="muller-gpu" type="nersc_slurm">
     <directives>
       <directive> --constraint=gpu</directive>
     </directives>
@@ -496,10 +496,22 @@
     <queues>
       <queue walltimemax="00:45:00" nodemax="64" default="true">regular</queue>
       <queue walltimemax="00:45:00" nodemax="64" strict="true">preempt</queue>
-      <queue walltimemax="00:15:00" nodemax="8" strict="true">debug</queue>
+      <queue walltimemax="00:15:00" nodemax="4" strict="true">debug</queue>
     </queues>
   </batch_system>
 
+  <batch_system MACH="pm-cpu" type="nersc_slurm">
+    <directives>
+      <directive> --constraint=cpu</directive>
+    </directives>
+    <queues>
+      <!-- Note: walltime is not the max walltime, but the default - see NERSC docs for Q limits, https://docs.nersc.gov/jobs/policy/ -->
+      <queue walltimemax="00:30:00" nodemax="4096" default="true">regular</queue>
+      <queue walltimemax="00:30:00" nodemax="4096" strict="true">preempt</queue>
+      <queue walltimemax="00:30:00" nodemax="8" strict="true">debug</queue>
+    </queues>
+  </batch_system>
+
   <batch_system MACH="alvarez" type="nersc_slurm">
     <directives>
       <directive> --constraint=cpu</directive>