From 4411d44834873c7d988f4af4b47e2c54a7483133 Mon Sep 17 00:00:00 2001 From: dqwu Date: Fri, 26 Jul 2024 18:06:26 -0500 Subject: [PATCH] Add SCREAM_SYSTEM_WORKAROUND macro This macro is used to optionally run hipInit prior to MPI_Init to avoid occasional segfaults. It can be turned off for ROCm 5.5.1 or newer releases. --- .../cmake_macros/crayclang-scream_frontier-scream-gpu.cmake | 2 +- components/eamxx/src/mct_coupling/atm_comp_mct.F90 | 4 ++++ .../eamxx/src/mct_coupling/scream_cxx_f90_interface.cpp | 4 ++++ components/eamxx/src/mct_coupling/scream_f2c_mod.F90 | 2 ++ driver-mct/main/cime_comp_mod.F90 | 4 ++++ 5 files changed, 15 insertions(+), 1 deletion(-) diff --git a/cime_config/machines/cmake_macros/crayclang-scream_frontier-scream-gpu.cmake b/cime_config/machines/cmake_macros/crayclang-scream_frontier-scream-gpu.cmake index cc90b369244..a5c89c3318e 100644 --- a/cime_config/machines/cmake_macros/crayclang-scream_frontier-scream-gpu.cmake +++ b/cime_config/machines/cmake_macros/crayclang-scream_frontier-scream-gpu.cmake @@ -5,7 +5,7 @@ set(SCC "cc") set(SCXX "hipcc") set(SFC "ftn") -string(APPEND CPPDEFS " -DLINUX") +string(APPEND CPPDEFS " -DLINUX -DSCREAM_SYSTEM_WORKAROUND=1") if (COMP_NAME STREQUAL gptl) string(APPEND CPPDEFS " -DHAVE_NANOTIME -DBIT64 -DHAVE_SLASHPROC -DHAVE_COMM_F2C -DHAVE_TIMES -DHAVE_GETTIMEOFDAY") endif() diff --git a/components/eamxx/src/mct_coupling/atm_comp_mct.F90 b/components/eamxx/src/mct_coupling/atm_comp_mct.F90 index bc79f6c11a7..dc57d828bbd 100644 --- a/components/eamxx/src/mct_coupling/atm_comp_mct.F90 +++ b/components/eamxx/src/mct_coupling/atm_comp_mct.F90 @@ -25,7 +25,9 @@ module atm_comp_mct ! Public interfaces !-------------------------------------------------------------------------- +#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1) public :: atm_init_hip_mct +#endif public :: atm_init_mct public :: atm_run_mct public :: atm_final_mct @@ -47,6 +49,7 @@ module atm_comp_mct CONTAINS !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1) !=============================================================================== subroutine atm_init_hip_mct() use scream_f2c_mod, only: scream_init_hip_atm @@ -54,6 +57,7 @@ subroutine atm_init_hip_mct() call scream_init_hip_atm() end subroutine atm_init_hip_mct +#endif !=============================================================================== subroutine atm_init_mct( EClock, cdata, x2a, a2x, NLFilename ) diff --git a/components/eamxx/src/mct_coupling/scream_cxx_f90_interface.cpp b/components/eamxx/src/mct_coupling/scream_cxx_f90_interface.cpp index 99a7ff0d824..e9129e6fcea 100644 --- a/components/eamxx/src/mct_coupling/scream_cxx_f90_interface.cpp +++ b/components/eamxx/src/mct_coupling/scream_cxx_f90_interface.cpp @@ -21,7 +21,9 @@ #include "ekat/ekat_pack.hpp" #include "ekat/ekat_assert.hpp" +#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1) #include +#endif // Anonymous namespace, for some utility functions namespace { @@ -204,9 +206,11 @@ void scream_setup_surface_coupling (const char*& import_field_names, int*& impor }); } +#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1) void scream_init_hip_atm () { hipInit(0); } +#endif void scream_init_atm (const char* caseid, const char* hostname, diff --git a/components/eamxx/src/mct_coupling/scream_f2c_mod.F90 b/components/eamxx/src/mct_coupling/scream_f2c_mod.F90 index 25abb1495ba..f7a0bc9c379 100644 --- a/components/eamxx/src/mct_coupling/scream_f2c_mod.F90 +++ b/components/eamxx/src/mct_coupling/scream_f2c_mod.F90 @@ -69,8 +69,10 @@ subroutine scream_setup_surface_coupling (import_field_names, import_cpl_indices integer(kind=c_int), intent(in) :: import_field_size, export_field_size end subroutine scream_setup_surface_coupling +#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1) subroutine scream_init_hip_atm () bind(c) end subroutine scream_init_hip_atm +#endif ! This subroutine performs completes the initialization of the atm instance. ! In particular, this routine must be called *after* scream_create_atm_instance, diff --git a/driver-mct/main/cime_comp_mod.F90 b/driver-mct/main/cime_comp_mod.F90 index 8282a76109c..7cf442b9413 100644 --- a/driver-mct/main/cime_comp_mod.F90 +++ b/driver-mct/main/cime_comp_mod.F90 @@ -714,7 +714,9 @@ module cime_comp_mod subroutine cime_pre_init1(esmf_log_option) use shr_pio_mod, only : shr_pio_init1, shr_pio_init2 use seq_comm_mct, only: num_inst_driver +#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1) use atm_comp_mct, only: atm_init_hip_mct +#endif !---------------------------------------------------------- !| Initialize MCT and MPI communicators and IO !---------------------------------------------------------- @@ -737,7 +739,9 @@ subroutine cime_pre_init1(esmf_log_option) beg_count = shr_sys_irtc(irtc_rate) +#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1) call atm_init_hip_mct() +#endif call mpi_init(ierr) call shr_mpi_chkerr(ierr,subname//' mpi_init')