Skip to content

Commit

Permalink
Add SCREAM_SYSTEM_WORKAROUND macro
Browse files Browse the repository at this point in the history
This macro is used to optionally run hipInit prior to MPI_Init to
avoid occasional segfaults.

It can be turned off for ROCm 5.5.1 or newer releases.
  • Loading branch information
dqwu committed Jul 27, 2024
1 parent d4d7e0e commit 4411d44
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set(SCC "cc")
set(SCXX "hipcc")
set(SFC "ftn")

string(APPEND CPPDEFS " -DLINUX")
string(APPEND CPPDEFS " -DLINUX -DSCREAM_SYSTEM_WORKAROUND=1")
if (COMP_NAME STREQUAL gptl)
string(APPEND CPPDEFS " -DHAVE_NANOTIME -DBIT64 -DHAVE_SLASHPROC -DHAVE_COMM_F2C -DHAVE_TIMES -DHAVE_GETTIMEOFDAY")
endif()
Expand Down
4 changes: 4 additions & 0 deletions components/eamxx/src/mct_coupling/atm_comp_mct.F90
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ module atm_comp_mct
! Public interfaces
!--------------------------------------------------------------------------

#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1)
public :: atm_init_hip_mct
#endif
public :: atm_init_mct
public :: atm_run_mct
public :: atm_final_mct
Expand All @@ -47,13 +49,15 @@ module atm_comp_mct
CONTAINS
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1)
!===============================================================================
subroutine atm_init_hip_mct()
use scream_f2c_mod, only: scream_init_hip_atm

call scream_init_hip_atm()

end subroutine atm_init_hip_mct
#endif

!===============================================================================
subroutine atm_init_mct( EClock, cdata, x2a, a2x, NLFilename )
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
#include "ekat/ekat_pack.hpp"
#include "ekat/ekat_assert.hpp"

#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1)
#include <hip/hip_runtime.h>
#endif

// Anonymous namespace, for some utility functions
namespace {
Expand Down Expand Up @@ -204,9 +206,11 @@ void scream_setup_surface_coupling (const char*& import_field_names, int*& impor
});
}

#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1)
void scream_init_hip_atm () {
hipInit(0);
}
#endif

void scream_init_atm (const char* caseid,
const char* hostname,
Expand Down
2 changes: 2 additions & 0 deletions components/eamxx/src/mct_coupling/scream_f2c_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,10 @@ subroutine scream_setup_surface_coupling (import_field_names, import_cpl_indices
integer(kind=c_int), intent(in) :: import_field_size, export_field_size
end subroutine scream_setup_surface_coupling

#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1)
subroutine scream_init_hip_atm () bind(c)
end subroutine scream_init_hip_atm
#endif

! This subroutine performs completes the initialization of the atm instance.
! In particular, this routine must be called *after* scream_create_atm_instance,
Expand Down
4 changes: 4 additions & 0 deletions driver-mct/main/cime_comp_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,9 @@ module cime_comp_mod
subroutine cime_pre_init1(esmf_log_option)
use shr_pio_mod, only : shr_pio_init1, shr_pio_init2
use seq_comm_mct, only: num_inst_driver
#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1)
use atm_comp_mct, only: atm_init_hip_mct
#endif
!----------------------------------------------------------
!| Initialize MCT and MPI communicators and IO
!----------------------------------------------------------
Expand All @@ -737,7 +739,9 @@ subroutine cime_pre_init1(esmf_log_option)

beg_count = shr_sys_irtc(irtc_rate)

#if defined(SCREAM_SYSTEM_WORKAROUND) && (SCREAM_SYSTEM_WORKAROUND == 1)
call atm_init_hip_mct()
#endif
call mpi_init(ierr)
call shr_mpi_chkerr(ierr,subname//' mpi_init')

Expand Down

0 comments on commit 4411d44

Please sign in to comment.