Skip to content

Commit

Permalink
Merge Pull Request E3SM-Project#2929 from E3SM-Project/scream/bartgol…
Browse files Browse the repository at this point in the history
…/eamxx/hist-restart-fix

Automatically Merged using E3SM Pull Request AutoTester
PR Title: Fix a couple of issues in history restart
PR Author: bartgol
PR LABELS: BFB, I/O, AT: AUTOMERGE, bugfix
  • Loading branch information
E3SM-Bot authored Aug 7, 2024
2 parents 05a500e + cd19cdf commit 395a55a
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 8 deletions.
39 changes: 31 additions & 8 deletions components/eamxx/src/share/io/scream_output_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,17 +242,18 @@ setup (const ekat::Comm& io_comm, const ekat::ParameterList& params,
const auto& last_output_filename = get_attribute<std::string>(rhist_file,"GLOBAL","last_output_filename");
m_resume_output_file = last_output_filename!="" and not restart_pl.get("force_new_file",false);
if (m_resume_output_file) {
scorpio::register_file(last_output_filename,scorpio::Read,m_output_file_specs.iotype);
int num_snaps = scorpio::get_dimlen(last_output_filename,"time");
scorpio::release_file(last_output_filename);
int num_snaps = scorpio::get_attribute<int>(rhist_file,"GLOBAL","last_output_file_num_snaps");

m_output_file_specs.filename = last_output_filename;
m_output_file_specs.is_open = true;
m_output_file_specs.storage.num_snapshots_in_file = num_snaps;
// The setup_file call will not register any new variable (the file is in Append mode,
// so all dims/vars must already be in the file). However, it will register decompositions,
// since those are a property of the run, not of the file.
setup_file(m_output_file_specs,m_output_control);

if (m_output_file_specs.storage.snapshot_fits(m_output_control.next_write_ts)) {
// The setup_file call will not register any new variable (the file is in Append mode,
// so all dims/vars must already be in the file). However, it will register decompositions,
// since those are a property of the run, not of the file.
setup_file(m_output_file_specs,m_output_control);
}
}
scorpio::release_file(rhist_file);
}
Expand Down Expand Up @@ -392,7 +393,7 @@ void OutputManager::run(const util::TimeStamp& timestamp)
snapshot_start = m_case_t0;
snapshot_start += m_time_bnds[0];
}
if (not filespecs.storage.snapshot_fits(snapshot_start)) {
if (filespecs.is_open and not filespecs.storage.snapshot_fits(snapshot_start)) {
release_file(filespecs.filename);
filespecs.close();
}
Expand Down Expand Up @@ -494,6 +495,10 @@ void OutputManager::run(const util::TimeStamp& timestamp)
write_timestamp (filespecs.filename,"last_write",m_output_control.last_write_ts,true);
scorpio::set_attribute (filespecs.filename,"GLOBAL","last_output_filename",m_output_file_specs.filename);
scorpio::set_attribute (filespecs.filename,"GLOBAL","num_snapshots_since_last_write",m_output_control.nsamples_since_last_write);

int nsnaps = m_output_file_specs.is_open
? scorpio::get_dimlen(m_output_file_specs.filename,"time") : 0;
scorpio::set_attribute (filespecs.filename,"GLOBAL","last_output_file_num_snaps",nsnaps);
}
// Write these in both output and rhist file. The former, b/c we need these info when we postprocess
// output, and the latter b/c we want to make sure these params don't change across restarts
Expand Down Expand Up @@ -789,6 +794,24 @@ setup_file ( IOFileSpecs& filespecs,
auto mode = m_resume_output_file ? scorpio::Append : scorpio::Write;
scorpio::register_file(filename,mode,filespecs.iotype);
if (m_resume_output_file) {
// We may have resumed an output file that contains extra snapshots *after* the restart time.
// E.g., if we output every step and the run crashed a few steps after writing the restart.
// In that case, we need to reset the time dimension in the output file, so that the extra
// snapshots will be overwritten.
const auto all_times = scorpio::get_all_times(filename);
int ntimes = all_times.size();
int ngood = 0;
for (const auto& t : all_times) {
auto keep = t<=m_output_control.last_write_ts.days_from(m_case_t0);
if (keep) {
++ngood;
} else {
break;
}
}
if (ngood<ntimes) {
scorpio::reset_unlimited_dim_len(filename,ngood);
}
scorpio::redef(filename);
} else {
// Register time (and possibly time_bnds) var(s)
Expand Down
24 changes: 24 additions & 0 deletions components/eamxx/src/share/io/scream_scorpio_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,30 @@ std::string get_time_name (const std::string& filename)
return pf.file->time_dim->name;
}

void reset_unlimited_dim_len(const std::string& filename, const int new_length)
{
auto& f = impl::get_file(filename,"scorpio::reset_unlimited_dim_len");

// Reset dim length
EKAT_REQUIRE_MSG (f.time_dim!=nullptr,
"Error! Cannot reset unlimited dim length. No unlimited dim stored.\n"
" - file name: " + filename + "\n");
EKAT_REQUIRE_MSG (new_length<f.time_dim->length,
"Error! New time dimension length must be shorter than the current one.\n"
" - file name: " + filename + "\n"
" - curr len : " + std::to_string(f.time_dim->length) + "\n"
" - new len : " + std::to_string(new_length) + "\n");
f.time_dim->length = new_length;

// Reset number of records counter for each time dep var
for (auto it : f.vars) {
auto& v = *it.second;
if (v.time_dep) {
v.num_records = new_length;
}
}
}

// =================== Decompositions operations ==================== //

// NOTES:
Expand Down
1 change: 1 addition & 0 deletions components/eamxx/src/share/io/scream_scorpio_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ bool is_dim_unlimited (const std::string& filename,
// NOTE: these throw if time dim is not present. Use has_dim to check first.
int get_time_len (const std::string& filename);
std::string get_time_name (const std::string& filename);
void reset_unlimited_dim_len(const std::string& filename, const int new_length);

// =================== Decompositions operations ==================== //

Expand Down

0 comments on commit 395a55a

Please sign in to comment.