From eb6bcca697ccadb0fee647091d9f44520d970b74 Mon Sep 17 00:00:00 2001 From: Luca Bertagna <lbertag@sandia.gov> Date: Tue, 30 Jul 2024 17:19:29 -0600 Subject: [PATCH 1/3] EAMxx: add routine to reset unlimited dim length This allows future snapshots to effectively overwrite what was already stored --- .../src/share/io/scream_scorpio_interface.cpp | 24 +++++++++++++++++++ .../src/share/io/scream_scorpio_interface.hpp | 1 + 2 files changed, 25 insertions(+) diff --git a/components/eamxx/src/share/io/scream_scorpio_interface.cpp b/components/eamxx/src/share/io/scream_scorpio_interface.cpp index cb2b501f2b5..8d2f64994dd 100644 --- a/components/eamxx/src/share/io/scream_scorpio_interface.cpp +++ b/components/eamxx/src/share/io/scream_scorpio_interface.cpp @@ -717,6 +717,30 @@ std::string get_time_name (const std::string& filename) return pf.file->time_dim->name; } +void reset_unlimited_dim_len(const std::string& filename, const int new_length) +{ + auto& f = impl::get_file(filename,"scorpio::reset_unlimited_dim_len"); + + // Reset dim length + EKAT_REQUIRE_MSG (f.time_dim!=nullptr, + "Error! Cannot reset unlimited dim length. No unlimited dim stored.\n" + " - file name: " + filename + "\n"); + EKAT_REQUIRE_MSG (new_length<f.time_dim->length, + "Error! New time dimension length must be shorter than the current one.\n" + " - file name: " + filename + "\n" + " - curr len : " + std::to_string(f.time_dim->length) + "\n" + " - new len : " + std::to_string(new_length) + "\n"); + f.time_dim->length = new_length; + + // Reset number of records counter for each time dep var + for (auto it : f.vars) { + auto& v = *it.second; + if (v.time_dep) { + v.num_records = new_length; + } + } +} + // =================== Decompositions operations ==================== // // NOTES: diff --git a/components/eamxx/src/share/io/scream_scorpio_interface.hpp b/components/eamxx/src/share/io/scream_scorpio_interface.hpp index 549de680fe4..54e3d902765 100644 --- a/components/eamxx/src/share/io/scream_scorpio_interface.hpp +++ b/components/eamxx/src/share/io/scream_scorpio_interface.hpp @@ -108,6 +108,7 @@ bool is_dim_unlimited (const std::string& filename, // NOTE: these throw if time dim is not present. Use has_dim to check first. int get_time_len (const std::string& filename); std::string get_time_name (const std::string& filename); +void reset_unlimited_dim_len(const std::string& filename, const int new_length); // =================== Decompositions operations ==================== // From 0bdead86c0f3a3b240d92efd3076a62cdc67dd65 Mon Sep 17 00:00:00 2001 From: Luca Bertagna <lbertag@sandia.gov> Date: Tue, 30 Jul 2024 17:21:19 -0600 Subject: [PATCH 2/3] EAMxx: two fixes to hist restart logic * Only open last output file if the next write timestamp will fit * When resuming a file, reset time dim length to what was stored in the rhist file (will overwrite any timestamp that was written to file after rhist was written) --- .../src/share/io/scream_output_manager.cpp | 36 +++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/components/eamxx/src/share/io/scream_output_manager.cpp b/components/eamxx/src/share/io/scream_output_manager.cpp index 13e2e97101b..d6be54d0918 100644 --- a/components/eamxx/src/share/io/scream_output_manager.cpp +++ b/components/eamxx/src/share/io/scream_output_manager.cpp @@ -242,17 +242,18 @@ setup (const ekat::Comm& io_comm, const ekat::ParameterList& params, const auto& last_output_filename = get_attribute<std::string>(rhist_file,"GLOBAL","last_output_filename"); m_resume_output_file = last_output_filename!="" and not restart_pl.get("force_new_file",false); if (m_resume_output_file) { - scorpio::register_file(last_output_filename,scorpio::Read,m_output_file_specs.iotype); - int num_snaps = scorpio::get_dimlen(last_output_filename,"time"); - scorpio::release_file(last_output_filename); + int num_snaps = scorpio::get_attribute<int>(rhist_file,"GLOBAL","last_output_file_num_snaps"); m_output_file_specs.filename = last_output_filename; m_output_file_specs.is_open = true; m_output_file_specs.storage.num_snapshots_in_file = num_snaps; - // The setup_file call will not register any new variable (the file is in Append mode, - // so all dims/vars must already be in the file). However, it will register decompositions, - // since those are a property of the run, not of the file. - setup_file(m_output_file_specs,m_output_control); + + if (m_output_file_specs.storage.snapshot_fits(m_output_control.next_write_ts)) { + // The setup_file call will not register any new variable (the file is in Append mode, + // so all dims/vars must already be in the file). However, it will register decompositions, + // since those are a property of the run, not of the file. + setup_file(m_output_file_specs,m_output_control); + } } scorpio::release_file(rhist_file); } @@ -494,6 +495,9 @@ void OutputManager::run(const util::TimeStamp& timestamp) write_timestamp (filespecs.filename,"last_write",m_output_control.last_write_ts,true); scorpio::set_attribute (filespecs.filename,"GLOBAL","last_output_filename",m_output_file_specs.filename); scorpio::set_attribute (filespecs.filename,"GLOBAL","num_snapshots_since_last_write",m_output_control.nsamples_since_last_write); + + int nsnaps = scorpio::get_dimlen(m_output_file_specs.filename,"time"); + scorpio::set_attribute (filespecs.filename,"GLOBAL","last_output_file_num_snaps",nsnaps); } // Write these in both output and rhist file. The former, b/c we need these info when we postprocess // output, and the latter b/c we want to make sure these params don't change across restarts @@ -789,6 +793,24 @@ setup_file ( IOFileSpecs& filespecs, auto mode = m_resume_output_file ? scorpio::Append : scorpio::Write; scorpio::register_file(filename,mode,filespecs.iotype); if (m_resume_output_file) { + // We may have resumed an output file that contains extra snapshots *after* the restart time. + // E.g., if we output every step and the run crashed a few steps after writing the restart. + // In that case, we need to reset the time dimension in the output file, so that the extra + // snapshots will be overwritten. + const auto all_times = scorpio::get_all_times(filename); + int ntimes = all_times.size(); + int ngood = 0; + for (const auto& t : all_times) { + auto keep = t<=m_output_control.last_write_ts.days_from(m_case_t0); + if (keep) { + ++ngood; + } else { + break; + } + } + if (ngood<ntimes) { + scorpio::reset_unlimited_dim_len(filename,ngood); + } scorpio::redef(filename); } else { // Register time (and possibly time_bnds) var(s) From cd19cdfe58780516c55f7776ff9f4c61f379a719 Mon Sep 17 00:00:00 2001 From: Luca Bertagna <lbertag@sandia.gov> Date: Wed, 7 Aug 2024 10:10:31 -0600 Subject: [PATCH 3/3] EAMxx: fix access to output file in IO Avoid calling scorpio interfaces on a file not open --- components/eamxx/src/share/io/scream_output_manager.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/components/eamxx/src/share/io/scream_output_manager.cpp b/components/eamxx/src/share/io/scream_output_manager.cpp index d6be54d0918..0b738efe648 100644 --- a/components/eamxx/src/share/io/scream_output_manager.cpp +++ b/components/eamxx/src/share/io/scream_output_manager.cpp @@ -393,7 +393,7 @@ void OutputManager::run(const util::TimeStamp& timestamp) snapshot_start = m_case_t0; snapshot_start += m_time_bnds[0]; } - if (not filespecs.storage.snapshot_fits(snapshot_start)) { + if (filespecs.is_open and not filespecs.storage.snapshot_fits(snapshot_start)) { release_file(filespecs.filename); filespecs.close(); } @@ -496,7 +496,8 @@ void OutputManager::run(const util::TimeStamp& timestamp) scorpio::set_attribute (filespecs.filename,"GLOBAL","last_output_filename",m_output_file_specs.filename); scorpio::set_attribute (filespecs.filename,"GLOBAL","num_snapshots_since_last_write",m_output_control.nsamples_since_last_write); - int nsnaps = scorpio::get_dimlen(m_output_file_specs.filename,"time"); + int nsnaps = m_output_file_specs.is_open + ? scorpio::get_dimlen(m_output_file_specs.filename,"time") : 0; scorpio::set_attribute (filespecs.filename,"GLOBAL","last_output_file_num_snaps",nsnaps); } // Write these in both output and rhist file. The former, b/c we need these info when we postprocess