From e0c17ca2b474521e489ef9edd7e0494db85e176b Mon Sep 17 00:00:00 2001 From: lkotipal Date: Mon, 5 Feb 2024 14:30:50 +0200 Subject: [PATCH 01/10] Failing to write bulk file is fatal --- vlasiator.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vlasiator.cpp b/vlasiator.cpp index d8175d691..a4e92c517 100644 --- a/vlasiator.cpp +++ b/vlasiator.cpp @@ -590,7 +590,7 @@ int main(int argn,char* args[]) { writeGhosts ) == false ) { - cerr << "FAILED TO WRITE GRID AT " << __FILE__ << " " << __LINE__ << endl; + abort_mpi("FAILED TO WRITE GRID", 1); } phiprof::stop("Initialization"); @@ -729,7 +729,7 @@ int main(int argn,char* args[]) { writeGhosts ) == false ) { - cerr << "FAILED TO WRITE GRID AT " << __FILE__ << " " << __LINE__ << endl; + abort_mpi("FAILED TO WRITE GRID", 1); } P::systemWriteDistributionWriteStride.pop_back(); @@ -946,7 +946,7 @@ int main(int argn,char* args[]) { writeGhosts ) == false ) { - cerr << "FAILED TO WRITE GRID AT" << __FILE__ << " " << __LINE__ << endl; + abort_mpi("FAILED TO WRITE GRID", 1); } P::systemWrites[i]++; // Special case for large timesteps From 8c09856711667135bbc6e8403e946a8ecca045c9 Mon Sep 17 00:00:00 2001 From: lkotipal Date: Mon, 5 Feb 2024 14:31:50 +0200 Subject: [PATCH 02/10] Attempt to clean up failed restart writes --- iowrite.cpp | 5 ++++- parameters.cpp | 2 ++ parameters.h | 2 ++ vlasiator.cpp | 4 ++++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/iowrite.cpp b/iowrite.cpp index 583e6ab7a..6fd497803 100644 --- a/iowrite.cpp +++ b/iowrite.cpp @@ -1557,6 +1557,7 @@ bool writeRestart( fname.width(7); fname.fill('0'); fname << fileIndex << "." << currentDate << ".vlsv"; + P::lastRestart = fname.str(); phiprof::Timer openTimer {"open"}; //Open the file with vlsvWriter: @@ -1588,7 +1589,9 @@ bool writeRestart( MPI_Info_set(MPIinfo, factor, stripeChar); } - if( vlsvWriter.open( fname.str(), MPI_COMM_WORLD, masterProcessId, MPIinfo ) == false) return false; + if (vlsvWriter.open( fname.str(), MPI_COMM_WORLD, masterProcessId, MPIinfo ) == false) { + return false; + } if( MPIinfo != MPI_INFO_NULL ) { MPI_Info_free(&MPIinfo); diff --git a/parameters.cpp b/parameters.cpp index 15ab4ea4f..61264da85 100644 --- a/parameters.cpp +++ b/parameters.cpp @@ -194,6 +194,8 @@ std::array P::overrideReadFsGridDecomposition = {0,0,0}; std::string tracerString; /*!< Fieldline tracer to use for coupling ionosphere and magnetosphere */ bool P::computeCurvature; +std::string P::lastRestart {""}; + bool P::addParameters() { typedef Readparameters RP; // the other default parameters we read through the add/get interface diff --git a/parameters.h b/parameters.h index 6fc3556e9..0cacbc10f 100644 --- a/parameters.h +++ b/parameters.h @@ -224,6 +224,8 @@ struct Parameters { static bool computeCurvature; /* Date: Mon, 22 Apr 2024 13:47:58 +0300 Subject: [PATCH 03/10] dummy From 25c8877981fdfc32b6ffc18f428df29dcf3b0daa Mon Sep 17 00:00:00 2001 From: lkotipal Date: Tue, 23 Apr 2024 12:37:30 +0300 Subject: [PATCH 04/10] Don't remove restart if we don't have a filename --- vlasiator.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vlasiator.cpp b/vlasiator.cpp index cee06acb1..b907774d1 100644 --- a/vlasiator.cpp +++ b/vlasiator.cpp @@ -1031,8 +1031,10 @@ int main(int argn,char* args[]) { outputReducer,"restart",(uint)P::t,P::restartStripeFactor) == false ) { // If restart write fails, remove the malformed file and hope someone clears space soon MPI_Barrier(MPI_COMM_WORLD); - std::remove(P::lastRestart.c_str()); - P::lastRestart = ""; + if(!P::lastRestart.empty()) { + std::remove(P::lastRestart.c_str()); + P::lastRestart.clear(); + } logFile << "(IO): ERROR Failed to write restart!" << endl << writeVerbose; cerr << "FAILED TO WRITE RESTART" << endl; } From 8d7deadbbb228070a9e6d0bc83727cd158c58883 Mon Sep 17 00:00:00 2001 From: lkotipal Date: Tue, 23 Apr 2024 13:00:34 +0300 Subject: [PATCH 05/10] Write proper file and line --- vlasiator.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vlasiator.cpp b/vlasiator.cpp index b907774d1..727972e6e 100644 --- a/vlasiator.cpp +++ b/vlasiator.cpp @@ -590,7 +590,8 @@ int main(int argn,char* args[]) { writeGhosts ) == false ) { - abort_mpi("FAILED TO WRITE GRID", 1); + // TODO make this std::format when we get C++20 + abort_mpi(std::string(__FILE__) + ":" + std::to_string(__LINE__) + ": FAILED TO WRITE GRID", 1); } phiprof::stop("Initialization"); From 3abd197e4b6d9e555739361a276db6655951fc81 Mon Sep 17 00:00:00 2001 From: lkotipal Date: Tue, 23 Apr 2024 13:04:21 +0300 Subject: [PATCH 06/10] More line numbers --- vlasiator.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vlasiator.cpp b/vlasiator.cpp index 727972e6e..790224b34 100644 --- a/vlasiator.cpp +++ b/vlasiator.cpp @@ -730,7 +730,8 @@ int main(int argn,char* args[]) { writeGhosts ) == false ) { - abort_mpi("FAILED TO WRITE GRID", 1); + // TODO make this std::format when we get C++20 + abort_mpi(std::string(__FILE__) + ":" + std::to_string(__LINE__) + ": FAILED TO WRITE GRID", 1); } P::systemWriteDistributionWriteStride.pop_back(); @@ -947,7 +948,8 @@ int main(int argn,char* args[]) { writeGhosts ) == false ) { - abort_mpi("FAILED TO WRITE GRID", 1); + // TODO make this std::format when we get C++20 + abort_mpi(std::string(__FILE__) + ":" + std::to_string(__LINE__) + ": FAILED TO WRITE GRID", 1); } P::systemWrites[i]++; // Special case for large timesteps From b5104440c3506dba97c9d19a969c9a480d7503ed Mon Sep 17 00:00:00 2001 From: lkotipal Date: Tue, 23 Apr 2024 13:31:41 +0300 Subject: [PATCH 07/10] Reduction of restart write success --- vlasiator.cpp | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/vlasiator.cpp b/vlasiator.cpp index 790224b34..5c3b30367 100644 --- a/vlasiator.cpp +++ b/vlasiator.cpp @@ -1018,31 +1018,21 @@ int main(int argn,char* args[]) { if (myRank == MASTER_RANK) logFile << "(IO): Writing restart data to disk, tstep = " << P::tstep << " t = " << P::t << endl << writeVerbose; //Write the restart: - if( writeRestart(mpiGrid, - perBGrid, // TODO: Merge all the fsgrids passed here into one meta-object - EGrid, - EHallGrid, - EGradPeGrid, - momentsGrid, - dPerBGrid, - dMomentsGrid, - BgBGrid, - volGrid, - technicalGrid, - version, - config, - outputReducer,"restart",(uint)P::t,P::restartStripeFactor) == false ) { - // If restart write fails, remove the malformed file and hope someone clears space soon - MPI_Barrier(MPI_COMM_WORLD); - if(!P::lastRestart.empty()) { - std::remove(P::lastRestart.c_str()); + // TODO: Merge all the fsgrids passed here into one meta-object + bool restartSuccess {writeRestart(mpiGrid, perBGrid, EGrid, EHallGrid, EGradPeGrid, momentsGrid, dPerBGrid, dMomentsGrid, BgBGrid, volGrid, technicalGrid, version, config, outputReducer,"restart",(uint)P::t,P::restartStripeFactor)}; + MPI_Reduce(myRank == MASTER_RANK ? MPI_IN_PLACE : &restartSuccess, &restartSuccess, 1, MPI_CXX_BOOL, MPI_LAND, MASTER_RANK, MPI_COMM_WORLD); + if (myRank == MASTER_RANK) { + if(!restartSuccess) { + // If restart write fails, remove the malformed file and hope a human clears space soon + if(!P::lastRestart.empty()) { + std::remove(P::lastRestart.c_str()); + } P::lastRestart.clear(); + logFile << "(IO): ERROR Failed to write restart!" << endl << writeVerbose; + cerr << "FAILED TO WRITE RESTART" << endl; + } else { + logFile << "(IO): .... done!"<< endl << writeVerbose; } - logFile << "(IO): ERROR Failed to write restart!" << endl << writeVerbose; - cerr << "FAILED TO WRITE RESTART" << endl; - } - if (myRank == MASTER_RANK) { - logFile << "(IO): .... done!"<< endl << writeVerbose; } timer.stop(); } From 17423eb6ba6e991a5516d9fe34ff4eb7226e8aee Mon Sep 17 00:00:00 2001 From: lkotipal Date: Tue, 23 Apr 2024 13:37:15 +0300 Subject: [PATCH 08/10] Reduce scope of restart filename --- iowrite.cpp | 4 ++-- iowrite.h | 2 +- parameters.cpp | 2 -- parameters.h | 2 -- vlasiator.cpp | 9 ++++----- 5 files changed, 7 insertions(+), 12 deletions(-) diff --git a/iowrite.cpp b/iowrite.cpp index 6fd497803..f25e38534 100644 --- a/iowrite.cpp +++ b/iowrite.cpp @@ -1522,7 +1522,7 @@ bool writeRestart( const std::string& versionInfo, const std::string& configInfo, DataReducer& dataReducer, - const string& name, + string& name, const uint& fileIndex, const int& stripe) { @@ -1557,7 +1557,7 @@ bool writeRestart( fname.width(7); fname.fill('0'); fname << fileIndex << "." << currentDate << ".vlsv"; - P::lastRestart = fname.str(); + name = fname.str(); phiprof::Timer openTimer {"open"}; //Open the file with vlsvWriter: diff --git a/iowrite.h b/iowrite.h index e0d23885c..dfe9b45e1 100644 --- a/iowrite.h +++ b/iowrite.h @@ -85,7 +85,7 @@ bool writeRestart( const std::string& versionInfo, const std::string& configInfo, DataReducer& dataReducer, - const std::string& name, + std::string& name, const uint& fileIndex, const int& stripe ); diff --git a/parameters.cpp b/parameters.cpp index 61264da85..15ab4ea4f 100644 --- a/parameters.cpp +++ b/parameters.cpp @@ -194,8 +194,6 @@ std::array P::overrideReadFsGridDecomposition = {0,0,0}; std::string tracerString; /*!< Fieldline tracer to use for coupling ionosphere and magnetosphere */ bool P::computeCurvature; -std::string P::lastRestart {""}; - bool P::addParameters() { typedef Readparameters RP; // the other default parameters we read through the add/get interface diff --git a/parameters.h b/parameters.h index 0cacbc10f..6fc3556e9 100644 --- a/parameters.h +++ b/parameters.h @@ -224,8 +224,6 @@ struct Parameters { static bool computeCurvature; /* Date: Tue, 23 Apr 2024 14:19:38 +0300 Subject: [PATCH 09/10] Don't assert --- vlasiator.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/vlasiator.cpp b/vlasiator.cpp index 9b7282ea0..84dfaa95d 100644 --- a/vlasiator.cpp +++ b/vlasiator.cpp @@ -1025,7 +1025,6 @@ int main(int argn,char* args[]) { if (myRank == MASTER_RANK) { if(!restartSuccess) { // If restart write fails, remove the malformed file and hope a human clears space soon - assert(restartFilename != "restart"); // Sanity check, this should be set before writeRestart returns std::remove(restartFilename.c_str()); logFile << "(IO): ERROR Failed to write restart!" << endl << writeVerbose; cerr << "FAILED TO WRITE RESTART" << endl; From ffc00d2e3e5b463b569f06f953e26c1fabbb4917 Mon Sep 17 00:00:00 2001 From: lkotipal Date: Tue, 23 Apr 2024 14:20:59 +0300 Subject: [PATCH 10/10] Do sanity check, actually --- vlasiator.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vlasiator.cpp b/vlasiator.cpp index 84dfaa95d..e9bf8cfa0 100644 --- a/vlasiator.cpp +++ b/vlasiator.cpp @@ -1025,7 +1025,10 @@ int main(int argn,char* args[]) { if (myRank == MASTER_RANK) { if(!restartSuccess) { // If restart write fails, remove the malformed file and hope a human clears space soon - std::remove(restartFilename.c_str()); + // Sanity check, this should be set before writeRestart returns + if (restartFilename != "restart") { + std::remove(restartFilename.c_str()); + } logFile << "(IO): ERROR Failed to write restart!" << endl << writeVerbose; cerr << "FAILED TO WRITE RESTART" << endl; } else {