Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hypergraph #778

Open
wants to merge 38 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
dfcdc0a
First attempt
markusbattarbee May 11, 2020
3f97feb
Changed ioread to not complain about single LB value on readin
markusbattarbee May 12, 2020
5fbea60
Merge branch 'dev' into three-dimensional-weights
markusbattarbee Sep 28, 2021
5dde4e7
cleanup, support restarts with both vector and scalar LBweight values
markusbattarbee Sep 28, 2021
0ef3aa2
Provide dccrg communication weight
lkotipal Feb 15, 2023
84cc0b4
Scale LB weight to time everywhere
lkotipal Feb 28, 2023
5d93ed0
Better Vlasov neighbors
lkotipal Mar 3, 2023
0ca9608
Parameter for HIER
lkotipal Jun 1, 2023
d1591df
Revert "Parameter for HIER"
lkotipal Jun 13, 2023
23f30cf
Dummy commit for CI
lkotipal Jun 1, 2023
f6ca52e
Dummy
lkotipal Jun 13, 2023
0ac5e73
Hypergraph branch submodule
lkotipal Feb 8, 2024
a6d0aef
Fix errors in merge
lkotipal Feb 8, 2024
7b874e6
LB algorithm for CI
lkotipal Feb 8, 2024
6054a54
Move stencil initialization before load balancing
lkotipal Feb 8, 2024
7ea579d
Change partitioning neighborhood
lkotipal Feb 8, 2024
1a55ec5
Parametrize partitioning neighborhood
lkotipal Feb 9, 2024
a1c7d5d
Correct submodule version
lkotipal Feb 9, 2024
aeb6bb7
Remove orphaned parameter
lkotipal Feb 9, 2024
6891d6a
Update submodule
lkotipal Feb 12, 2024
9a61f21
Multi-neighborhood partitioning
lkotipal Feb 12, 2024
70d0702
Attempt to fix crashes, remove debug message
lkotipal Feb 12, 2024
6864a87
Update submodule
lkotipal Feb 12, 2024
418cb14
Use RIB by default
lkotipal Feb 13, 2024
89b6197
Output reducer for system boundary communication
lkotipal Feb 26, 2024
e794fc6
Per-cell partitioning neighborhoods
lkotipal Feb 26, 2024
e951630
Merge remote-tracking branch 'markusbattarbee/three-dimensional-weigh…
lkotipal Mar 21, 2024
a7bb565
Better multi-weight support
lkotipal Apr 5, 2024
0edcc75
Submodule update
lkotipal Apr 16, 2024
5d9a192
Default weight dimension 1
lkotipal Apr 16, 2024
af2e100
Submodule update
lkotipal Apr 17, 2024
afb2cb2
Merge branch 'hypergraph-3d-weights' into hypergraph
lkotipal Apr 17, 2024
05441b6
Merge branch 'dev' into hypergraph
lkotipal Apr 17, 2024
c3f2ab0
Write LB weight when writing all DROs
lkotipal Apr 17, 2024
1a1ceb7
Change LB approach after initialization
lkotipal Jun 6, 2024
0a42980
Merge branch 'dev' into hypergraph
lkotipal Nov 26, 2024
a53b551
Manual revert of multi-dimensional cell weights
lkotipal Nov 26, 2024
3292e97
Don't initialize stencils twice
lkotipal Nov 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions common.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ namespace CellParams {
FSGRID_BOUNDARYTYPE, /*!< Boundary type of this cell, as stored in the fsGrid */
CELLID, /*! < DCCRG cell index */
REFINEMENT_LEVEL, /*! < Refinement level */
AMR_TRANSLATE_COMM, /*! < Flag to include this cell in AMR pre-translate communication */
SYSBOUNDARIES_COMM, // < Flag if cell is included in sysboundaries communication
CONNECTION, /*!< Magnetic connection. See TracingPointConnectionType for assigned values. */
CONNECTION_FW_X, /*!< Endpoint x (forward-propagated) for the magnetic connection tracer*/
CONNECTION_FW_Y, /*!< Endpoint y (forward-propagated) for the magnetic connection tracer*/
Expand Down
18 changes: 16 additions & 2 deletions datareduction/datareducer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,20 @@ void initializeDataReducers(DataReducer * outputReducer, DataReducer * diagnosti
continue;
}
}
if(P::systemWriteAllDROs || lowercase == "vg_amr_translate_comm") { // Flag for AMR translation communication
outputReducer->addOperator(new DRO::DataReductionOperatorCellParams("vg_amr_translate_comm",CellParams::AMR_TRANSLATE_COMM, 1));
outputReducer->addMetadata(outputReducer->size()-1,"","","AMRtranslate","1.0");
if(!P::systemWriteAllDROs) {
continue;
}
}
if(P::systemWriteAllDROs || lowercase == "vg_sysboundaries_comm") { // Flag for sysboundaries vspace communication
outputReducer->addOperator(new DRO::DataReductionOperatorCellParams("vg_sysboundaries_comm",CellParams::SYSBOUNDARIES_COMM, 1));
outputReducer->addMetadata(outputReducer->size()-1,"","","SysboundariesComm","1.0");
if(!P::systemWriteAllDROs) {
continue;
}
}
if(P::systemWriteAllDROs || lowercase == "fg_rhom") { // Overall mass density (summed over all populations)
outputReducer->addOperator(new DRO::DataReductionOperatorFsGrid("fg_rhom",[](
FsGrid< std::array<Real, fsgrids::bfield::N_BFIELD>, FS_STENCIL_WIDTH> & perBGrid,
Expand Down Expand Up @@ -432,7 +446,7 @@ void initializeDataReducers(DataReducer * outputReducer, DataReducer * diagnosti
}
if(P::systemWriteAllDROs || lowercase == "lbweight" || lowercase == "vg_lbweight" || lowercase == "vg_loadbalanceweight" || lowercase == "vg_loadbalance_weight") {
// Load balance metric for LB debugging
outputReducer->addOperator(new DRO::DataReductionOperatorCellParams("vg_loadbalance_weight",CellParams::LBWEIGHTCOUNTER,1));
outputReducer->addOperator(new DRO::DataReductionOperatorCellParams("vg_loadbalance_weight",CellParams::LBWEIGHTCOUNTER, 1));
outputReducer->addMetadata(outputReducer->size()-1,"","","$\\mathrm{LB weight}$","");
if(!P::systemWriteAllDROs) {
continue;
Expand Down Expand Up @@ -3645,7 +3659,7 @@ void initializeDataReducers(DataReducer * outputReducer, DataReducer * diagnosti
}
}
if(P::diagnosticWriteAllDROs || lowercase == "lbweight" || lowercase == "vg_lbweight" || lowercase == "vg_loadbalanceweight" || lowercase == "vg_loadbalance_weight" || lowercase == "loadbalance_weight") {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P::diagnosticWriteAllDROs lost here

diagnosticReducer->addOperator(new DRO::DataReductionOperatorCellParams("vg_loadbalance_weight",CellParams::LBWEIGHTCOUNTER,1));
diagnosticReducer->addOperator(new DRO::DataReductionOperatorCellParams("vg_loadbalance_weight",CellParams::LBWEIGHTCOUNTER, 1));
if(!P::diagnosticWriteAllDROs) {
continue;
}
Expand Down
47 changes: 44 additions & 3 deletions grid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,23 @@ void initializeGrids(
sysBoundaries.isPeriodic(2))
.initialize(comm)
.set_geometry(geom_params);

mpiGrid.set_load_balance_norm(P::loadBalanceNorm);

for (const auto& [key, value] : P::loadBalanceOptions) {
mpiGrid.set_partitioning_option(key, value);
}

mpiGrid.set_partitioning_option("LB_APPROACH", "PARTITION");

// Hypergraph partitioning needs stencils initialized
initializeStencils(mpiGrid);

phiprof::Timer refineTimer {"Refine spatial cells"};
// We need this first as well
recalculateLocalCellsCache(mpiGrid);

setPartitioningNeighborhoods(mpiGrid);

if (!P::isRestart) {
// Note call to project.refineSpatialCells below
if (P::amrMaxSpatialRefLevel > 0 && project.refineSpatialCells(mpiGrid)) {
Expand All @@ -163,13 +176,19 @@ void initializeGrids(

// Init velocity mesh on all cells
initVelocityGridGeometry(mpiGrid);
initializeStencils(mpiGrid);

mpiGrid.set_partitioning_option("OBJ_WEIGHTS_COMPARABLE", "1");
/** RCB_MULTICRITERIA_NORM
Norm used in multicriteria algorithm; this determines how to balance the different weight constraints. Valid values are 1,2, and 3. Roughly, if the weights correspond to different phases, then the value 1 (1-norm) tries to minimize the total time (sum over all phases) while the value 3 (max-norm) attempts to minimize the worst imbalance in any phase. The 2-norm does something in between. Try a different value if you're not happy with the balance. **/

for (const auto& [key, value] : P::loadBalanceOptions) {
mpiGrid.set_partitioning_option(key, value);
}
phiprof::Timer initialLBTimer {"Initial load-balancing"};
if (myRank == MASTER_RANK) logFile << "(INIT): Starting initial load balance." << endl << writeVerbose;

// TODO: do we really need two initial LB?
setPartitioningNeighborhoods(mpiGrid);
mpiGrid.balance_load(); // Direct DCCRG call, recalculate cache afterwards
recalculateLocalCellsCache(mpiGrid);

Expand Down Expand Up @@ -322,7 +341,7 @@ void initializeGrids(
} else if (P::writeFullBGB) {
// If, instead of starting a regular simulation, we are only writing out the background field, it is enough to set a dummy load balance value of 1 here.
for (size_t i=0; i<cells.size(); ++i) {
mpiGrid[cells[i]]->parameters[CellParams::LBWEIGHTCOUNTER] = 1;
mpiGrid[cells[i]]->parameters[CellParams::LBWEIGHTCOUNTER] = 1.0;
}
}

Expand Down Expand Up @@ -399,6 +418,8 @@ void initializeGrids(
P::dt = P::bailout_min_dt;
}

mpiGrid.set_partitioning_option("LB_APPROACH", P::loadBalanceOptions.count("LB_APPROACH") ? P::loadBalanceOptions["LB_APPROACH"] : "REPARTITION");

// With all cell data in place, make preparations for translation
prepareAMRLists(mpiGrid);
initialStateTimer.stop();
Expand Down Expand Up @@ -484,6 +505,24 @@ void setFaceNeighborRanks( dccrg::Dccrg<SpatialCell,dccrg::Cartesian_Geometry>&
}
}

void setPartitioningNeighborhoods(dccrg::Dccrg<SpatialCell,dccrg::Cartesian_Geometry>& mpiGrid) {
const vector<CellID>& cells = getLocalCells();
for (auto& cell : cells){
mpiGrid.clear_partitioning_neighborhoods(cell);
for (auto neighborhood : P::partitioningNeighborhoods) {
// TODO: for now, layer 1 cells communicate in the extended neighborhood
// If this is ever fixed, SYSBOUNDRIES_NEIGHBORHOOD_ID needs a case
if (neighborhood == SYSBOUNDARIES_EXTENDED_NEIGHBORHOOD_ID) {
if (mpiGrid[cell]->sysBoundaryLayer == 1 || mpiGrid[cell]->sysBoundaryLayer == 2) {
mpiGrid.add_partitioning_neighborhood(cell, neighborhood);
}
} else {
mpiGrid.add_partitioning_neighborhood(cell, neighborhood);
}
}
}
}

void balanceLoad(dccrg::Dccrg<SpatialCell,dccrg::Cartesian_Geometry>& mpiGrid, SysBoundary& sysBoundaries, FsGrid<fsgrids::technical, FS_STENCIL_WIDTH> & technicalGrid, bool doTranslationLists){
// Invalidate cached cell lists
Parameters::meshRepartitioned = true;
Expand All @@ -497,6 +536,7 @@ void balanceLoad(dccrg::Dccrg<SpatialCell,dccrg::Cartesian_Geometry>& mpiGrid, S

deallocTimer.stop();
//set weights based on each cells LB weight counter
setPartitioningNeighborhoods(mpiGrid);
const vector<CellID>& cells = getLocalCells();
for (size_t i=0; i<cells.size(); ++i){
// Set cell weight. We could use different counters or number of blocks if different solvers are active.
Expand Down Expand Up @@ -1104,6 +1144,7 @@ void initializeStencils(dccrg::Dccrg<SpatialCell,dccrg::Cartesian_Geometry>& mpi
neighborhood.push_back({{0, 0, d}});
}
}

if( !mpiGrid.add_neighborhood(VLASOV_SOLVER_NEIGHBORHOOD_ID, neighborhood)){
std::cerr << "Failed to add neighborhood VLASOV_SOLVER_NEIGHBORHOOD_ID \n";
abort();
Expand Down
2 changes: 2 additions & 0 deletions grid.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ void initializeGrids(
Project& project
);

void setPartitioningNeighborhoods(dccrg::Dccrg<SpatialCell,dccrg::Cartesian_Geometry>& mpiGrid);

/*!
\brief Balance load

Expand Down
22 changes: 22 additions & 0 deletions ioread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,28 @@ bool readIonosphereNodeVariable(
return true;
}

/*!
\brief read length of SpatialGrid vector variable
\param file The ParallelReader file to use
\param variableName Name of the variable to check
*/
int readVectorSize(vlsv::ParallelReader& file,
const std::string& variableName){
uint64_t arraySize;
uint64_t vectorSize;
vlsv::datatype::type dataType;
uint64_t byteSize;
list<pair<string,string> > attribs;
attribs.push_back(make_pair("name",variableName));
attribs.push_back(make_pair("mesh","SpatialGrid"));

if (file.getArrayInfo("VARIABLE",attribs,arraySize,vectorSize,dataType,byteSize) == false) {
logFile << "(RESTART) ERROR: Failed to read " << endl << write;
return -1;
}
return (int)vectorSize;
}

/*!
\brief Read in state from a vlsv file in order to restart simulations
\param mpiGrid Vlasiator's grid
Expand Down
20 changes: 18 additions & 2 deletions parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ bool P::isRestart = false;
int P::writeAsFloat = false;
int P::writeRestartAsFloat = false;
string P::loadBalanceAlgorithm = string("");
std::vector<int> P::partitioningNeighborhoods {};
std::map<std::string, std::string> P::loadBalanceOptions;
int P::loadBalanceNorm {1};
uint P::rebalanceInterval = numeric_limits<uint>::max();

vector<string> P::outputVariableList;
Expand Down Expand Up @@ -388,8 +390,11 @@ bool P::addParameters() {
RP::add("vlasovsolver.GhostTranslateExtent","Stencil size in all-local ghost translation (default: VLASOV_STENCIL_WIDTH+1",0);

// Load balancing parameters
RP::add("loadBalance.algorithm", "Load balancing algorithm to be used", string("RCB"));
RP::add("loadBalance.algorithm", "Load balancing algorithm to be used", string("RIB"));
RP::addComposing("loadBalance.partitioning_neighborhood", "Neighborhood ID for (hyper)graph partitioning, see definitions.h");
RP::add("loadBalance.tolerance", "Load imbalance tolerance", string("1.05"));
RP::add("loadBalance.weight_dim", "Dimension of object weight", string("1"));
RP::add("loadBalance.norm", "Norm to use for cell weights in load balance, default 1-norm (sum)", 1);
RP::add("loadBalance.rebalanceInterval", "Load rebalance interval (steps)", 10);

RP::addComposing("loadBalance.optionKey", "Zoltan option key. Has to be matched by loadBalance.optionValue.");
Expand Down Expand Up @@ -422,7 +427,8 @@ bool P::addParameters() {
"ig_precipitation ig_deltaphi "+
"ig_inplanecurrent ig_b ig_e vg_drift vg_ionospherecoupling vg_connection vg_fluxrope fg_curvature "+
"vg_amr_drho vg_amr_du vg_amr_dpsq vg_amr_dbsq vg_amr_db vg_amr_alpha1 vg_amr_reflevel vg_amr_alpha2 "+
"vg_gridcoordinates fg_gridcoordinates vg_pressure_anisotropy vg_amr_vorticity");
"vg_gridcoordinates fg_gridcoordinates vg_pressure_anisotropy vg_amr_vorticity " +
"vg_amr_translate_comm vg_sysboundaries_comm");

RP::addComposing(
"variables_deprecated.output",
Expand Down Expand Up @@ -1030,10 +1036,20 @@ void Parameters::getParameters() {
}
// Get load balance parameters
RP::get("loadBalance.algorithm", P::loadBalanceAlgorithm);
RP::get("loadBalance.partitioning_neighborhood", P::partitioningNeighborhoods);

if(P::partitioningNeighborhoods.empty() && (P::loadBalanceAlgorithm == "GRAPH" || P::loadBalanceAlgorithm == "HYPERGRAPH")) {
P::partitioningNeighborhoods.push_back(FULL_NEIGHBORHOOD_ID);
}

loadBalanceOptions["IMBALANCE_TOL"] = "";
RP::get("loadBalance.tolerance", loadBalanceOptions["IMBALANCE_TOL"]);
RP::get("loadBalance.norm", P::loadBalanceNorm);
RP::get("loadBalance.rebalanceInterval", P::rebalanceInterval);

loadBalanceOptions["OBJ_WEIGHT_DIM"] = "";
RP::get("loadBalance.weight_dim", loadBalanceOptions["OBJ_WEIGHT_DIM"]);

std::vector<std::string> loadBalanceKeys;
std::vector<std::string> loadBalanceValues;
RP::get("loadBalance.optionKey", loadBalanceKeys);
Expand Down
2 changes: 2 additions & 0 deletions parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,9 @@ struct Parameters {
in the Lorentz force and in the field solver.*/

static std::string loadBalanceAlgorithm; /*!< Algorithm to be used for load balance.*/
static std::vector<int> partitioningNeighborhoods; /*!< Partitioning neighborhood for (hyper)graph.*/
static std::map<std::string, std::string> loadBalanceOptions; // Other Load balancing options
static int loadBalanceNorm; // Norm to use for cell weights
static uint rebalanceInterval; /*!< Load rebalance interval (steps). */
static bool prepareForRebalance; /**< If true, propagators should measure their time consumption in preparation
* for mesh repartitioning.*/
Expand Down
2 changes: 1 addition & 1 deletion submodules/dccrg
Submodule dccrg updated 1 files
+419 −128 dccrg.hpp
9 changes: 9 additions & 0 deletions sysboundary/sysboundary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,15 @@ void SysBoundary::applySysBoundaryVlasovConditions(
SpatialCell::set_mpi_transfer_type(Transfer::CELL_PARAMETERS | Transfer::POP_METADATA | Transfer::CELL_SYSBOUNDARYFLAG, true);
mpiGrid.update_copies_of_remote_neighbors(SYSBOUNDARIES_EXTENDED_NEIGHBORHOOD_ID);

// Mark cells that are communicating velocity blocks on system boundaries
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this in fact now provide incorrect information to Zoltan? This tells it that a sysboundary cell which currently is not at a process boundary won't cost anything to communicate, even though in the end it would.

Copy link
Contributor Author

@lkotipal lkotipal Apr 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not used for determining partitioning neighborhoods (see setPartitioningNeighborhoods in grid.cpp), just for data analysis of supposed communication weight from vlsv files.

for (auto& cell : mpiGrid.get_local_cells_on_process_boundary(SYSBOUNDARIES_EXTENDED_NEIGHBORHOOD_ID)) {
mpiGrid[cell]->parameters[CellParams::SYSBOUNDARIES_COMM] = mpiGrid[cell]->sysBoundaryLayer == 1 || mpiGrid[cell]->sysBoundaryLayer == 2;
}

for (auto& cell : mpiGrid.get_local_cells_not_on_process_boundary(SYSBOUNDARIES_EXTENDED_NEIGHBORHOOD_ID)) {
mpiGrid[cell]->parameters[CellParams::SYSBOUNDARIES_COMM] = false;
}

// Loop over existing particle species
for (uint popID = 0; popID < getObjectWrapper().particleSpecies.size(); ++popID) {
SpatialCell::setCommunicatedSpecies(popID);
Expand Down
4 changes: 2 additions & 2 deletions vlasovsolver/cpu_acc_semilag.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ void cpu_accelerate_cell(SpatialCell* spatial_cell,
const uint popID,
const uint map_order,
const Real& dt) {
//double t1 = MPI_Wtime();
double t1 = MPI_Wtime();

vmesh::VelocityMesh<vmesh::GlobalID,vmesh::LocalID>& vmesh = spatial_cell->get_velocity_mesh(popID);
//vmesh::VelocityBlockContainer<vmesh::LocalID>& blockContainer = spatial_cell->get_velocity_blocks(popID);
Expand Down Expand Up @@ -159,6 +159,6 @@ void cpu_accelerate_cell(SpatialCell* spatial_cell,
}

if (Parameters::prepareForRebalance == true) {
// spatial_cell->parameters[CellParams::LBWEIGHTCOUNTER] += (MPI_Wtime() - t1);
//spatial_cell->parameters[CellParams::LBWEIGHTCOUNTER] += (MPI_Wtime() - t1);
}
}
2 changes: 1 addition & 1 deletion vlasovsolver/vlasovmover.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ void calculateSpatialTranslation(

phiprof::Timer semilagTimer {"semilag-trans"};

//double t1 = MPI_Wtime();
double t1 = MPI_Wtime();

const vector<CellID>& localCells = getLocalCells();
vector<CellID> remoteTargetCellsx;
Expand Down
Loading