Skip to content

Commit

Permalink
Fix Subfiling VFD IOC assignment bug (#3456) (#3518)
Browse files Browse the repository at this point in the history
  • Loading branch information
jhendersonHDF authored Sep 7, 2023
1 parent 60040c4 commit fcf2e65
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 19 deletions.
13 changes: 13 additions & 0 deletions release_docs/RELEASE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,19 @@ Bug Fixes since HDF5-1.14.2 release
===================================
Library
-------
- Fixed a bug with the way the Subfiling VFD assigns I/O concentrators

During a file open operation, the Subfiling VFD determines the topology
of the application and uses that to select a subset of MPI ranks that
I/O will be forwarded to, called I/O concentrators. The code for this
had previously assumed that the parallel job launcher application (e.g.,
mpirun, srun, etc.) would distribute MPI ranks sequentially among a node
until all processors on that node have been assigned before going on to
the next node. When the launcher application mapped MPI ranks to nodes
in a different fashion, such as round-robin, this could cause the Subfiling
VFD to incorrectly map MPI ranks as I/O concentrators, leading to missing
subfiles.

- Fixed a file space allocation bug in the parallel library for chunked
datasets

Expand Down
5 changes: 0 additions & 5 deletions src/H5FDsubfiling/H5FDsubfiling_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,6 @@
#include "H5subfiling_common.h"
#include "H5subfiling_err.h"

/*
* Some definitions for debugging the Subfiling VFD
*/
/* #define H5FD_SUBFILING_DEBUG */

#define DRIVER_INFO_MESSAGE_MAX_INFO 65536
#define DRIVER_INFO_MESSAGE_MAX_LENGTH 65552 /* MAX_INFO + sizeof(info_header_t) */

Expand Down
66 changes: 52 additions & 14 deletions src/H5FDsubfiling/H5subfiling_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,16 @@ static herr_t H5_free_subfiling_topology(sf_topology_t *topology);
static herr_t init_subfiling(const char *base_filename, uint64_t file_id,
H5FD_subfiling_params_t *subfiling_config, int file_acc_flags, MPI_Comm comm,
int64_t *context_id_out);
static herr_t init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
sf_topology_t **app_topology_out);
static herr_t init_app_topology(int64_t sf_context_id, H5FD_subfiling_params_t *subfiling_config,
MPI_Comm comm, MPI_Comm node_comm, sf_topology_t **app_topology_out);
static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type,
char **ioc_sel_info_str);
static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config,
long iocs_per_node, sf_topology_t **app_topology);
static herr_t init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm);
static herr_t gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm);
static int compare_layout_nodelocal(const void *layout1, const void *layout2);
static herr_t identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride);
static herr_t identify_ioc_ranks(int64_t sf_context_id, sf_topology_t *app_topology, int rank_stride);
static herr_t init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename,
uint64_t file_id, H5FD_subfiling_params_t *subfiling_config,
sf_topology_t *app_topology, MPI_Comm file_comm);
Expand Down Expand Up @@ -886,7 +886,7 @@ init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_param
* Setup the application topology information, including the computed
* number and distribution map of the set of I/O concentrators
*/
if (init_app_topology(subfiling_config, comm, node_comm, &app_topology) < 0)
if (init_app_topology(context_id, subfiling_config, comm, node_comm, &app_topology) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application topology");

new_context->sf_context_id = context_id;
Expand Down Expand Up @@ -938,8 +938,8 @@ init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_param
*-------------------------------------------------------------------------
*/
static herr_t
init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
sf_topology_t **app_topology_out)
init_app_topology(int64_t sf_context_id, H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm,
MPI_Comm node_comm, sf_topology_t **app_topology_out)
{
H5FD_subfiling_ioc_select_t ioc_selection_type;
sf_topology_t *app_topology = NULL;
Expand Down Expand Up @@ -1142,7 +1142,7 @@ init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_
* Determine which ranks are I/O concentrator ranks, based on the
* given IOC selection strategy and MPI information.
*/
if (identify_ioc_ranks(app_topology, rank_multiple) < 0)
if (identify_ioc_ranks(sf_context_id, app_topology, rank_multiple) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL,
"couldn't determine which MPI ranks are I/O concentrators");
}
Expand Down Expand Up @@ -1600,7 +1600,7 @@ compare_layout_nodelocal(const void *layout1, const void *layout2)
*-------------------------------------------------------------------------
*/
static herr_t
identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
identify_ioc_ranks(int64_t sf_context_id, sf_topology_t *app_topology, int rank_stride)
{
app_layout_t *app_layout = NULL;
int *io_concentrators = NULL;
Expand All @@ -1613,6 +1613,11 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
assert(app_topology->app_layout);
assert(app_topology->app_layout->layout);
assert(app_topology->app_layout->node_count > 0);
assert(app_topology->app_layout->node_count <= app_topology->app_layout->world_size);

#ifndef H5_SUBFILING_DEBUG
(void)sf_context_id;
#endif

app_layout = app_topology->app_layout;

Expand All @@ -1629,36 +1634,52 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
case SELECT_IOC_ONE_PER_NODE: {
int total_ioc_count = 0;
int iocs_per_node = 1;
int last_lead_rank;

if (app_topology->n_io_concentrators > app_layout->node_count)
iocs_per_node = app_topology->n_io_concentrators / app_layout->node_count;

assert(app_layout->node_ranks);
/*
* NOTE: The below code assumes that the app_layout->layout
* array was sorted according to the node_lead_rank field,
* such that entries for MPI ranks on the same node will occur
* together in the array.
*/

for (size_t i = 0; i < (size_t)app_layout->node_count; i++) {
int node_index = app_layout->node_ranks[i];
int local_size = app_layout->layout[node_index].node_local_size;
last_lead_rank = app_layout->layout[0].node_lead_rank;
for (size_t i = 0, layout_idx = 0; i < (size_t)app_layout->node_count; i++) {
int local_size = app_layout->layout[layout_idx].node_local_size;

/* Assign first I/O concentrator from this node */
assert(total_ioc_count < app_topology->n_io_concentrators);
io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
io_concentrators[total_ioc_count] = app_layout->layout[layout_idx++].rank;

if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
assert(!app_topology->rank_is_ioc);

app_topology->ioc_idx = total_ioc_count;
app_topology->rank_is_ioc = TRUE;
}

total_ioc_count++;

/* Assign any additional I/O concentrators from this node */
for (size_t j = 1; j < (size_t)iocs_per_node; j++) {
if (total_ioc_count >= max_iocs)
break;
if (j >= (size_t)local_size)
break;

/* Sanity check to make sure this rank is on the same node */
assert(app_layout->layout[layout_idx].node_lead_rank ==
app_layout->layout[layout_idx - 1].node_lead_rank);

assert(total_ioc_count < app_topology->n_io_concentrators);
io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
io_concentrators[total_ioc_count] = app_layout->layout[layout_idx++].rank;

if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
assert(!app_topology->rank_is_ioc);

app_topology->ioc_idx = total_ioc_count;
app_topology->rank_is_ioc = TRUE;
}
Expand All @@ -1668,8 +1689,25 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)

if (total_ioc_count >= max_iocs)
break;

/* Find the block of layout structures for the next node */
while ((layout_idx < (size_t)app_layout->world_size) &&
(last_lead_rank == app_layout->layout[layout_idx].node_lead_rank))
layout_idx++;

if (layout_idx >= (size_t)app_layout->world_size)
break;

last_lead_rank = app_layout->layout[layout_idx].node_lead_rank;
}

#ifdef H5_SUBFILING_DEBUG
if (app_topology->n_io_concentrators != total_ioc_count)
H5_subfiling_log(sf_context_id,
"%s: **WARN** Number of I/O concentrators adjusted from %d to %d", __func__,
app_topology->n_io_concentrators, total_ioc_count);
#endif

/* Set final number of I/O concentrators after adjustments */
app_topology->n_io_concentrators = total_ioc_count;

Expand Down
19 changes: 19 additions & 0 deletions src/H5FDsubfiling/H5subfiling_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,25 @@ typedef enum io_ops {
* will be broadcast to all MPI ranks and will
* provide a basis for determining which MPI ranks
* will host an I/O concentrator.
*
* - rank
* The MPI rank value for this processor
*
* - node_local_rank
* The MPI rank value for this processor in an MPI
* communicator that only involves MPI ranks on the
* same node as this processor
*
* - node_local_size
* The number of MPI ranks on the same node as this
* processor, including this processor itself
*
* - node_lead_rank
* The lowest MPI rank value for processors on the
* same node as this processor (possibly the MPI
* rank value for this processor); Denotes a "lead"
* MPI rank for certain operations.
*
*/
typedef struct {
int rank;
Expand Down

0 comments on commit fcf2e65

Please sign in to comment.