From 13b9f2a2a8d22fabe7564671be5130990b1a52f0 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Fri, 21 Jun 2024 16:56:50 +0200 Subject: [PATCH 01/20] Started to implement general "Cluster Manager" class, which takes care of the HPC environment, i.e. gets the list of Hosts, identifies Head and worker nodes and launches the Orchestrator. Allows for easier implementation of other schedulers, in particular SLURM. --- src/relexi/smartsim/cluster_manager.py | 230 +++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 src/relexi/smartsim/cluster_manager.py diff --git a/src/relexi/smartsim/cluster_manager.py b/src/relexi/smartsim/cluster_manager.py new file mode 100644 index 0000000..bea7bd1 --- /dev/null +++ b/src/relexi/smartsim/cluster_manager.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 + +"""The Cluster Manager contains all information about the HPC environment.""" + +import socket +import rlxout +import smartsim + +import sys + +class ClusterManager(): + """Base class for cluster managers. + + This class defines the interface for cluster managers. which contain all + information of the HPC environment used for the training environments. In + particular, it provides properties to access the hostnames and entails the + launched Orchestrator. + + Properties: + type (str): Type of the cluster manager. + hosts (list): List of hostnames. + dedicated_head (bool): Whether dedicated head and worker nodes are + available or everything runs on single shared node. + head (str): Hostname of Head node. + workers (list): List of worker nodes. + job_id (str): Job ID of the current job. + + Methods: + print_info: Print information about the current environment. + generate_rankfiles: Generate rank files for OpenMPI process binding. + + Raises: + NotImplementedError: If the methods are not implemented for the + provided scheduler type. + ValueError: If the scheduler type is not supported. + """ + + TYPES = ['local', 'pbs', 'slurm'] # All possible types of cluster managers + + def __init__( + self, + scheduler_type='local', + db_port=6790, + db_network_interface='lo', + ): + + # Check if the scheduler type is supported + self.type = scheduler_type.casefold().strip() + + rlxout.info('Trying to identify {self.type} training environment...') + try: + self._hosts = self._get_hostlist() + except: + if self.type == 'local': + raise RuntimeError('Failed to setup local training environment!') + rlxout.error('Failed!') + rlxout.info('Trying to run in local mode instead...') + try: + self.type = 'local' + self._hosts = self._get_hostlist() + except: + raise RuntimeError('Also failed to setup local environment!') + rlxout.info('Success!') + + self.exp, self.db = self._launch_orchestrator(port=db_port, + network_interface=db_network_interface) + + def _launch_orchestrator(self, port, network_interface): + """Launches the SmartSim Orchestrator for the current job. + + Args: + port (int): Port to start the Orchestrator on. + network_interface (str): Network interface to use for the Orchestrator. + + Returns: + Experiment: The Experiment object. + Orchestrator: The Orchestrator object. + """ + rlxout.small_banner('Starting Orchestrator...') + + # Generate flexi experiment + exp = smartsim.Experiment('flexi', launcher=launcher_type) + + # Initialize the orchestrator based on the orchestrator_type + if self.type == 'local': + db = smartsim.Orchestrator(launcher=self.type, port=port, interface='lo') + elif self.type in {'pbs','slurm'}: + db = smartsim.Orchestrator(launcher=self.type, port=port, interface=network_interface) + else: + raise NotImplementedError(f"Orchestrator type {self.type} not implemented.") + + # startup Orchestrator + rlxout.info("Starting the Database...", newline=False) + try: + exp.start(db) + except Exception as e: + raise RuntimeError(f"Failed to start the Orchestrator: {e}") + rlxout.info(" Success!") + + # get the database nodes and select the first one + entry_db = socket.gethostbyname(db.hosts[0]) + rlxout.info("If the SmartRedis database isn't stopping properly you can use this command to stop it from the command line:") + rlxout.info(f"$(smart dbcli) -h {db.hosts[0]} -p {port} shutdown", newline=False) + + return exp, db + + def print_info(self): + """Print information about the current job.""" + rlxout.info("Found the following environment information:") + rlxout.info(f"Scheduler Type: {self.type}") + rlxout.info(f"Job ID: {self.job_id}") + rlxout.info(f"Hosts: {self.hosts}") + rlxout.info(f"Head node: {self.head}") + rlxout.info(f"Worker nodes: {self.workers}") + + def _get_hostlist(self): + """Get the list of hosts the script is executed on. + + Uses the scheduler type to determine the hostlist via the environment + variables set by the scheduler. + + Returns: + List containing the hostnames as strings. + + Raises: + NotImplementedError: If the method is not implemented for the + scheduler type. + """ + if self.type == 'local': + return [socket.gethostname()] + elif self.type == 'pbs': + return os.environ['PBS_NODEFILE'] + elif self.type == 'slurm': + return os.environ['SLURM_NODELIST'] + else: + raise NotImplementedError(f"Method get_hostlist not implemented for scheduler type {self.type}") + + @property + def type(self): + """Get the type of scheduler environment used for the cluster manager. + + Returns: + Type of the cluster manager as a string. + """ + return self._type + + @type.setter + def type(self, value): + """Set the type of scheduler environment used for the cluster manager. + Ensure that the type is supported. + + Args: + value (str): Type of the cluster manager as a string. + """ + if not value in self.TYPES: + raise ValueError(f"Scheduler type {value} not supported.") + self._type = value + + @property + def hosts(self): + """Get the list of hosts the script is executed on. + + Returns: + List containing the hostnames as strings. + """ + return self._hosts + + @property + def dedicated_head(self): + """Return whether dedicated head and worker nodes are available or + everything runs on single shared node. + + Returns: + Bool indicating whether dedicated head is used or not. + """ + return len(self._hosts) > 1 + + @property + def head(self): + """Get head node, which is first node, i.e. node Relexi is actually + started on. + + Returns: + Hostname of Head node as string. + """ + return self._hosts[0] + + @property + def workers(self): + """Get a list of worker nodes. + + Returns: + List containing the hostnames of worker nodes as strings. + """ + if self.dedicated_head: + return self._hosts[1:] + return self._hosts + + @property + def job_id(self): + """Get the jobID of the current job. + + Returns: + Job ID of the current job. + """ + return self._job_id + + def generate_rankfile(self, n_par_env, ranks_per_env, base_path=None): + """Generate rank file for OpenMPI process binding. + + Args: + cores_per_node (int): Number of cores per node. + n_par_env (int): Number of parallel environments to be launched. + ranks_per_env (int): Number of ranks per environments. + base_path (str): (Optional.) Path to the directory of the rank files. + + Returns: + List of filenames of the rankfiles. + """ + if self.type == 'local': + return generate_rankfile_ompi(self.hosts, cores_per_node, n_par_env, ranks_per_env, base_path) + if self.type == 'pbs': + return generate_rankfile_ompi(self.hosts, cores_per_node, n_par_env, ranks_per_env, base_path) + if self.type == 'slurm': + return generate_rankfile_ompi(self.hosts, cores_per_node, n_par_env, ranks_per_env, base_path) + else: + raise NotImplementedError(f"Method generate_rankfile not implemented for scheduler type {self.type}") + +if __name__ == '__main__': + db = ClusterManager() From 48c4e12f975d8c674fdf78940309defe587f671b Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Fri, 21 Jun 2024 23:12:02 +0200 Subject: [PATCH 02/20] Now first running version with new ClusterManager --- src/relexi/{smartsim => }/cluster_manager.py | 64 +++++++++----------- src/relexi/rl/ppo/train.py | 30 +++++---- 2 files changed, 44 insertions(+), 50 deletions(-) rename src/relexi/{smartsim => }/cluster_manager.py (82%) diff --git a/src/relexi/smartsim/cluster_manager.py b/src/relexi/cluster_manager.py similarity index 82% rename from src/relexi/smartsim/cluster_manager.py rename to src/relexi/cluster_manager.py index bea7bd1..057b022 100644 --- a/src/relexi/smartsim/cluster_manager.py +++ b/src/relexi/cluster_manager.py @@ -2,11 +2,14 @@ """The Cluster Manager contains all information about the HPC environment.""" +import os +import sys import socket -import rlxout + import smartsim -import sys +import relexi.io.output as rlxout + class ClusterManager(): """Base class for cluster managers. @@ -23,7 +26,6 @@ class ClusterManager(): available or everything runs on single shared node. head (str): Hostname of Head node. workers (list): List of worker nodes. - job_id (str): Job ID of the current job. Methods: print_info: Print information about the current environment. @@ -35,7 +37,7 @@ class ClusterManager(): ValueError: If the scheduler type is not supported. """ - TYPES = ['local', 'pbs', 'slurm'] # All possible types of cluster managers + TYPES = ['local', 'pbs', 'slurm'] # All implemented types of cluster managers def __init__( self, @@ -47,24 +49,29 @@ def __init__( # Check if the scheduler type is supported self.type = scheduler_type.casefold().strip() - rlxout.info('Trying to identify {self.type} training environment...') + rlxout.info(f'Trying to identify {self.type} training environment...') try: self._hosts = self._get_hostlist() - except: + except Exception as e: if self.type == 'local': raise RuntimeError('Failed to setup local training environment!') - rlxout.error('Failed!') - rlxout.info('Trying to run in local mode instead...') + rlxout.warning(f'Failed: {e}') + rlxout.info('Trying to run in local mode instead...', newline=False) try: self.type = 'local' self._hosts = self._get_hostlist() except: raise RuntimeError('Also failed to setup local environment!') - rlxout.info('Success!') + rlxout.info('Success!', newline=False) - self.exp, self.db = self._launch_orchestrator(port=db_port, + self.db = None + self.exp, self.db, self.entry_db = self._launch_orchestrator(port=db_port, network_interface=db_network_interface) + def __del__(self): + if self.db: + self.exp.stop(self.db) + def _launch_orchestrator(self, port, network_interface): """Launches the SmartSim Orchestrator for the current job. @@ -79,13 +86,13 @@ def _launch_orchestrator(self, port, network_interface): rlxout.small_banner('Starting Orchestrator...') # Generate flexi experiment - exp = smartsim.Experiment('flexi', launcher=launcher_type) + exp = smartsim.Experiment('flexi', launcher=self.type) # Initialize the orchestrator based on the orchestrator_type if self.type == 'local': - db = smartsim.Orchestrator(launcher=self.type, port=port, interface='lo') + db = exp.create_database(port=port, interface='lo') elif self.type in {'pbs','slurm'}: - db = smartsim.Orchestrator(launcher=self.type, port=port, interface=network_interface) + db = exp.create_database(hosts=self.head, port=port, interface=network_interface) else: raise NotImplementedError(f"Orchestrator type {self.type} not implemented.") @@ -95,23 +102,22 @@ def _launch_orchestrator(self, port, network_interface): exp.start(db) except Exception as e: raise RuntimeError(f"Failed to start the Orchestrator: {e}") - rlxout.info(" Success!") + rlxout.info("Success!", newline=False) # get the database nodes and select the first one entry_db = socket.gethostbyname(db.hosts[0]) rlxout.info("If the SmartRedis database isn't stopping properly you can use this command to stop it from the command line:") rlxout.info(f"$(smart dbcli) -h {db.hosts[0]} -p {port} shutdown", newline=False) - return exp, db + return exp, db, entry_db - def print_info(self): + def info(self): """Print information about the current job.""" rlxout.info("Found the following environment information:") - rlxout.info(f"Scheduler Type: {self.type}") - rlxout.info(f"Job ID: {self.job_id}") - rlxout.info(f"Hosts: {self.hosts}") - rlxout.info(f"Head node: {self.head}") - rlxout.info(f"Worker nodes: {self.workers}") + rlxout.info(f"Scheduler: {self.type}", newline=False) + rlxout.info(f"Hosts: {self.hosts}", newline=False) + rlxout.info(f"Head node: {self.head}", newline=False) + rlxout.info(f"Workers: {self.workers}", newline=False) def _get_hostlist(self): """Get the list of hosts the script is executed on. @@ -196,22 +202,12 @@ def workers(self): return self._hosts[1:] return self._hosts - @property - def job_id(self): - """Get the jobID of the current job. - - Returns: - Job ID of the current job. - """ - return self._job_id - - def generate_rankfile(self, n_par_env, ranks_per_env, base_path=None): + def generate_rankfiles(self, n_models, n_ranks_per_model, base_path=None): """Generate rank file for OpenMPI process binding. Args: - cores_per_node (int): Number of cores per node. - n_par_env (int): Number of parallel environments to be launched. - ranks_per_env (int): Number of ranks per environments. + n_models (int): Number of models to be launched. + n_ranks_per_model (int): Number of ranks used for each model. base_path (str): (Optional.) Path to the directory of the rank files. Returns: diff --git a/src/relexi/rl/ppo/train.py b/src/relexi/rl/ppo/train.py index d47be33..2019f25 100644 --- a/src/relexi/rl/ppo/train.py +++ b/src/relexi/rl/ppo/train.py @@ -21,7 +21,7 @@ import relexi.rl.models import relexi.rl.tf_helpers import relexi.env.flexiEnvSmartSim -import relexi.smartsim.init_smartsim +import relexi.cluster_manager import relexi.io.readin as rlxin import relexi.io.output as rlxout from relexi.smartsim.helpers import generate_rankfile_ompi, copy_to_nodes, parser_flexi_parameters @@ -107,16 +107,16 @@ def train( config_file tf.config.optimizer.set_jit(True) # Initialize SmartSim - exp, worker_nodes, db, entry_db, is_db_cluster = relexi.smartsim.init_smartsim.init_smartsim(port = smartsim_port - ,num_dbs = smartsim_num_dbs - ,launcher_type = smartsim_launcher - ,orchestrator_type = smartsim_orchestrator - ) + resources = relexi.cluster_manager.ClusterManager( + scheduler_type=smartsim_orchestrator, + db_port=smartsim_port, + ) + resources.info() # generating rankfiles for OpenMPI if mpi_launch_mpmd: # If all MPI jobs are run with single mpirun command, all jobs are allocated based on single rankfile - rank_files = generate_rankfile_ompi(worker_nodes + rank_files = generate_rankfile_ompi(resources.workers ,n_procs_per_node ,n_par_env=1 ,ranks_per_env=num_parallel_environments*num_procs_per_environment @@ -124,7 +124,7 @@ def train( config_file else: # Otherwise every MPI job gets its own rankfile - rank_files = generate_rankfile_ompi(worker_nodes + rank_files = generate_rankfile_ompi(resources.workers ,n_procs_per_node ,num_parallel_environments ,num_procs_per_environment @@ -165,14 +165,13 @@ def train( config_file # Instantiate parallel collection environment my_env = tf_py_environment.TFPyEnvironment( - relexi.env.flexiEnvSmartSim.flexiEnv(exp + relexi.env.flexiEnvSmartSim.flexiEnv(resources.exp ,executable_path ,parameter_file ,tag = 'train' ,port = smartsim_port - ,entry_db = entry_db - ,is_db_cluster = is_db_cluster - ,hosts = worker_nodes + ,entry_db = resources.entry_db + ,hosts = resources.workers ,n_envs = num_parallel_environments ,n_procs = num_procs_per_environment ,n_procs_per_node = n_procs_per_node @@ -193,14 +192,13 @@ def train( config_file eval_files = train_files my_eval_env = tf_py_environment.TFPyEnvironment( - relexi.env.flexiEnvSmartSim.flexiEnv(exp + relexi.env.flexiEnvSmartSim.flexiEnv(resources.exp ,executable_path ,parameter_file ,tag = 'eval' ,port = smartsim_port - ,entry_db = entry_db - ,is_db_cluster = is_db_cluster - ,hosts = worker_nodes + ,entry_db = resources.entry_db + ,hosts = resources.workers ,n_procs = num_procs_per_environment ,n_procs_per_node = n_procs_per_node ,spectra_file = reward_spectrum_file From 8c52ccc8a01eee6dee4a5f13879b1c7673309aae Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Fri, 21 Jun 2024 23:13:34 +0200 Subject: [PATCH 03/20] Remove Init SmartSim file, since functionality has now moved to ClusterManager --- src/relexi/smartsim/init_smartsim.py | 201 --------------------------- 1 file changed, 201 deletions(-) delete mode 100644 src/relexi/smartsim/init_smartsim.py diff --git a/src/relexi/smartsim/init_smartsim.py b/src/relexi/smartsim/init_smartsim.py deleted file mode 100644 index 337f4dd..0000000 --- a/src/relexi/smartsim/init_smartsim.py +++ /dev/null @@ -1,201 +0,0 @@ -#!/usr/bin/env python3 - -"""Helpers for launching the SmartSim Orchestrator.""" - -import os -import json -import socket -import subprocess - -from smartsim import Experiment -from smartsim.database import Orchestrator - -import relexi.io.output as rlxout - - -def get_host(): - """Get the host the script is executed on from the env variable. - - Returns: - Hostname as string - """ - return socket.gethostname() - - -def get_pbs_hosts(): - """Get the host list from the PBS Nodefile. - - Returns: - List containing the hostnames as strings - """ - nodefile_path = os.environ["PBS_NODEFILE"] - with open(nodefile_path, "r", encoding='ascii') as f: - hostlist = [] - for line in f: - # only take the name not the entire ip-address otherwise there will be an error - # it will set the command line flag "mpirun ... -host " - # This only works with the hostname shorthand - full_host_ip = line.strip() # e.g. abc.ib0...de - hostname = full_host_ip.split(".")[0] # e.g. abc - if not hostname in hostlist: - hostlist.append(hostname) - return hostlist - - -def get_pbs_walltime(): - """Get the walltime of the current PBS job. - - Returns: - Walltime of current PBS job. - """ - job_id = os.environ["PBS_JOBID"] - cmd = f"qstat -xfF json {job_id}" - stat_json_str = subprocess.check_output(cmd, shell=True, text=True) - stat_json = json.loads(stat_json_str) - return stat_json["Jobs"][job_id]["Resource_List"]["walltime"] - - -def init_smartsim( - port=6790, - num_dbs=1, - network_interface="ib0", - launcher_type="local", - orchestrator_type="local" -): - """Starts the orchestrator, launches an experiment and gets list of hosts. - - Args: - port (int): (Optional.) Port number on which Orchestrator will be - launched. - num_dbs (int): (Optional.) Number of databases should be launched. - `num_dbs>1` imply that the database is clustered , i.e. distributed - across multiple instances. - network_interface (string) = (Optional.) Name of network interface to - be used to establish communication to clients. - launcher_type (string): (Optional.) Launcher to be used to start the - executable. Currently implemented are: - * local - * mpirun - orchestrator_type (string): Scheduler environment in which the - orchestrator is launched. Currently implemented are: - * local - * pbs - Returns: - smartsim.Experiment: The experiments in which the Orchestrator was - started - list: List of names of the nodes used as workers to run the simulations - smarsim.Orchestrator: The launched Orchestrator - string: The IP address and port used to access the Orchestrator - bool: Flag to indicate whether Orchestrator is clustered. - - Note: - Admissable combinations of Experiment launcher and orchestrator type: - * laun.: local, orch.: pbs = incompatible. - * laun.: local, orch.: local = only 1 in-memory database possible. - `mpirun` will still distribute the flexi instances to other - nodes. - * laun.: pbs, orch.: pbs = does not support clusters of size 2 - otherwise works flawlessly (warning: orchestrator doesn't find - the cluster configuration). - * laun.: pbs, orch.: local = not supported error: not supported by - PBSPro. - - TODO: - * Add support for SLURM. - * Clean implementation and nesting. - * Make object out of this. - * Allow to reconnect to already started Orchestrator - * Or closue Orchestrator still open from previous run - """ - - rlxout.small_banner('Starting SmartSim...') - - # Check whether launcher and orchestrator are identical (case-insensitive) - if not launcher_type.casefold() == orchestrator_type.casefold(): - rlxout.warning(f'Chosen Launcher {launcher_type} and orchestrator {orchestrator_type} are incompatible! Please choose identical types for both!') - - # Is database clustered, i.e. hosted on different nodes? - db_is_clustered = num_dbs > 1 - - # First try PBS if necessary. Use local configuration as backup - pbs_failed = False - if launcher_type.casefold() == 'pbs': - try: - # try to load the batch settings from the batch job environment - # variables like PBS_JOBID and PBS_NODEFILE - walltime = get_pbs_walltime() - hosts = get_pbs_hosts() - num_hosts = len(hosts) - rlxout.info(f"Identified available nodes: {hosts}") - - # Maximum of 1 DB per node allowed for PBS Orchestrator - if num_hosts < num_dbs: - rlxout.warning(f"You selected {num_dbs} databases and {num_hosts} nodes, but maximum is 1 database per node. Setting number of databases to {num_hosts}") - num_dbs = num_hosts - - # Clustered DB with PBS orchestrator requires at least 3 nodes for reasons - if db_is_clustered: - if num_dbs < 3: - rlxout.warning(f"Only {num_dbs} databases requested, but clustered orchestrator requires 3 or more databases. Non-clustered orchestrator is launched instead!") - db_is_clustered = False - else: - rlxout.info(f"Using a clustered database with {num_dbs} instances.") - else: - rlxout.info("Using an UNclustered database on root node.") - - except Exception: - # If no env. variables for batchjob, use the local launcher - rlxout.warning("Didn't find pbs batch environment. Switching to local setup.") - pbs_failed = True - - # If local configuration is required or if scheduler-based launcher failed. - if (launcher_type.casefold() == 'local') or pbs_failed: - launcher_type = "local" - orchestrator_type = "local" - db_is_clustered = False - hosts = [get_host()] - - # Generate flexi experiment - exp = Experiment("flexi", launcher=launcher_type) - - # Initialize the orchestrator based on the orchestrator_type - if orchestrator_type.casefold() == "local": - db = Orchestrator( - port=port, - interface='lo' - ) - - elif orchestrator_type.casefold() == "pbs": - db = Orchestrator( - launcher='pbs', - port=port, - db_nodes=num_dbs, - batch=False, # false if it is launched in an interactive batch job - time=walltime, # this is necessary, otherwise the orchestrator wont run properly - interface=network_interface, - hosts=hosts, # this must be the hostnames of the nodes, it mustn't be the ip-addresses - run_command="mpirun" - ) - else: - rlxout.warning(f"Orchester type {orchestrator_type} not implemented!") - raise NotImplementedError - - # startup Orchestrator - rlxout.info("Starting the Database...", newline=False) - exp.start(db) - - # get the database nodes and select the first one - entry_db = socket.gethostbyname(db.hosts[0]) - rlxout.info(f"Identified 1 of {len(db.hosts)} database hosts to later connect clients to: {entry_db}", newline=False) - rlxout.info("If the SmartRedis database isn't stopping properly you can use this command to stop it from the command line:") - for db_host in db.hosts: - rlxout.info(f"$(smart dbcli) -h {db_host} -p {port} shutdown", newline=False) - - # If multiple nodes are available, the first executes Relexi, while - # all worker processes are started on different nodes. - if len(hosts) > 1: - worker_nodes = hosts[1:] - else: # Only single node - worker_nodes = hosts - - return exp, worker_nodes, db, entry_db, db_is_clustered From 6a008d510da2513f2d2d78c5e1323179979e1cf4 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Mon, 24 Jun 2024 14:37:20 +0200 Subject: [PATCH 04/20] Improve Docstrings for ClusterManager, explicit typing and consistent use of "Distributed Mode" and "Local Mode". --- src/relexi/cluster_manager.py | 194 +++++++++++++++++++--------------- 1 file changed, 108 insertions(+), 86 deletions(-) diff --git a/src/relexi/cluster_manager.py b/src/relexi/cluster_manager.py index 057b022..f3e7070 100644 --- a/src/relexi/cluster_manager.py +++ b/src/relexi/cluster_manager.py @@ -1,29 +1,45 @@ #!/usr/bin/env python3 -"""The Cluster Manager contains all information about the HPC environment.""" +"""The ClusterManager class for managing the HPC environment.""" import os -import sys import socket +from typing import List, Optional import smartsim import relexi.io.output as rlxout -class ClusterManager(): - """Base class for cluster managers. +class ClusterManager: + """Class containing information about and handling the HPC environment. - This class defines the interface for cluster managers. which contain all - information of the HPC environment used for the training environments. In - particular, it provides properties to access the hostnames and entails the - launched Orchestrator. + This class defines the interface for cluster managers, which contain all + information of the HPC environment used for the training environments and + method to manage it. This include in particular to identify the scheduler + environment, identify the hostnames of the available nodes as well as + launching and managing the SmartSim experiment including the Orchestrator. - Properties: - type (str): Type of the cluster manager. - hosts (list): List of hostnames. - dedicated_head (bool): Whether dedicated head and worker nodes are + Two possible modes are available for using the available compute resources: + + - **Distributed Mode**: The localhost running the main training script + becomes the dedicated Head node that hosts the database and runs + the model evaluation and training loop. All training environments + are distributed to the available Worker nodes. + + - **Local Mode**: The training script, the database and the training + environments are all placed on the localhost. + + For **Distributed Mode**, more than 1 node has to be available. Otherwise + **Local Mode** will be used. The mode of the `ClusterManager` can be + retrieved via the `is_distributed` attribute. + + Attributes: + type (str): Type of the cluster manager. Must be one of + **{'local', 'pbs', 'slurm'}**. + is_distributed (bool): Whether dedicated head and worker nodes are available or everything runs on single shared node. + hosts (list): List of hostnames. head (str): Hostname of Head node. workers (list): List of worker nodes. @@ -32,47 +48,58 @@ class ClusterManager(): generate_rankfiles: Generate rank files for OpenMPI process binding. Raises: + ValueError: If the scheduler type is not supported. + RuntimeError: If the following conditions are met: + - The scheduler environment cannot be identified, or + - Launching the Orchestrator fails. NotImplementedError: If the methods are not implemented for the provided scheduler type. - ValueError: If the scheduler type is not supported. """ TYPES = ['local', 'pbs', 'slurm'] # All implemented types of cluster managers def __init__( self, - scheduler_type='local', - db_port=6790, - db_network_interface='lo', + scheduler_type: Optional[str] = 'local', + db_port: Optional[int] = 6790, + db_network_interface: Optional[str] = 'lo' ): - - # Check if the scheduler type is supported self.type = scheduler_type.casefold().strip() - rlxout.info(f'Trying to identify {self.type} training environment...') + try: self._hosts = self._get_hostlist() except Exception as e: - if self.type == 'local': - raise RuntimeError('Failed to setup local training environment!') rlxout.warning(f'Failed: {e}') - rlxout.info('Trying to run in local mode instead...', newline=False) - try: - self.type = 'local' - self._hosts = self._get_hostlist() - except: - raise RuntimeError('Also failed to setup local environment!') + if self.type != 'local': + rlxout.info('Trying to run in local mode instead...', newline=False) + try: + self.type = 'local' + self._hosts = self._get_hostlist() + except Exception as e: + raise RuntimeError('Also failed to setup local environment!') from e + else: + raise RuntimeError('Failed to setup local training environment!') from e rlxout.info('Success!', newline=False) self.db = None - self.exp, self.db, self.entry_db = self._launch_orchestrator(port=db_port, - network_interface=db_network_interface) + self.exp, self.db, self.entry_db = self._launch_orchestrator( + port=db_port, + network_interface=db_network_interface, + ) def __del__(self): if self.db: - self.exp.stop(self.db) + try: + self.exp.stop(self.db) + except Exception as e: + raise RuntimeError('Failed to stop the Orchestrator!') from e - def _launch_orchestrator(self, port, network_interface): + def _launch_orchestrator( + self, + port: int, + network_interface: str + ) -> tuple[smartsim.Experiment, smartsim.Database, str]: """Launches the SmartSim Orchestrator for the current job. Args: @@ -80,8 +107,7 @@ def _launch_orchestrator(self, port, network_interface): network_interface (str): Network interface to use for the Orchestrator. Returns: - Experiment: The Experiment object. - Orchestrator: The Orchestrator object. + tuple: The Experiment object, the Orchestrator object, and the entry database hostname. """ rlxout.small_banner('Starting Orchestrator...') @@ -89,22 +115,19 @@ def _launch_orchestrator(self, port, network_interface): exp = smartsim.Experiment('flexi', launcher=self.type) # Initialize the orchestrator based on the orchestrator_type - if self.type == 'local': - db = exp.create_database(port=port, interface='lo') - elif self.type in {'pbs','slurm'}: - db = exp.create_database(hosts=self.head, port=port, interface=network_interface) - else: - raise NotImplementedError(f"Orchestrator type {self.type} not implemented.") + db = exp.create_database( + port=port, + interface='lo' if self.type == 'local' else network_interface, + hosts=self.head if self.type in {'pbs', 'slurm'} else None, + ) - # startup Orchestrator rlxout.info("Starting the Database...", newline=False) try: exp.start(db) except Exception as e: - raise RuntimeError(f"Failed to start the Orchestrator: {e}") + raise RuntimeError(f"Failed to start the Orchestrator: {e}") from e rlxout.info("Success!", newline=False) - # get the database nodes and select the first one entry_db = socket.gethostbyname(db.hosts[0]) rlxout.info("If the SmartRedis database isn't stopping properly you can use this command to stop it from the command line:") rlxout.info(f"$(smart dbcli) -h {db.hosts[0]} -p {port} shutdown", newline=False) @@ -112,21 +135,25 @@ def _launch_orchestrator(self, port, network_interface): return exp, db, entry_db def info(self): - """Print information about the current job.""" - rlxout.info("Found the following environment information:") - rlxout.info(f"Scheduler: {self.type}", newline=False) - rlxout.info(f"Hosts: {self.hosts}", newline=False) - rlxout.info(f"Head node: {self.head}", newline=False) - rlxout.info(f"Workers: {self.workers}", newline=False) - - def _get_hostlist(self): + """Print information about the current environment.""" + rlxout.info("Found the following environment:") + rlxout.info(f" Scheduler: {self.type}", newline=False) + rlxout.info(f" Hosts: {self.hosts}", newline=False) + if self.is_distributed: + rlxout.info(f"Relexi is running in distributed mode:") + rlxout.info(f" Head node: {self.head}", newline=False) + rlxout.info(f" Workers: {self.workers}", newline=False) + else: + rlxout.info(f"Relexi is running in local mode on: {self.head}") + + def _get_hostlist(self) -> List[str]: """Get the list of hosts the script is executed on. Uses the scheduler type to determine the hostlist via the environment variables set by the scheduler. Returns: - List containing the hostnames as strings. + list: List containing the hostnames as strings. Raises: NotImplementedError: If the method is not implemented for the @@ -142,85 +169,80 @@ def _get_hostlist(self): raise NotImplementedError(f"Method get_hostlist not implemented for scheduler type {self.type}") @property - def type(self): + def type(self) -> str: """Get the type of scheduler environment used for the cluster manager. Returns: - Type of the cluster manager as a string. + str: Type of the cluster manager. """ return self._type @type.setter - def type(self, value): + def type(self, value: str): """Set the type of scheduler environment used for the cluster manager. Ensure that the type is supported. Args: - value (str): Type of the cluster manager as a string. + value (str): Type of the cluster manager. """ - if not value in self.TYPES: + if value not in self.TYPES: raise ValueError(f"Scheduler type {value} not supported.") self._type = value @property - def hosts(self): + def hosts(self) -> List[str]: """Get the list of hosts the script is executed on. Returns: - List containing the hostnames as strings. + list: List containing the hostnames as strings. """ return self._hosts @property - def dedicated_head(self): - """Return whether dedicated head and worker nodes are available or - everything runs on single shared node. + def is_distributed(self) -> bool: + """Returns whether ClusterManager runs in distributed or local mode. Returns: - Bool indicating whether dedicated head is used or not. + bool: Indicates whether cluster runs in distributed or local mode. + - True: A dedicated head node is used for training and the + database and at least one additional worker node to run + simulations. + - False: Only single machine is available and training, + database and simulation will be performed on the localhost. """ return len(self._hosts) > 1 @property - def head(self): - """Get head node, which is first node, i.e. node Relexi is actually - started on. + def head(self) -> str: + """Get head node, which is where Relexi is actually runs on. Returns: - Hostname of Head node as string. + str: Hostname of the head node. """ - return self._hosts[0] + return self.hosts[0] @property - def workers(self): - """Get a list of worker nodes. + def workers(self) -> List[str]: + """Get a list of worker nodes depending on mode. Returns: - List containing the hostnames of worker nodes as strings. + list: List containing the hostnames of worker nodes as strings. """ - if self.dedicated_head: - return self._hosts[1:] - return self._hosts + if self.is_distributed: + return self.hosts[1:] + return self.hosts - def generate_rankfiles(self, n_models, n_ranks_per_model, base_path=None): + def generate_rankfiles(self, n_models: int, n_ranks_per_model: int, base_path: Optional[str] = None) -> List[str]: """Generate rank file for OpenMPI process binding. Args: n_models (int): Number of models to be launched. n_ranks_per_model (int): Number of ranks used for each model. - base_path (str): (Optional.) Path to the directory of the rank files. + base_path (str, optional): Path to the directory of the rank files. Returns: - List of filenames of the rankfiles. + list: List of filenames of the rankfiles. """ - if self.type == 'local': - return generate_rankfile_ompi(self.hosts, cores_per_node, n_par_env, ranks_per_env, base_path) - if self.type == 'pbs': - return generate_rankfile_ompi(self.hosts, cores_per_node, n_par_env, ranks_per_env, base_path) - if self.type == 'slurm': - return generate_rankfile_ompi(self.hosts, cores_per_node, n_par_env, ranks_per_env, base_path) - else: + if self.type not in self.TYPES: raise NotImplementedError(f"Method generate_rankfile not implemented for scheduler type {self.type}") - -if __name__ == '__main__': - db = ClusterManager() + return generate_rankfile_ompi(self.hosts, n_models, n_ranks_per_model, base_path) From a6a74a83b1e80dc02df5fb0a368d992460fe1a54 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Mon, 24 Jun 2024 14:46:50 +0200 Subject: [PATCH 05/20] Fix bug in importing/naming the SmartSim Experiment and Orchestrator classes. --- src/relexi/cluster_manager.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/relexi/cluster_manager.py b/src/relexi/cluster_manager.py index f3e7070..94a7982 100644 --- a/src/relexi/cluster_manager.py +++ b/src/relexi/cluster_manager.py @@ -6,7 +6,8 @@ import socket from typing import List, Optional -import smartsim +from smartsim import Experiment +from smartsim.database.orchestrator import Orchestrator import relexi.io.output as rlxout @@ -99,7 +100,7 @@ def _launch_orchestrator( self, port: int, network_interface: str - ) -> tuple[smartsim.Experiment, smartsim.Database, str]: + ) -> tuple[Experiment, Orchestrator, str]: """Launches the SmartSim Orchestrator for the current job. Args: @@ -112,7 +113,7 @@ def _launch_orchestrator( rlxout.small_banner('Starting Orchestrator...') # Generate flexi experiment - exp = smartsim.Experiment('flexi', launcher=self.type) + exp = Experiment('flexi', launcher=self.type) # Initialize the orchestrator based on the orchestrator_type db = exp.create_database( From 0b318714a20045b4c7172998894086b5266963b8 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Mon, 24 Jun 2024 15:29:02 +0200 Subject: [PATCH 06/20] Improve documentation of Cluster Manager once again --- src/relexi/cluster_manager.py | 92 ++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 39 deletions(-) diff --git a/src/relexi/cluster_manager.py b/src/relexi/cluster_manager.py index 94a7982..2102ff7 100644 --- a/src/relexi/cluster_manager.py +++ b/src/relexi/cluster_manager.py @@ -17,36 +17,40 @@ class ClusterManager: This class defines the interface for cluster managers, which contain all information of the HPC environment used for the training environments and - method to manage it. This include in particular to identify the scheduler - environment, identify the hostnames of the available nodes as well as - launching and managing the SmartSim experiment including the Orchestrator. + methods to manage it. This includes in particular to identify the scheduler + environment, the hostnames of the available nodes and launching and + managing the SmartSim `Experiment` including the `Orchestrator`. Two possible modes are available for using the available compute resources: - - **Distributed Mode**: The localhost running the main training script - becomes the dedicated Head node that hosts the database and runs - the model evaluation and training loop. All training environments - are distributed to the available Worker nodes. + - **Distributed Mode**: The `localhost` running the main training script + becomes the dedicated **Head** node that hosts the database, + evaluates the model and runs the training loop. All training + environments are distributed to the available **Worker** nodes. - **Local Mode**: The training script, the database and the training - environments are all placed on the localhost. + environments are all placed on the `localhost`. For **Distributed Mode**, more than 1 node has to be available. Otherwise **Local Mode** will be used. The mode of the `ClusterManager` can be retrieved via the `is_distributed` attribute. Attributes: - type (str): Type of the cluster manager. Must be one of - **{'local', 'pbs', 'slurm'}**. - is_distributed (bool): Whether dedicated head and worker nodes are - available or everything runs on single shared node. - hosts (list): List of hostnames. - head (str): Hostname of Head node. - workers (list): List of worker nodes. - - Methods: - print_info: Print information about the current environment. - generate_rankfiles: Generate rank files for OpenMPI process binding. + type (str): Type of the cluster manager. Must be `'local'`, `'pbs'`, + or `'slurm'`. + is_distributed (bool): Indicates whether cluster runs in + **Distributed Mode** or **Local Mode**. + hosts (list): List of hostnames of available nodes. + head (str): Hostname of Head node (is name of `localhost` if in + **Local Mode**). + workers (list): List of worker nodes (contains only `localhost` if in + **Local Mode**). + db (Orchestrator): The launched `Orchestrator` database from the + `smartsim` package. + exp (Experiment): The `Experiment` object the `Orchestrator` is + launched with. + entry_db (str): IP address of the host of the database. Required to + connect a client to the database. Raises: ValueError: If the scheduler type is not supported. @@ -62,9 +66,19 @@ class ClusterManager: def __init__( self, scheduler_type: Optional[str] = 'local', - db_port: Optional[int] = 6790, - db_network_interface: Optional[str] = 'lo' + db_network_interface: Optional[str] = 'lo', + db_port: Optional[int] = 6790 ): + """Initialize the ClusterManager. + + Args: + scheduler_type (str, optional): Type of the cluster manager. + Must be `'local'`, `'pbs'`, or `'slurm'`. Defaults to `'local'`. + db_network_interface (str, optional): Network interface to use for + the Orchestrator. Defaults to `'lo'`. + db_port (int, optional): Port to start the Orchestrator on. + Defaults to `6790`. + """ self.type = scheduler_type.casefold().strip() rlxout.info(f'Trying to identify {self.type} training environment...') @@ -135,18 +149,6 @@ def _launch_orchestrator( return exp, db, entry_db - def info(self): - """Print information about the current environment.""" - rlxout.info("Found the following environment:") - rlxout.info(f" Scheduler: {self.type}", newline=False) - rlxout.info(f" Hosts: {self.hosts}", newline=False) - if self.is_distributed: - rlxout.info(f"Relexi is running in distributed mode:") - rlxout.info(f" Head node: {self.head}", newline=False) - rlxout.info(f" Workers: {self.workers}", newline=False) - else: - rlxout.info(f"Relexi is running in local mode on: {self.head}") - def _get_hostlist(self) -> List[str]: """Get the list of hosts the script is executed on. @@ -205,8 +207,8 @@ def is_distributed(self) -> bool: Returns: bool: Indicates whether cluster runs in distributed or local mode. - - True: A dedicated head node is used for training and the - database and at least one additional worker node to run + - True: A dedicated Head node is used for training and the + database and at least one additional Worker node to run simulations. - False: Only single machine is available and training, database and simulation will be performed on the localhost. @@ -215,24 +217,36 @@ def is_distributed(self) -> bool: @property def head(self) -> str: - """Get head node, which is where Relexi is actually runs on. + """Get Head node, which is where Relexi is actually runs on. Returns: - str: Hostname of the head node. + str: Hostname of the Head node. """ return self.hosts[0] @property def workers(self) -> List[str]: - """Get a list of worker nodes depending on mode. + """Get a list of Worker nodes depending on mode. Returns: - list: List containing the hostnames of worker nodes as strings. + list: List containing the hostnames of Worker nodes as strings. """ if self.is_distributed: return self.hosts[1:] return self.hosts + def info(self): + """Print information about the current environment.""" + rlxout.info("Found the following environment:") + rlxout.info(f" Scheduler: {self.type}", newline=False) + rlxout.info(f" Hosts: {self.hosts}", newline=False) + if self.is_distributed: + rlxout.info(f"Relexi is running in distributed mode:") + rlxout.info(f" Head: {self.head}", newline=False) + rlxout.info(f" Workers: {self.workers}", newline=False) + else: + rlxout.info(f"Relexi is running in local mode on: {self.head}") + def generate_rankfiles(self, n_models: int, n_ranks_per_model: int, base_path: Optional[str] = None) -> List[str]: """Generate rank file for OpenMPI process binding. From e0575c756da875101651cc89b8cb66c5469c50df Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Mon, 24 Jun 2024 16:22:55 +0200 Subject: [PATCH 07/20] Some linter fixes and remove "generate_ompi_rankfiles", since the method is still in the old "relexi.smartsim.helpers" module --- src/relexi/cluster_manager.py | 45 ++++++++++------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/src/relexi/cluster_manager.py b/src/relexi/cluster_manager.py index 2102ff7..f63cee4 100644 --- a/src/relexi/cluster_manager.py +++ b/src/relexi/cluster_manager.py @@ -10,6 +10,7 @@ from smartsim.database.orchestrator import Orchestrator import relexi.io.output as rlxout +from relexi.smartsim.helpers import generate_rankfile_ompi class ClusterManager: @@ -91,8 +92,8 @@ def __init__( try: self.type = 'local' self._hosts = self._get_hostlist() - except Exception as e: - raise RuntimeError('Also failed to setup local environment!') from e + except Exception as f: + raise RuntimeError('Also failed to setup local environment!') from f else: raise RuntimeError('Failed to setup local training environment!') from e rlxout.info('Success!', newline=False) @@ -144,7 +145,7 @@ def _launch_orchestrator( rlxout.info("Success!", newline=False) entry_db = socket.gethostbyname(db.hosts[0]) - rlxout.info("If the SmartRedis database isn't stopping properly you can use this command to stop it from the command line:") + rlxout.info("Use this command to shutdown database if not terminated correctly:") rlxout.info(f"$(smart dbcli) -h {db.hosts[0]} -p {port} shutdown", newline=False) return exp, db, entry_db @@ -164,12 +165,12 @@ def _get_hostlist(self) -> List[str]: """ if self.type == 'local': return [socket.gethostname()] - elif self.type == 'pbs': + if self.type == 'pbs': return os.environ['PBS_NODEFILE'] - elif self.type == 'slurm': + if self.type == 'slurm': return os.environ['SLURM_NODELIST'] - else: - raise NotImplementedError(f"Method get_hostlist not implemented for scheduler type {self.type}") + raise NotImplementedError( + f"Method get_hostlist not implemented for scheduler type {self.type}") @property def type(self) -> str: @@ -203,21 +204,16 @@ def hosts(self) -> List[str]: @property def is_distributed(self) -> bool: - """Returns whether ClusterManager runs in distributed or local mode. + """Whether `ClusterManager` runs in **Distributed** or **Local** mode. Returns: - bool: Indicates whether cluster runs in distributed or local mode. - - True: A dedicated Head node is used for training and the - database and at least one additional Worker node to run - simulations. - - False: Only single machine is available and training, - database and simulation will be performed on the localhost. + bool: `True` if in **Distributed Mode**, `False` otherwise. """ return len(self._hosts) > 1 @property def head(self) -> str: - """Get Head node, which is where Relexi is actually runs on. + """Return name of Head node, which is where Relexi actually runs on. Returns: str: Hostname of the Head node. @@ -226,7 +222,7 @@ def head(self) -> str: @property def workers(self) -> List[str]: - """Get a list of Worker nodes depending on mode. + """Returns list of Workers used for running training environments. Returns: list: List containing the hostnames of Worker nodes as strings. @@ -241,23 +237,8 @@ def info(self): rlxout.info(f" Scheduler: {self.type}", newline=False) rlxout.info(f" Hosts: {self.hosts}", newline=False) if self.is_distributed: - rlxout.info(f"Relexi is running in distributed mode:") + rlxout.info("Relexi is running in distributed mode:") rlxout.info(f" Head: {self.head}", newline=False) rlxout.info(f" Workers: {self.workers}", newline=False) else: rlxout.info(f"Relexi is running in local mode on: {self.head}") - - def generate_rankfiles(self, n_models: int, n_ranks_per_model: int, base_path: Optional[str] = None) -> List[str]: - """Generate rank file for OpenMPI process binding. - - Args: - n_models (int): Number of models to be launched. - n_ranks_per_model (int): Number of ranks used for each model. - base_path (str, optional): Path to the directory of the rank files. - - Returns: - list: List of filenames of the rankfiles. - """ - if self.type not in self.TYPES: - raise NotImplementedError(f"Method generate_rankfile not implemented for scheduler type {self.type}") - return generate_rankfile_ompi(self.hosts, n_models, n_ranks_per_model, base_path) From b084a32f134e117d9e982176310c07e4f35a35b1 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Tue, 25 Jun 2024 09:17:15 +0200 Subject: [PATCH 08/20] Proper way to get hosts and available MPI slots on worker nodes within a local, pbs and slurm environment. Also resort class methods and attributes to match the common sorting convention. --- src/relexi/cluster_manager.py | 268 +++++++++++++++++++++++++--------- 1 file changed, 201 insertions(+), 67 deletions(-) diff --git a/src/relexi/cluster_manager.py b/src/relexi/cluster_manager.py index f63cee4..09b0bc6 100644 --- a/src/relexi/cluster_manager.py +++ b/src/relexi/cluster_manager.py @@ -48,16 +48,16 @@ class ClusterManager: **Local Mode**). db (Orchestrator): The launched `Orchestrator` database from the `smartsim` package. + db_entry (str): IP address of the host of the database. Required to + connect a client to the database. exp (Experiment): The `Experiment` object the `Orchestrator` is launched with. - entry_db (str): IP address of the host of the database. Required to - connect a client to the database. Raises: ValueError: If the scheduler type is not supported. RuntimeError: If the following conditions are met: - The scheduler environment cannot be identified, or - - Launching the Orchestrator fails. + - Launching the `Orchestrator` fails. NotImplementedError: If the methods are not implemented for the provided scheduler type. """ @@ -99,7 +99,7 @@ def __init__( rlxout.info('Success!', newline=False) self.db = None - self.exp, self.db, self.entry_db = self._launch_orchestrator( + self._exp, self._db, self._db_entry = self._launch_orchestrator( port=db_port, network_interface=db_network_interface, ) @@ -111,6 +111,161 @@ def __del__(self): except Exception as e: raise RuntimeError('Failed to stop the Orchestrator!') from e + def info(self): + """Prints information about the current environment.""" + rlxout.info("Found the following environment:") + rlxout.info(f" Scheduler: {self.type}", newline=False) + rlxout.info(f" Hosts: {self.hosts}", newline=False) + if self.is_distributed: + rlxout.info("Relexi is running in distributed mode:") + rlxout.info(f" Head: {self.head}", newline=False) + rlxout.info(f" Workers: {self.workers}", newline=False) + else: + rlxout.info(f"Relexi is running in local mode on: {self.head}") + + def get_worker_slots(self) -> List[str]: + """Gets the list of available MPI slots on the Worker nodes. + + To obtain a list of all available MPI slots on the Worker nodes, the + following strategy is used depending on the type of environment: + - `local`: All CPU cores from localhost are used except one to run + training script and the database. + - `pbs`: The number of slots is determined by accessing the + `PBS_NODEFILE` environment variable and removing the Head node. + - `slurm`: The number of slots is determined by accessing the + `SLURM_JOB_CPUS_PER_NODE` environment variable and counting the + number of Worker nodes. + + Returns: + list: List containing hostname and host-local slot number of each + free slot on the Worker nodes. + """ + if self.type == 'local': + n_cpus = os.cpu_count()-1 # Save 1 CPU core for the Head tasks + return [[self.head, str(i)] for i in range(n_cpus)] + + if self.type == 'pbs': + # Get PBS_NODEFILE count number of slots per node and return list + # of slots per node. + nodes = self._read_pbs_nodefile() + worker_slots = [] + for worker in self.workers: + n_slots = sum(1 for nodename in nodes if worker in nodename) + #worker_slots.append({worker: str(i)}) # Dict of slots per node + for i in range(n_slots): + worker_slots.append([worker, str(i)]) + return worker_slots + + if self.type == 'slurm': + cpus_per_node = os.environ['SLURM_JOB_CPUS_PER_NODE'] + if cpus_per_node is None: + raise KeyError("Environment variable 'SLURM_JOB_CPUS_PER_NODE' is not set!") + for worker in self.workers: + for i in range(cpus_per_node): + worker_slots.append([worker, str(i)]) + return worker_slots + + raise NotImplementedError( + f"Method 'get_worker_slots' not implemented for scheduler type {self.type}") + + @property + def type(self) -> str: + """Get the type of scheduler environment used for the cluster manager. + + Returns: + str: Type of the cluster manager. + """ + return self._type + + @type.setter + def type(self, value: str): + """Set the type of scheduler environment used for the cluster manager. + Ensure that the type is supported. + + Args: + value (str): Type of the cluster manager. + """ + if value not in self.TYPES: + raise ValueError(f"Scheduler type {value} not supported.") + self._type = value + + @property + def hosts(self) -> List[str]: + """Get the list of hosts the script is executed on. + + Returns: + list: List containing the hostnames as strings. + """ + return self._hosts + + @property + def is_distributed(self) -> bool: + """Whether `ClusterManager` runs in **Distributed** or **Local** mode. + + Checks for the number of hosts available. If more than one host is + used, the `ClusterManager` runs in **Distributed Mode**. + + Returns: + bool: `True` if in **Distributed Mode**, `False` otherwise. + """ + return len(self._hosts) > 1 + + @property + def head(self) -> str: + """Return name of Head node, which is where Relexi actually runs on. + + Returns: + str: Hostname of the Head node. + """ + return self._get_local_hostname() + + @property + def workers(self) -> List[str]: + """Returns list of Workers used for running training environments. + + Obtains Workers by removing the Head node from the list of hosts. + + Returns: + list: List containing the hostnames of Workers as strings. + """ + if self.is_distributed: + local_host = self._get_local_hostname() + workers = self.hosts.copy() + if local_host in workers: + workers.remove(local_host) + else: + rlxout.warning(f"Localhost '{local_host}' not found in hosts list:") + rlxout.warning(f" {workers}") + return workers + return self.hosts + + @property + def db(self) -> Orchestrator: + """Get the Orchestrator database object. + + Returns: + Orchestrator: The Orchestrator database object. + """ + return self._db + + @property + def db_entry(self) -> str: + """Get the IP address of the host of the database. + + Returns: + str: IP address of the host of the database. + """ + return self._db_entry + + @property + def exp(self) -> Experiment: + """Get the Experiment object the Orchestrator is launched with. + + Returns: + Experiment: The Experiment object. + """ + return self._exp + def _launch_orchestrator( self, port: int, @@ -164,81 +319,60 @@ def _get_hostlist(self) -> List[str]: scheduler type. """ if self.type == 'local': - return [socket.gethostname()] + return [self._get_local_hostname()] if self.type == 'pbs': - return os.environ['PBS_NODEFILE'] + nodes = self._read_pbs_nodefile() + # Get the list of unique nodes via casting into set and list again + return list(set(nodes)) if self.type == 'slurm': - return os.environ['SLURM_NODELIST'] + return self._get_slurm_nodelist() raise NotImplementedError( - f"Method get_hostlist not implemented for scheduler type {self.type}") + f"Method `get_hostlist` not implemented for scheduler type {self.type}!") - @property - def type(self) -> str: - """Get the type of scheduler environment used for the cluster manager. + def _read_pbs_nodefile() -> List[str]: + """Read the PBS_NODEFILE and return the list of nodes. - Returns: - str: Type of the cluster manager. - """ - return self._type - - @type.setter - def type(self, value: str): - """Set the type of scheduler environment used for the cluster manager. - Ensure that the type is supported. - - Args: - value (str): Type of the cluster manager. - """ - if value not in self.TYPES: - raise ValueError(f"Scheduler type {value} not supported.") - self._type = value - - @property - def hosts(self) -> List[str]: - """Get the list of hosts the script is executed on. + NOTE: + The PBS_NODEFILE contains the list of nodes allocated to the job. + If a node provides multiple MPI slots, it is the corresponding + number of times in the file. Returns: list: List containing the hostnames as strings. """ - return self._hosts - - @property - def is_distributed(self) -> bool: - """Whether `ClusterManager` runs in **Distributed** or **Local** mode. - - Returns: - bool: `True` if in **Distributed Mode**, `False` otherwise. - """ - return len(self._hosts) > 1 - - @property - def head(self) -> str: - """Return name of Head node, which is where Relexi actually runs on. + if self.type != 'pbs': + raise ValueError("Method 'read_pbs_nodefile' only available for PBS scheduler!") + node_file = os.environ['PBS_NODEFILE'] + if node_file is None: + raise KeyError("Environment variable 'PBS_NODEFILE' is not set!") + with open(node_file, 'r', encoding='utf-8') as f: + nodes = [line.strip() for line in f.readlines()] + return nodes + + def _get_slurm_nodelist() -> List[str]: + """Get the list of hosts from the SLURM_NODELIST environment variable. Returns: - str: Hostname of the Head node. + list: List containing the unique hostnames as strings. """ - return self.hosts[0] - - @property - def workers(self) -> List[str]: - """Returns list of Workers used for running training environments. + if self.type != 'slurm': + raise ValueError("Method 'get_slurm_nodelist' only available for SLURM scheduler!") + # Get the compressed list of nodes from SLURM_NODELIST + node_list = os.getenv('SLURM_NODELIST') + if node_list is None: + raise KeyError("Environment variable 'SLURM_NODELIST' is not set!") + # Use scontrol to expand the node list + result = subprocess.run(['scontrol', 'show', 'hostname', node_list], capture_output=True, text=True) + # Check if the command was successful + if result.returncode != 0: + raise RuntimeError(f"scontrol command failed: {result.stderr.strip()}") + # Split the output into individual hostnames + return result.stdout.strip().split('\n') + + def _get_local_hostname(self) -> str: + """Get the hostname of the machine executing the Python script. Returns: - list: List containing the hostnames of Worker nodes as strings. + str: Hostname of the local machine executing the script. """ - if self.is_distributed: - return self.hosts[1:] - return self.hosts - - def info(self): - """Print information about the current environment.""" - rlxout.info("Found the following environment:") - rlxout.info(f" Scheduler: {self.type}", newline=False) - rlxout.info(f" Hosts: {self.hosts}", newline=False) - if self.is_distributed: - rlxout.info("Relexi is running in distributed mode:") - rlxout.info(f" Head: {self.head}", newline=False) - rlxout.info(f" Workers: {self.workers}", newline=False) - else: - rlxout.info(f"Relexi is running in local mode on: {self.head}") + return socket.gethostname() From 6a5bebceee3d8e19cf1990cd6f93790705eeef59 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Tue, 25 Jun 2024 09:17:44 +0200 Subject: [PATCH 09/20] Change encoding of files to UTF-8 which is backwards compatible to ASCII anyways. --- src/relexi/env/flexiEnvSmartSim.py | 2 +- src/relexi/io/readin.py | 4 ++-- src/relexi/smartsim/helpers.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/relexi/env/flexiEnvSmartSim.py b/src/relexi/env/flexiEnvSmartSim.py index 7b0e1df..9562836 100644 --- a/src/relexi/env/flexiEnvSmartSim.py +++ b/src/relexi/env/flexiEnvSmartSim.py @@ -95,7 +95,7 @@ def __init__(self, # Read target DNS spectra from file if spectra_file: - with open(spectra_file, 'r', encoding='ascii') as csvfile: + with open(spectra_file, 'r', encoding='utf-8') as csvfile: reader = csv.reader(csvfile, delimiter=',') col_e = next(reader).index('E') e = [] diff --git a/src/relexi/io/readin.py b/src/relexi/io/readin.py index 6446327..efaf90c 100644 --- a/src/relexi/io/readin.py +++ b/src/relexi/io/readin.py @@ -20,7 +20,7 @@ def read_config(file_in, flatten=True): Returns: dict: Dictionary containing the contents of the file """ - with open(file_in, 'r', encoding='ascii') as stream: + with open(file_in, 'r', encoding='utf-8') as stream: config = yaml.safe_load(stream) if flatten: @@ -83,7 +83,7 @@ def read_file(filename, newline=None): Returns: str: Returns single string with the content of the file. """ - with open(filename, 'r', encoding='ascii') as myfile: + with open(filename, 'r', encoding='utf-8') as myfile: data = myfile.read() if newline: return data.replace('\n', newline) diff --git a/src/relexi/smartsim/helpers.py b/src/relexi/smartsim/helpers.py index 840b2ae..1fbbf06 100644 --- a/src/relexi/smartsim/helpers.py +++ b/src/relexi/smartsim/helpers.py @@ -44,7 +44,7 @@ def generate_rankfile_ompi(hosts, cores_per_node, n_par_env, ranks_per_env, base for env_idx in range(n_par_env): filename = os.path.join(rankfile_dir, f"par_env_{env_idx:05d}") rankfiles.append(filename) - with open(filename, 'w', encoding='ascii') as rankfile: + with open(filename, 'w', encoding='utf-8') as rankfile: for i in range(ranks_per_env): rankfile.write(f"rank {i}={hosts[n_cores_used//cores_per_node]} slot={next_free_slot}\n") next_free_slot = next_free_slot + 1 @@ -76,8 +76,8 @@ def parser_flexi_parameters(parameter_file, keyword, value): pbs_job_id = os.environ['PBS_JOBID'] parameter_file_out = f"parameter_flexi-{pbs_job_id[0:7]}.ini" - with open(parameter_file_out, 'w', encoding='ascii') as new_file: - with open(parameter_file_in, 'r', encoding='ascii') as old_file: + with open(parameter_file_out, 'w', encoding='utf-8') as new_file: + with open(parameter_file_in, 'r', encoding='utf-8') as old_file: for line in old_file: new_file.write(pattern.sub(subst, line)) return parameter_file_out From 5de020fd1614922af647763f857e8dcb03fc8161 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Wed, 26 Jun 2024 08:45:59 +0200 Subject: [PATCH 10/20] Add "auto" type that detects environment automatically using SmartSim. Also fixes few bugs. --- src/relexi/cluster_manager.py | 25 ++++++++++++++++++------- src/relexi/rl/ppo/train.py | 7 ++++--- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/relexi/cluster_manager.py b/src/relexi/cluster_manager.py index 09b0bc6..1c2e291 100644 --- a/src/relexi/cluster_manager.py +++ b/src/relexi/cluster_manager.py @@ -4,8 +4,10 @@ import os import socket +import subprocess from typing import List, Optional +import smartsim from smartsim import Experiment from smartsim.database.orchestrator import Orchestrator @@ -66,7 +68,7 @@ class ClusterManager: def __init__( self, - scheduler_type: Optional[str] = 'local', + scheduler_type: Optional[str] = 'auto', db_network_interface: Optional[str] = 'lo', db_port: Optional[int] = 6790 ): @@ -74,15 +76,24 @@ def __init__( Args: scheduler_type (str, optional): Type of the cluster manager. - Must be `'local'`, `'pbs'`, or `'slurm'`. Defaults to `'local'`. + Must be `'local'`, `'pbs'`, `'slurm'` or `'auto'`. Defaults to + `'auto'`, for which the type of cluster environment iss + identified automatically. db_network_interface (str, optional): Network interface to use for the Orchestrator. Defaults to `'lo'`. db_port (int, optional): Port to start the Orchestrator on. Defaults to `6790`. """ - self.type = scheduler_type.casefold().strip() - rlxout.info(f'Trying to identify {self.type} training environment...') + # Using SmartSim utility to identify type automatically + if scheduler_type == 'auto': + rlxout.info('Trying to identify cluster environment...') + scheduler = smartsim.wlm.detect_launcher() + rlxout.info(f'Found "{scheduler}" environment!', newline=False) + self.type = scheduler.casefold().strip() + else: + self.type = scheduler_type.casefold().strip() + rlxout.info(f'Trying to setup "{self.type}" environment...') try: self._hosts = self._get_hostlist() except Exception as e: @@ -98,7 +109,7 @@ def __init__( raise RuntimeError('Failed to setup local training environment!') from e rlxout.info('Success!', newline=False) - self.db = None + self._db = None self._exp, self._db, self._db_entry = self._launch_orchestrator( port=db_port, network_interface=db_network_interface, @@ -329,7 +340,7 @@ def _get_hostlist(self) -> List[str]: raise NotImplementedError( f"Method `get_hostlist` not implemented for scheduler type {self.type}!") - def _read_pbs_nodefile() -> List[str]: + def _read_pbs_nodefile(self) -> List[str]: """Read the PBS_NODEFILE and return the list of nodes. NOTE: @@ -349,7 +360,7 @@ def _read_pbs_nodefile() -> List[str]: nodes = [line.strip() for line in f.readlines()] return nodes - def _get_slurm_nodelist() -> List[str]: + def _get_slurm_nodelist(self) -> List[str]: """Get the list of hosts from the SLURM_NODELIST environment variable. Returns: diff --git a/src/relexi/rl/ppo/train.py b/src/relexi/rl/ppo/train.py index 2019f25..3b0fc05 100644 --- a/src/relexi/rl/ppo/train.py +++ b/src/relexi/rl/ppo/train.py @@ -110,6 +110,7 @@ def train( config_file resources = relexi.cluster_manager.ClusterManager( scheduler_type=smartsim_orchestrator, db_port=smartsim_port, + db_network_interface='ibp94s0', ) resources.info() @@ -170,7 +171,7 @@ def train( config_file ,parameter_file ,tag = 'train' ,port = smartsim_port - ,entry_db = resources.entry_db + ,entry_db = resources.db_entry ,hosts = resources.workers ,n_envs = num_parallel_environments ,n_procs = num_procs_per_environment @@ -197,7 +198,7 @@ def train( config_file ,parameter_file ,tag = 'eval' ,port = smartsim_port - ,entry_db = resources.entry_db + ,entry_db = resources.db_entry ,hosts = resources.workers ,n_procs = num_procs_per_environment ,n_procs_per_node = n_procs_per_node @@ -389,5 +390,5 @@ def train( config_file del my_env del my_eval_env - exp.stop(db) + del resources time.sleep(2.) # Wait for orchestrator to be properly closed From 53f6eedc07afa35df62eb91023652c300f080462 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Wed, 26 Jun 2024 13:35:57 +0200 Subject: [PATCH 11/20] Rename cluster manager to "Runtime", since it better conveys the meaning. Also rename the "smartsim" sub-package to "runtime". --- src/relexi/rl/ppo/train.py | 28 +-- src/relexi/rl/tf_helpers.py | 4 +- src/relexi/{smartsim => runtime}/__init__.py | 0 src/relexi/{smartsim => runtime}/helpers.py | 31 ++- .../runtime.py} | 209 +++++++----------- 5 files changed, 115 insertions(+), 157 deletions(-) rename src/relexi/{smartsim => runtime}/__init__.py (100%) rename src/relexi/{smartsim => runtime}/helpers.py (87%) rename src/relexi/{cluster_manager.py => runtime/runtime.py} (52%) diff --git a/src/relexi/rl/ppo/train.py b/src/relexi/rl/ppo/train.py index 3b0fc05..5e68108 100644 --- a/src/relexi/rl/ppo/train.py +++ b/src/relexi/rl/ppo/train.py @@ -21,10 +21,10 @@ import relexi.rl.models import relexi.rl.tf_helpers import relexi.env.flexiEnvSmartSim -import relexi.cluster_manager +import relexi.runtime import relexi.io.readin as rlxin import relexi.io.output as rlxout -from relexi.smartsim.helpers import generate_rankfile_ompi, copy_to_nodes, parser_flexi_parameters +from relexi.runtime.helpers import generate_rankfile_ompi, copy_to_nodes, parser_flexi_parameters def train( config_file @@ -107,17 +107,17 @@ def train( config_file tf.config.optimizer.set_jit(True) # Initialize SmartSim - resources = relexi.cluster_manager.ClusterManager( - scheduler_type=smartsim_orchestrator, + runtime = relexi.runtime.Runtime( + type_=smartsim_orchestrator, db_port=smartsim_port, db_network_interface='ibp94s0', ) - resources.info() + runtime.info() # generating rankfiles for OpenMPI if mpi_launch_mpmd: # If all MPI jobs are run with single mpirun command, all jobs are allocated based on single rankfile - rank_files = generate_rankfile_ompi(resources.workers + rank_files = generate_rankfile_ompi(runtime.workers ,n_procs_per_node ,n_par_env=1 ,ranks_per_env=num_parallel_environments*num_procs_per_environment @@ -125,7 +125,7 @@ def train( config_file else: # Otherwise every MPI job gets its own rankfile - rank_files = generate_rankfile_ompi(resources.workers + rank_files = generate_rankfile_ompi(runtime.workers ,n_procs_per_node ,num_parallel_environments ,num_procs_per_environment @@ -166,13 +166,13 @@ def train( config_file # Instantiate parallel collection environment my_env = tf_py_environment.TFPyEnvironment( - relexi.env.flexiEnvSmartSim.flexiEnv(resources.exp + relexi.env.flexiEnvSmartSim.flexiEnv(runtime.exp ,executable_path ,parameter_file ,tag = 'train' ,port = smartsim_port - ,entry_db = resources.db_entry - ,hosts = resources.workers + ,entry_db = runtime.db_entry + ,hosts = runtime.workers ,n_envs = num_parallel_environments ,n_procs = num_procs_per_environment ,n_procs_per_node = n_procs_per_node @@ -193,13 +193,13 @@ def train( config_file eval_files = train_files my_eval_env = tf_py_environment.TFPyEnvironment( - relexi.env.flexiEnvSmartSim.flexiEnv(resources.exp + relexi.env.flexiEnvSmartSim.flexiEnv(runtime.exp ,executable_path ,parameter_file ,tag = 'eval' ,port = smartsim_port - ,entry_db = resources.db_entry - ,hosts = resources.workers + ,entry_db = runtime.db_entry + ,hosts = runtime.workers ,n_procs = num_procs_per_environment ,n_procs_per_node = n_procs_per_node ,spectra_file = reward_spectrum_file @@ -390,5 +390,5 @@ def train( config_file del my_env del my_eval_env - del resources + del runtime time.sleep(2.) # Wait for orchestrator to be properly closed diff --git a/src/relexi/rl/tf_helpers.py b/src/relexi/rl/tf_helpers.py index 89ee787..18e811b 100644 --- a/src/relexi/rl/tf_helpers.py +++ b/src/relexi/rl/tf_helpers.py @@ -4,7 +4,7 @@ import tensorflow as tf import relexi.io.output as rlxout -import relexi.smartsim.helpers +import relexi.runtime.helpers def write_metrics(metrics, step, category_name): @@ -101,7 +101,7 @@ def collect_trajectories(driver, env): # Stop FLEXI instances env.stop() # Cleanup OMP files - relexi.smartsim.helpers.clean_ompi_tmpfiles() + relexi.runtime.helpers.clean_ompi_tmpfiles() @tf.function diff --git a/src/relexi/smartsim/__init__.py b/src/relexi/runtime/__init__.py similarity index 100% rename from src/relexi/smartsim/__init__.py rename to src/relexi/runtime/__init__.py diff --git a/src/relexi/smartsim/helpers.py b/src/relexi/runtime/helpers.py similarity index 87% rename from src/relexi/smartsim/helpers.py rename to src/relexi/runtime/helpers.py index 1fbbf06..dc8daea 100644 --- a/src/relexi/smartsim/helpers.py +++ b/src/relexi/runtime/helpers.py @@ -17,7 +17,7 @@ def generate_rankfile_ompi(hosts, cores_per_node, n_par_env, ranks_per_env, base_path=None): """Generate rank file for OpenMPI process binding. - Args: + Args: hosts (list): List of hostnames cores_per_node (int): Number of cores per node n_par_env (int): Number of parallel environments to be launched @@ -30,9 +30,9 @@ def generate_rankfile_ompi(hosts, cores_per_node, n_par_env, ranks_per_env, base # If no base_path given, use CWD if base_path: - rankfile_dir = os.path.join(base_path, "ompi-rankfiles") + rankfile_dir = os.path.join(base_path, 'ompi-rankfiles') else: - rankfile_dir = "ompi-rankfiles" + rankfile_dir = 'ompi-rankfiles' if os.path.exists(rankfile_dir): shutil.rmtree(rankfile_dir) @@ -42,11 +42,11 @@ def generate_rankfile_ompi(hosts, cores_per_node, n_par_env, ranks_per_env, base next_free_slot = 0 n_cores_used = 0 for env_idx in range(n_par_env): - filename = os.path.join(rankfile_dir, f"par_env_{env_idx:05d}") + filename = os.path.join(rankfile_dir, f'par_env_{env_idx:05d}') rankfiles.append(filename) with open(filename, 'w', encoding='utf-8') as rankfile: for i in range(ranks_per_env): - rankfile.write(f"rank {i}={hosts[n_cores_used//cores_per_node]} slot={next_free_slot}\n") + rankfile.write(f'rank {i}={hosts[n_cores_used//cores_per_node]} slot={next_free_slot}\n') next_free_slot = next_free_slot + 1 n_cores_used = n_cores_used + 1 if next_free_slot > (cores_per_node - 1): @@ -70,11 +70,11 @@ def parser_flexi_parameters(parameter_file, keyword, value): Returns: str: Path to new (modified) parameter file """ - pattern = re.compile(fr"({keyword})\s*=.*", re.IGNORECASE) - subst = keyword + "=" + value + pattern = re.compile(fr'({keyword})\s*=.*', re.IGNORECASE) + subst = keyword + '=' + value parameter_file_in = parameter_file pbs_job_id = os.environ['PBS_JOBID'] - parameter_file_out = f"parameter_flexi-{pbs_job_id[0:7]}.ini" + parameter_file_out = f'parameter_flexi-{pbs_job_id[0:7]}.ini' with open(parameter_file_out, 'w', encoding='utf-8') as new_file: with open(parameter_file_in, 'r', encoding='utf-8') as old_file: @@ -83,7 +83,7 @@ def parser_flexi_parameters(parameter_file, keyword, value): return parameter_file_out -def clean_ompi_tmpfiles(env_variable="TMPDIR"): +def clean_ompi_tmpfiles(env_variable='TMPDIR'): """Cleans up temporary files created by OpenMPI. OpenMPI creates temporary files with each invocation, which might cause the @@ -96,13 +96,12 @@ def clean_ompi_tmpfiles(env_variable="TMPDIR"): folder for termporary files is stored. Returns: - int: - * 1 if operation was successfull, - * -1 otherwise. + int: Returns + - `1` if operation was successfull, + - `-1` otherwise. """ - try: - tmpdir = os.environ[env_variable] - except Exception: + tmpdir = os.geten(env_variable) + if tmpdir is None: return -1 path = os.path.join(tmpdir, 'ompi.*') @@ -160,7 +159,7 @@ def copy_to_nodes(my_files, base_path, hosts, subfolder=None): os.system(f'ssh {host} mkdir -p {target}') # Copy files for my_file in my_files: - os.system(f'scp -q "{my_file}" "{host}:{target}"') + os.system(f'scp -q {my_file} {host}:{target}') # Get new path of files my_files_new = [] diff --git a/src/relexi/cluster_manager.py b/src/relexi/runtime/runtime.py similarity index 52% rename from src/relexi/cluster_manager.py rename to src/relexi/runtime/runtime.py index 1c2e291..4185c03 100644 --- a/src/relexi/cluster_manager.py +++ b/src/relexi/runtime/runtime.py @@ -1,24 +1,24 @@ #!/usr/bin/env python3 -"""The ClusterManager class for managing the HPC environment.""" +"""The Runtime class for managing the HPC runtime environment.""" import os import socket import subprocess -from typing import List, Optional +from typing import List, Optional, Union import smartsim from smartsim import Experiment from smartsim.database.orchestrator import Orchestrator import relexi.io.output as rlxout -from relexi.smartsim.helpers import generate_rankfile_ompi +from relexi.runtime.helpers import generate_rankfile_ompi -class ClusterManager: +class Runtime: """Class containing information about and handling the HPC environment. - This class defines the interface for cluster managers, which contain all + This class defines the interface for an HPC runtime, which contains all information of the HPC environment used for the training environments and methods to manage it. This includes in particular to identify the scheduler environment, the hostnames of the available nodes and launching and @@ -26,23 +26,23 @@ class ClusterManager: Two possible modes are available for using the available compute resources: - - **Distributed Mode**: The `localhost` running the main training script + - **Distributed**: The `localhost` running the main training script becomes the dedicated **Head** node that hosts the database, evaluates the model and runs the training loop. All training environments are distributed to the available **Worker** nodes. - - **Local Mode**: The training script, the database and the training + - **Local**: The training script, the database and the training environments are all placed on the `localhost`. - For **Distributed Mode**, more than 1 node has to be available. Otherwise - **Local Mode** will be used. The mode of the `ClusterManager` can be - retrieved via the `is_distributed` attribute. + More than 1 node has to be available in order to initiate a + **Distributed** runtime. Otherwise, if only a single node is available, + **Local** mode will be used. The mode of the `Runtime` can be retrieved + via the `is_distributed` attribute. Attributes: - type (str): Type of the cluster manager. Must be `'local'`, `'pbs'`, - or `'slurm'`. - is_distributed (bool): Indicates whether cluster runs in - **Distributed Mode** or **Local Mode**. + type (str): Type of runtime. Must be `'local'`, `'pbs'`, or `'slurm'`. + is_distributed (bool): Indicates whether a **Distributed** or **Local** + runtime is used. hosts (list): List of hostnames of available nodes. head (str): Hostname of Head node (is name of `localhost` if in **Local Mode**). @@ -64,49 +64,49 @@ class ClusterManager: provided scheduler type. """ - TYPES = ['local', 'pbs', 'slurm'] # All implemented types of cluster managers + TYPES = ['local', 'pbs', 'slurm'] + """Supported types of runtime environments.""" def __init__( self, - scheduler_type: Optional[str] = 'auto', + type_: Optional[str] = 'auto', db_network_interface: Optional[str] = 'lo', db_port: Optional[int] = 6790 ): - """Initialize the ClusterManager. + """Initialize the Runtime. Args: - scheduler_type (str, optional): Type of the cluster manager. - Must be `'local'`, `'pbs'`, `'slurm'` or `'auto'`. Defaults to - `'auto'`, for which the type of cluster environment iss - identified automatically. + type_ (str, optional): Type of runtime. Must be `'local'`, `'pbs'`, + `'slurm'` or `'auto'`. Defaults to `'auto'`, for which the type + of runtime environment is identified automatically. db_network_interface (str, optional): Network interface to use for the Orchestrator. Defaults to `'lo'`. db_port (int, optional): Port to start the Orchestrator on. Defaults to `6790`. """ # Using SmartSim utility to identify type automatically - if scheduler_type == 'auto': - rlxout.info('Trying to identify cluster environment...') - scheduler = smartsim.wlm.detect_launcher() - rlxout.info(f'Found "{scheduler}" environment!', newline=False) - self.type = scheduler.casefold().strip() - else: - self.type = scheduler_type.casefold().strip() - - rlxout.info(f'Trying to setup "{self.type}" environment...') try: + if type_ == 'auto': + rlxout.info('Identifying environment...') + scheduler = smartsim.wlm.detect_launcher() + rlxout.info(f'Found "{scheduler}" environment!', newline=False) + self.type = scheduler.casefold().strip() + else: + self.type = type_.casefold().strip() + + rlxout.info(f'Setting up "{self.type}" runtime...') self._hosts = self._get_hostlist() except Exception as e: rlxout.warning(f'Failed: {e}') if self.type != 'local': - rlxout.info('Trying to run in local mode instead...', newline=False) + rlxout.info('Trying to setup LOCAL runtime instead...', newline=False) try: self.type = 'local' self._hosts = self._get_hostlist() except Exception as f: - raise RuntimeError('Also failed to setup local environment!') from f + raise RuntimeError('Also failed to setup LOCAL environment!') from f else: - raise RuntimeError('Failed to setup local training environment!') from e + raise RuntimeError('Failed to setup LOCAL training environment!') from e rlxout.info('Success!', newline=False) self._db = None @@ -123,86 +123,42 @@ def __del__(self): raise RuntimeError('Failed to stop the Orchestrator!') from e def info(self): - """Prints information about the current environment.""" - rlxout.info("Found the following environment:") - rlxout.info(f" Scheduler: {self.type}", newline=False) - rlxout.info(f" Hosts: {self.hosts}", newline=False) + """Prints information about the current runtime environment.""" + rlxout.info('Configuration of runtime environment:') + rlxout.info(f' Scheduler: {self.type}', newline=False) + rlxout.info(f' Hosts: {self.hosts}', newline=False) if self.is_distributed: - rlxout.info("Relexi is running in distributed mode:") - rlxout.info(f" Head: {self.head}", newline=False) - rlxout.info(f" Workers: {self.workers}", newline=False) + rlxout.info('Running in DISTRIBUTED mode:') + rlxout.info(f' Head: {self.head}', newline=False) + rlxout.info(f' Workers: {self.workers}', newline=False) else: - rlxout.info(f"Relexi is running in local mode on: {self.head}") - - def get_worker_slots(self) -> List[str]: - """Gets the list of available MPI slots on the Worker nodes. - - To obtain a list of all available MPI slots on the Worker nodes, the - following strategy is used depending on the type of environment: - - `local`: All CPU cores from localhost are used except one to run - training script and the database. - - `pbs`: The number of slots is determined by accessing the - `PBS_NODEFILE` environment variable and removing the Head node. - - `slurm`: The number of slots is determined by accessing the - `SLURM_JOB_CPUS_PER_NODE` environment variable and counting the - number of Worker nodes. - - Returns: - list: List containing hostname and host-local slot number of each - free slot on the Worker nodes. - """ - if self.type == 'local': - n_cpus = os.cpu_count()-1 # Save 1 CPU core for the Head tasks - return [[self.head, str(i)] for i in range(n_cpus)] - - if self.type == 'pbs': - # Get PBS_NODEFILE count number of slots per node and return list - # of slots per node. - nodes = self._read_pbs_nodefile() - worker_slots = [] - for worker in self.workers: - n_slots = sum(1 for nodename in nodes if worker in nodename) - #worker_slots.append({worker: str(i)}) # Dict of slots per node - for i in range(n_slots): - worker_slots.append([worker, str(i)]) - return worker_slots - - if self.type == 'slurm': - cpus_per_node = os.environ['SLURM_JOB_CPUS_PER_NODE'] - if cpus_per_node is None: - raise KeyError("Environment variable 'SLURM_JOB_CPUS_PER_NODE' is not set!") - for worker in self.workers: - for i in range(cpus_per_node): - worker_slots.append([worker, str(i)]) - return worker_slots - - raise NotImplementedError( - f"Method 'get_worker_slots' not implemented for scheduler type {self.type}") + rlxout.info(f'Running in LOCAL mode on: {self.head}') @property def type(self) -> str: - """Get the type of scheduler environment used for the cluster manager. + """Get the type of the runtime environment. Returns: - str: Type of the cluster manager. + str: Type of the runtime environment. """ return self._type @type.setter def type(self, value: str): - """Set the type of scheduler environment used for the cluster manager. - Ensure that the type is supported. + """Set the type of environment used for the runtime. + + Validates that the type is actually supported. Args: - value (str): Type of the cluster manager. + value (str): Type of the runtime environment. """ if value not in self.TYPES: - raise ValueError(f"Scheduler type {value} not supported.") + raise ValueError(f'Runtime of type {value} not supported.') self._type = value @property def hosts(self) -> List[str]: - """Get the list of hosts the script is executed on. + """Get the list of hosts within the runtime environment. Returns: list: List containing the hostnames as strings. @@ -211,19 +167,20 @@ def hosts(self) -> List[str]: @property def is_distributed(self) -> bool: - """Whether `ClusterManager` runs in **Distributed** or **Local** mode. + """Whether runtime is **Distributed** or **Local**. Checks for the number of hosts available. If more than one host is - used, the `ClusterManager` runs in **Distributed Mode**. + found in runtime, it runs in **Distributed** mode, otherwise it runs + in **Local** mode. Returns: - bool: `True` if in **Distributed Mode**, `False` otherwise. + bool: `True` if **Distributed**, `False` otherwise. """ return len(self._hosts) > 1 @property def head(self) -> str: - """Return name of Head node, which is where Relexi actually runs on. + """Return name of Head node, which is where this instance is located. Returns: str: Hostname of the Head node. @@ -232,7 +189,7 @@ def head(self) -> str: @property def workers(self) -> List[str]: - """Returns list of Workers used for running training environments. + """Returns list of Workers found in the current runtime environment. Obtains Workers by removing the Head node from the list of hosts. @@ -245,17 +202,17 @@ def workers(self) -> List[str]: if local_host in workers: workers.remove(local_host) else: - rlxout.warning(f"Localhost '{local_host}' not found in hosts list:") - rlxout.warning(f" {workers}") + rlxout.warning(f'Localhost "{local_host}" not found in hosts list:') + rlxout.warning(f' {workers}') return workers return self.hosts @property def db(self) -> Orchestrator: - """Get the Orchestrator database object. + """Get the Orchestrator database instance. Returns: - Orchestrator: The Orchestrator database object. + Orchestrator: The `Orchestrator` database instance. """ return self._db @@ -270,10 +227,10 @@ def db_entry(self) -> str: @property def exp(self) -> Experiment: - """Get the Experiment object the Orchestrator is launched with. + """Get the `Experiment` instance the `Orchestrator` is launched in. Returns: - Experiment: The Experiment object. + Experiment: The `Experiment` instance. """ return self._exp @@ -282,19 +239,21 @@ def _launch_orchestrator( port: int, network_interface: str ) -> tuple[Experiment, Orchestrator, str]: - """Launches the SmartSim Orchestrator for the current job. + """Launches a SmartSim `Orchestratori` in the current runtime. Args: - port (int): Port to start the Orchestrator on. - network_interface (str): Network interface to use for the Orchestrator. + port (int): Port to start the `Orchestrator` on. + network_interface (str): Network interface to use for the + `Orchestrator`. Returns: - tuple: The Experiment object, the Orchestrator object, and the entry database hostname. + tuple: The `Experiment` instance, the `Orchestrator` instance and + the IP address of the host of the database. """ rlxout.small_banner('Starting Orchestrator...') # Generate flexi experiment - exp = Experiment('flexi', launcher=self.type) + exp = Experiment('relexi', launcher=self.type) # Initialize the orchestrator based on the orchestrator_type db = exp.create_database( @@ -303,24 +262,24 @@ def _launch_orchestrator( hosts=self.head if self.type in {'pbs', 'slurm'} else None, ) - rlxout.info("Starting the Database...", newline=False) + rlxout.info('Starting the Orchestrator...', newline=False) try: exp.start(db) except Exception as e: - raise RuntimeError(f"Failed to start the Orchestrator: {e}") from e - rlxout.info("Success!", newline=False) + raise RuntimeError(f'Failed to start the Orchestrator: {e}') from e + rlxout.info('Success!', newline=False) entry_db = socket.gethostbyname(db.hosts[0]) - rlxout.info("Use this command to shutdown database if not terminated correctly:") - rlxout.info(f"$(smart dbcli) -h {db.hosts[0]} -p {port} shutdown", newline=False) + rlxout.info('Use this command to shutdown database if not terminated correctly:') + rlxout.info(f'$(smart dbcli) -h {db.hosts[0]} -p {port} shutdown', newline=False) return exp, db, entry_db def _get_hostlist(self) -> List[str]: """Get the list of hosts the script is executed on. - Uses the scheduler type to determine the hostlist via the environment - variables set by the scheduler. + Uses the runtime type to determine the hostlist via the environment + variables set by the corresponding scheduler environment. Returns: list: List containing the hostnames as strings. @@ -338,13 +297,13 @@ def _get_hostlist(self) -> List[str]: if self.type == 'slurm': return self._get_slurm_nodelist() raise NotImplementedError( - f"Method `get_hostlist` not implemented for scheduler type {self.type}!") + f'Method `get_hostlist` not implemented for runtime "{self.type}"!') def _read_pbs_nodefile(self) -> List[str]: - """Read the PBS_NODEFILE and return the list of nodes. + """Read the `PBS_NODEFILE` and return the list of nodes. NOTE: - The PBS_NODEFILE contains the list of nodes allocated to the job. + The `PBS_NODEFILE` contains the list of nodes allocated to the job. If a node provides multiple MPI slots, it is the corresponding number of times in the file. @@ -352,10 +311,10 @@ def _read_pbs_nodefile(self) -> List[str]: list: List containing the hostnames as strings. """ if self.type != 'pbs': - raise ValueError("Method 'read_pbs_nodefile' only available for PBS scheduler!") - node_file = os.environ['PBS_NODEFILE'] + raise ValueError('Method "read_pbs_nodefile" only available for PBS scheduler!') + node_file = os.getenv('PBS_NODEFILE') if node_file is None: - raise KeyError("Environment variable 'PBS_NODEFILE' is not set!") + raise KeyError('Environment variable "PBS_NODEFILE" is not set!') with open(node_file, 'r', encoding='utf-8') as f: nodes = [line.strip() for line in f.readlines()] return nodes @@ -367,16 +326,16 @@ def _get_slurm_nodelist(self) -> List[str]: list: List containing the unique hostnames as strings. """ if self.type != 'slurm': - raise ValueError("Method 'get_slurm_nodelist' only available for SLURM scheduler!") + raise ValueError('Method "get_slurm_nodelist" only available for SLURM scheduler!') # Get the compressed list of nodes from SLURM_NODELIST node_list = os.getenv('SLURM_NODELIST') if node_list is None: - raise KeyError("Environment variable 'SLURM_NODELIST' is not set!") + raise KeyError('Environment variable "SLURM_NODELIST" is not set!') # Use scontrol to expand the node list result = subprocess.run(['scontrol', 'show', 'hostname', node_list], capture_output=True, text=True) # Check if the command was successful if result.returncode != 0: - raise RuntimeError(f"scontrol command failed: {result.stderr.strip()}") + raise RuntimeError(f'scontrol command failed: {result.stderr.strip()}') # Split the output into individual hostnames return result.stdout.strip().split('\n') From b55a034bf63d4cc496430daaf9eba72ec97b8afc Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Wed, 26 Jun 2024 16:39:20 +0200 Subject: [PATCH 12/20] Add correct package layout for "runtime" sub-package --- src/relexi/runtime/__init__.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/relexi/runtime/__init__.py b/src/relexi/runtime/__init__.py index e69de29..f23bcd8 100644 --- a/src/relexi/runtime/__init__.py +++ b/src/relexi/runtime/__init__.py @@ -0,0 +1,18 @@ +"""Provides functionalities to create and manage an HPC runtime environment. + +This module provides the necessary functionalities to create and manage a +runtime environment on distributed HPC systems for distributed Reinforcement +Learning (RL) algorithms. The main class is `Runtime`, which is used to +identify the resources available on the system, create the necessary +environment variables, and run the given program. The `helpers` module provides +some helper functions to facilitate the process of creating and managing the +runtime environment. + +The public classes and functions are: + - `Runtime`: The main class to create and manage a runtime environment. + - `helpers`: A module with helper functions to facilitate the process. +""" +from .runtime import Runtime +from . import helpers + +__all__ = ['Runtime', 'helpers'] From b9eb845d11e4c392d7c5d6399aee5b0e1d5e8235 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Thu, 4 Jul 2024 17:35:53 +0200 Subject: [PATCH 13/20] First working version for SLURM. However, Rankfiles for PBS still not working... --- src/relexi/env/flexiEnvSmartSim.py | 196 +++++++-------------------- src/relexi/rl/ppo/train.py | 12 +- src/relexi/runtime/helpers.py | 2 +- src/relexi/runtime/runtime.py | 206 +++++++++++++++++++++++++++-- 4 files changed, 246 insertions(+), 170 deletions(-) diff --git a/src/relexi/env/flexiEnvSmartSim.py b/src/relexi/env/flexiEnvSmartSim.py index 9562836..48844aa 100644 --- a/src/relexi/env/flexiEnvSmartSim.py +++ b/src/relexi/env/flexiEnvSmartSim.py @@ -38,7 +38,7 @@ class flexiEnv(py_environment.PyEnvironment): """ def __init__(self, - exp, + runtime, flexi_path, prm_file, spectra_file, @@ -47,15 +47,10 @@ def __init__(self, reward_scale, n_procs=1, n_envs=1, - n_procs_per_node=1, restart_files=None, random_restart_file=True, - entry_db="127.0.0.1", - port=6780, - is_db_cluster=False, debug=0, tag=None, - hosts=None, rankfiles=None, mpi_launch_mpmd=False, env_launcher='mpirun' @@ -64,10 +59,8 @@ def __init__(self, # Path to FLEXI executable self.n_envs = n_envs self.n_procs = n_procs - self.n_procs_per_node = n_procs_per_node self.prm_file = prm_file self.flexi_path = flexi_path - self.hosts = hosts self.rankfiles = rankfiles # Save values for reward function @@ -103,30 +96,22 @@ def __init__(self, e.append(float(rows[col_e])) self.e_dns = e - # Get experiment handle and port of db - self.exp = exp - self.port = port - # Should be IP address not hostname, since "-.,'" in hostname will cause a crash - self.entry_db = entry_db - self.is_db_cluster = is_db_cluster + # Get runtime environment + self.runtime = runtime # Connect python redis client to an orchestrator database - self.client = Client(address=f"{self.entry_db}:{str(self.port)}", cluster=self.is_db_cluster) + self.client = Client(address=self.runtime.db_entry, cluster=False) # Build tag from tag plus env number if tag: - self.tag = [tag+str(i)+'_' for i in range(self.n_envs)] + self.tag = [f'{tag}{i:03d}_' for i in range(self.n_envs)] else: self.tag = None - # Startup FLEXI instances inside experiment to get state size - self.flexi = self._start_flexi(self.exp, self.n_procs, self.n_envs) - - # Get current state from FLEXI environment + # Startup FLEXI instances to get state size + self.flexi = self._start_flexi() self._state = self._get_current_state() - - # End FLEXI again. Otherwise it will compute the entire run... - self._end_flexi() + self._stop_flexi() # Specify action and observation dimensions (neglect first batch dimension) self._action_spec = array_spec.ArraySpec( @@ -141,6 +126,16 @@ def __del__(self): """Finalize launched FLEXI instances if deleted.""" self.stop() + def stop(self): + """Stops all flexi instances inside launched in this environment.""" + if self.flexi: + self._stop_flexi() + + def start(self): + """Starts all flexi instances with configuration specified in initialization.""" + self.flexi = self._start_flexi() + self._state = self._get_current_state() + @property def batched(self): """Override batched property to indicate that environment is batched.""" @@ -151,128 +146,45 @@ def batch_size(self): """Override batch size property according to chosen batch size.""" return self.n_envs - def stop(self): - """Stops all flexi instances inside launched in this environment.""" - if self.exp: - self._stop_flexi_instances(self.exp) - - def start(self): - """Starts all flexi instances with configuration specified in initialization.""" - # Start new FLEXI instance and get initial state - self.flexi = self._start_flexi(self.exp, self.n_procs, self.n_envs) - self._state = self._get_current_state() - - def _stop_flexi_instances(self, exp): - """Stop all FLEXI instances. - - Uses the current SmartSim experiment to loop over all FLEXI instances - and stop them if they have not finished yet. - - Args: - exp (smartsim.Experiment): Experiment in which the Orchestrator and - the FLEXI instances were launched. - - Returns: - None - - TODO: - * `exp` should be a property of the class. - """ - if self.flexi: - for flexi in self.flexi: - if not exp.finished(flexi): - exp.stop(flexi) - - def _start_flexi(self, exp, n_procs, n_envs): - """Start FLEXI instances within SmartSim experiment. + def _start_flexi(self): + """Start FLEXI instances within runtime environment. - Args: - exp (smartsim.Experiment): Experiment in which the Orchestrator - was launched. - n_procs (int): Number of processors used to run each simulation - environment. - n_envs (int): Number of environments to be launched. - Returns: List of `smartsim` handles for each started FLEXI environment. """ - - # Build hostlist to specify on which hosts each flexi is started - # First check: Are there enough free ranks? - ranks_avail = self.n_procs_per_node*len(self.hosts) - ranks_needed = n_envs*n_procs - if ranks_needed > ranks_avail: - rlxout.warning(f'Only {ranks_avail} ranks are available, but {ranks_needed} would be required '+ - 'to start {n_envs} on {n_procs} each.') - - # Distribute ranks to instances in a round robin fashion - # TODO: Get ranks directly from hostfile for PBS Orchestrator - hosts_per_flexi = np.zeros((n_envs, 2), dtype=np.int8) - n_cores_used = 0 - for i in range(n_envs): - # 1. Get first node the instance has ranks on - hosts_per_flexi[i, 0] = n_cores_used // self.n_procs_per_node - # 2. Increase amount of used cores accordingly - n_cores_used = n_cores_used + n_procs - # 3. Get last node the instance has ranks on - hosts_per_flexi[i,1] = (n_cores_used-1) // self.n_procs_per_node # last node - - flexi = [] - # Build list of individual FLEXI instances - for i in range(n_envs): - + exe_args = [] + exe_name = [] + # Build list of arguments for each executable + for i in range(self.n_envs): + # First argument is parameter file + exe_args.append([self.prm_file]) # Select (possibly random drawn) restart file for the run if self.random_restart_file: - restart_file = random.choice(self.restart_files) + exe_args[i].append(random.choice(self.restart_files)) else: - restart_file = self.restart_files[0] - - args = [self.prm_file, restart_file] + exe_args[i].append(self.restart_files[0]) + # Tags are given to FLEXI with the Syntax "--tag [value]" if self.tag[i]: - # Tags are given to FLEXI with the Syntax "--tag [value]" - args.append('--tag') - args.append(self.tag[i]) - - if self.env_launcher == 'mpirun': - run_args = { - "rankfile": self.rankfiles[i], - "report-bindings": "" - } - run = MpirunSettings( - exe=self.flexi_path, - exe_args=args, - run_args=run_args) - run.set_tasks(n_procs) - - # Create MPMD Settings and start later in single command - if self.mpi_launch_mpmd: - if i == 0: - f_mpmd = run - else: - f_mpmd.make_mpmd(run) - - else: # Otherwise do not use launcher - run = RunSettings(exe=self.flexi_path, exe_args=args) - - # Create and directly start FLEXI instances - if not self.mpi_launch_mpmd: - flexi_instance = exp.create_model(self.tag[i]+"flexi", run) - exp.start(flexi_instance, block=False, summary=False) - flexi.append(flexi_instance) - - # Create MPMD Model from settings and start - if self.mpi_launch_mpmd: - flexi = exp.create_model(self.tag[0]+"flexi", f_mpmd) - exp.start(flexi, block=False, summary=False) - flexi = [flexi] - - return flexi - - def _end_flexi(self): - """Stop FLEXI experiment with SmartSim.""" + exe_args[i].append('--tag') + exe_args[i].append(self.tag[i]) + # And create name of executable + exe_name.append(self.tag[i]+'flexi') + + # Launch executables in runtime + return self.runtime.launch_models( + self.flexi_path, + exe_args, + exe_name, + self.n_procs, + self.n_envs, + launcher=self.env_launcher + ) + + def _stop_flexi(self): + """Stop all FLEXI instances currently running.""" for flexi in self.flexi: - if not self.exp.finished(flexi): - self.exp.stop(flexi) + if not self.runtime.exp.finished(flexi): + self.runtime.exp.stop(flexi) def _reset(self): """Resets the FLEXI environment. @@ -287,15 +199,7 @@ def _reset(self): functions "start()" and "stop()" manually. This function is thus deprecated. """ - - # Close FLEXI instance - # self._end_flexi() self._episode_ended = False - - # Start new FLEXI instance and get initial state - # self.flexi = self._start_flexi(self.exp,self.n_procs,self.n_envs) - # self._state = self._get_current_state() - return ts.restart(self._state, batch_size=self.n_envs) def _step(self, action): @@ -311,9 +215,9 @@ def _step(self, action): Returns: Transition containing (state, reward, discount) - .""" + """ if self._episode_ended: - # The last action ended the episode. Ignore the current action and start a new episode. + # The last action ended the episode. Ignore the current action and start new one. return self.reset() # Update Prediction diff --git a/src/relexi/rl/ppo/train.py b/src/relexi/rl/ppo/train.py index 5e68108..4f57472 100644 --- a/src/relexi/rl/ppo/train.py +++ b/src/relexi/rl/ppo/train.py @@ -166,16 +166,12 @@ def train( config_file # Instantiate parallel collection environment my_env = tf_py_environment.TFPyEnvironment( - relexi.env.flexiEnvSmartSim.flexiEnv(runtime.exp + relexi.env.flexiEnvSmartSim.flexiEnv(runtime ,executable_path ,parameter_file ,tag = 'train' - ,port = smartsim_port - ,entry_db = runtime.db_entry - ,hosts = runtime.workers ,n_envs = num_parallel_environments ,n_procs = num_procs_per_environment - ,n_procs_per_node = n_procs_per_node ,spectra_file = reward_spectrum_file ,reward_kmin = reward_kmin ,reward_kmax = reward_kmax @@ -193,15 +189,11 @@ def train( config_file eval_files = train_files my_eval_env = tf_py_environment.TFPyEnvironment( - relexi.env.flexiEnvSmartSim.flexiEnv(runtime.exp + relexi.env.flexiEnvSmartSim.flexiEnv(runtime ,executable_path ,parameter_file ,tag = 'eval' - ,port = smartsim_port - ,entry_db = runtime.db_entry - ,hosts = runtime.workers ,n_procs = num_procs_per_environment - ,n_procs_per_node = n_procs_per_node ,spectra_file = reward_spectrum_file ,reward_kmin = reward_kmin ,reward_kmax = reward_kmax diff --git a/src/relexi/runtime/helpers.py b/src/relexi/runtime/helpers.py index dc8daea..53965ca 100644 --- a/src/relexi/runtime/helpers.py +++ b/src/relexi/runtime/helpers.py @@ -100,7 +100,7 @@ def clean_ompi_tmpfiles(env_variable='TMPDIR'): - `1` if operation was successfull, - `-1` otherwise. """ - tmpdir = os.geten(env_variable) + tmpdir = os.getenv(env_variable) if tmpdir is None: return -1 diff --git a/src/relexi/runtime/runtime.py b/src/relexi/runtime/runtime.py index 4185c03..beddfc0 100644 --- a/src/relexi/runtime/runtime.py +++ b/src/relexi/runtime/runtime.py @@ -7,6 +7,8 @@ import subprocess from typing import List, Optional, Union +import numpy as np + import smartsim from smartsim import Experiment from smartsim.database.orchestrator import Orchestrator @@ -49,9 +51,9 @@ class Runtime: workers (list): List of worker nodes (contains only `localhost` if in **Local Mode**). db (Orchestrator): The launched `Orchestrator` database from the - `smartsim` package. - db_entry (str): IP address of the host of the database. Required to - connect a client to the database. + `smartsim` packagev. + db_entry (str): IP address and port of the host of the database. + Takes the form `IP_ADDRESS:PORT`. exp (Experiment): The `Experiment` object the `Orchestrator` is launched with. @@ -134,6 +136,104 @@ def info(self): else: rlxout.info(f'Running in LOCAL mode on: {self.head}') + def launch_models( + self, + exe: Union[str, List[str]], + exe_args: Union[str, List[str]], + exe_name: Union[str, List[str]], + n_procs: Union[int, List[int]], + n_exe: Optional[int] = 1, + launcher: Optional[str] = 'local' + ) -> List[smartsim.entity.model.Model]: + """Launch the models on the available nodes. + + Args: + exe (str, List(str)): Path to the executable to launch. Can either + be a single path or a list of length `n_exe`. If only a single + path is provided, it is used for all executables. + exe_args (str, List(str)): Arguments to pass to the executable. Can + either be a single string or a list of length `n_exe`. If only + a single string is provided, it is used for all executables. + exe_name (str, List(str)): Name of the executable used to identify + launched model in the SmartSim context. Can either be a single + string or a list of length `n_exe`. If only a single string is + provided, it is used for all executables. + n_procs (int, List(int)): Number of processes to launch. Can either + be a single integer or a list of length `n_exe`. If only a + single integer is provided, it is used for all executables. + n_exe (int): Number of executables to launch. Defaults to `1`. + launcher (str): Launcher to use for the executable. Must be one of + `'mpirun'`, `'srun'`, or `'local'`. + """ + def _validate_args(arg, n): + """Validate the length of the arguments.""" + if isinstance(arg, list) and not len(arg) == n: + raise ValueError(f'Expected {n} entries, but got {len(arg)}!') + if not isinstance(arg, list): + return [arg] * n + return arg + + # Validate that arguments are of correct length + exe = _validate_args(exe, n_exe) + exe_args = _validate_args(exe_args, n_exe) + exe_name = _validate_args(exe_name, n_exe) + n_procs = _validate_args(n_procs, n_exe) + + # Check compatibility of launcher and scheduler type + if launcher == 'local': + if any(n_procs > 1): + raise ValueError('Local launcher only supports single process execution!') + if (launcher == 'srun') and (self.type != 'slurm'): + raise ValueError('srun launcher only supported for SLURM scheduler!') + + # Ensure we have sufficient procs available + procs_avail = self._get_total_worker_slots() + procs_requested = sum(n_procs) + if procs_requested > procs_avail: + raise ValueError(f'Not enough processes available! Requested: {procs_requested}, Available: {procs_avail}') + if min(n_procs) < 1: + raise ValueError('At least one process must be requested per executable!') + + # Distribute the executables to the available nodes for SLURM + # TODO: Precompute and store placement + if self.type == 'slurm': + slurm_hosts_per_exe = self._distribute_workers_slurm( + n_procs, + n_exe, + procs_avail + ) + + models = [] + for i in range(n_exe): + if launcher == 'local': + run_args = None + elif launcher == 'mpirun': + run_args = { + #'rankfile': self.rankfiles[i], + 'report-bindings': None + } + elif launcher == 'srun': + run_args = { + 'mpi': 'pmix', + 'nodelist': ','.join(slurm_hosts_per_exe[i]), + 'distribution': 'block:block:block,Pack', + 'cpu-bind': 'verbose', + 'exclusive': None, + } + run = self.exp.create_run_settings( + exe=exe[i], + exe_args=exe_args[i], + run_command=launcher, + run_args=run_args + ) + run.set_tasks(n_procs[i]) + + model = self.exp.create_model(exe_name[i], run) + self.exp.start(model, block=False, summary=False) + models.append(model) + + return models + @property def type(self) -> str: """Get the type of the runtime environment. @@ -218,10 +318,10 @@ def db(self) -> Orchestrator: @property def db_entry(self) -> str: - """Get the IP address of the host of the database. + """Get IP address of database. Returns: - str: IP address of the host of the database. + str: Address of the database. Takes the form `IP_ADDRESS:PORT`. """ return self._db_entry @@ -269,11 +369,11 @@ def _launch_orchestrator( raise RuntimeError(f'Failed to start the Orchestrator: {e}') from e rlxout.info('Success!', newline=False) - entry_db = socket.gethostbyname(db.hosts[0]) + db_entry = socket.gethostbyname(db.hosts[0]) rlxout.info('Use this command to shutdown database if not terminated correctly:') rlxout.info(f'$(smart dbcli) -h {db.hosts[0]} -p {port} shutdown', newline=False) - return exp, db, entry_db + return exp, db, f'{db_entry}:{port}' def _get_hostlist(self) -> List[str]: """Get the list of hosts the script is executed on. @@ -291,15 +391,15 @@ def _get_hostlist(self) -> List[str]: if self.type == 'local': return [self._get_local_hostname()] if self.type == 'pbs': - nodes = self._read_pbs_nodefile() + nodes = self._get_hostlist_pbs() # Get the list of unique nodes via casting into set and list again return list(set(nodes)) if self.type == 'slurm': - return self._get_slurm_nodelist() + return self._get_hostlist_slurm() raise NotImplementedError( f'Method `get_hostlist` not implemented for runtime "{self.type}"!') - def _read_pbs_nodefile(self) -> List[str]: + def _get_hostlist_pbs(self) -> List[str]: """Read the `PBS_NODEFILE` and return the list of nodes. NOTE: @@ -314,12 +414,12 @@ def _read_pbs_nodefile(self) -> List[str]: raise ValueError('Method "read_pbs_nodefile" only available for PBS scheduler!') node_file = os.getenv('PBS_NODEFILE') if node_file is None: - raise KeyError('Environment variable "PBS_NODEFILE" is not set!') + raise KeyError('Environment variable "PBS_NODEFILE" not found!') with open(node_file, 'r', encoding='utf-8') as f: nodes = [line.strip() for line in f.readlines()] return nodes - def _get_slurm_nodelist(self) -> List[str]: + def _get_hostlist_slurm(self) -> List[str]: """Get the list of hosts from the SLURM_NODELIST environment variable. Returns: @@ -330,7 +430,7 @@ def _get_slurm_nodelist(self) -> List[str]: # Get the compressed list of nodes from SLURM_NODELIST node_list = os.getenv('SLURM_NODELIST') if node_list is None: - raise KeyError('Environment variable "SLURM_NODELIST" is not set!') + raise KeyError('Environment variable "SLURM_NODELIST" not found!') # Use scontrol to expand the node list result = subprocess.run(['scontrol', 'show', 'hostname', node_list], capture_output=True, text=True) # Check if the command was successful @@ -339,6 +439,86 @@ def _get_slurm_nodelist(self) -> List[str]: # Split the output into individual hostnames return result.stdout.strip().split('\n') + def _get_total_worker_slots(self) -> int: + """Get the total number of worker slots available in the runtime. + + Returns: + int: Number of slots per worker node. + """ + if self.type == 'local': + # Leave one core for the head node + return os.cpu_count()-1 + + if self.type == 'pbs': + # Get all slots from PBS_NODEFILE. Multiple slots per node + # correspond to multiple entries in the file. + nodes = self._get_hostlist_pbs() + slots = np.zeros(len(self.workers), dtype=int) + for i, worker in enumerate(self.workers): + slots[i] = nodes.count(worker) + return np.sum(slots) + + if self.type == 'slurm': + # Get the number of slots per node from SLURM_CPUS_PER_NODE + slots = os.getenv('SLURM_JOB_CPUS_PER_NODE') + if slots is None: + raise KeyError('Environment variable "SLURM_JOB_CPUS_PER_NODE" not found!') + # Parse only first integer part from string (has form '24(x4)'). + slots_int = "" + for char in slots: + if char.isdigit(): + slots_int += char + else: + break + return int(slots_int)*len(self.workers) + + raise NotImplementedError( + f'Method `get_slots_per_worker` not implemented for runtime "{self.type}"!') + + def _distribute_workers_slurm(self, n_procs: List[int], n_exe: int, procs_avail: int) -> List[List[str]]: + """Distribute the executables to the available nodes for SLURM. + + Uses two different strategies to distribute the executables to the + available nodes. Either multiple executables per node or multiple nodes + per executable. However, a single executable cannot be placed on parts + of multiple nodes, since this causes problems with SLURM. Either + executable spans multiple whole nodes, or single partial node. + + Args: + n_procs (List[int]): Number of processes to launch per executable. + n_exe (int): Number of executables to launch. + procs_avail (int): Number of available processes. + + Returns: + List[List[str]]: List of lists containing the hostnames for each + executable. + """ + procs_per_worker = procs_avail//len(self.workers) + nodes_avail = self.workers + slurm_hosts_per_exe = [] + # Either multiple executables per node or multiple nodes per executable + if max(n_procs) > procs_per_worker: + # Use whole nodes per executable + for i in range(n_exe): + n_nodes_req = int(np.ceil(n_procs[i]/procs_per_worker)) + current_hosts = [] + for _ in range(n_nodes_req): + current_hosts.append(nodes_avail.pop(0)) + slurm_hosts_per_exe.append(current_hosts) + else: + # Distribute processes to nodes + cores_avail = procs_per_worker + for i in range(n_exe): + # Exe does not fit on node + if n_procs[i] > cores_avail: + if len(nodes_avail) <= 1: + raise RuntimeError('Failed to distribute models to resources!') + # Take next node + nodes_avail.pop(0) + cores_avail = procs_per_worker + slurm_hosts_per_exe.append([nodes_avail[0]]) + return slurm_hosts_per_exe + def _get_local_hostname(self) -> str: """Get the hostname of the machine executing the Python script. From b587ff3319240ce063c0cbc7c11bae3f4e43e66c Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Wed, 10 Jul 2024 11:34:20 +0200 Subject: [PATCH 14/20] Initial implementation of `LaunchConfig` class, which specifies the distribution of resources for a given amount of executables and resources and creates the required pinning files or commandline arguments depending no the chosen runtime environment and launcher. --- src/relexi/env/flexiEnvSmartSim.py | 9 +- src/relexi/rl/ppo/train.py | 77 ++---- src/relexi/runtime/__init__.py | 13 +- src/relexi/runtime/helpers.py | 40 --- src/relexi/runtime/launch_configuration.py | 288 +++++++++++++++++++++ src/relexi/runtime/runtime.py | 130 ++++------ 6 files changed, 371 insertions(+), 186 deletions(-) create mode 100644 src/relexi/runtime/launch_configuration.py diff --git a/src/relexi/env/flexiEnvSmartSim.py b/src/relexi/env/flexiEnvSmartSim.py index 48844aa..351dbaf 100644 --- a/src/relexi/env/flexiEnvSmartSim.py +++ b/src/relexi/env/flexiEnvSmartSim.py @@ -51,7 +51,6 @@ def __init__(self, random_restart_file=True, debug=0, tag=None, - rankfiles=None, mpi_launch_mpmd=False, env_launcher='mpirun' ): @@ -61,7 +60,6 @@ def __init__(self, self.n_procs = n_procs self.prm_file = prm_file self.flexi_path = flexi_path - self.rankfiles = rankfiles # Save values for reward function self.reward_kmin = reward_kmin @@ -73,9 +71,8 @@ def __init__(self, # Sanity Check Launcher self.env_launcher = env_launcher if ((self.env_launcher == 'local') and (n_procs != 1)): - rlxout.warning("For env_launcher 'local', only single execution is allowed! Setting 'n_procs=1'!") - rlxout.warning("To run evironments in parallel with MPI, use env_launcher='mpi'!") - n_procs = 1 + rlxout.warning("For env_launcher 'local', only single execution is allowed! Setting 'n_procs=1'") + self.n_procs = 1 if self.env_launcher == 'mpirun': self.mpi_launch_mpmd = mpi_launch_mpmd @@ -274,7 +271,7 @@ def _get_current_state(self): do_init = True key = "state" for tag in self.tag: - self.client.poll_tensor(tag+key, 10, 10000) + self.client.poll_tensor(tag+key, 10, 1000) try: data = self.client.get_tensor(tag+key) except Exception: diff --git a/src/relexi/rl/ppo/train.py b/src/relexi/rl/ppo/train.py index 4f57472..eb08961 100644 --- a/src/relexi/rl/ppo/train.py +++ b/src/relexi/rl/ppo/train.py @@ -24,7 +24,7 @@ import relexi.runtime import relexi.io.readin as rlxin import relexi.io.output as rlxout -from relexi.runtime.helpers import generate_rankfile_ompi, copy_to_nodes, parser_flexi_parameters +from relexi.runtime.helpers import copy_to_nodes, parser_flexi_parameters def train( config_file @@ -114,54 +114,33 @@ def train( config_file ) runtime.info() - # generating rankfiles for OpenMPI - if mpi_launch_mpmd: - # If all MPI jobs are run with single mpirun command, all jobs are allocated based on single rankfile - rank_files = generate_rankfile_ompi(runtime.workers - ,n_procs_per_node - ,n_par_env=1 - ,ranks_per_env=num_parallel_environments*num_procs_per_environment - ) + ## Copy all local files into local directory, possibly fast RAM-Disk or similar + ## for performance and to reduce Filesystem access + #if local_dir: + # # Prefix with PBS Job ID if PBS job + # if smartsim_launcher.casefold() == 'pbs': + # pbs_job_id = os.environ['PBS_JOBID'] + # local_dir = os.path.join(local_dir, pbs_job_id) - else: - # Otherwise every MPI job gets its own rankfile - rank_files = generate_rankfile_ompi(runtime.workers - ,n_procs_per_node - ,num_parallel_environments - ,num_procs_per_environment - ) - - # Copy all local files into local directory, possibly fast RAM-Disk or similar - # for performance and to reduce Filesystem access - if local_dir: - # Prefix with PBS Job ID if PBS job - if smartsim_launcher.casefold() == 'pbs': - pbs_job_id = os.environ['PBS_JOBID'] - local_dir = os.path.join(local_dir, pbs_job_id) - - rlxout.info(f"Moving local files to {local_dir} ..." ) - - # Get list of all nodes - nodes = copy.deepcopy(worker_nodes) - ai_node = os.environ['HOSTNAME'] - nodes.insert(0, ai_node) - - # Move all files to local dir - # TODO: control which files are copied by 'local_files' variable! - train_files = copy_to_nodes(train_files, local_dir,nodes,subfolder='train_files') - eval_files = copy_to_nodes(eval_files, local_dir,nodes,subfolder='eval_files') - reward_spectrum_file = copy_to_nodes(reward_spectrum_file,local_dir,nodes,subfolder='reward_files') - rank_files = copy_to_nodes(rank_files, local_dir,nodes,subfolder='ompi_rank_files') - mesh_file = copy_to_nodes(mesh_file, local_dir,nodes,subfolder='ompi_rank_files') - - # We have to update the meshfile in the parameter file before copying - parameter_file = parser_flexi_parameters(parameter_file, 'MeshFile', mesh_file) - parameter_file = copy_to_nodes(parameter_file,local_dir,nodes,subfolder='parameter_files') - - rlxout.info(" DONE! ",newline=False) - - if mpi_launch_mpmd: - rank_files = [rank_files[0] for _ in range(num_parallel_environments)] + # rlxout.info(f"Moving local files to {local_dir} ..." ) + + # # Get list of all nodes + # nodes = copy.deepcopy(runtime.workers) + # ai_node = os.environ['HOSTNAME'] + # nodes.insert(0, ai_node) + + # # Move all files to local dir + # # TODO: control which files are copied by 'local_files' variable! + # train_files = copy_to_nodes(train_files, local_dir,nodes,subfolder='train_files') + # eval_files = copy_to_nodes(eval_files, local_dir,nodes,subfolder='eval_files') + # reward_spectrum_file = copy_to_nodes(reward_spectrum_file,local_dir,nodes,subfolder='reward_files') + # mesh_file = copy_to_nodes(mesh_file, local_dir,nodes,subfolder='meshf_file') + + # # We have to update the meshfile in the parameter file before copying + # parameter_file = parser_flexi_parameters(parameter_file, 'MeshFile', mesh_file) + # parameter_file = copy_to_nodes(parameter_file,local_dir,nodes,subfolder='parameter_files') + + # rlxout.info(" DONE! ",newline=False) # Instantiate parallel collection environment @@ -177,7 +156,6 @@ def train( config_file ,reward_kmax = reward_kmax ,reward_scale = reward_scale ,restart_files = train_files - ,rankfiles = rank_files ,env_launcher = env_launcher ,mpi_launch_mpmd = mpi_launch_mpmd ,debug = debug @@ -200,7 +178,6 @@ def train( config_file ,reward_scale = reward_scale ,restart_files = eval_files ,random_restart_file = False - ,rankfiles = rank_files ,env_launcher = env_launcher ,debug = debug )) diff --git a/src/relexi/runtime/__init__.py b/src/relexi/runtime/__init__.py index f23bcd8..c19c6fa 100644 --- a/src/relexi/runtime/__init__.py +++ b/src/relexi/runtime/__init__.py @@ -4,15 +4,20 @@ runtime environment on distributed HPC systems for distributed Reinforcement Learning (RL) algorithms. The main class is `Runtime`, which is used to identify the resources available on the system, create the necessary -environment variables, and run the given program. The `helpers` module provides -some helper functions to facilitate the process of creating and managing the -runtime environment. +environment variables, and run the given program. The `LaunchConfig` class +provides a configuration for launching a batch of executables in the runtime. +This include most importantly the distribution of the executables across the +available resources. The `helpers` module provides some helper functions to +facilitate the process of creating and managing the runtime environment. The public classes and functions are: - `Runtime`: The main class to create and manage a runtime environment. + - `LaunchConfig`: A class to define the launch configuration for a batch of + executables in a runtime. - `helpers`: A module with helper functions to facilitate the process. """ +from .launch_configuration import LaunchConfig from .runtime import Runtime from . import helpers -__all__ = ['Runtime', 'helpers'] +__all__ = ['Runtime', 'LaunchConfig', 'helpers'] diff --git a/src/relexi/runtime/helpers.py b/src/relexi/runtime/helpers.py index 53965ca..f6b2acb 100644 --- a/src/relexi/runtime/helpers.py +++ b/src/relexi/runtime/helpers.py @@ -14,46 +14,6 @@ import shutil -def generate_rankfile_ompi(hosts, cores_per_node, n_par_env, ranks_per_env, base_path=None): - """Generate rank file for OpenMPI process binding. - - Args: - hosts (list): List of hostnames - cores_per_node (int): Number of cores per node - n_par_env (int): Number of parallel environments to be launched - ranks_per_env (int): Number of ranks per environments - base_path (str): (Optional.) Path to the directory of the rank files - - Returns: - list: List of filenames of the rankfiles - """ - - # If no base_path given, use CWD - if base_path: - rankfile_dir = os.path.join(base_path, 'ompi-rankfiles') - else: - rankfile_dir = 'ompi-rankfiles' - - if os.path.exists(rankfile_dir): - shutil.rmtree(rankfile_dir) - os.makedirs(rankfile_dir, exist_ok=True) - - rankfiles = [] - next_free_slot = 0 - n_cores_used = 0 - for env_idx in range(n_par_env): - filename = os.path.join(rankfile_dir, f'par_env_{env_idx:05d}') - rankfiles.append(filename) - with open(filename, 'w', encoding='utf-8') as rankfile: - for i in range(ranks_per_env): - rankfile.write(f'rank {i}={hosts[n_cores_used//cores_per_node]} slot={next_free_slot}\n') - next_free_slot = next_free_slot + 1 - n_cores_used = n_cores_used + 1 - if next_free_slot > (cores_per_node - 1): - next_free_slot = 0 - return rankfiles - - def parser_flexi_parameters(parameter_file, keyword, value): """Changes the value for a keyword in a FLEXI parameter file. diff --git a/src/relexi/runtime/launch_configuration.py b/src/relexi/runtime/launch_configuration.py new file mode 100644 index 0000000..4fc5b41 --- /dev/null +++ b/src/relexi/runtime/launch_configuration.py @@ -0,0 +1,288 @@ +#/usr/bin/env python3 + +"""Launch configuration for a batch of executables in a runtime.""" + +from __future__ import annotations + +from typing import List + +import numpy as np + +class LaunchConfig(): + """Launch configuration for a batch of executables in a runtime. + + This class provides a launch configuration for a batch of executables in a + runtime. It contains the specific configuration to distribute the + executables to the available resources. The configuration can be of three + types: 'local', 'mpirun', and 'srun'. The 'local' configuration is used + for local execution, 'mpirun' for OpenMPI, and 'srun' for SLURM. + + Attributes: + type (str): Type of the launch configuration. + n_exe (int): Number of executables to launch. + n_procs (List[int]): Number of processes to launch per executable. Must + be of length `n_exe`. + workers (List[str]): List of worker nodes available. + n_worker_slots (int): Number of available worker slots. + config (dict): Configuration dictionary. + rankfiles (List[str]): List of rankfiles if `type=='mpirun'`, is `None` + otherwise. + hosts_per_exe (List[List[str]]): List of lists containing the hostnames + for each executable if `type=='srun'`, is `None` otherwise. + + Methods: + from_dict(cls, config: dict, runtime: Runtime) -> LaunchConfiguration: + Instantiate a launch configuration from a configuration dictionary. + as_dict() -> dict: + Return the launch configuration as a dictionary. + is_compatible(config: dict) -> bool: + Check if other launch configuration is compatible based on dict. + config_is_valid(config: dict) -> bool: + Check if the given configuration is valid. + + Raises: + ValueError: If the requested configuration is invalid. + RuntimeError: If the configuration cannot be generated. + """ + + TYPES = ['local', 'mpirun', 'srun'] + """Supported types of launch configurations.""" + + CONFIG_KEYS = ['type', 'n_exe', 'n_procs', 'workers'] + """Keys for the configuration dictionary.""" + + def __init__(self, type_: str, runtime, n_exe: int, n_procs: List[int]): + """Initialize the launch configuration. + + Args: + type_ (str): Type of the launch configuration. + runtime (Runtime): Runtime instance for which launch configuration + should be generated. + n_exe (int): Number of executables to launch. + n_procs (List[int]): Number of processes to launch per executable. + """ + self.type = type_ + self.n_exe = n_exe + self.n_procs = n_procs + self.workers = runtime.workers + self.n_worker_slots = runtime.n_worker_slots + # Set with property setter to check for validity + self.config = {'type': self.type, + 'n_exe': self.n_exe, + 'n_procs': self.n_procs, + 'workers': self.workers} + + # Generate rankfiles for OpenMPI + self._rankfiles = None + if self.type == 'mpirun': + slots_per_node = runtime.n_worker_slots//len(self.workers) + self._rankfiles = self._generate_rankfile_ompi(self.workers, + slots_per_node, + n_exe, + n_procs) + # Distribute workers for SLURM + self._hosts_per_exe = None + if self.type == 'srun': + self._hosts_per_exe = self._distribute_workers_slurm(n_procs, + n_exe, + runtime.workers, + runtime.n_worker_slots) + + @property + def config(self) -> dict: + """Return the current launch configuration as dict.""" + return self._config + + @config.setter + def config(self, config: dict): + """Set a launch configuration.""" + if not self.config_is_valid(config): + raise ValueError('Invalid configuration dictionary!') + if sum(config['n_procs']) > self.n_worker_slots: + raise ValueError('Not enough processes available!') + self._config = config + + @property + def type(self) -> str: + """Return the type of the launch configuration.""" + return self._type + + @type.setter + def type(self, value): + """Set the type of the launch configuration.""" + if value not in self.TYPES: + raise ValueError('Invalid launch configuration type!') + self._type = value + + @property + def rankfiles(self) -> List[str]: + """Return paths to rankfiles for `mpirun` launcher.""" + if self._rankfiles is None: + raise ValueError('Rankfiles not yet generated!') + return self._rankfiles + + @property + def hosts_per_exe(self) -> List[List[str]]: + """Return the hosts for each executable for `srun`.""" + if self._hosts_per_exe is None: + raise ValueError('Hosts not yet generated!') + return self._hosts_per_exe + + def as_dict(self) -> dict: + """Return the launch configuration as a dictionary.""" + return self._config + + def is_compatible(self, config: dict) -> bool: + """Check if other launch configuration is compatible based on dict. + + Another launch configuration is compatible if the first `n_exe` + executables can be launched on the same resources as the first `n_exe` + executables of the existing launch configuration. + + Args: + config (dict): Dictionary of the other launch configuration. + + Returns: + bool: `True` if the configurations are compatible, `False` otherwise. + """ + if self.type != config['type']: + return False + if self.n_exe != config['n_exe']: + return False + if self.n_procs != config['n_procs']: + return False + return True + + @classmethod + def from_dict(cls, config: dict, runtime) -> LaunchConfig: + """Instantiate a launch configuration from a configuration dictionary. + + The dictionary has to take the form of: + ``` + { + 'type': str, + 'n_exe': int, + 'n_procs': List[int] + 'workers': List[str] + } + ``` + + Args: + config (dict): Configuration dictionary. + runtime (Runtime): Runtime object. + + Returns: + LaunchConfig: Launch configuration instance. + """ + if not cls.config_is_valid(config): + raise ValueError('Invalid configuration dictionary!') + return cls(config['type'], runtime, config['n_exe'], config['n_procs']) + + @classmethod + def config_is_valid(cls, config: dict) -> bool: + """Check if the given configuration is valid. + + The configuration is valid if it contains all necessary keys and the + values are valid. However, the availability of the resources is not + checked! + + Args: + config (dict): Configuration dictionary. + + Returns: + bool: `True` if the configuration is valid, `False` otherwise. + """ + if not all(key in config for key in cls.CONFIG_KEYS): + raise ValueError('Configuration dictionary does not contain all neccessary keys!') + if config['type'] not in cls.TYPES: + return False + if len(config['n_procs']) != config['n_exe']: + return False + if len(config['workers']) < 1: + return False + if min(config['n_procs']) < 1: + return False + if config['n_exe'] < 1: + return False + return True + + @staticmethod + def _distribute_workers_slurm( + n_procs: List[int], + n_exe: int, + workers: List[str], + procs_avail: int + ) -> List[List[str]]: + """Distribute the executables to the available nodes for SLURM. + + Uses two different strategies to distribute the executables to the + available nodes. Either multiple executables per node or multiple nodes + per executable. However, a single executable cannot be placed on parts + of multiple nodes, since this causes problems with SLURM. Either + executable spans multiple whole nodes, or single partial node. + + Args: + n_procs (List[int]): Number of processes to launch per executable. + n_exe (int): Number of executables to launch. + workers (List[str]): List of worker nodes available. + procs_avail (int): Number of available processes. + + Returns: + List[List[str]]: List of lists containing the hostnames for each + executable. + """ + procs_per_worker = procs_avail//len(workers) + nodes_avail = workers + slurm_hosts_per_exe = [] + # Either multiple executables per node or multiple nodes per executable + if max(n_procs) > procs_per_worker: + # Use whole nodes per executable + for i in range(n_exe): + n_nodes_req = int(np.ceil(n_procs[i]/procs_per_worker)) + current_hosts = [] + for _ in range(n_nodes_req): + current_hosts.append(nodes_avail.pop(0)) + slurm_hosts_per_exe.append(current_hosts) + else: + # Distribute processes to nodes + cores_avail = procs_per_worker + for i in range(n_exe): + # Exe does not fit on node + if n_procs[i] > cores_avail: + if len(nodes_avail) <= 1: + raise RuntimeError('Failed to distribute models to resources!') + # Take next node + nodes_avail.pop(0) + cores_avail = procs_per_worker + slurm_hosts_per_exe.append([nodes_avail[0]]) + return slurm_hosts_per_exe + + @staticmethod + def _generate_rankfile_ompi(workers: List[str], + n_slots_per_worker: int, + n_exe: int, + n_procs: List[int],) -> List[str]: + """Generate rank file for OpenMPI process binding. + + Args: + workers (list): List of hostnames + n_exe (int): Number of executables to be launched + n_procs (int): Number of ranks per environments + + Returns: + list: List of filenames of the rankfiles + """ + rankfiles = [] + next_free_slot = 0 + n_cores_used = 0 + for i_exe in range(n_exe): + filename = f'.env_{i_exe:05d}.txt' + rankfiles.append(filename) + with open(filename, 'w', encoding='utf-8') as rankfile: + for i in range(n_procs[i_exe]): + rankfile.write(f'rank {i}={workers[n_cores_used//n_slots_per_worker]} slot={next_free_slot}\n') + next_free_slot += 1 + n_cores_used += 1 + if next_free_slot >= n_slots_per_worker: + next_free_slot = 0 + return rankfiles diff --git a/src/relexi/runtime/runtime.py b/src/relexi/runtime/runtime.py index beddfc0..4637199 100644 --- a/src/relexi/runtime/runtime.py +++ b/src/relexi/runtime/runtime.py @@ -14,10 +14,10 @@ from smartsim.database.orchestrator import Orchestrator import relexi.io.output as rlxout -from relexi.runtime.helpers import generate_rankfile_ompi +from relexi.runtime import LaunchConfig -class Runtime: +class Runtime(): """Class containing information about and handling the HPC environment. This class defines the interface for an HPC runtime, which contains all @@ -50,12 +50,15 @@ class Runtime: **Local Mode**). workers (list): List of worker nodes (contains only `localhost` if in **Local Mode**). + n_worker_slots (int): Total number of slots available on workers. db (Orchestrator): The launched `Orchestrator` database from the `smartsim` packagev. db_entry (str): IP address and port of the host of the database. Takes the form `IP_ADDRESS:PORT`. exp (Experiment): The `Experiment` object the `Orchestrator` is launched with. + launch_config (LaunchConfig): CurrentcConfiguration for launching a + batch of executables in the runtime. Raises: ValueError: If the scheduler type is not supported. @@ -116,6 +119,8 @@ def __init__( port=db_port, network_interface=db_network_interface, ) + self.launch_config = None + self.n_worker_slots = self._get_total_worker_slots() def __del__(self): if self.db: @@ -180,53 +185,50 @@ def _validate_args(arg, n): n_procs = _validate_args(n_procs, n_exe) # Check compatibility of launcher and scheduler type - if launcher == 'local': - if any(n_procs > 1): - raise ValueError('Local launcher only supports single process execution!') + if (launcher == 'local') and (max(n_procs) > 1): + raise ValueError('Local launcher only supports single process execution!') if (launcher == 'srun') and (self.type != 'slurm'): raise ValueError('srun launcher only supported for SLURM scheduler!') - # Ensure we have sufficient procs available - procs_avail = self._get_total_worker_slots() - procs_requested = sum(n_procs) - if procs_requested > procs_avail: - raise ValueError(f'Not enough processes available! Requested: {procs_requested}, Available: {procs_avail}') - if min(n_procs) < 1: - raise ValueError('At least one process must be requested per executable!') - - # Distribute the executables to the available nodes for SLURM - # TODO: Precompute and store placement - if self.type == 'slurm': - slurm_hosts_per_exe = self._distribute_workers_slurm( - n_procs, - n_exe, - procs_avail - ) + # Check if launch config is up-to-date and create or update if required + config_dict = {'type': launcher, + 'n_exe': n_exe, + 'n_procs': n_procs, + 'workers': self.workers} + if self.launch_config is None: + self.launch_config = LaunchConfig.from_dict(config_dict, self) + else: + if not self.launch_config.is_compatible(config_dict): + self.launch_config.config = config_dict models = [] for i in range(n_exe): if launcher == 'local': - run_args = None - elif launcher == 'mpirun': - run_args = { - #'rankfile': self.rankfiles[i], - 'report-bindings': None - } - elif launcher == 'srun': - run_args = { - 'mpi': 'pmix', - 'nodelist': ','.join(slurm_hosts_per_exe[i]), - 'distribution': 'block:block:block,Pack', - 'cpu-bind': 'verbose', - 'exclusive': None, - } - run = self.exp.create_run_settings( - exe=exe[i], - exe_args=exe_args[i], - run_command=launcher, - run_args=run_args - ) - run.set_tasks(n_procs[i]) + run = self.exp.create_run_settings( + exe=exe[i], + exe_args=exe_args[i], + ) + else: + if launcher == 'mpirun': + run_args = { + 'rankfile': self.launch_config.rankfiles[i], + 'report-bindings': None + } + elif launcher == 'srun': + run_args = { + 'mpi': 'pmix', + 'nodelist': ','.join(self.launch_config.hosts_per_exe[i]), + 'distribution': 'block:block:block,Pack', + 'cpu-bind': 'verbose', + 'exclusive': None, + } + run = self.exp.create_run_settings( + exe=exe[i], + exe_args=exe_args[i], + run_command=launcher, + run_args=run_args + ) + run.set_tasks(n_procs[i]) model = self.exp.create_model(exe_name[i], run) self.exp.start(model, block=False, summary=False) @@ -475,50 +477,6 @@ def _get_total_worker_slots(self) -> int: raise NotImplementedError( f'Method `get_slots_per_worker` not implemented for runtime "{self.type}"!') - def _distribute_workers_slurm(self, n_procs: List[int], n_exe: int, procs_avail: int) -> List[List[str]]: - """Distribute the executables to the available nodes for SLURM. - - Uses two different strategies to distribute the executables to the - available nodes. Either multiple executables per node or multiple nodes - per executable. However, a single executable cannot be placed on parts - of multiple nodes, since this causes problems with SLURM. Either - executable spans multiple whole nodes, or single partial node. - - Args: - n_procs (List[int]): Number of processes to launch per executable. - n_exe (int): Number of executables to launch. - procs_avail (int): Number of available processes. - - Returns: - List[List[str]]: List of lists containing the hostnames for each - executable. - """ - procs_per_worker = procs_avail//len(self.workers) - nodes_avail = self.workers - slurm_hosts_per_exe = [] - # Either multiple executables per node or multiple nodes per executable - if max(n_procs) > procs_per_worker: - # Use whole nodes per executable - for i in range(n_exe): - n_nodes_req = int(np.ceil(n_procs[i]/procs_per_worker)) - current_hosts = [] - for _ in range(n_nodes_req): - current_hosts.append(nodes_avail.pop(0)) - slurm_hosts_per_exe.append(current_hosts) - else: - # Distribute processes to nodes - cores_avail = procs_per_worker - for i in range(n_exe): - # Exe does not fit on node - if n_procs[i] > cores_avail: - if len(nodes_avail) <= 1: - raise RuntimeError('Failed to distribute models to resources!') - # Take next node - nodes_avail.pop(0) - cores_avail = procs_per_worker - slurm_hosts_per_exe.append([nodes_avail[0]]) - return slurm_hosts_per_exe - def _get_local_hostname(self) -> str: """Get the hostname of the machine executing the Python script. From 0c1a3ba648791d9b3752478eceb7422279add72c Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Wed, 10 Jul 2024 13:30:39 +0200 Subject: [PATCH 15/20] Fix bug in distribution of cores for SLURM --- src/relexi/runtime/launch_configuration.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/relexi/runtime/launch_configuration.py b/src/relexi/runtime/launch_configuration.py index 4fc5b41..ed65dd4 100644 --- a/src/relexi/runtime/launch_configuration.py +++ b/src/relexi/runtime/launch_configuration.py @@ -244,16 +244,17 @@ def _distribute_workers_slurm( current_hosts.append(nodes_avail.pop(0)) slurm_hosts_per_exe.append(current_hosts) else: - # Distribute processes to nodes + # Use multiple executables peper cores_avail = procs_per_worker for i in range(n_exe): - # Exe does not fit on node + # Does not fit on remaining slots on node if n_procs[i] > cores_avail: if len(nodes_avail) <= 1: raise RuntimeError('Failed to distribute models to resources!') # Take next node nodes_avail.pop(0) cores_avail = procs_per_worker + cores_avail -= n_procs[i] slurm_hosts_per_exe.append([nodes_avail[0]]) return slurm_hosts_per_exe From 5c3608184aa97288c01c7785c31c7a2570ef9923 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Thu, 18 Jul 2024 11:54:15 +0200 Subject: [PATCH 16/20] Properly use HEAD node name to remove from hosts, since names provided by schedulers node list and returned by socket might not be identical. this ensures that the naming of head and hosts is consistent. --- src/relexi/runtime/runtime.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/relexi/runtime/runtime.py b/src/relexi/runtime/runtime.py index 4637199..8046c15 100644 --- a/src/relexi/runtime/runtime.py +++ b/src/relexi/runtime/runtime.py @@ -287,7 +287,7 @@ def head(self) -> str: Returns: str: Hostname of the Head node. """ - return self._get_local_hostname() + return self.hosts[0] @property def workers(self) -> List[str]: @@ -299,12 +299,11 @@ def workers(self) -> List[str]: list: List containing the hostnames of Workers as strings. """ if self.is_distributed: - local_host = self._get_local_hostname() workers = self.hosts.copy() - if local_host in workers: - workers.remove(local_host) + if self.head in workers: + workers.remove(self.head) else: - rlxout.warning(f'Localhost "{local_host}" not found in hosts list:') + rlxout.warning(f'Localhost "{self.head}" not found in hosts list:') rlxout.warning(f' {workers}') return workers return self.hosts @@ -352,9 +351,7 @@ def _launch_orchestrator( tuple: The `Experiment` instance, the `Orchestrator` instance and the IP address of the host of the database. """ - rlxout.small_banner('Starting Orchestrator...') - - # Generate flexi experiment + # Generate relexi experiment exp = Experiment('relexi', launcher=self.type) # Initialize the orchestrator based on the orchestrator_type From f60257f3b880f7c7fd570d69624ba930d0160b50 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Mon, 22 Jul 2024 15:10:29 +0200 Subject: [PATCH 17/20] Multiple improvements. Use SmartSIm functionality to get Scheduler Hosts, get number of slots for each host (for heterogeneous jobs) and add extensive testing suite for Runtime and LaunchConfig classes, and adapt remaining test to new implementation. --- src/relexi/runtime/launch_configuration.py | 2 + src/relexi/runtime/runtime.py | 128 ++++++++++----------- tests/test_env_flexi.py | 15 ++- tests/test_helpers.py | 29 ----- tests/test_launch_configuration.py | 99 ++++++++++++++++ tests/test_runtime.py | 125 ++++++++++++++++++++ 6 files changed, 296 insertions(+), 102 deletions(-) delete mode 100644 tests/test_helpers.py create mode 100644 tests/test_launch_configuration.py create mode 100644 tests/test_runtime.py diff --git a/src/relexi/runtime/launch_configuration.py b/src/relexi/runtime/launch_configuration.py index ed65dd4..78ee0e2 100644 --- a/src/relexi/runtime/launch_configuration.py +++ b/src/relexi/runtime/launch_configuration.py @@ -231,6 +231,8 @@ def _distribute_workers_slurm( List[List[str]]: List of lists containing the hostnames for each executable. """ + if sum(n_procs) > procs_avail: + raise RuntimeError('Failed to distribute models to resources!') procs_per_worker = procs_avail//len(workers) nodes_avail = workers slurm_hosts_per_exe = [] diff --git a/src/relexi/runtime/runtime.py b/src/relexi/runtime/runtime.py index 8046c15..163220a 100644 --- a/src/relexi/runtime/runtime.py +++ b/src/relexi/runtime/runtime.py @@ -76,7 +76,8 @@ def __init__( self, type_: Optional[str] = 'auto', db_network_interface: Optional[str] = 'lo', - db_port: Optional[int] = 6790 + db_port: Optional[int] = 6790, + do_launch_orchestrator: Optional[bool] = True ): """Initialize the Runtime. @@ -88,6 +89,8 @@ def __init__( the Orchestrator. Defaults to `'lo'`. db_port (int, optional): Port to start the Orchestrator on. Defaults to `6790`. + do_launch_orchestrator (bool, optional): Whether to launch the + `Orchestrator` immediately. Defaults to `True`. """ # Using SmartSim utility to identify type automatically try: @@ -101,9 +104,12 @@ def __init__( rlxout.info(f'Setting up "{self.type}" runtime...') self._hosts = self._get_hostlist() + # Check that actually sufficient hosts found + if type_ != 'local' and len(self._hosts) < 2: + raise ValueError('Less than 2 hosts found in environment!') except Exception as e: rlxout.warning(f'Failed: {e}') - if self.type != 'local': + if type_ != 'local': rlxout.info('Trying to setup LOCAL runtime instead...', newline=False) try: self.type = 'local' @@ -114,11 +120,17 @@ def __init__( raise RuntimeError('Failed to setup LOCAL training environment!') from e rlxout.info('Success!', newline=False) + self._exp = None self._db = None - self._exp, self._db, self._db_entry = self._launch_orchestrator( - port=db_port, - network_interface=db_network_interface, - ) + self._db_entry = None + if do_launch_orchestrator: + try: + self._exp, self._db, self._db_entry = self._launch_orchestrator( + port=db_port, + network_interface=db_network_interface, + ) + except Exception as e: + raise RuntimeError('Failed to launch the Orchestrator!') from e self.launch_config = None self.n_worker_slots = self._get_total_worker_slots() @@ -287,7 +299,7 @@ def head(self) -> str: Returns: str: Hostname of the Head node. """ - return self.hosts[0] + return self._get_local_hostname() @property def workers(self) -> List[str]: @@ -374,12 +386,10 @@ def _launch_orchestrator( return exp, db, f'{db_entry}:{port}' + def _get_hostlist(self) -> List[str]: """Get the list of hosts the script is executed on. - Uses the runtime type to determine the hostlist via the environment - variables set by the corresponding scheduler environment. - Returns: list: List containing the hostnames as strings. @@ -390,53 +400,55 @@ def _get_hostlist(self) -> List[str]: if self.type == 'local': return [self._get_local_hostname()] if self.type == 'pbs': - nodes = self._get_hostlist_pbs() - # Get the list of unique nodes via casting into set and list again - return list(set(nodes)) + return smartsim.wlm.pbs.get_hosts() if self.type == 'slurm': - return self._get_hostlist_slurm() + return smartsim.wlm.slurm.get_hosts() raise NotImplementedError( f'Method `get_hostlist` not implemented for runtime "{self.type}"!') - def _get_hostlist_pbs(self) -> List[str]: - """Read the `PBS_NODEFILE` and return the list of nodes. + def _get_slots_per_node_slurm(self) -> List[int]: + """Get the number of slots per node for the SLURM scheduler. + + Returns: + list(int): List containing the number of slots per node. + """ + if self.type != 'slurm': + raise ValueError('Method only available for SLURM scheduler!') + # 1. Get the nodelist + slots = os.getenv('SLURM_JOB_CPUS_PER_NODE') + if slots is None: + raise ValueError("SLURM_JOB_CPUS_PER_NODE is not set!") + # 2. split all entries at comma + nodelist = slots.split(',') + # 3. expand all compressed entries + expanded_list = [] + for entry in nodelist: + if '(' in entry: + num_cpus, count = entry.split('(x') + num_cpus = int(num_cpus) + count = int(count[:-1]) # remove trailing ')' + expanded_list.extend([num_cpus] * count) + else: + expanded_list.append(int(entry)) + return expanded_list - NOTE: - The `PBS_NODEFILE` contains the list of nodes allocated to the job. - If a node provides multiple MPI slots, it is the corresponding - number of times in the file. + def _get_slots_per_node_pbs(self) -> List[int]: + """Get the number of slots per node for the PBS scheduler. Returns: - list: List containing the hostnames as strings. + list(int): List containing the number of slots per node. """ if self.type != 'pbs': - raise ValueError('Method "read_pbs_nodefile" only available for PBS scheduler!') + raise ValueError('Method only available for PBS scheduler!') + # 1. Get the nodelist node_file = os.getenv('PBS_NODEFILE') if node_file is None: raise KeyError('Environment variable "PBS_NODEFILE" not found!') + # 2. Read the nodelist with open(node_file, 'r', encoding='utf-8') as f: - nodes = [line.strip() for line in f.readlines()] - return nodes - - def _get_hostlist_slurm(self) -> List[str]: - """Get the list of hosts from the SLURM_NODELIST environment variable. - - Returns: - list: List containing the unique hostnames as strings. - """ - if self.type != 'slurm': - raise ValueError('Method "get_slurm_nodelist" only available for SLURM scheduler!') - # Get the compressed list of nodes from SLURM_NODELIST - node_list = os.getenv('SLURM_NODELIST') - if node_list is None: - raise KeyError('Environment variable "SLURM_NODELIST" not found!') - # Use scontrol to expand the node list - result = subprocess.run(['scontrol', 'show', 'hostname', node_list], capture_output=True, text=True) - # Check if the command was successful - if result.returncode != 0: - raise RuntimeError(f'scontrol command failed: {result.stderr.strip()}') - # Split the output into individual hostnames - return result.stdout.strip().split('\n') + nodes = [line.strip().split('.')[0] for line in f.readlines()] + # 3. Count the number of slots (i.e. lines) per node + return [nodes.count(host) for host in self.hosts] def _get_total_worker_slots(self) -> int: """Get the total number of worker slots available in the runtime. @@ -447,30 +459,12 @@ def _get_total_worker_slots(self) -> int: if self.type == 'local': # Leave one core for the head node return os.cpu_count()-1 - if self.type == 'pbs': - # Get all slots from PBS_NODEFILE. Multiple slots per node - # correspond to multiple entries in the file. - nodes = self._get_hostlist_pbs() - slots = np.zeros(len(self.workers), dtype=int) - for i, worker in enumerate(self.workers): - slots[i] = nodes.count(worker) - return np.sum(slots) - + slots_per_node = self._get_slots_per_node_pbs() + return np.sum(slots_per_node[1:]) if self.type == 'slurm': - # Get the number of slots per node from SLURM_CPUS_PER_NODE - slots = os.getenv('SLURM_JOB_CPUS_PER_NODE') - if slots is None: - raise KeyError('Environment variable "SLURM_JOB_CPUS_PER_NODE" not found!') - # Parse only first integer part from string (has form '24(x4)'). - slots_int = "" - for char in slots: - if char.isdigit(): - slots_int += char - else: - break - return int(slots_int)*len(self.workers) - + slots_per_node = self._get_slots_per_node_slurm() + return np.sum(slots_per_node[1:]) raise NotImplementedError( f'Method `get_slots_per_worker` not implemented for runtime "{self.type}"!') @@ -480,4 +474,4 @@ def _get_local_hostname(self) -> str: Returns: str: Hostname of the local machine executing the script. """ - return socket.gethostname() + return socket.gethostname().split('.')[0] diff --git a/tests/test_env_flexi.py b/tests/test_env_flexi.py index 7fbf311..85d7a51 100644 --- a/tests/test_env_flexi.py +++ b/tests/test_env_flexi.py @@ -1,19 +1,22 @@ #!/usr/bin/env python3 +import os.path +import unittest +from unittest.mock import patch + +from smartsim import Experiment + from .context import relexi import relexi.env.flexiEnvSmartSim as rlxenv -from smartsim import Experiment -from unittest.mock import patch -import os.path from relexi.env.flexiEnvSmartSim import Client """ Contains pytest - tests for the functionalities of the relexi.env.flexiEnv module """ @patch.object(rlxenv.flexiEnv,'_start_flexi') @patch.object(rlxenv.flexiEnv,'_get_current_state') -@patch.object(rlxenv.flexiEnv,'_end_flexi') +@patch.object(rlxenv.flexiEnv,'stop') @patch('relexi.env.flexiEnvSmartSim.Client') -def init_flexi_env(mock__start_flexi, mock__get_current_state, mock__end_flexi, mock_Client): +def init_flexi_env(mock__start_flexi, mock__get_current_state, mock_stop, mock_Client): smartsim_port = 6780 smartsim_num_dbs = 1 @@ -59,7 +62,6 @@ def init_flexi_env(mock__start_flexi, mock__get_current_state, mock__end_flexi, ,tag = 'eval' ,port = smartsim_port ,entry_db = entry_db - ,is_db_cluster = is_db_cluster ,hosts = worker_nodes ,n_procs = num_procs_per_environment ,n_envs = num_parallel_environments @@ -77,6 +79,7 @@ def init_flexi_env(mock__start_flexi, mock__get_current_state, mock__end_flexi, return flexi_env +@unittest.skip('Has to be adapted to new Runtime implementation.') @patch('os.path.isfile') @patch('os.access') @patch.object(Experiment, 'start') diff --git a/tests/test_helpers.py b/tests/test_helpers.py deleted file mode 100644 index e2efce3..0000000 --- a/tests/test_helpers.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 - -from .context import relexi -import relexi.smartsim.helpers as rlxhelpers - -""" Contains pytest - tests for the functionalities of the relexi.smartsim.helpers module """ - -def test_generate_rankfile_ompi(tmp_path): - - hosts = ["r1n1c1n1", "r1n1c1n2"] - cores_per_node = 4 - n_par_env = 4 - ranks_per_env = 2 - base_path = tmp_path - - expected = list() - - expected.append("rank 0=r1n1c1n1 slot=0\nrank 1=r1n1c1n1 slot=1") - expected.append("rank 0=r1n1c1n1 slot=2\nrank 1=r1n1c1n1 slot=3") - expected.append("rank 0=r1n1c1n2 slot=0\nrank 1=r1n1c1n2 slot=1") - expected.append("rank 0=r1n1c1n2 slot=2\nrank 1=r1n1c1n2 slot=3") - - rankfiles_out = rlxhelpers.generate_rankfile_ompi(hosts, cores_per_node, n_par_env, ranks_per_env, base_path) - - i = 0 - for rankfile in rankfiles_out: - with open(rankfile, 'r') as fh: - assert fh.read().rstrip()==expected[i], f"Rankfile for rank {i} is wrong" - i = i+1 diff --git a/tests/test_launch_configuration.py b/tests/test_launch_configuration.py new file mode 100644 index 0000000..1cfc6a5 --- /dev/null +++ b/tests/test_launch_configuration.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +"""Test for the launchconfig module based on `unittest` package.""" + +import unittest + +from .context import relexi +from relexi.runtime import LaunchConfig, Runtime + +class TestLaunchConfig(unittest.TestCase): + """Tests for the launchconfig module.""" + + #@unittest.mock.patch('runtime.Runtime.n_worker_slots', return_value=4) + #def test_launchconfig_init_local(self): + # """Test init of launchconfig with database.""" + # launchconfig = LaunchConfig(type_='local') + # assert launchconfig is not None + # assert launchconfig.type == 'local' + + def test_generate_rankfile_ompi(self): + """Test generate_rankfile_ompi.""" + # Specify configuration + workers = ['r1n1c1n1', 'r1n1c1n2'] + n_slots_per_worker = 4 + n_par_env = 4 + n_procs = [2, 2, 1, 3] + # Generate rankfiles + rankfiles = LaunchConfig._generate_rankfile_ompi( + workers, + n_slots_per_worker, + n_par_env, + n_procs + ) + # Prepare expected content + expected = [ + 'rank 0=r1n1c1n1 slot=0\nrank 1=r1n1c1n1 slot=1', + 'rank 0=r1n1c1n1 slot=2\nrank 1=r1n1c1n1 slot=3', + 'rank 0=r1n1c1n2 slot=0', + 'rank 0=r1n1c1n2 slot=1\nrank 1=r1n1c1n2 slot=2\nrank 2=r1n1c1n2 slot=3' + ] + # Check that rankfiles are correct + for i, rankfile in enumerate(rankfiles): + with open(rankfile, 'r', encoding='utf-8') as f: + file_lines = f.read().rstrip() + print(f'Rankfile {i}:\n{file_lines}') + assert file_lines == expected[i] + + def test_distribute_workers_slurm_1(self): + """Test distribute_workers_slurm.""" + # Specify configuration + n_procs = [2, 2, 1, 3] + n_exe = 4 + workers = ['r1n1c1n1', 'r1n1c1n2'] + procs_avail = 8 + # Distribute workers + hosts_per_exe = LaunchConfig._distribute_workers_slurm( + n_procs, + n_exe, + workers, + procs_avail + ) + # Check that workers are correct + expected = [['r1n1c1n1'], ['r1n1c1n1'], ['r1n1c1n2'], ['r1n1c1n2']] + assert expected == hosts_per_exe + + def test_distribute_workers_slurm_2(self): + """Test distribute_workers_slurm.""" + # Specify configuration + n_procs = [3, 3] + n_exe = 2 + workers = ['r1n1c1n1', 'r1n1c1n2'] + procs_avail = 8 + # Distribute workers + hosts_per_exe = LaunchConfig._distribute_workers_slurm( + n_procs, + n_exe, + workers, + procs_avail + ) + # Check that workers are correct + expected = [['r1n1c1n1'], ['r1n1c1n2']] + assert expected == hosts_per_exe + + def test_distribute_workers_slurm_3(self): + """Test errors raised when not sufficient resources.""" + # Specify configuration + n_procs = [3, 3] + n_exe = 2 + workers = ['r1n1c1n1', 'r1n1c1n2'] + procs_avail = 4 + # Check that workers are correct + self.assertRaises( + RuntimeError, + LaunchConfig._distribute_workers_slurm, + n_procs, + n_exe, + workers, + procs_avail + ) diff --git a/tests/test_runtime.py b/tests/test_runtime.py new file mode 100644 index 0000000..8e59da4 --- /dev/null +++ b/tests/test_runtime.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 + +"""Tests for the runtime module based on `unittest` package.""" + +import os +import socket +import unittest +from unittest import mock + +import smartsim + +from .context import relexi +from relexi.runtime import Runtime + + +class TestRuntime(unittest.TestCase): + """Tests for the runtime module.""" + def test_runtime_auto(self): + """Test auto-detection of runtime type.""" + runtime = Runtime(type_='auto', do_launch_orchestrator=False) + assert runtime is not None + assert runtime.type == smartsim.wlm.detect_launcher() + + def test_runtime_init(self): + """Test init of runtime with database.""" + runtime = Runtime(type_='local') + assert runtime is not None + assert runtime.type == 'local' + + def test_runtime_info_local(self): + """For Info, we just want to check if it runs without errors.""" + try: + runtime = Runtime(type_='local') + runtime.info() + except Exception as e: + self.fail(f"Runtime.info() raised an exception: {e}") + + def test_runtime_env_local(self): + runtime = Runtime(type_='local', db_port=6780) + assert runtime is not None + assert runtime.type == 'local' + assert not runtime.is_distributed + assert runtime.hosts == [runtime.head] + assert runtime.hosts == runtime.workers + assert runtime.db_entry == '127.0.0.1:6780' + assert runtime.db is not None + assert runtime.exp is not None + + @mock.patch.dict(os.environ, {'SLURM_JOB_CPUS_PER_NODE': '4(x2),8,16(x2)'}) + def test_runtime_init_slurm(self): + """Test setup based on mocked SLURM_JOB_CPUS_PER_NODE.""" + # Also mock retrieval of hostnames, since smartsim util needs + # `scontrol` for that, which is not installed on non-SLURM systems. + with mock.patch('smartsim.wlm.slurm.get_hosts', + return_value=['node1', 'node2', 'node3', 'node4', 'node5']): + # Set localhost to correct name + with mock.patch('socket.gethostname', return_value='node1'): + runtime = Runtime(type_='slurm', do_launch_orchestrator=False) + assert runtime is not None + assert runtime.is_distributed + assert runtime.type == 'slurm' + assert runtime.hosts == ['node1', 'node2', 'node3', 'node4', 'node5'] + assert runtime.workers == ['node2','node3', 'node4', 'node5'] + assert runtime.head == 'node1' + assert runtime.n_worker_slots == 44 + assert runtime._get_slots_per_node_slurm() == [4, 4, 8, 16, 16] + + @mock.patch.dict(os.environ, {'PBS_NODEFILE': '.nodefile.mock'}) + def test_runtime_env_pbs_1(self): + """Test setup based on mocked nodefile.""" + # Prepare rank file + with open('.nodefile.mock', 'w', encoding='utf-8') as f: + f.write('node1\nnode2\nnode2\nnode2\nnode3\nnode3\n') + # Set localhost to correct name + with mock.patch('socket.gethostname', return_value='node1'): + runtime = Runtime(type_='pbs', do_launch_orchestrator=False) + assert runtime is not None + assert runtime.is_distributed + assert runtime.type == 'pbs' + assert runtime.hosts == ['node1', 'node2', 'node3'] + assert runtime.workers == ['node2','node3'] + assert runtime.head == 'node1' + assert runtime.n_worker_slots == 5 + assert runtime._get_slots_per_node_pbs() == [1, 3, 2] + + @mock.patch.dict(os.environ, {'PBS_NODEFILE': '.nodefile.mock'}) + def test_runtime_env_pbs_2(self): + """Test long-form hostnames in nodefile.""" + # Prepare rank file + with open('.nodefile.mock', 'w', encoding='utf-8') as f: + f.write('node1.some.thing\nnode1.some.thing\nnode2.some.thing\nnode2.some.thing\nnode2.some.thing\nnode3.some.thing\n') + # Set localhost to correct name + with mock.patch('socket.gethostname', return_value='node1'): + runtime = Runtime(type_='pbs', do_launch_orchestrator=False) + assert runtime is not None + assert runtime.is_distributed + assert runtime.type == 'pbs' + assert runtime.hosts == ['node1', 'node2', 'node3'] + assert runtime.workers == ['node2','node3'] + assert runtime.head == 'node1' + assert runtime.n_worker_slots == 4 + assert runtime._get_slots_per_node_pbs() == [2, 3, 1] + + @mock.patch.dict(os.environ, {'PBS_NODEFILE': '.nodefile.mock'}) + def test_runtime_env_pbs_3(self): + """Test fallback to 'local' mode for empty nodefile""" + # Prepare rank file + with open ('.nodefile.mock', 'w', encoding='utf-8') as f: + f.write('') + runtime = Runtime(type_='pbs', do_launch_orchestrator=False) + assert runtime is not None + assert runtime.type == 'local' + assert not runtime.is_distributed + assert runtime.hosts == [runtime.head] + assert runtime.hosts == runtime.workers + + def test_runtime_init_wrong(self): + """Test fallback for invalid runtime type""" + runtime = Runtime(type_='wrong') + assert runtime.type == 'local' + assert not runtime.is_distributed + assert runtime.hosts == [runtime.head] + assert runtime.hosts == runtime.workers + assert runtime.db is not None + assert runtime.exp is not None From ab0c2f3a3bb7693f3f35cca5dcbf4fb92588461e Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Mon, 22 Jul 2024 15:21:08 +0200 Subject: [PATCH 18/20] Update dependencies because of known bugs in setuptools-70.*. Makes !26 redundant. --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4c6712e..8363819 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ smartsim>=0.4,<0.7 smartredis -tensorflow>=2.9,<2.16 +tensorflow>=2.15,<2.16 tf-agents cmake pyyaml @@ -8,3 +8,4 @@ matplotlib pdoc pytest pytest-cov +setuptools!=70.* From 6304690c9c6bd0916c475c02e7e3298b163b1c32 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Wed, 24 Jul 2024 08:52:18 +0200 Subject: [PATCH 19/20] Add network_interface for Orchestrator as user parameter --- examples/HIT_24_DOF/prm.yaml | 7 ++----- examples/HIT_32_DOF/prm.yaml | 7 ++----- src/relexi/rl/ppo/train.py | 8 ++++---- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/examples/HIT_24_DOF/prm.yaml b/examples/HIT_24_DOF/prm.yaml index 0124dbd..bffbfeb 100644 --- a/examples/HIT_24_DOF/prm.yaml +++ b/examples/HIT_24_DOF/prm.yaml @@ -14,8 +14,6 @@ environment: executable_path: ../../../flexi-extensions/build/bin/flexi parameter_file: ./simulation_files/parameter_flexi.ini mesh_file: ./simulation_files/CART_HEX_PERIODIC_004_mesh.h5 - #local_dir: /var/tmp - mpi_launch_mpmd: False env_launcher: local # All parameters for setting up the reward @@ -64,6 +62,5 @@ performance: # SmartSim smartsim: smartsim_port: 6780 - smartsim_num_dbs: 1 - smartsim_launcher: pbs - smartsim_orchestrator: pbs + smartsim_network_interface: local + smartsim_orchestrator: auto diff --git a/examples/HIT_32_DOF/prm.yaml b/examples/HIT_32_DOF/prm.yaml index 70c3368..ee16652 100644 --- a/examples/HIT_32_DOF/prm.yaml +++ b/examples/HIT_32_DOF/prm.yaml @@ -14,8 +14,6 @@ environment: executable_path: ../../../flexi-extensions/build/bin/flexi parameter_file: ./simulation_files/parameter_flexi.ini mesh_file: ./simulation_files/CART_HEX_PERIODIC_004_mesh.h5 - #local_dir: /var/tmp - mpi_launch_mpmd: False env_launcher: local # All parameters for setting up the reward @@ -64,6 +62,5 @@ performance: # SmartSim smartsim: smartsim_port: 6780 - smartsim_num_dbs: 1 - smartsim_launcher: pbs - smartsim_orchestrator: pbs + smartsim_network_interface: local + smartsim_orchestrator: auto diff --git a/src/relexi/rl/ppo/train.py b/src/relexi/rl/ppo/train.py index eb08961..f2bacc7 100644 --- a/src/relexi/rl/ppo/train.py +++ b/src/relexi/rl/ppo/train.py @@ -62,9 +62,9 @@ def train( config_file ,do_profile = False ,smartsim_port = 6780 ,smartsim_num_dbs = 1 - ,smartsim_launcher = "local" - ,smartsim_orchestrator = "local" - ,env_launcher = "mpirun" + ,smartsim_orchestrator = 'local' + ,smartsim_network_interface = 'local' + ,env_launcher = 'mpirun' ,mpi_launch_mpmd = False ,local_dir = None ,n_procs_per_node=128 # Hawk @@ -110,7 +110,7 @@ def train( config_file runtime = relexi.runtime.Runtime( type_=smartsim_orchestrator, db_port=smartsim_port, - db_network_interface='ibp94s0', + db_network_interface=smartsim_network_interface, ) runtime.info() From 03f574a218fce1de5b98bff6b59b63cdd3e6ffea Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Wed, 24 Jul 2024 09:18:51 +0200 Subject: [PATCH 20/20] Fix bug when using "auto" environment detection in a "local" environment. --- src/relexi/runtime/runtime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/relexi/runtime/runtime.py b/src/relexi/runtime/runtime.py index 163220a..3e91da4 100644 --- a/src/relexi/runtime/runtime.py +++ b/src/relexi/runtime/runtime.py @@ -92,8 +92,8 @@ def __init__( do_launch_orchestrator (bool, optional): Whether to launch the `Orchestrator` immediately. Defaults to `True`. """ - # Using SmartSim utility to identify type automatically try: + # Using SmartSim utility to identify type automatically if type_ == 'auto': rlxout.info('Identifying environment...') scheduler = smartsim.wlm.detect_launcher() @@ -105,7 +105,7 @@ def __init__( rlxout.info(f'Setting up "{self.type}" runtime...') self._hosts = self._get_hostlist() # Check that actually sufficient hosts found - if type_ != 'local' and len(self._hosts) < 2: + if self.type != 'local' and len(self._hosts) < 2: raise ValueError('Less than 2 hosts found in environment!') except Exception as e: rlxout.warning(f'Failed: {e}')