diff --git a/deepmd/tf/cluster/__init__.py b/deepmd/tf/cluster/__init__.py index 3c15778fe5..6735ce92f4 100644 --- a/deepmd/tf/cluster/__init__.py +++ b/deepmd/tf/cluster/__init__.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: LGPL-3.0-or-later """Module that reads node resources, auto detects if running local or on SLURM.""" -import os from typing import ( List, Optional, @@ -9,7 +8,6 @@ ) from .local import get_resource as get_local_res -from .slurm import get_resource as get_slurm_res __all__ = ["get_resource"] @@ -22,7 +20,4 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: Tuple[str, List[str], Optional[List[int]]] nodename, nodelist, and gpus """ - if "SLURM_JOB_NODELIST" in os.environ: - return get_slurm_res() - else: - return get_local_res() + return get_local_res() diff --git a/deepmd/tf/cluster/local.py b/deepmd/tf/cluster/local.py index bd0e4c86aa..60961a0d65 100644 --- a/deepmd/tf/cluster/local.py +++ b/deepmd/tf/cluster/local.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: LGPL-3.0-or-later """Get local GPU resources.""" -import socket import subprocess as sp import sys from typing import ( @@ -13,6 +12,9 @@ from deepmd.tf.env import ( tf, ) +from deepmd.utils.hostlist import ( + get_host_names, +) __all__ = ["get_gpus", "get_resource"] @@ -57,7 +59,6 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: Tuple[str, List[str], Optional[List[int]]] nodename, nodelist, and gpus """ - nodename = socket.gethostname() - nodelist = [nodename] + nodename, nodelist = get_host_names() gpus = get_gpus() return nodename, nodelist, gpus diff --git a/deepmd/tf/cluster/slurm.py b/deepmd/tf/cluster/slurm.py deleted file mode 100644 index 7a7ebcee3e..0000000000 --- a/deepmd/tf/cluster/slurm.py +++ /dev/null @@ -1,59 +0,0 @@ -# SPDX-License-Identifier: LGPL-3.0-or-later -"""MOdule to get resources on SLURM cluster. - -References ----------- -https://github.com/deepsense-ai/tensorflow_on_slurm #### -""" - -import os -from typing import ( - List, - Optional, - Tuple, -) - -import hostlist - -from deepmd.tf.cluster import ( - local, -) - -__all__ = ["get_resource"] - - -def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: - """Get SLURM resources: nodename, nodelist, and gpus. - - Returns - ------- - Tuple[str, List[str], Optional[List[int]]] - nodename, nodelist, and gpus - - Raises - ------ - RuntimeError - if number of nodes could not be retrieved - ValueError - list of nodes is not of the same length sa number of nodes - ValueError - if current nodename is not found in node list - """ - nodelist = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"]) - nodename = os.environ["SLURMD_NODENAME"] - num_nodes_env = os.getenv("SLURM_JOB_NUM_NODES") - if num_nodes_env: - num_nodes = int(num_nodes_env) - else: - raise RuntimeError("Could not get SLURM number of nodes") - - if len(nodelist) != num_nodes: - raise ValueError( - f"Number of slurm nodes {len(nodelist)} not equal to {num_nodes}" - ) - if nodename not in nodelist: - raise ValueError( - f"Nodename({nodename}) not in nodelist({nodelist}). This should not happen!" - ) - gpus = local.get_gpus() - return nodename, nodelist, gpus diff --git a/deepmd/utils/hostlist.py b/deepmd/utils/hostlist.py new file mode 100644 index 0000000000..d09a8d8bf1 --- /dev/null +++ b/deepmd/utils/hostlist.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +import socket +from typing import ( + List, + Tuple, +) + + +def get_host_names() -> Tuple[str, List[str]]: + """Get host names of all nodes in the cluster. + + If mpi4py is not installed or MPI is not used, then the + host name of the current node is returned as those of all nodes. + + Returns + ------- + str + Host name of the current node + List[str] + List of host names of all nodes in the cluster + """ + host_name = socket.gethostname() + try: + from mpi4py import ( + MPI, + ) + except ImportError: + return host_name, [host_name] + + comm = MPI.COMM_WORLD + if comm.Get_size() == 1: + return host_name, [host_name] + host_names = comm.allgather(host_name) + return host_name, list(set(host_names)) diff --git a/pyproject.toml b/pyproject.toml index e4096b37b4..1701e10bb8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,6 @@ dependencies = [ 'scipy', 'pyyaml', 'dargs >= 0.4.1', - 'python-hostlist >= 1.21', 'typing_extensions; python_version < "3.8"', 'importlib_metadata>=1.4; python_version < "3.8"', 'h5py', diff --git a/source/tests/tf/test_cluster.py b/source/tests/tf/test_cluster.py index 27526a3ccf..ea90e1ea6d 100644 --- a/source/tests/tf/test_cluster.py +++ b/source/tests/tf/test_cluster.py @@ -6,7 +6,6 @@ from deepmd.tf.cluster import ( local, - slurm, ) kHostName = "compute-b24-1" @@ -70,75 +69,3 @@ def test_resource(self, mock_gethostname): nodename, nodelist, _ = local.get_resource() self.assertEqual(nodename, kHostName) self.assertEqual(nodelist, [kHostName]) - - -class TestSlurm(unittest.TestCase): - @mock.patch.dict( - "os.environ", - values={ - "SLURM_JOB_NODELIST": kHostName, - "SLURMD_NODENAME": kHostName, - "SLURM_JOB_NUM_NODES": "1", - }, - ) - def test_single(self): - nodename, nodelist, _ = slurm.get_resource() - self.assertEqual(nodename, kHostName) - self.assertEqual(nodelist, [kHostName]) - - @mock.patch.dict( - "os.environ", - values={ - "SLURM_JOB_NODELIST": "compute-b24-[1-3,5-9],compute-b25-[4,8]", - "SLURMD_NODENAME": "compute-b24-2", - "SLURM_JOB_NUM_NODES": "10", - }, - ) - def test_multiple(self): - nodename, nodelist, _ = slurm.get_resource() - self.assertEqual(nodename, "compute-b24-2") - self.assertEqual( - nodelist, - [ - "compute-b24-1", - "compute-b24-2", - "compute-b24-3", - "compute-b24-5", - "compute-b24-6", - "compute-b24-7", - "compute-b24-8", - "compute-b24-9", - "compute-b25-4", - "compute-b25-8", - ], - ) - - def test_illegal(self): - environ = { - "SLURM_JOB_NODELIST": "compute-b24-[3-5]", - "SLURMD_NODENAME": "compute-b24-4", - } - with mock.patch.dict("os.environ", environ): - with self.assertRaises(RuntimeError) as cm: - _ = slurm.get_resource() - self.assertIn("Could not get SLURM number", str(cm.exception)) - - environ = { - "SLURM_JOB_NODELIST": "compute-b24-1,compute-b25-2", - "SLURMD_NODENAME": "compute-b25-2", - "SLURM_JOB_NUM_NODES": "4", - } - with mock.patch.dict("os.environ", environ): - with self.assertRaises(ValueError) as cm: - _ = slurm.get_resource() - self.assertIn("Number of slurm nodes 2", str(cm.exception)) - - environ = { - "SLURM_JOB_NODELIST": "compute-b24-1,compute-b25-3", - "SLURMD_NODENAME": "compute-b25-2", - "SLURM_JOB_NUM_NODES": "2", - } - with mock.patch.dict("os.environ", environ): - with self.assertRaises(ValueError) as cm: - _ = slurm.get_resource() - self.assertIn("Nodename(compute-b25-2", str(cm.exception))