From 28726b2a78be1d542b076123567389bcff54caa2 Mon Sep 17 00:00:00 2001 From: Fabrice Normandin Date: Thu, 25 Apr 2024 10:21:05 -0400 Subject: [PATCH] Minor touchups for code reviews Signed-off-by: Fabrice Normandin --- milatools/cli/utils.py | 2 +- milatools/utils/compute_node.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/milatools/cli/utils.py b/milatools/cli/utils.py index e210106b..dfe32b45 100644 --- a/milatools/cli/utils.py +++ b/milatools/cli/utils.py @@ -264,7 +264,7 @@ def hoststring(self, host: str) -> str: def get_hostname_to_use_for_compute_node( node_name: str, cluster: str = "mila", ssh_config_path: Path = SSH_CONFIG_FILE ) -> str: - """Return the hostname to use to connect to this compute note via ssh.""" + """Return the hostname to use to connect to this compute node via ssh.""" if not ssh_config_path.exists(): # If the SSH config file doesn't exist, we can't do much. raise MilatoolsUserError( diff --git a/milatools/utils/compute_node.py b/milatools/utils/compute_node.py index f1894e33..36b88cc8 100644 --- a/milatools/utils/compute_node.py +++ b/milatools/utils/compute_node.py @@ -45,17 +45,21 @@ class ComputeNode(Runner): """ login_node: RemoteV2 + """The login node of the SLURM cluster.""" + job_id: int + """The job ID of the job running on the compute node.""" + salloc_subprocess: asyncio.subprocess.Process | None = None + """A handle to the subprocess that is running the `salloc` command.""" + _closed: bool = dataclasses.field(default=False, init=False, repr=False) def __post_init__(self): - # The hostname will be of the compute node, not the login node. - # NOTE: On DRAC clusters, we don't actually want the full hostname here, because - # the SSH config we make with `mila init` matches the node names like `cdr****`, - # but not the fully qualified hostnames (`cdr2052.int.cedar.computecanada.ca`). - cluster = self.login_node.hostname node_name = self.get_output("echo $SLURMD_NODENAME", display=False, hide=True) + # We show the hostname of the compute node during commands, even though we're + # actually running `ssh srun --jobid ` + cluster = self.login_node.hostname self.hostname: str = get_hostname_to_use_for_compute_node( node_name, cluster=cluster,