Skip to content

Commit

Permalink
Minor touchups for code reviews
Browse files Browse the repository at this point in the history
Signed-off-by: Fabrice Normandin <[email protected]>
  • Loading branch information
lebrice committed Apr 25, 2024
1 parent 23df2c8 commit 28726b2
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 6 deletions.
2 changes: 1 addition & 1 deletion milatools/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def hoststring(self, host: str) -> str:
def get_hostname_to_use_for_compute_node(
node_name: str, cluster: str = "mila", ssh_config_path: Path = SSH_CONFIG_FILE
) -> str:
"""Return the hostname to use to connect to this compute note via ssh."""
"""Return the hostname to use to connect to this compute node via ssh."""
if not ssh_config_path.exists():
# If the SSH config file doesn't exist, we can't do much.
raise MilatoolsUserError(
Expand Down
14 changes: 9 additions & 5 deletions milatools/utils/compute_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,21 @@ class ComputeNode(Runner):
"""

login_node: RemoteV2
"""The login node of the SLURM cluster."""

job_id: int
"""The job ID of the job running on the compute node."""

salloc_subprocess: asyncio.subprocess.Process | None = None
"""A handle to the subprocess that is running the `salloc` command."""

_closed: bool = dataclasses.field(default=False, init=False, repr=False)

def __post_init__(self):
# The hostname will be of the compute node, not the login node.
# NOTE: On DRAC clusters, we don't actually want the full hostname here, because
# the SSH config we make with `mila init` matches the node names like `cdr****`,
# but not the fully qualified hostnames (`cdr2052.int.cedar.computecanada.ca`).
cluster = self.login_node.hostname
node_name = self.get_output("echo $SLURMD_NODENAME", display=False, hide=True)
# We show the hostname of the compute node during commands, even though we're
# actually running `ssh <login-node> srun --jobid <job-id> <command>`
cluster = self.login_node.hostname
self.hostname: str = get_hostname_to_use_for_compute_node(
node_name,
cluster=cluster,
Expand Down

0 comments on commit 28726b2

Please sign in to comment.