Skip to content

Commit

Permalink
Cancel jobs explicitly in test_slurm_remote.py
Browse files Browse the repository at this point in the history
Signed-off-by: Fabrice Normandin <[email protected]>
  • Loading branch information
lebrice committed Apr 23, 2024
1 parent 6a8149d commit 1b6f095
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 66 deletions.
28 changes: 16 additions & 12 deletions tests/integration/test_code_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from milatools.utils.remote_v2 import RemoteV2

from ..cli.common import skip_param_if_on_github_cloud_ci
from ..conftest import launches_jobs
from .conftest import (
skip_if_not_already_logged_in,
skip_param_if_not_already_logged_in,
Expand Down Expand Up @@ -63,6 +64,7 @@ def test_check_disk_quota(


@pytest.mark.slow
@launches_jobs
@PARAMIKO_SSH_BANNER_BUG
@pytest.mark.parametrize("persist", [True, False])
def test_code(
Expand Down Expand Up @@ -128,15 +130,17 @@ def test_code(
# before submitting the job)
workdir = job_info["WorkDir"]
assert workdir == scratch

if persist:
# Job should still be running since we're using `persist` (that's the whole
# point.)
assert job_info["State"] == "RUNNING"
else:
# Job should have been cancelled by us after the `echo` process finished.
# NOTE: This check is a bit flaky, perhaps our `scancel` command hasn't
# completed yet, or sacct doesn't show the change in status quick enough.
# Relaxing it a bit for now.
# assert "CANCELLED" in job_info["State"]
assert "CANCELLED" in job_info["State"] or job_info["State"] == "RUNNING"
try:
if persist:
# Job should still be running since we're using `persist` (that's the whole
# point.)
assert job_info["State"] == "RUNNING"
else:
# Job should have been cancelled by us after the `echo` process finished.
# NOTE: This check is a bit flaky, perhaps our `scancel` command hasn't
# completed yet, or sacct doesn't show the change in status quick enough.
# Relaxing it a bit for now.
# assert "CANCELLED" in job_info["State"]
assert "CANCELLED" in job_info["State"] or job_info["State"] == "RUNNING"
finally:
login_node.run(f"scancel {job_id}", display=True)
109 changes: 55 additions & 54 deletions tests/integration/test_slurm_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

from ..cli.common import on_windows
from ..conftest import launches_jobs
from .conftest import MAX_JOB_DURATION, SLURM_CLUSTER, hangs_in_github_CI
from .conftest import SLURM_CLUSTER, hangs_in_github_CI

logger = get_logger(__name__)

Expand Down Expand Up @@ -219,52 +219,46 @@ def test_ensure_allocation(
"""
data, remote_runner = salloc_slurm_remote.ensure_allocation()
assert isinstance(remote_runner, fabric.runners.Remote)

assert "node_name" in data
# NOTE: it very well could be if we also extracted it from the salloc output.
assert "jobid" not in data
compute_node_from_salloc_output = data["node_name"]

# Check that salloc was called. This would be printed to stderr by fabric if we were
# using `run`, but `ensure_allocation` uses `extract` which somehow doesn't print it
salloc_stdout, salloc_stderr = capsys.readouterr()
# assert not stdout
# assert not stderr
assert "salloc: Granted job allocation" in salloc_stdout
assert (
f"salloc: Nodes {compute_node_from_salloc_output} are ready for job"
in salloc_stdout
)
assert not salloc_stderr

# Check that the returned remote runner is indeed connected to a *login* node (?!)
# NOTE: This is a fabric.runners.Remote, not a Remote or SlurmRemote of milatools,
# so calling `.run` doesn't launch a job.
result = remote_runner.run("hostname", echo=True, in_stream=False)
assert result
hostname_from_remote_runner = result.stdout.strip()
# BUG: If the `login_node` is a RemoteV2, then this will fail because the hostname
# might differ between the two (multiple login nodes in the Mila cluster).
result2 = login_node.run("hostname", display=True, hide=False)
assert result2
hostname_from_login_node_runner = result2.stdout.strip()

if isinstance(login_node, RemoteV2) and login_node.hostname == "mila":
assert hostname_from_remote_runner.startswith("login-")
assert hostname_from_login_node_runner.startswith("login-")
elif isinstance(login_node, RemoteV1):
assert hostname_from_remote_runner == hostname_from_login_node_runner

print(f"Sleeping for {MAX_JOB_DURATION.total_seconds()}s until job finishes...")
time.sleep(MAX_JOB_DURATION.total_seconds())

# todo: This check is flaky. (the test itself is outdated because it's for RemoteV1)
# sacct_output = get_recent_jobs_info(login_node, fields=("JobName", "Node", "State"))
# assert (JOB_NAME, compute_node_from_salloc_output, "COMPLETED") in sacct_output or (
# JOB_NAME,
# compute_node_from_salloc_output,
# "TIMEOUT",
# ) in sacct_output
try:
assert "node_name" in data
# NOTE: it very well could be if we also extracted it from the salloc output.
assert "jobid" not in data
compute_node_from_salloc_output = data["node_name"]

# Check that salloc was called. This would be printed to stderr by fabric if we were
# using `run`, but `ensure_allocation` uses `extract` which somehow doesn't print it
salloc_stdout, salloc_stderr = capsys.readouterr()
# assert not stdout
# assert not stderr
assert "salloc: Granted job allocation" in salloc_stdout
assert (
f"salloc: Nodes {compute_node_from_salloc_output} are ready for job"
in salloc_stdout
)
assert not salloc_stderr

# Check that the returned remote runner is indeed connected to a *login* node (?!)
# NOTE: This is a fabric.runners.Remote, not a Remote or SlurmRemote of milatools,
# so calling `.run` doesn't launch a job.
result = remote_runner.run("hostname", echo=True, in_stream=False)
assert result
hostname_from_remote_runner = result.stdout.strip()
# BUG: If the `login_node` is a RemoteV2, then this will fail because the hostname
# might differ between the two (multiple login nodes in the Mila cluster).
result2 = login_node.run("hostname", display=True, hide=False)
assert result2
hostname_from_login_node_runner = result2.stdout.strip()

if isinstance(login_node, RemoteV2) and login_node.hostname == "mila":
assert hostname_from_remote_runner.startswith("login-")
assert hostname_from_login_node_runner.startswith("login-")
elif isinstance(login_node, RemoteV1):
assert hostname_from_remote_runner == hostname_from_login_node_runner
finally:
result = remote_runner.run("echo $SLURM_JOB_ID", echo=True, in_stream=False)
assert result
job_id = int(result.stdout.strip())
login_node.run(f"scancel {job_id}", display=True, hide=False)


@launches_jobs
Expand All @@ -289,11 +283,18 @@ def test_ensure_allocation_sbatch(
node_hostname = job_data["node_name"]
assert "jobid" in job_data
job_id_from_sbatch_extract = job_data["jobid"]
try:
sleep_so_sacct_can_update()

sleep_so_sacct_can_update()

job_infos = get_recent_jobs_info(login_node, fields=("JobId", "JobName", "Node"))
# NOTE: `.extract`, used by `ensure_allocation`, actually returns the full node
# hostname as an output (e.g. cn-a001.server.mila.quebec), but sacct only shows the
# node name.
assert (job_id_from_sbatch_extract, job_name, node_hostname) in job_infos
job_infos = get_recent_jobs_info(
login_node, fields=("JobId", "JobName", "Node")
)
# NOTE: `.extract`, used by `ensure_allocation`, actually returns the full node
# hostname as an output (e.g. cn-a001.server.mila.quebec), but sacct only shows the
# node name.
assert (job_id_from_sbatch_extract, job_name, node_hostname) in job_infos
finally:
job_id_from_sbatch_extract = int(job_id_from_sbatch_extract)
login_node.run(
f"scancel {job_id_from_sbatch_extract}", display=True, hide=False
)

0 comments on commit 1b6f095

Please sign in to comment.