diff --git a/tests/integration/test_code_command.py b/tests/integration/test_code_command.py index 6ca94650..d3727cb1 100644 --- a/tests/integration/test_code_command.py +++ b/tests/integration/test_code_command.py @@ -16,6 +16,7 @@ from milatools.utils.remote_v2 import RemoteV2 from ..cli.common import skip_param_if_on_github_cloud_ci +from ..conftest import launches_jobs from .conftest import ( skip_if_not_already_logged_in, skip_param_if_not_already_logged_in, @@ -63,6 +64,7 @@ def test_check_disk_quota( @pytest.mark.slow +@launches_jobs @PARAMIKO_SSH_BANNER_BUG @pytest.mark.parametrize("persist", [True, False]) def test_code( @@ -128,15 +130,17 @@ def test_code( # before submitting the job) workdir = job_info["WorkDir"] assert workdir == scratch - - if persist: - # Job should still be running since we're using `persist` (that's the whole - # point.) - assert job_info["State"] == "RUNNING" - else: - # Job should have been cancelled by us after the `echo` process finished. - # NOTE: This check is a bit flaky, perhaps our `scancel` command hasn't - # completed yet, or sacct doesn't show the change in status quick enough. - # Relaxing it a bit for now. - # assert "CANCELLED" in job_info["State"] - assert "CANCELLED" in job_info["State"] or job_info["State"] == "RUNNING" + try: + if persist: + # Job should still be running since we're using `persist` (that's the whole + # point.) + assert job_info["State"] == "RUNNING" + else: + # Job should have been cancelled by us after the `echo` process finished. + # NOTE: This check is a bit flaky, perhaps our `scancel` command hasn't + # completed yet, or sacct doesn't show the change in status quick enough. + # Relaxing it a bit for now. + # assert "CANCELLED" in job_info["State"] + assert "CANCELLED" in job_info["State"] or job_info["State"] == "RUNNING" + finally: + login_node.run(f"scancel {job_id}", display=True) diff --git a/tests/integration/test_slurm_remote.py b/tests/integration/test_slurm_remote.py index 731a9f22..0531acab 100644 --- a/tests/integration/test_slurm_remote.py +++ b/tests/integration/test_slurm_remote.py @@ -23,7 +23,7 @@ from ..cli.common import on_windows from ..conftest import launches_jobs -from .conftest import MAX_JOB_DURATION, SLURM_CLUSTER, hangs_in_github_CI +from .conftest import SLURM_CLUSTER, hangs_in_github_CI logger = get_logger(__name__) @@ -219,52 +219,46 @@ def test_ensure_allocation( """ data, remote_runner = salloc_slurm_remote.ensure_allocation() assert isinstance(remote_runner, fabric.runners.Remote) - - assert "node_name" in data - # NOTE: it very well could be if we also extracted it from the salloc output. - assert "jobid" not in data - compute_node_from_salloc_output = data["node_name"] - - # Check that salloc was called. This would be printed to stderr by fabric if we were - # using `run`, but `ensure_allocation` uses `extract` which somehow doesn't print it - salloc_stdout, salloc_stderr = capsys.readouterr() - # assert not stdout - # assert not stderr - assert "salloc: Granted job allocation" in salloc_stdout - assert ( - f"salloc: Nodes {compute_node_from_salloc_output} are ready for job" - in salloc_stdout - ) - assert not salloc_stderr - - # Check that the returned remote runner is indeed connected to a *login* node (?!) - # NOTE: This is a fabric.runners.Remote, not a Remote or SlurmRemote of milatools, - # so calling `.run` doesn't launch a job. - result = remote_runner.run("hostname", echo=True, in_stream=False) - assert result - hostname_from_remote_runner = result.stdout.strip() - # BUG: If the `login_node` is a RemoteV2, then this will fail because the hostname - # might differ between the two (multiple login nodes in the Mila cluster). - result2 = login_node.run("hostname", display=True, hide=False) - assert result2 - hostname_from_login_node_runner = result2.stdout.strip() - - if isinstance(login_node, RemoteV2) and login_node.hostname == "mila": - assert hostname_from_remote_runner.startswith("login-") - assert hostname_from_login_node_runner.startswith("login-") - elif isinstance(login_node, RemoteV1): - assert hostname_from_remote_runner == hostname_from_login_node_runner - - print(f"Sleeping for {MAX_JOB_DURATION.total_seconds()}s until job finishes...") - time.sleep(MAX_JOB_DURATION.total_seconds()) - - # todo: This check is flaky. (the test itself is outdated because it's for RemoteV1) - # sacct_output = get_recent_jobs_info(login_node, fields=("JobName", "Node", "State")) - # assert (JOB_NAME, compute_node_from_salloc_output, "COMPLETED") in sacct_output or ( - # JOB_NAME, - # compute_node_from_salloc_output, - # "TIMEOUT", - # ) in sacct_output + try: + assert "node_name" in data + # NOTE: it very well could be if we also extracted it from the salloc output. + assert "jobid" not in data + compute_node_from_salloc_output = data["node_name"] + + # Check that salloc was called. This would be printed to stderr by fabric if we were + # using `run`, but `ensure_allocation` uses `extract` which somehow doesn't print it + salloc_stdout, salloc_stderr = capsys.readouterr() + # assert not stdout + # assert not stderr + assert "salloc: Granted job allocation" in salloc_stdout + assert ( + f"salloc: Nodes {compute_node_from_salloc_output} are ready for job" + in salloc_stdout + ) + assert not salloc_stderr + + # Check that the returned remote runner is indeed connected to a *login* node (?!) + # NOTE: This is a fabric.runners.Remote, not a Remote or SlurmRemote of milatools, + # so calling `.run` doesn't launch a job. + result = remote_runner.run("hostname", echo=True, in_stream=False) + assert result + hostname_from_remote_runner = result.stdout.strip() + # BUG: If the `login_node` is a RemoteV2, then this will fail because the hostname + # might differ between the two (multiple login nodes in the Mila cluster). + result2 = login_node.run("hostname", display=True, hide=False) + assert result2 + hostname_from_login_node_runner = result2.stdout.strip() + + if isinstance(login_node, RemoteV2) and login_node.hostname == "mila": + assert hostname_from_remote_runner.startswith("login-") + assert hostname_from_login_node_runner.startswith("login-") + elif isinstance(login_node, RemoteV1): + assert hostname_from_remote_runner == hostname_from_login_node_runner + finally: + result = remote_runner.run("echo $SLURM_JOB_ID", echo=True, in_stream=False) + assert result + job_id = int(result.stdout.strip()) + login_node.run(f"scancel {job_id}", display=True, hide=False) @launches_jobs @@ -289,11 +283,18 @@ def test_ensure_allocation_sbatch( node_hostname = job_data["node_name"] assert "jobid" in job_data job_id_from_sbatch_extract = job_data["jobid"] + try: + sleep_so_sacct_can_update() - sleep_so_sacct_can_update() - - job_infos = get_recent_jobs_info(login_node, fields=("JobId", "JobName", "Node")) - # NOTE: `.extract`, used by `ensure_allocation`, actually returns the full node - # hostname as an output (e.g. cn-a001.server.mila.quebec), but sacct only shows the - # node name. - assert (job_id_from_sbatch_extract, job_name, node_hostname) in job_infos + job_infos = get_recent_jobs_info( + login_node, fields=("JobId", "JobName", "Node") + ) + # NOTE: `.extract`, used by `ensure_allocation`, actually returns the full node + # hostname as an output (e.g. cn-a001.server.mila.quebec), but sacct only shows the + # node name. + assert (job_id_from_sbatch_extract, job_name, node_hostname) in job_infos + finally: + job_id_from_sbatch_extract = int(job_id_from_sbatch_extract) + login_node.run( + f"scancel {job_id_from_sbatch_extract}", display=True, hide=False + )