From 27fbd5ad86045a8e9aa24c86aada617373735f9e Mon Sep 17 00:00:00 2001
From: Fabrice Normandin <normandf@mila.quebec>
Date: Mon, 3 Jun 2024 10:58:34 -0400
Subject: [PATCH] Simplify `test_code.py`'s `existing_job` fixture

Signed-off-by: Fabrice Normandin <normandf@mila.quebec>
---
 tests/integration/test_code.py | 46 ++++++++++++++--------------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/tests/integration/test_code.py b/tests/integration/test_code.py
index 40f3e3f5..ec580ae8 100644
--- a/tests/integration/test_code.py
+++ b/tests/integration/test_code.py
@@ -26,7 +26,7 @@
 from milatools.utils.compute_node import (
     ComputeNode,
     get_queued_milatools_job_ids,
-    sbatch,
+    salloc,
 )
 from milatools.utils.disk_quota import check_disk_quota, check_disk_quota_v1
 from milatools.utils.remote_v1 import RemoteV1
@@ -232,53 +232,45 @@ def mock_which(command: str) -> str | None:
             )
 
 
-@pytest.fixture(scope="session")
-def persistent_job_name(job_name: str) -> str | None:
-    """Name of a job that should be used for tests and not be cancelled when exiting."""
-    return job_name + "_persistent"
-
-
 @pytest_asyncio.fixture(scope="session")
 async def existing_job(
     cluster: str,
     login_node_v2: RemoteV2,
     allocation_flags: list[str],
     job_name: str,
-    persistent_job_name: str,
 ) -> ComputeNode:
     """Gets a compute node connecting to a running job on the cluster.
 
-    This avoids making an allocation if possible, by reusing an already-running job
-    with the name `persistent_job_name` or `job_name`, or any running job if found.
-
-    This will try to use either persistent or temporary jobs (small jobs that are used
-    across test runs, or jobs that are spawned just for a single test run) to avoid
-    making a new allocation on the cluster.
+    This avoids making an allocation if possible, by reusing an already-running job with
+    the name `job_name` if it exists.
     """
     if cluster == "localhost":
         pytest.skip(
             "This test doesn't yet work with the slurm cluster spun up in the GitHub CI."
         )
-    # Try to find a dedicated test job, then a test job from this current test run, then
-    # any running job.
-    for job_name in [persistent_job_name, job_name, None]:
-        existing_test_jobs_on_cluster = await get_queued_milatools_job_ids(
-            login_node_v2, job_name=job_name
-        )
-        # todo: filter to use only the ones that are expected to be up for a little while
-        # longer (e.g. 2-3 minutes)
-        for job_id in existing_test_jobs_on_cluster:
+
+    existing_test_jobs_on_cluster = await get_queued_milatools_job_ids(
+        login_node_v2, job_name=job_name
+    )
+    # todo: filter to use only the ones that are expected to be up for a little while
+    # longer (e.g. 2-3 minutes)
+    for job_id in existing_test_jobs_on_cluster:
+        try:
+            # Note: Connecting to a compute node runs a command with `srun`, so it will
+            # raise an error if the job is no longer running.
             compute_node = await ComputeNode.connect(login_node_v2, job_id)
+        except Exception as exc:
+            logger.debug(f"Unable to reuse job {job_id}: {exc}")
+        else:
             logger.info(
                 f"Reusing existing test job with name {job_name} on the cluster: {job_id}"
             )
             return compute_node
     logger.info(
-        "Unable to find existing test jobs on the cluster. Allocating a new one that "
-        "will persist after tests are finished."
+        "Unable to find existing test jobs on the cluster. Allocating a new one."
     )
-    compute_node = await sbatch(
-        login_node_v2, sbatch_flags=allocation_flags, job_name=persistent_job_name
+    compute_node = await salloc(
+        login_node_v2, salloc_flags=allocation_flags, job_name=job_name
     )
     return compute_node