Skip to content

Commit

Permalink
Make bsub tests faster (#9350)
Browse files Browse the repository at this point in the history
* Remove needless wait in driver execute

Do not wait after last possible attempt in driver execute_with_retry

* Fix naming of max attempts in driver

* Add max attempt to lsf test

Remove retries due to stdout missing
  • Loading branch information
JHolba authored Nov 26, 2024
1 parent d7829ac commit ca1806e
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 15 deletions.
6 changes: 3 additions & 3 deletions src/ert/scheduler/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ async def _execute_with_retry(
_logger = driverlogger or logging.getLogger(__name__)
error_message: Optional[str] = None

for _ in range(total_attempts):
for i in range(total_attempts):
try:
process = await asyncio.create_subprocess_exec(
*cmd_with_args,
Expand Down Expand Up @@ -150,8 +150,8 @@ async def _execute_with_retry(
)
_logger.error(error_message)
return False, error_message

await asyncio.sleep(retry_interval)
if i < (total_attempts - 1):
await asyncio.sleep(retry_interval)
error_message = (
f'Command "{shlex.join(cmd_with_args)}" failed after {total_attempts} attempts '
f"with {outputs}"
Expand Down
4 changes: 2 additions & 2 deletions src/ert/scheduler/lsf_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def __init__(
self._max_attempt: int = 100
self._sleep_time_between_bkills = 30
self._sleep_time_between_cmd_retries = 3
self._bsub_retries = 10
self._max_bsub_attempts = 10

self._poll_period = _POLL_PERIOD

Expand Down Expand Up @@ -363,7 +363,7 @@ async def submit(
bsub_with_args,
retry_on_empty_stdout=True,
retry_codes=(FLAKY_SSH_RETURNCODE,),
total_attempts=self._bsub_retries,
total_attempts=self._max_bsub_attempts,
retry_interval=self._sleep_time_between_cmd_retries,
error_on_msgs=BSUB_FAILURE_MESSAGES,
)
Expand Down
6 changes: 3 additions & 3 deletions src/ert/scheduler/openpbs_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def __init__(
self._num_cpus_per_node: Optional[int] = num_cpus_per_node
self._cluster_label: Optional[str] = cluster_label
self._job_prefix = job_prefix
self._num_pbs_cmd_retries = 10
self._max_pbs_cmd_attempts = 10
self._sleep_time_between_cmd_retries = 2
self._poll_period = _POLL_PERIOD

Expand Down Expand Up @@ -268,7 +268,7 @@ async def submit(
QSUB_CONNECTION_REFUSED,
),
stdin=script.encode(encoding="utf-8"),
total_attempts=self._num_pbs_cmd_retries,
total_attempts=self._max_pbs_cmd_attempts,
retry_interval=self._sleep_time_between_cmd_retries,
driverlogger=logger,
)
Expand Down Expand Up @@ -298,7 +298,7 @@ async def kill(self, iens: int) -> None:
[str(self._qdel_cmd), str(job_id)],
retry_codes=(QDEL_REQUEST_INVALID,),
accept_codes=(QDEL_JOB_HAS_FINISHED,),
total_attempts=self._num_pbs_cmd_retries,
total_attempts=self._max_pbs_cmd_attempts,
retry_interval=self._sleep_time_between_cmd_retries,
driverlogger=logger,
)
Expand Down
4 changes: 2 additions & 2 deletions src/ert/scheduler/slurm_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def __init__(
self._include_hosts = include_hosts

self._sbatch = sbatch_cmd
self._sbatch_retries = 1
self._max_sbatch_attempts = 1

self._scancel = scancel_cmd
self._squeue = squeue_cmd
Expand Down Expand Up @@ -217,7 +217,7 @@ async def submit(
sbatch_with_args,
retry_on_empty_stdout=True,
retry_codes=(),
total_attempts=self._sbatch_retries,
total_attempts=self._max_sbatch_attempts,
retry_interval=self._sleep_time_between_cmd_retries,
)
if not process_success:
Expand Down
5 changes: 3 additions & 2 deletions tests/ert/unit_tests/scheduler/test_lsf_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ async def test_faulty_bsub(monkeypatch, tmp_path, bsub_script, expectation):
bsub_path.write_text(f"#!/bin/sh\n{bsub_script}")
bsub_path.chmod(bsub_path.stat().st_mode | stat.S_IEXEC)
driver = LsfDriver()
driver._max_bsub_attempts = 1
with expectation:
await driver.submit(0, "sleep")

Expand Down Expand Up @@ -578,7 +579,7 @@ async def test_that_bsub_will_retry_and_fail(
bsub_path.write_text(f"#!/bin/sh\necho {error_msg} >&2\nexit {exit_code}")
bsub_path.chmod(bsub_path.stat().st_mode | stat.S_IEXEC)
driver = LsfDriver()
driver._bsub_retries = 2
driver._max_bsub_attempts = 2
driver._sleep_time_between_cmd_retries = 0.2
match_str = (
f'failed after 2 attempts with exit code {exit_code}.*error: "{error_msg if error_msg else "<empty>"}"'
Expand Down Expand Up @@ -662,7 +663,7 @@ async def test_that_bsub_will_retry_and_succeed(
)
bsub_path.chmod(bsub_path.stat().st_mode | stat.S_IEXEC)
driver = LsfDriver()
driver._bsub_retries = 2
driver._max_bsub_attempts = 2
driver._sleep_time_between_cmd_retries = 0.2
await driver.submit(0, "sleep 10")

Expand Down
6 changes: 3 additions & 3 deletions tests/ert/unit_tests/scheduler/test_openpbs_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ async def test_that_qsub_will_retry_and_fail(
qsub_path.write_text(f"#!/bin/sh\necho {error_msg} >&2\nexit {exit_code}")
qsub_path.chmod(qsub_path.stat().st_mode | stat.S_IEXEC)
driver = OpenPBSDriver()
driver._num_pbs_cmd_retries = 2
driver._max_pbs_cmd_attempts = 2
driver._sleep_time_between_cmd_retries = 0.2
match_str = (
f'failed after 2 attempts with exit code {exit_code}.*error: "{error_msg}"'
Expand Down Expand Up @@ -452,7 +452,7 @@ async def test_that_qsub_will_retry_and_succeed(
)
qsub_path.chmod(qsub_path.stat().st_mode | stat.S_IEXEC)
driver = OpenPBSDriver()
driver._num_pbs_cmd_retries = 2
driver._max_pbs_cmd_attempts = 2
driver._sleep_time_between_cmd_retries = 0.2
await driver.submit(0, "sleep 10")

Expand Down Expand Up @@ -490,7 +490,7 @@ async def test_that_qdel_will_retry_and_succeed(
)
qdel_path.chmod(qdel_path.stat().st_mode | stat.S_IEXEC)
driver = OpenPBSDriver()
driver._num_pbs_cmd_retries = 2
driver._max_pbs_cmd_attempts = 2
driver._retry_pbs_cmd_interval = 0.2
driver._iens2jobid[0] = 111
await driver.kill(0)
Expand Down

0 comments on commit ca1806e

Please sign in to comment.