Merge pull request #527 from opensafely-core/docker-timeouts

fix: retry on docker inpsect timeouts
opensafely-core · Dec 5, 2022 · e8b4fd0 · e8b4fd0
2 parents 0c11aab + 1925c26
commit e8b4fd0
Show file tree

Hide file tree

Showing 7 changed files with 126 additions and 5 deletions.
diff --git a/jobrunner/config.py b/jobrunner/config.py
@@ -96,6 +96,7 @@ def _is_valid_backend_name(name):
 
 MAX_WORKERS = int(os.environ.get("MAX_WORKERS") or max(cpu_count() - 1, 1))
 MAX_DB_WORKERS = int(os.environ.get("MAX_DB_WORKERS") or MAX_WORKERS)
+MAX_RETRIES = int(os.environ.get("MAX_RETRIES", 0))
 
 # This is a crude mechanism for preventing a single large JobRequest with lots
 # of associated Jobs from hogging all the resources. We want this configurable

diff --git a/jobrunner/executors/local.py b/jobrunner/executors/local.py
@@ -12,6 +12,7 @@
 from jobrunner.executors.volumes import copy_file, get_volume_api
 from jobrunner.job_executor import (
     ExecutorAPI,
+    ExecutorRetry,
     ExecutorState,
     JobDefinition,
     JobResults,
@@ -187,9 +188,15 @@ def cleanup(self, job):
 
     def get_status(self, job):
         name = container_name(job)
-        job_running = docker.container_inspect(
-            name, "State.Running", none_if_not_exists=True
-        )
+        try:
+            job_running = docker.container_inspect(
+                name,
+                "State.Running",
+                none_if_not_exists=True,
+                timeout=10,
+            )
+        except docker.DockerTimeoutError:
+            raise ExecutorRetry("timed out inspecting container {name}")
 
         if job_running is None:
             # no volume for this job found

diff --git a/jobrunner/job_executor.py b/jobrunner/job_executor.py
@@ -61,6 +61,12 @@ class JobResults:
     message: str = None
 
 
+class ExecutorRetry(Exception):
+    """Indicates to the job scheduler that there's a temporary issue and to try again later."""
+
+    pass
+
+
 class ExecutorAPI:
     """
     API for managing job execution.

diff --git a/jobrunner/lib/docker.py b/jobrunner/lib/docker.py
@@ -301,7 +301,7 @@ def container_is_running(name):
     return container_inspect(name, "State.Running", none_if_not_exists=True) or False
 
 
-def container_inspect(name, key="", none_if_not_exists=False):
+def container_inspect(name, key="", none_if_not_exists=False, timeout=None):
     """
     Retrieves metadata about the named container. By default will return
     everything but `key` can be a dotted path to a specific piece of metadata.
@@ -315,7 +315,10 @@ def container_inspect(name, key="", none_if_not_exists=False):
             ["container", "inspect", "--format", "{{json .%s}}" % key, name],
             check=True,
             capture_output=True,
+            timeout=timeout,
         )
+    except subprocess.TimeoutExpired:
+        raise DockerTimeoutError(f"container_inspect timeout for {name}")
     except subprocess.CalledProcessError as e:
         if (
             none_if_not_exists

diff --git a/jobrunner/run.py b/jobrunner/run.py
@@ -14,6 +14,7 @@
 from jobrunner.executors import get_executor_api
 from jobrunner.job_executor import (
     ExecutorAPI,
+    ExecutorRetry,
     ExecutorState,
     JobDefinition,
     Privacy,
@@ -27,6 +28,13 @@
 
 log = logging.getLogger(__name__)
 
+# used to track the number of times an executor has asked to retry a job
+EXECUTOR_RETRIES = {}
+
+
+class RetriesExceeded(Exception):
+    pass
+
 
 class InvalidTransition(Exception):
     pass
@@ -188,7 +196,19 @@ def handle_job(job, api, mode=None, paused=None):
         )
         return
 
-    initial_status = api.get_status(definition)
+    try:
+        initial_status = api.get_status(definition)
+    except ExecutorRetry:
+        retries = EXECUTOR_RETRIES.get(job.id, 0)
+        if retries >= config.MAX_RETRIES:
+            raise RetriesExceeded(
+                f"Too many retries for job {job.id} from executor"
+            ) from ExecutorRetry
+        else:
+            EXECUTOR_RETRIES[job.id] = retries + 1
+            return
+    else:
+        EXECUTOR_RETRIES.pop(job.id, None)
 
     # handle the simple no change needed states.
     if initial_status.state in STABLE_STATES:

diff --git a/tests/test_local_executor.py b/tests/test_local_executor.py
@@ -622,3 +622,30 @@ def test_delete_files_bad_privacy(tmp_work_dir):
     populate_workspace("test", "file.txt")
     with pytest.raises(Exception):
         api.delete_files("test", None, ["file.txt"])
+
+
+def test_get_status_timeout(tmp_work_dir, monkeypatch):
+
+    job = JobDefinition(
+        id="test_get_status_timeout",
+        job_request_id="test_request_id",
+        study=None,
+        workspace="test",
+        action="action",
+        created_at=int(time.time()),
+        image="ghcr.io/opensafely-core/busybox",
+        args=["sleep", "1"],
+        env={},
+        inputs=[],
+        output_spec={},
+        allow_database_access=False,
+    )
+
+    def inspect(*args, **kwargs):
+        raise docker.DockerTimeoutError("timeout")
+
+    monkeypatch.setattr(local.docker, "container_inspect", inspect)
+    api = local.LocalDockerAPI()
+
+    with pytest.raises(local.ExecutorRetry):
+        api.get_status(job)
diff --git a/tests/test_run.py b/tests/test_run.py
@@ -575,6 +575,63 @@ def error(*args, **kwargs):
     assert spans[-1].name == "JOB"
 
 
+def test_handle_single_job_retries_exceeded(db, monkeypatch):
+    api = StubExecutorAPI()
+    job = api.add_test_job(ExecutorState.EXECUTED, State.RUNNING, StatusCode.EXECUTED)
+
+    def retry(*args, **kwargs):
+        raise run.ExecutorRetry("retry")
+
+    monkeypatch.setattr(api, "get_status", retry)
+    monkeypatch.setattr(config, "MAX_RETRIES", 3)
+
+    run.handle_single_job(job, api)
+    run.handle_single_job(job, api)
+    run.handle_single_job(job, api)
+
+    with pytest.raises(run.RetriesExceeded):
+        run.handle_single_job(job, api)
+
+    assert job.state is State.FAILED
+
+    spans = get_trace()
+    assert spans[-3].name == "EXECUTED"
+    error_span = spans[-2]
+    assert error_span.name == "INTERNAL_ERROR"
+    assert error_span.status.status_code == trace.StatusCode.ERROR
+    assert error_span.events[0].name == "exception"
+    assert (
+        error_span.events[0].attributes["exception.message"]
+        == f"Too many retries for job {job.id} from executor"
+    )
+    assert spans[-1].name == "JOB"
+
+
+def test_handle_single_job_retries_not_exceeded(db, monkeypatch):
+    api = StubExecutorAPI()
+    job = api.add_test_job(ExecutorState.EXECUTED, State.RUNNING, StatusCode.EXECUTED)
+
+    def retry(*args, **kwargs):
+        raise run.ExecutorRetry("retry")
+
+    orig_get_status = api.get_status
+
+    monkeypatch.setattr(api, "get_status", retry)
+    monkeypatch.setattr(config, "MAX_RETRIES", 3)
+
+    run.handle_single_job(job, api)
+    run.handle_single_job(job, api)
+    run.handle_single_job(job, api)
+
+    monkeypatch.setattr(api, "get_status", orig_get_status)
+
+    # do *not* blow up this time
+    run.handle_single_job(job, api)
+
+    assert job.state is State.RUNNING
+    assert job.id not in run.EXECUTOR_RETRIES
+
+
 def test_handle_single_job_shortcuts_synchronous(db):
     api = StubExecutorAPI()
     job = api.add_test_job(ExecutorState.UNKNOWN, State.PENDING, StatusCode.CREATED)