From fb835a2cb16d9dd1e12b9ae837677a5abff55454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Barth=C3=A9s?= Date: Mon, 2 Sep 2024 15:44:18 +0200 Subject: [PATCH 1/8] chore: change default value for `IMAGE_BUILD_TIMEOUT` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Guilhem Barthés --- backend/backend/settings/deps/image_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/backend/settings/deps/image_build.py b/backend/backend/settings/deps/image_build.py index fb348c930..375336e5a 100644 --- a/backend/backend/settings/deps/image_build.py +++ b/backend/backend/settings/deps/image_build.py @@ -1,7 +1,7 @@ import os # How long we wait before throwing errors, in seconds -IMAGE_BUILD_TIMEOUT = 3 * 60 * 60 # 3 hours +IMAGE_BUILD_TIMEOUT = 60 * 60 # 1 hour # Delay before two check IMAGE_BUILD_CHECK_DELAY = 5 BUILDER_KANIKO_STARTUP_MAX_ATTEMPTS = int(os.getenv("BUILDER_KANIKO_STARTUP_MAX_ATTEMPTS", 60)) From 4b8880d17e5025caa74d8e6d3832e6e5cfdc8cec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Barth=C3=A9s?= Date: Mon, 2 Sep 2024 15:52:42 +0200 Subject: [PATCH 2/8] feat: add `builder.timeout` to change `IMAGE_BUILD_TIMEOUT` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Guilhem Barthés --- backend/backend/settings/deps/image_build.py | 2 +- charts/substra-backend/Chart.yaml | 2 +- charts/substra-backend/templates/statefulset-builder.yaml | 2 ++ charts/substra-backend/values.yaml | 4 ++++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/backend/backend/settings/deps/image_build.py b/backend/backend/settings/deps/image_build.py index 375336e5a..2f92195e9 100644 --- a/backend/backend/settings/deps/image_build.py +++ b/backend/backend/settings/deps/image_build.py @@ -1,7 +1,7 @@ import os # How long we wait before throwing errors, in seconds -IMAGE_BUILD_TIMEOUT = 60 * 60 # 1 hour +IMAGE_BUILD_TIMEOUT = int(os.getenv("IMAGE_BUILD_TIMEOUT", 60 * 60)) # 1 hour # Delay before two check IMAGE_BUILD_CHECK_DELAY = 5 BUILDER_KANIKO_STARTUP_MAX_ATTEMPTS = int(os.getenv("BUILDER_KANIKO_STARTUP_MAX_ATTEMPTS", 60)) diff --git a/charts/substra-backend/Chart.yaml b/charts/substra-backend/Chart.yaml index 3b3b75a7b..e34178caf 100644 --- a/charts/substra-backend/Chart.yaml +++ b/charts/substra-backend/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: substra-backend home: https://github.com/Substra -version: 26.10.2 +version: 26.11.0 appVersion: 0.48.0 kubeVersion: '>= 1.19.0-0' description: Main package for Substra diff --git a/charts/substra-backend/templates/statefulset-builder.yaml b/charts/substra-backend/templates/statefulset-builder.yaml index f3d12cc17..3f81b6d3d 100644 --- a/charts/substra-backend/templates/statefulset-builder.yaml +++ b/charts/substra-backend/templates/statefulset-builder.yaml @@ -199,6 +199,8 @@ spec: value: {{ toYaml .Values.builder.kanikoStartup.maxAttempts | quote }} - name: BUILDER_KANIKO_STARTUP_PENDING_STATE_WAIT_SECONDS value: {{ toYaml .Values.builder.kanikoStartup.checkDelay | quote }} + - name: IMAGE_BUILD_TIMEOUT + value: {{ toYaml .Values.builder.timeout | quote }} ports: - name: http containerPort: 8000 diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 5ea060618..32ba8b9db 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -583,6 +583,10 @@ builder: ## concurrency: 1 + ## @param builder.timeout Number of seconds to wait before failing a build + ## + timeout: 3600 + ## @param builder.kanikoStartup.maxAttempts Number of checks done before considering a kaniko pod has failed to spawn ## @param builder.kanikoStartup.checkDelay Time, in seconds, to wait when a container is in pending state before checking again its status ## From 2ba4dc2c4d1703f6aa9a29b7de1e79c91e1dcbb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Barth=C3=A9s?= Date: Mon, 2 Sep 2024 18:05:37 +0200 Subject: [PATCH 3/8] feat: add test for `substrap.lock_local.lock_resource` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Guilhem Barthés --- backend/substrapp/tests/test_lock_local.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 backend/substrapp/tests/test_lock_local.py diff --git a/backend/substrapp/tests/test_lock_local.py b/backend/substrapp/tests/test_lock_local.py new file mode 100644 index 000000000..ea38b185a --- /dev/null +++ b/backend/substrapp/tests/test_lock_local.py @@ -0,0 +1,19 @@ +from pathlib import Path + +import pytest + +from substrapp import lock_local + + +# Test that `lock_local.lock_resource() returns the correct excception +def test_lock_local_timeout(): + resource_type = "test" + unique_identifier = "1" + # Create fake lock + slug = f"{resource_type}_{unique_identifier}" + lock_path = Path(lock_local._get_lock_file_path(slug)) + lock_path.write_text("unique_id") + + with pytest.raises(Exception): + with lock_local.lock_resource(resource_type, unique_identifier, timeout=1): + pass From 07a84c29165f27e1d7caae0811285d84df76c9fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Barth=C3=A9s?= Date: Mon, 2 Sep 2024 18:06:20 +0200 Subject: [PATCH 4/8] chore: remove unused `IMAGE_BUILD_TIMEOUT` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Guilhem Barthés --- backend/substrapp/compute_tasks/image_builder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/substrapp/compute_tasks/image_builder.py b/backend/substrapp/compute_tasks/image_builder.py index fc4d50429..318cb925e 100644 --- a/backend/substrapp/compute_tasks/image_builder.py +++ b/backend/substrapp/compute_tasks/image_builder.py @@ -12,7 +12,6 @@ logger = structlog.get_logger(__name__) -IMAGE_BUILD_TIMEOUT = settings.IMAGE_BUILD_TIMEOUT IMAGE_BUILD_CHECK_DELAY = settings.IMAGE_BUILD_CHECK_DELAY REGISTRY = settings.REGISTRY REGISTRY_SCHEME = settings.REGISTRY_SCHEME From b10cc083a9c59df9079f3e2aeaf817d4cc3f7195 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Barth=C3=A9s?= Date: Mon, 2 Sep 2024 18:07:17 +0200 Subject: [PATCH 5/8] fix: replace `Exception` by `LockError` as `Exception` does not accept custom named arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Guilhem Barthés --- backend/substrapp/exceptions.py | 10 ++++++++++ backend/substrapp/lock_local.py | 4 +++- backend/substrapp/tests/test_lock_local.py | 3 ++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/backend/substrapp/exceptions.py b/backend/substrapp/exceptions.py index b3b64c1c8..416849a41 100644 --- a/backend/substrapp/exceptions.py +++ b/backend/substrapp/exceptions.py @@ -32,3 +32,13 @@ class IntegrityError(Exception): class ServerMediasNoSubdirError(Exception): """A supplied servermedias path didn't contain the expected subdir""" + + +class LockError(Exception): + """Cannot get a lock file to write on""" + + lock_file: str + + def __init__(self, *args, lock_file: str, **kwargs) -> None: + self.lock_file = lock_file + super().__init__(args, kwargs) diff --git a/backend/substrapp/lock_local.py b/backend/substrapp/lock_local.py index c7683f524..ea35f41b7 100644 --- a/backend/substrapp/lock_local.py +++ b/backend/substrapp/lock_local.py @@ -7,6 +7,8 @@ import structlog +from substrapp.exceptions import LockError + logger = structlog.get_logger(__name__) LOCK_FILE_FOLDER = "/tmp" # nosec @@ -47,7 +49,7 @@ def lock_resource( did_wait = True logger.debug("Lock: Waiting for lock to be released", lock_file=lock_file) if time.time() - start > timeout: - raise Exception(f"Failed to acquire lock after {timeout} seconds", lock_file=lock_file) + raise LockError(f"Failed to acquire lock after {timeout} seconds", lock_file=lock_file) time.sleep(delay) logger.debug("Lock: Acquired", lock_file=lock_file) diff --git a/backend/substrapp/tests/test_lock_local.py b/backend/substrapp/tests/test_lock_local.py index ea38b185a..22d019de3 100644 --- a/backend/substrapp/tests/test_lock_local.py +++ b/backend/substrapp/tests/test_lock_local.py @@ -3,6 +3,7 @@ import pytest from substrapp import lock_local +from substrapp.exceptions import LockError # Test that `lock_local.lock_resource() returns the correct excception @@ -14,6 +15,6 @@ def test_lock_local_timeout(): lock_path = Path(lock_local._get_lock_file_path(slug)) lock_path.write_text("unique_id") - with pytest.raises(Exception): + with pytest.raises(LockError): with lock_local.lock_resource(resource_type, unique_identifier, timeout=1): pass From 0e7c45bed82a32db332441a004e34f4461956b2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Barth=C3=A9s?= Date: Tue, 3 Sep 2024 10:23:33 +0200 Subject: [PATCH 6/8] chore: update chart changelog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Guilhem Barthés --- charts/substra-backend/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index 5ff5d83cb..6cbb86751 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -236,6 +236,7 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `builder.enabled` | Enable builder service | `true` | | `builder.replicaCount` | Replica count for the builder service | `1` | | `builder.concurrency` | Maximum amount of tasks to process in parallel | `1` | +| `builder.timeout` | Number of seconds to wait before failing a build | `3600` | | `builder.kanikoStartup.maxAttempts` | Number of checks done before considering a kaniko pod has failed to spawn | `60` | | `builder.kanikoStartup.checkDelay` | Time, in seconds, to wait when a container is in pending state before checking again its status | `2` | | `builder.image.registry` | Substra backend server image registry | `ghcr.io` | From f783fa325cdaea83aae69449a022dba4df3c8d9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Barth=C3=A9s?= Date: Tue, 3 Sep 2024 10:28:02 +0200 Subject: [PATCH 7/8] doc: update chart changelog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Guilhem Barthés --- charts/substra-backend/CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/charts/substra-backend/CHANGELOG.md b/charts/substra-backend/CHANGELOG.md index e7e97888b..ac595b37a 100644 --- a/charts/substra-backend/CHANGELOG.md +++ b/charts/substra-backend/CHANGELOG.md @@ -1,6 +1,12 @@ # Changelog +## [26.11.0] - 2024-09-17 + +# Added + +New configurable value `.Values.builder.timeout` configuring the duration before a building task would fail. + ## [26.10.2] - 2024-09-17 # Fixed From 6d33e50f2095ff37e05850187842380c1f553371 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Barth=C3=A9s?= Date: Tue, 3 Sep 2024 12:41:16 +0200 Subject: [PATCH 8/8] doc: change fragment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Guilhem Barthés --- changes/992.added | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/992.added diff --git a/changes/992.added b/changes/992.added new file mode 100644 index 000000000..f72589d6e --- /dev/null +++ b/changes/992.added @@ -0,0 +1 @@ +Add configurable builder timeout through the env var `IMAGE_BUILD_TIMEOUT`