From 6f515976f7bd49651f4064762d5e0c6920f77c09 Mon Sep 17 00:00:00 2001 From: Sylvain <35365065+sanderegg@users.noreply.github.com> Date: Mon, 13 Nov 2023 18:08:58 +0100 Subject: [PATCH 1/2] =?UTF-8?q?=E2=99=BB=EF=B8=8FAutoscaling:=20Debug=20lo?= =?UTF-8?q?gs=20for=20issue=20with=20scaling=20up=20(#5025)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../utils/utils_docker.py | 33 +++++++++++------- services/autoscaling/tests/unit/conftest.py | 1 + .../unit/test_modules_auto_scaling_dynamic.py | 34 +++++++++++++++---- 3 files changed, 48 insertions(+), 20 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py index 0e5e88585c2..65b8ce8af05 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py @@ -107,21 +107,24 @@ def _check_if_node_is_removable(node: Node) -> bool: def _is_task_waiting_for_resources(task: Task) -> bool: # NOTE: https://docs.docker.com/engine/swarm/how-swarm-mode-works/swarm-task-states/ - if ( - not task.Status - or not task.Status.State - or not task.Status.Message - or not task.Status.Err + with log_context( + logger, level=logging.DEBUG, msg=f"_is_task_waiting_for_resources: {task}" ): - return False - return ( - task.Status.State == TaskState.pending - and task.Status.Message == _PENDING_DOCKER_TASK_MESSAGE - and ( - _INSUFFICIENT_RESOURCES_DOCKER_TASK_ERR in task.Status.Err - or _NOT_SATISFIED_SCHEDULING_CONSTRAINTS_TASK_ERR in task.Status.Err + if ( + not task.Status + or not task.Status.State + or not task.Status.Message + or not task.Status.Err + ): + return False + return ( + task.Status.State == TaskState.pending + and task.Status.Message == _PENDING_DOCKER_TASK_MESSAGE + and ( + _INSUFFICIENT_RESOURCES_DOCKER_TASK_ERR in task.Status.Err + or _NOT_SATISFIED_SCHEDULING_CONSTRAINTS_TASK_ERR in task.Status.Err + ) ) - ) async def _associated_service_has_no_node_placement_contraints( @@ -187,6 +190,10 @@ async def pending_service_tasks_with_insufficient_resources( ) sorted_tasks = sorted(tasks, key=_by_created_dt) + logger.debug( + "found following tasks that might trigger autoscaling: %s", + [task.ID for task in tasks], + ) return [ task diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index 619ec908545..e388acba225 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -514,6 +514,7 @@ def aws_allowed_ec2_instance_type_names() -> list[InstanceTypeType]: "t2.2xlarge", "g3.4xlarge", "g4dn.2xlarge", + "g4dn.8xlarge", "r5n.4xlarge", "r5n.8xlarge", ] diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 0044c157b0d..cb41d2d3278 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -379,7 +379,9 @@ async def _assert_ec2_instances( assert len(all_instances["Reservations"]) == num_reservations for reservation in all_instances["Reservations"]: assert "Instances" in reservation - assert len(reservation["Instances"]) == num_instances + assert ( + len(reservation["Instances"]) == num_instances + ), f"created {num_instances} instances of {reservation['Instances'][0]['InstanceType'] if num_instances > 0 else 'n/a'}" for instance in reservation["Instances"]: assert "InstanceType" in instance assert instance["InstanceType"] == instance_type @@ -440,7 +442,7 @@ async def _assert_ec2_instances( ), ], ) -async def test_cluster_scaling_up_and_down( +async def test_cluster_scaling_up_and_down( # noqa: PLR0915 minimal_configuration: None, service_monitored_labels: dict[DockerLabelKey, str], app_settings: ApplicationSettings, @@ -686,6 +688,7 @@ async def test_cluster_scaling_up_and_down( @dataclass(frozen=True) class _ScaleUpParams: + imposed_instance_type: str | None service_resources: Resources num_services: int expected_instance_type: str @@ -697,15 +700,28 @@ class _ScaleUpParams: [ pytest.param( _ScaleUpParams( + imposed_instance_type=None, service_resources=Resources( cpus=5, ram=parse_obj_as(ByteSize, "36Gib") ), num_services=10, - expected_instance_type="g3.4xlarge", + expected_instance_type="g3.4xlarge", # 1 GPU, 16 CPUs, 122GiB expected_num_instances=4, ), id="sim4life-light", - ) + ), + pytest.param( + _ScaleUpParams( + imposed_instance_type="g4dn.8xlarge", + service_resources=Resources( + cpus=5, ram=parse_obj_as(ByteSize, "20480MB") + ), + num_services=7, + expected_instance_type="g4dn.8xlarge", # 1 GPU, 32 CPUs, 128GiB + expected_num_instances=2, + ), + id="sim4life", + ), ], ) async def test_cluster_scaling_up_starts_multiple_instances( @@ -714,13 +730,12 @@ async def test_cluster_scaling_up_starts_multiple_instances( app_settings: ApplicationSettings, initialized_app: FastAPI, create_service: Callable[ - [dict[str, Any], dict[DockerLabelKey, str], str], Awaitable[Service] + [dict[str, Any], dict[DockerLabelKey, str], str, list[str]], Awaitable[Service] ], task_template: dict[str, Any], create_task_reservations: Callable[[int, int], dict[str, Any]], ec2_client: EC2Client, mock_tag_node: mock.Mock, - fake_node: Node, scale_up_params: _ScaleUpParams, mock_rabbitmq_post_message: mock.Mock, mock_find_node_with_name: mock.Mock, @@ -741,6 +756,11 @@ async def test_cluster_scaling_up_starts_multiple_instances( ), service_monitored_labels, "pending", + [ + f"node.labels.{DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY}=={scale_up_params.imposed_instance_type}" + ] + if scale_up_params.imposed_instance_type + else [], ) for _ in range(scale_up_params.num_services) ) @@ -756,7 +776,7 @@ async def test_cluster_scaling_up_starts_multiple_instances( ec2_client, num_reservations=1, num_instances=scale_up_params.expected_num_instances, - instance_type="g3.4xlarge", + instance_type=scale_up_params.expected_instance_type, instance_state="running", ) From 9cabec07f034fa1bd1e76dacbba54e07b02f39bb Mon Sep 17 00:00:00 2001 From: Pedro Crespo-Valero <32402063+pcrespov@users.noreply.github.com> Date: Mon, 13 Nov 2023 22:27:34 +0100 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=93=9D=20Maintenance:=20cleanup=20rea?= =?UTF-8?q?dme=20and=20vscode=20settings=20(#5023)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/extensions.json | 4 ++++ .vscode/settings.template.json | 26 ++++++++------------------ README.md | 12 ++++-------- 3 files changed, 16 insertions(+), 26 deletions(-) diff --git a/.vscode/extensions.json b/.vscode/extensions.json index 24bd21ba4e7..65f20197c67 100644 --- a/.vscode/extensions.json +++ b/.vscode/extensions.json @@ -3,9 +3,13 @@ "charliermarsh.ruff", "eamodio.gitlens", "exiasr.hadolint", + "ms-azuretools.vscode-docker", + "ms-python.black-formatter", + "ms-python.pylint", "ms-python.python", "njpwerner.autodocstring", "samuelcolvin.jinjahtml", "timonwong.shellcheck", + "vscode-icons-team.vscode-icons", ] } diff --git a/.vscode/settings.template.json b/.vscode/settings.template.json index 731b7d3b0f8..3a4405e9594 100644 --- a/.vscode/settings.template.json +++ b/.vscode/settings.template.json @@ -1,5 +1,7 @@ // This is a template. Clone and replace extension ".template.json" by ".json" { + "autoDocstring.docstringFormat": "pep257", + "editor.tabSize": 2, "editor.insertSpaces": true, "editor.detectIndentation": false, @@ -27,9 +29,8 @@ "**/.git/subtree-cache/**": true, "**/node_modules/*/**": true }, - "python.formatting.autopep8Args": [ - "--max-line-length 140" - ], + "python.analysis.autoImportCompletions": true, + "python.analysis.typeCheckingMode": "basic", "python.analysis.extraPaths": [ "./packages/models-library/src", "./packages/postgres-database/src", @@ -54,26 +55,15 @@ "[makefile]": { "editor.insertSpaces": false }, - "python.testing.pytestEnabled": true, - "autoDocstring.docstringFormat": "pep257", "hadolint.hadolintPath": "${workspaceFolder}/scripts/hadolint.bash", "hadolint.cliOptions": [], - "shellcheck.executablePath": "${workspaceFolder}/scripts/shellcheck.bash", - "shellcheck.run": "onSave", - "shellcheck.enableQuickFix": true, - "python.formatting.provider": "black", - "isort.path": [ - "${workspaceFolder}/.venv/bin/isort" - ], - "isort.args": [ - "--settings-path=${workspaceFolder}/.isort.cfg" - ], - "python.analysis.autoImportCompletions": true, - "python.analysis.typeCheckingMode": "basic", "ruff.lint.args": [ "--config=${workspaceFolder}/.ruff.toml" ], "ruff.path": [ "${workspaceFolder}/.venv/bin/ruff" - ] + ], + "shellcheck.executablePath": "${workspaceFolder}/scripts/shellcheck.bash", + "shellcheck.run": "onSave", + "shellcheck.enableQuickFix": true } diff --git a/README.md b/README.md index 02051728b81..1cb462938a5 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@

- [![black_badge]](https://github.com/psf/black) [![ci_badge]](https://github.com/ITISFoundation/osparc-simcore/actions/workflows/ci-testing-deploy.yml) @@ -28,7 +27,6 @@ [osparc_status]:https://img.shields.io/badge/dynamic/json?label=osparc.io&query=%24.status.description&url=https%3A%2F%2Fstatus.osparc.io%2Fapi%2Fv2%2Fstatus.json - The SIM-CORE, named **o2S2PARC** – **O**pen **O**nline **S**imulations for **S**timulating **P**eripheral **A**ctivity to **R**elieve **C**onditions – is one of the three integrative cores of the SPARC program’s Data Resource Center (DRC). The aim of o2S2PARC is to establish a comprehensive, freely accessible, intuitive, and interactive online platform for simulating peripheral nerve system neuromodulation/ stimulation and its impact on organ physiology in a precise and predictive manner. To achieve this, the platform will comprise both state-of-the art and highly detailed animal and human anatomical models with realistic tissue property distributions that make it possible to perform simulations ranging from the molecular scale up to the complexity of the human body. @@ -72,15 +70,14 @@ Services are deployed in two stacks:``simcore-stack`` comprises all core-service To build and run: - git -- docker +- [docker](https://docs.docker.com/engine/install/ubuntu/#installation-methods) - make >=4.2 - awk, jq (optional tools within makefiles) To develop, in addition: -- python 3.10 -- nodejs for client part (this dependency will be deprecated soon) -- swagger-cli (make sure to have a recent version of nodejs) +- *python 3.10*: we recommend using the python manager [pyenv](https://brain2life.hashnode.dev/how-to-install-pyenv-python-version-manager-on-ubuntu-2004) +- *nodejs* for client part: we recommend using the node manager [nvm](https://github.com/nvm-sh/nvm#install--update-script) - [vscode] (highly recommended) To verify current base OS, Docker and Python build versions have a look at: @@ -135,7 +132,7 @@ To upgrade a single requirement named `fastapi`run: - [Git release workflow](docs/releasing-workflow-instructions.md) - Public [releases](https://github.com/ITISFoundation/osparc-simcore/releases) -- Production in https://osparc.io +- Production in - [Staging instructions](docs/releasing-workflow-instructions.md#staging-example) - [User Manual](https://itisfoundation.github.io/osparc-manual/) @@ -191,6 +188,5 @@ This project is licensed under the terms of the [MIT license](LICENSE).

-[chocolatey]:https://chocolatey.org/ [vscode]:https://code.visualstudio.com/ [WSL2]:https://docs.microsoft.com/en-us/windows/wsl